Introduce extended_financial_data code (#357)

Introducing the previously developed method to fetch the financial data via table parsing (aka "data lake like solution") in a non-destructive manner by defaulting to the current RegEx-based behaviour.
2025-08-23 15:10:04 +02:00 · 2023-11-11 14:10:20 +01:00
parent e5b61bc19c b0bcdc6fe1
commit a6d486209a
5 changed files with 2154 additions and 364 deletions
--- a/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb
+++ b/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb
--- a/poetry.lock
+++ b/poetry.lock
@@ -7361,11 +7361,11 @@ test = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
 testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]

 [extras]
-ingest = ["deutschland", "selenium", "xmltodict"]
+ingest = ["deutschland", "html5lib", "selenium", "xmltodict"]
 transformation = ["spacy", "spacy-sentiws", "torch", "torchaudio", "torchvision", "transformers"]
 web-server = ["dash", "dash-auth", "dash-bootstrap-components", "matplotlib", "networkx", "seaborn"]

 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.11,<3.13"
-content-hash = "0fb643247c09a91aeef5aae1286426f9296688dadff30f5a4a6085c3abe5399e"
+content-hash = "5ca44ede811dc417faeda6b976c032682be7b4edadc16fc6c81e2ffe3dc4f946"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -81,9 +81,10 @@ torchvision = {version = "*", source = "torch-cpu"}
 tqdm = "^4.66.1"
 transformers = {version = "*", extras = ["torch"]}
 xmltodict = "^0.13.0"
+html5lib = "^1.1"

 [tool.poetry.extras]
-ingest = ["selenium", "deutschland", "xmltodict"]
+ingest = ["selenium", "deutschland", "xmltodict", "html5lib"]
 transformation = ["torch", "torchaudio", "torchvision", "transformers", "spacy-sentiws", "spacy"]
 web-server = ["dash", "dash-auth", "dash-bootstrap-components", "matplotlib", "seaborn", "networkx"]

--- a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py
+++ b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py
@@ -1,6 +1,8 @@
 """Fetch data from Bundesanzeiger."""
 import re
+from io import StringIO

+import numpy as np
 import pandas as pd
 from bs4 import BeautifulSoup
 from deutschland.bundesanzeiger import Bundesanzeiger as Ba
@@ -14,12 +16,15 @@ pd.options.mode.chained_assignment = None  # type: ignore
 class Bundesanzeiger:
    """Bundesanzeiger wrapper to export relevant information."""

-    def get_information(self, company_name: str, city: str | None) -> pd.DataFrame:
+    def get_information(
+        self, company_name: str, city: str | None, finance_from_tables: bool = False
+    ) -> pd.DataFrame:
        """Extract relevant information from all found yearly results for the given company.

        Args:
            company_name (str): Name of the company to search for
            city (Optional[str]): City where the company is registered
+            finance_from_tables (bool, optional): If True, financial information is extracted from tables. If False, financial information will be extracted from text via RegEx. Defaults to False.

        Returns:
            pd.DataFrame: Result
@@ -49,12 +54,17 @@ class Bundesanzeiger:
        df_data["auditors"] = audits

        # Add Financial information
-        df_data["financial_results"] = df_data.raw_report.apply(
-            self.extract_financial_results
-        )
+        if finance_from_tables is True:
+            df_data["financial_results"] = df_data.raw_report.apply(
+                self.parse_tables_to_kpis
+            )
+        else:
+            df_data["financial_results"] = df_data.raw_report.apply(
+                self.extract_financial_results
+            )

        # Remove irrelevant columns
-        return df_data.drop(["raw_report"], axis=1)
+        return df_data

    @staticmethod
    def filter_reports(df_reports: pd.DataFrame) -> pd.DataFrame:
@@ -189,6 +199,114 @@ class Bundesanzeiger:
        )
        return self.__extract_kpis__(report_parsed)

+    def __extract_tables_from_report__(self, report: str) -> list[pd.DataFrame]:
+        result = []
+        soup = BeautifulSoup(report, features="html.parser")
+        for table in soup.find_all("table", {"class": "std_table"}):
+            try:
+                results = pd.read_html(
+                    StringIO(str(table)), flavor="bs4", thousands=".", decimal=","
+                )
+                if len(results) > 0:
+                    data_frame = results[0]
+                    result.append(data_frame)
+            # ruff: noqa: S112
+            except Exception:
+                continue
+        return result
+
+    # ruff: noqa: PLR0912
+    def parse_tables_to_kpis(self, report: str) -> dict[str, float]:
+        """Extract KPIs from tables included in a report.
+
+        Args:
+            report (str): Raw report
+
+        Returns:
+            dict: Extracted KPIs
+        """
+        kpis = {}
+        tables = self.__extract_tables_from_report__(report)
+        for table in tables:
+
+            def cleanse_string(value: str) -> str | None:
+                if value is not None and isinstance(value, str):
+                    return re.sub(r"(.+\.).", "", value)
+                return None
+
+            def parse_string_to_float(value: str | float) -> float | None:
+                if value is None:
+                    return None
+                try:
+                    return float(value)
+                except Exception:
+                    return None
+
+            def apply_factor(value: str, factor: float) -> float | None:
+                transformed_value = parse_string_to_float(value)
+                if transformed_value is None or isinstance(transformed_value, str):
+                    return None
+                return transformed_value * factor
+
+            table[table.columns[0]] = table[table.columns[0]].apply(cleanse_string)  # type: ignore
+
+            converter = {
+                "Mio€": 1 * 10**6,
+                "Mio": 1 * 10**6,
+                "T€": 1 * 10**3,
+                "TEUR": 1 * 10**3,
+                "EUR": 1,
+                "€": 1,
+            }
+
+            for column in table.columns[1:]:
+                if isinstance(column, tuple):
+                    for c in column:
+                        for x, factor in converter.items():
+                            if x in c:
+                                table[column] = table[column].apply(
+                                    lambda x, factor=factor: apply_factor(x, factor)
+                                )
+                                break
+                else:
+                    for x, factor in converter.items():  # noqa: PLW2901
+                        parts = str(column).split(" ")
+                        for y in parts:
+                            if re.match(x, y):
+                                table[column] = table[column].apply(
+                                    lambda x, factor=factor: apply_factor(x, factor)
+                                )
+                                table = table.rename(  # noqa: PLW2901
+                                    {column: parts[0]}, axis=1
+                                )
+                                break
+
+            table = table.dropna(axis=0, how="all")  # noqa: PLW2901
+            table = table.dropna(axis=1, how="all")  # noqa: PLW2901
+
+            columns_to_prune = []
+            for column_index, column_type in enumerate(table.dtypes[1:]):
+                if column_type in ["object", "str"]:
+                    columns_to_prune.append(column_index + 1)
+
+            table = table.drop(  # noqa: PLW2901
+                table.columns[columns_to_prune], axis="columns"
+            )
+            table = table.replace(to_replace="None", value=np.nan)  # noqa: PLW2901
+            table = table.dropna()  # noqa: PLW2901
+            if len(table.columns) <= 1:
+                continue
+
+            exps = [r"^[0-9a-zA-Z]+[\.\)] ", r"[\+\=\-\_]"]
+            for _index, row in table.iterrows():
+                name_cleansed = row.iloc[0]
+                if not isinstance(name_cleansed, str):
+                    continue
+                for exp in exps:
+                    name_cleansed = re.sub(exp, "", name_cleansed.strip())
+                kpis[name_cleansed] = row.iloc[1]
+        return kpis
+

 if __name__ == "__main__":
    ba_wrapper = Bundesanzeiger()
--- a/tests/utils/data_extraction/bundesanzeiger_test.py
+++ b/tests/utils/data_extraction/bundesanzeiger_test.py
@@ -136,3 +136,110 @@ def test_get_information_no_results(mock_bundesanzeiger: Mock) -> None:
    ba = Bundesanzeiger()
    result = ba.get_information("PRJ 23 Transparenzregister GmbH", "Iserlohn")
    assert len(result) == 0
+
+
+def test_extract_tables_from_reports() -> None:
+    report = """
+        <table>
+        </table>
+        <div>
+            Möge die Macht mir dir sein
+            <table class="std_table">
+                <tr>
+                    <th>Column A</th>
+                    <th>Column B</th>
+                </tr>
+                <tr>
+                    <td>42</td>
+                    <td>4711</td>
+                </tr>
+            </table>
+        </div>
+    """
+    ba = Bundesanzeiger()
+    result = ba.__extract_tables_from_report__(report)
+    assert len(result) == 1
+
+
+def test_parse_tables_to_kpis() -> None:
+    report = """
+        <table class="std_table">
+            <tr>
+                <th>Position</th>
+            </tr>
+            <tr>
+                <td>a) Umlaufvermögen</td>
+            </tr>
+        </table>
+        <table class="std_table">
+            <tr>
+                <th>Position</th>
+                <th>Test</th>
+            </tr>
+            <tr>
+                <td>4711</td>
+                <td>4711</td>
+            </tr>
+        </table>
+        <div>
+            Möge die Macht mir dir sein
+            <table class="std_table">
+                <tr>
+                    <th>Position</th>
+                    <th>2023 in T€</th>
+                    <th>1997 in €</th>
+                </tr>
+                <tr>
+                    <td>a) Umlaufvermögen</td>
+                    <td>12,13</td>
+                    <td>4711</td>
+                </tr>
+                <tr>
+                    <td>+EBIT</td>
+                    <td>1123</td>
+                    <td>4711</td>
+                </tr>
+                <tr>
+                    <td>To be ignored</td>
+                    <td>I've tried so hard and got so far, but in the end it doesn't even matter</td>
+                    <td>4711</td>
+                </tr>
+                <tr>
+                    <td>Gewinn</td>
+                    <td></td>
+                    <td>4711</td>
+                </tr>
+                <tr>
+                    <td>Jahresüberschuss</td>
+                    <td>4.130,12</td>
+                    <td>4711</td>
+                </tr>
+            </table>
+            <table class="std_table">
+            <thead>
+                <tr>
+                    <th>Position</th>
+                    <th>Betrag in</th>
+                </tr>
+                 <tr>
+                    <th>Hallo</th>
+                    <th>€</th>
+                </tr>
+            </thead>
+            <tbody>
+                <tr>
+                    <td>I. Schulden</td>
+                    <td>0,12</td>
+                </tr>
+            </tbody>
+        </table>
+        </div>
+    """
+    ba = Bundesanzeiger()
+    result = ba.parse_tables_to_kpis(report)
+    assert result == {
+        "Umlaufvermögen": 12130.0,
+        "EBIT": 1123000.0,
+        "Jahresüberschuss": 4130120.0,
+        "Schulden": 0.12,
+    }