feat(data-extraction): Minimal Financial data fetch

2025-07-13 12:00:08 +02:00 · 2023-08-18 16:44:49 +02:00
parent 1e15656028
commit e8e354932c
1 changed files with 109 additions and 7 deletions
--- a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py
+++ b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py
@ -24,21 +24,43 @@ class Bundesanzeiger:
        Returns:
            pd.DataFrame: Result
        """
+        # Get Bundesanzeiger entries for company
        reports = self.__ba.get_reports(company_name)
+        # Transform to list of data
        report_contents = []
        for key in reports:
            report_contents.append(reports[key])

+        # Transform to DataFrame and filter out irrelevant entries
        df_data = pd.DataFrame(report_contents)
-        df_data["type"] = df_data.name.apply(lambda name: name.split(" ")[0])
-        df_data = df_data.loc[df_data.type == "Jahresabschluss"]
-        df_data["jahr"] = df_data.name.apply(
+        df_data = self.filter_reports(df_data)
+
+        # Add Auditor information
+        df_data["auditors"] = df_data.raw_report.apply(self.extract_auditors)
+
+        # Add Financial information
+        df_data["financial_results"] = df_data.raw_report.apply(
+            self.extract_financial_results
+        )
+
+        # Remove irrelevant columns
+        return df_data.drop(["raw_report"], axis=1)
+
+    def filter_reports(self, df_reports: pd.DataFrame) -> pd.DataFrame:
+        """Returns only reports of type `Jahresabschluss` and extracts the year of the report.
+
+        Args:
+            df_reports (pd.DataFrame): DataFrame containing list of reports
+
+        Returns:
+            pd.DataFrame: Filtered and pruned DataFrame
+        """
+        df_reports["type"] = df_reports.name.apply(lambda name: name.split(" ")[0])
+        df_reports = df_reports.loc[df_reports.type == "Jahresabschluss"]
+        df_reports["jahr"] = df_reports.name.apply(
            lambda name: name.split(" ")[-1].split(".")[-1]
        )
-        df_data = df_data.drop(["name", "report", "type"], axis=1)
-
-        df_data["auditors"] = df_data.raw_report.apply(self.extract_auditors)
-        return df_data
+        return df_reports.drop(["name", "report", "type"], axis=1)

    def extract_auditor_company(self, report: str) -> str | None:
        """Extract the name of an auditor company from the given yearly results report.
@ -73,3 +95,83 @@ class Bundesanzeiger:
            Auditor(hit.replace(", Wirtschaftsprüfer", "").lstrip(), auditor_company)
            for hit in hits
        ]
+
+    def __extract_kpis__(self, report: str) -> dict:
+        """Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd.
+
+        Extracts Key Performance Indicators (KPIs) from the financial reports.
+
+        Args:
+            report (str): The yearly report as a parsed string
+
+        Returns:
+            dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.
+        """
+        kpis = {}
+
+        # Define KPI patterns to search for
+        kpi_patterns = {
+            "revenue": r"(?:revenue|umsatz|erlöse)[:\s]*([\d,.]+[mmb]?)",
+            "net_income": r"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\s]*([\d,.]+[mmb]?)",
+            "ebit": r"(?:ebit|operating income)[:\s]*([\d,.]+[mmb]?)",
+            "ebitda": r"(?:ebitda)[:\s]*([\d,.]+[mmb]?)",
+            "gross_profit": r"(?:gross profit|bruttogewinn)[:\s]*([\d,.]+[mmb]?)",
+            "operating_profit": r"(?:operating profit|betriebsgewinn)[:\s]*([\d,.]+[mmb]?)",
+            "assets": r"(?:total assets|bilanzsumme)[:\s]*([\d,.]+[mmb]?)",
+            "liabilities": r"(?:total liabilities|gesamtverbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
+            "equity": r"(?:shareholders'? equity|eigenkapital)[:\s]*([\d,.]+[mmb]?)",
+            "current_assets": r"(?:current assets|umlaufvermögen)[:\s]*([\d,.]+[mmb]?)",
+            "current_liabilities": r"(?:current liabilities|kurzfristige verbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
+            "long_term_debt": r"(?:long[-\s]?term debt|langfristige verbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
+            "short_term_debt": r"(?:short[-\s]?term debt|kurzfristige verbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
+            "cash_and_cash_equivalents": r"(?:cash (?:and cash equivalents)?|barmittel)[:\s]*([\d,.]+[mmb]?)",
+            "dividends": r"(?:dividends?|dividende)[:\s]*([\d,.]+[mmb]?)",
+            "cash_flow": r"(?:cash flow|cashflow|cash flow from operating activities)[:\s]*([\d,.]+[mmb]?)",
+        }
+
+        for kpi, pattern in kpi_patterns.items():
+            match = re.search(pattern, report, flags=re.IGNORECASE | re.UNICODE)
+            if match:
+                value = match.group(1)
+
+                # Clean and validate the extracted number
+                try:
+                    if not value:  # Check if value is empty
+                        cleaned_value = None
+                    else:
+                        multiplier = 1
+                        if value[-1].lower() == "m":
+                            value = value[:-1]
+                            multiplier = 1_000_000
+                        elif value[-1].lower() == "b":
+                            value = value[:-1]
+                            multiplier = 1_000_000_000
+
+                        # Remove commas after checking for multipliers
+                        value = value.replace(".", "").replace(",", ".").strip()
+                        cleaned_value = float(value) * multiplier
+                except ValueError:
+                    cleaned_value = None
+
+                if cleaned_value is not None:
+                    kpis[kpi] = cleaned_value
+        return kpis
+
+    def extract_financial_results(self, report: str) -> dict:
+        """Extract financial data from given report.
+
+        Args:
+            report (str): Report to be analyzed
+
+        Returns:
+            dict: Results
+        """
+        report_parsed = (
+            BeautifulSoup(report, features="html.parser").get_text().replace("\n", " ")
+        )
+        return self.__extract_kpis__(report_parsed)
+
+
+if __name__ == "__main__":
+    ba_wrapper = Bundesanzeiger()
+    ba_wrapper.get_information("Atos IT-Dienstleistung und Beratung GmbH")