From e8e354932ca14783eebf09fbb31791cbe188aec5 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Fri, 18 Aug 2023 16:44:49 +0200 Subject: [PATCH] feat(data-extraction): Minimal Financial data fetch --- .../utils/data_extraction/bundesanzeiger.py | 116 ++++++++++++++++-- 1 file changed, 109 insertions(+), 7 deletions(-) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py index c102062..bb756ab 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py @@ -24,21 +24,43 @@ class Bundesanzeiger: Returns: pd.DataFrame: Result """ + # Get Bundesanzeiger entries for company reports = self.__ba.get_reports(company_name) + # Transform to list of data report_contents = [] for key in reports: report_contents.append(reports[key]) + # Transform to DataFrame and filter out irrelevant entries df_data = pd.DataFrame(report_contents) - df_data["type"] = df_data.name.apply(lambda name: name.split(" ")[0]) - df_data = df_data.loc[df_data.type == "Jahresabschluss"] - df_data["jahr"] = df_data.name.apply( + df_data = self.filter_reports(df_data) + + # Add Auditor information + df_data["auditors"] = df_data.raw_report.apply(self.extract_auditors) + + # Add Financial information + df_data["financial_results"] = df_data.raw_report.apply( + self.extract_financial_results + ) + + # Remove irrelevant columns + return df_data.drop(["raw_report"], axis=1) + + def filter_reports(self, df_reports: pd.DataFrame) -> pd.DataFrame: + """Returns only reports of type `Jahresabschluss` and extracts the year of the report. + + Args: + df_reports (pd.DataFrame): DataFrame containing list of reports + + Returns: + pd.DataFrame: Filtered and pruned DataFrame + """ + df_reports["type"] = df_reports.name.apply(lambda name: name.split(" ")[0]) + df_reports = df_reports.loc[df_reports.type == "Jahresabschluss"] + df_reports["jahr"] = df_reports.name.apply( lambda name: name.split(" ")[-1].split(".")[-1] ) - df_data = df_data.drop(["name", "report", "type"], axis=1) - - df_data["auditors"] = df_data.raw_report.apply(self.extract_auditors) - return df_data + return df_reports.drop(["name", "report", "type"], axis=1) def extract_auditor_company(self, report: str) -> str | None: """Extract the name of an auditor company from the given yearly results report. @@ -73,3 +95,83 @@ class Bundesanzeiger: Auditor(hit.replace(", Wirtschaftsprüfer", "").lstrip(), auditor_company) for hit in hits ] + + def __extract_kpis__(self, report: str) -> dict: + """Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd. + + Extracts Key Performance Indicators (KPIs) from the financial reports. + + Args: + report (str): The yearly report as a parsed string + + Returns: + dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values. + """ + kpis = {} + + # Define KPI patterns to search for + kpi_patterns = { + "revenue": r"(?:revenue|umsatz|erlöse)[:\s]*([\d,.]+[mmb]?)", + "net_income": r"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\s]*([\d,.]+[mmb]?)", + "ebit": r"(?:ebit|operating income)[:\s]*([\d,.]+[mmb]?)", + "ebitda": r"(?:ebitda)[:\s]*([\d,.]+[mmb]?)", + "gross_profit": r"(?:gross profit|bruttogewinn)[:\s]*([\d,.]+[mmb]?)", + "operating_profit": r"(?:operating profit|betriebsgewinn)[:\s]*([\d,.]+[mmb]?)", + "assets": r"(?:total assets|bilanzsumme)[:\s]*([\d,.]+[mmb]?)", + "liabilities": r"(?:total liabilities|gesamtverbindlichkeiten)[:\s]*([\d,.]+[mmb]?)", + "equity": r"(?:shareholders'? equity|eigenkapital)[:\s]*([\d,.]+[mmb]?)", + "current_assets": r"(?:current assets|umlaufvermögen)[:\s]*([\d,.]+[mmb]?)", + "current_liabilities": r"(?:current liabilities|kurzfristige verbindlichkeiten)[:\s]*([\d,.]+[mmb]?)", + "long_term_debt": r"(?:long[-\s]?term debt|langfristige verbindlichkeiten)[:\s]*([\d,.]+[mmb]?)", + "short_term_debt": r"(?:short[-\s]?term debt|kurzfristige verbindlichkeiten)[:\s]*([\d,.]+[mmb]?)", + "cash_and_cash_equivalents": r"(?:cash (?:and cash equivalents)?|barmittel)[:\s]*([\d,.]+[mmb]?)", + "dividends": r"(?:dividends?|dividende)[:\s]*([\d,.]+[mmb]?)", + "cash_flow": r"(?:cash flow|cashflow|cash flow from operating activities)[:\s]*([\d,.]+[mmb]?)", + } + + for kpi, pattern in kpi_patterns.items(): + match = re.search(pattern, report, flags=re.IGNORECASE | re.UNICODE) + if match: + value = match.group(1) + + # Clean and validate the extracted number + try: + if not value: # Check if value is empty + cleaned_value = None + else: + multiplier = 1 + if value[-1].lower() == "m": + value = value[:-1] + multiplier = 1_000_000 + elif value[-1].lower() == "b": + value = value[:-1] + multiplier = 1_000_000_000 + + # Remove commas after checking for multipliers + value = value.replace(".", "").replace(",", ".").strip() + cleaned_value = float(value) * multiplier + except ValueError: + cleaned_value = None + + if cleaned_value is not None: + kpis[kpi] = cleaned_value + return kpis + + def extract_financial_results(self, report: str) -> dict: + """Extract financial data from given report. + + Args: + report (str): Report to be analyzed + + Returns: + dict: Results + """ + report_parsed = ( + BeautifulSoup(report, features="html.parser").get_text().replace("\n", " ") + ) + return self.__extract_kpis__(report_parsed) + + +if __name__ == "__main__": + ba_wrapper = Bundesanzeiger() + ba_wrapper.get_information("Atos IT-Dienstleistung und Beratung GmbH")