From e1b8397f9ed2711b3d9b26dd98a16b79784e30e9 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Fri, 10 Nov 2023 12:31:27 +0100 Subject: [PATCH] feat: Introduce switch for different financial extraction routines --- .../utils/data_extraction/bundesanzeiger.py | 34 +++++++++++++------ 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py index 3d80abe..b0e22c4 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py @@ -16,12 +16,15 @@ pd.options.mode.chained_assignment = None # type: ignore class Bundesanzeiger: """Bundesanzeiger wrapper to export relevant information.""" - def get_information(self, company_name: str, city: str | None) -> pd.DataFrame: + def get_information( + self, company_name: str, city: str | None, finance_from_tables: bool = False + ) -> pd.DataFrame: """Extract relevant information from all found yearly results for the given company. Args: company_name (str): Name of the company to search for city (Optional[str]): City where the company is registered + finance_from_tables (bool, optional): If True, financial information is extracted from tables. If False, financial information will be extracted from text via RegEx. Defaults to False. Returns: pd.DataFrame: Result @@ -51,9 +54,14 @@ class Bundesanzeiger: df_data["auditors"] = audits # Add Financial information - df_data["financial_results"] = df_data.raw_report.apply( - self.parse_tables_to_kpis - ) + if finance_from_tables is True: + df_data["financial_results"] = df_data.raw_report.apply( + self.parse_tables_to_kpis + ) + else: + df_data["financial_results"] = df_data.raw_report.apply( + self.extract_financial_results + ) # Remove irrelevant columns return df_data @@ -262,27 +270,31 @@ class Bundesanzeiger: ) break else: - for x, factor in converter.items(): + for x, factor in converter.items(): # noqa: PLW2901 parts = str(column).split(" ") for y in parts: if re.match(x, y): table[column] = table[column].apply( lambda x, factor=factor: apply_factor(x, factor) ) - table = table.rename({column: parts[0]}, axis=1) + table = table.rename( # noqa: PLW2901 + {column: parts[0]}, axis=1 + ) break - table = table.dropna(axis=0, how="all") - table = table.dropna(axis=1, how="all") + table = table.dropna(axis=0, how="all") # noqa: PLW2901 + table = table.dropna(axis=1, how="all") # noqa: PLW2901 columns_to_prune = [] for column_index, column_type in enumerate(table.dtypes[1:]): if column_type in ["object", "str"]: columns_to_prune.append(column_index + 1) - table = table.drop(table.columns[columns_to_prune], axis="columns") - table = table.replace(to_replace="None", value=np.nan) - table = table.dropna() + table = table.drop( # noqa: PLW2901 + table.columns[columns_to_prune], axis="columns" + ) + table = table.replace(to_replace="None", value=np.nan) # noqa: PLW2901 + table = table.dropna() # noqa: PLW2901 if len(table.columns) <= 1: continue