feat: Introduce switch for different financial extraction routines

2025-07-29 19:11:03 +02:00 · 2023-11-10 12:31:27 +01:00
parent 9edf5b1dce
commit e1b8397f9e
1 changed files with 23 additions and 11 deletions
--- a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py
+++ b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py
@@ -16,12 +16,15 @@ pd.options.mode.chained_assignment = None  # type: ignore
 class Bundesanzeiger:
    """Bundesanzeiger wrapper to export relevant information."""

-    def get_information(self, company_name: str, city: str | None) -> pd.DataFrame:
+    def get_information(
+        self, company_name: str, city: str | None, finance_from_tables: bool = False
+    ) -> pd.DataFrame:
        """Extract relevant information from all found yearly results for the given company.

        Args:
            company_name (str): Name of the company to search for
            city (Optional[str]): City where the company is registered
+            finance_from_tables (bool, optional): If True, financial information is extracted from tables. If False, financial information will be extracted from text via RegEx. Defaults to False.

        Returns:
            pd.DataFrame: Result
@@ -51,9 +54,14 @@ class Bundesanzeiger:
        df_data["auditors"] = audits

        # Add Financial information
-        df_data["financial_results"] = df_data.raw_report.apply(
-            self.parse_tables_to_kpis
-        )
+        if finance_from_tables is True:
+            df_data["financial_results"] = df_data.raw_report.apply(
+                self.parse_tables_to_kpis
+            )
+        else:
+            df_data["financial_results"] = df_data.raw_report.apply(
+                self.extract_financial_results
+            )

        # Remove irrelevant columns
        return df_data
@@ -262,27 +270,31 @@ class Bundesanzeiger:
                                )
                                break
                else:
-                    for x, factor in converter.items():
+                    for x, factor in converter.items():  # noqa: PLW2901
                        parts = str(column).split(" ")
                        for y in parts:
                            if re.match(x, y):
                                table[column] = table[column].apply(
                                    lambda x, factor=factor: apply_factor(x, factor)
                                )
-                                table = table.rename({column: parts[0]}, axis=1)
+                                table = table.rename(  # noqa: PLW2901
+                                    {column: parts[0]}, axis=1
+                                )
                                break

-            table = table.dropna(axis=0, how="all")
-            table = table.dropna(axis=1, how="all")
+            table = table.dropna(axis=0, how="all")  # noqa: PLW2901
+            table = table.dropna(axis=1, how="all")  # noqa: PLW2901

            columns_to_prune = []
            for column_index, column_type in enumerate(table.dtypes[1:]):
                if column_type in ["object", "str"]:
                    columns_to_prune.append(column_index + 1)

-            table = table.drop(table.columns[columns_to_prune], axis="columns")
-            table = table.replace(to_replace="None", value=np.nan)
-            table = table.dropna()
+            table = table.drop(  # noqa: PLW2901
+                table.columns[columns_to_prune], axis="columns"
+            )
+            table = table.replace(to_replace="None", value=np.nan)  # noqa: PLW2901
+            table = table.dropna()  # noqa: PLW2901
            if len(table.columns) <= 1:
                continue