From e1b8397f9ed2711b3d9b26dd98a16b79784e30e9 Mon Sep 17 00:00:00 2001
From: TrisNol <tristan.nolde@yahoo.de>
Date: Fri, 10 Nov 2023 12:31:27 +0100
Subject: [PATCH] feat: Introduce switch for different financial extraction
 routines

---
 .../utils/data_extraction/bundesanzeiger.py   | 34 +++++++++++++------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py
index 3d80abe..b0e22c4 100644
--- a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py
+++ b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py
@@ -16,12 +16,15 @@ pd.options.mode.chained_assignment = None  # type: ignore
 class Bundesanzeiger:
     """Bundesanzeiger wrapper to export relevant information."""
 
-    def get_information(self, company_name: str, city: str | None) -> pd.DataFrame:
+    def get_information(
+        self, company_name: str, city: str | None, finance_from_tables: bool = False
+    ) -> pd.DataFrame:
         """Extract relevant information from all found yearly results for the given company.
 
         Args:
             company_name (str): Name of the company to search for
             city (Optional[str]): City where the company is registered
+            finance_from_tables (bool, optional): If True, financial information is extracted from tables. If False, financial information will be extracted from text via RegEx. Defaults to False.
 
         Returns:
             pd.DataFrame: Result
@@ -51,9 +54,14 @@ class Bundesanzeiger:
         df_data["auditors"] = audits
 
         # Add Financial information
-        df_data["financial_results"] = df_data.raw_report.apply(
-            self.parse_tables_to_kpis
-        )
+        if finance_from_tables is True:
+            df_data["financial_results"] = df_data.raw_report.apply(
+                self.parse_tables_to_kpis
+            )
+        else:
+            df_data["financial_results"] = df_data.raw_report.apply(
+                self.extract_financial_results
+            )
 
         # Remove irrelevant columns
         return df_data
@@ -262,27 +270,31 @@ class Bundesanzeiger:
                                 )
                                 break
                 else:
-                    for x, factor in converter.items():
+                    for x, factor in converter.items():  # noqa: PLW2901
                         parts = str(column).split(" ")
                         for y in parts:
                             if re.match(x, y):
                                 table[column] = table[column].apply(
                                     lambda x, factor=factor: apply_factor(x, factor)
                                 )
-                                table = table.rename({column: parts[0]}, axis=1)
+                                table = table.rename(  # noqa: PLW2901
+                                    {column: parts[0]}, axis=1
+                                )
                                 break
 
-            table = table.dropna(axis=0, how="all")
-            table = table.dropna(axis=1, how="all")
+            table = table.dropna(axis=0, how="all")  # noqa: PLW2901
+            table = table.dropna(axis=1, how="all")  # noqa: PLW2901
 
             columns_to_prune = []
             for column_index, column_type in enumerate(table.dtypes[1:]):
                 if column_type in ["object", "str"]:
                     columns_to_prune.append(column_index + 1)
 
-            table = table.drop(table.columns[columns_to_prune], axis="columns")
-            table = table.replace(to_replace="None", value=np.nan)
-            table = table.dropna()
+            table = table.drop(  # noqa: PLW2901
+                table.columns[columns_to_prune], axis="columns"
+            )
+            table = table.replace(to_replace="None", value=np.nan)  # noqa: PLW2901
+            table = table.dropna()  # noqa: PLW2901
             if len(table.columns) <= 1:
                 continue