feat: Introduce switch for different financial extraction routines

This commit is contained in:
TrisNol 2023-11-10 12:31:27 +01:00
parent 9edf5b1dce
commit e1b8397f9e

View File

@ -16,12 +16,15 @@ pd.options.mode.chained_assignment = None # type: ignore
class Bundesanzeiger:
"""Bundesanzeiger wrapper to export relevant information."""
def get_information(self, company_name: str, city: str | None) -> pd.DataFrame:
def get_information(
self, company_name: str, city: str | None, finance_from_tables: bool = False
) -> pd.DataFrame:
"""Extract relevant information from all found yearly results for the given company.
Args:
company_name (str): Name of the company to search for
city (Optional[str]): City where the company is registered
finance_from_tables (bool, optional): If True, financial information is extracted from tables. If False, financial information will be extracted from text via RegEx. Defaults to False.
Returns:
pd.DataFrame: Result
@ -51,9 +54,14 @@ class Bundesanzeiger:
df_data["auditors"] = audits
# Add Financial information
df_data["financial_results"] = df_data.raw_report.apply(
self.parse_tables_to_kpis
)
if finance_from_tables is True:
df_data["financial_results"] = df_data.raw_report.apply(
self.parse_tables_to_kpis
)
else:
df_data["financial_results"] = df_data.raw_report.apply(
self.extract_financial_results
)
# Remove irrelevant columns
return df_data
@ -262,27 +270,31 @@ class Bundesanzeiger:
)
break
else:
for x, factor in converter.items():
for x, factor in converter.items(): # noqa: PLW2901
parts = str(column).split(" ")
for y in parts:
if re.match(x, y):
table[column] = table[column].apply(
lambda x, factor=factor: apply_factor(x, factor)
)
table = table.rename({column: parts[0]}, axis=1)
table = table.rename( # noqa: PLW2901
{column: parts[0]}, axis=1
)
break
table = table.dropna(axis=0, how="all")
table = table.dropna(axis=1, how="all")
table = table.dropna(axis=0, how="all") # noqa: PLW2901
table = table.dropna(axis=1, how="all") # noqa: PLW2901
columns_to_prune = []
for column_index, column_type in enumerate(table.dtypes[1:]):
if column_type in ["object", "str"]:
columns_to_prune.append(column_index + 1)
table = table.drop(table.columns[columns_to_prune], axis="columns")
table = table.replace(to_replace="None", value=np.nan)
table = table.dropna()
table = table.drop( # noqa: PLW2901
table.columns[columns_to_prune], axis="columns"
)
table = table.replace(to_replace="None", value=np.nan) # noqa: PLW2901
table = table.dropna() # noqa: PLW2901
if len(table.columns) <= 1:
continue