feat: Introduce switch for different financial extraction routines

This commit is contained in:
TrisNol 2023-11-10 12:31:27 +01:00
parent 9edf5b1dce
commit e1b8397f9e

View File

@ -16,12 +16,15 @@ pd.options.mode.chained_assignment = None # type: ignore
class Bundesanzeiger: class Bundesanzeiger:
"""Bundesanzeiger wrapper to export relevant information.""" """Bundesanzeiger wrapper to export relevant information."""
def get_information(self, company_name: str, city: str | None) -> pd.DataFrame: def get_information(
self, company_name: str, city: str | None, finance_from_tables: bool = False
) -> pd.DataFrame:
"""Extract relevant information from all found yearly results for the given company. """Extract relevant information from all found yearly results for the given company.
Args: Args:
company_name (str): Name of the company to search for company_name (str): Name of the company to search for
city (Optional[str]): City where the company is registered city (Optional[str]): City where the company is registered
finance_from_tables (bool, optional): If True, financial information is extracted from tables. If False, financial information will be extracted from text via RegEx. Defaults to False.
Returns: Returns:
pd.DataFrame: Result pd.DataFrame: Result
@ -51,9 +54,14 @@ class Bundesanzeiger:
df_data["auditors"] = audits df_data["auditors"] = audits
# Add Financial information # Add Financial information
df_data["financial_results"] = df_data.raw_report.apply( if finance_from_tables is True:
self.parse_tables_to_kpis df_data["financial_results"] = df_data.raw_report.apply(
) self.parse_tables_to_kpis
)
else:
df_data["financial_results"] = df_data.raw_report.apply(
self.extract_financial_results
)
# Remove irrelevant columns # Remove irrelevant columns
return df_data return df_data
@ -262,27 +270,31 @@ class Bundesanzeiger:
) )
break break
else: else:
for x, factor in converter.items(): for x, factor in converter.items(): # noqa: PLW2901
parts = str(column).split(" ") parts = str(column).split(" ")
for y in parts: for y in parts:
if re.match(x, y): if re.match(x, y):
table[column] = table[column].apply( table[column] = table[column].apply(
lambda x, factor=factor: apply_factor(x, factor) lambda x, factor=factor: apply_factor(x, factor)
) )
table = table.rename({column: parts[0]}, axis=1) table = table.rename( # noqa: PLW2901
{column: parts[0]}, axis=1
)
break break
table = table.dropna(axis=0, how="all") table = table.dropna(axis=0, how="all") # noqa: PLW2901
table = table.dropna(axis=1, how="all") table = table.dropna(axis=1, how="all") # noqa: PLW2901
columns_to_prune = [] columns_to_prune = []
for column_index, column_type in enumerate(table.dtypes[1:]): for column_index, column_type in enumerate(table.dtypes[1:]):
if column_type in ["object", "str"]: if column_type in ["object", "str"]:
columns_to_prune.append(column_index + 1) columns_to_prune.append(column_index + 1)
table = table.drop(table.columns[columns_to_prune], axis="columns") table = table.drop( # noqa: PLW2901
table = table.replace(to_replace="None", value=np.nan) table.columns[columns_to_prune], axis="columns"
table = table.dropna() )
table = table.replace(to_replace="None", value=np.nan) # noqa: PLW2901
table = table.dropna() # noqa: PLW2901
if len(table.columns) <= 1: if len(table.columns) <= 1:
continue continue