Introduce extended_financial_data code (#357)

Introducing the previously developed method to fetch the financial data
via table parsing (aka "data lake like solution") in a non-destructive
manner by defaulting to the current RegEx-based behaviour.
This commit is contained in:
Tristan Nolde
2023-11-11 14:10:20 +01:00
committed by GitHub
5 changed files with 2154 additions and 364 deletions

File diff suppressed because it is too large Load Diff

4
poetry.lock generated
View File

@ -7361,11 +7361,11 @@ test = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
[extras] [extras]
ingest = ["deutschland", "selenium", "xmltodict"] ingest = ["deutschland", "html5lib", "selenium", "xmltodict"]
transformation = ["spacy", "spacy-sentiws", "torch", "torchaudio", "torchvision", "transformers"] transformation = ["spacy", "spacy-sentiws", "torch", "torchaudio", "torchvision", "transformers"]
web-server = ["dash", "dash-auth", "dash-bootstrap-components", "matplotlib", "networkx", "seaborn"] web-server = ["dash", "dash-auth", "dash-bootstrap-components", "matplotlib", "networkx", "seaborn"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = ">=3.11,<3.13" python-versions = ">=3.11,<3.13"
content-hash = "0fb643247c09a91aeef5aae1286426f9296688dadff30f5a4a6085c3abe5399e" content-hash = "5ca44ede811dc417faeda6b976c032682be7b4edadc16fc6c81e2ffe3dc4f946"

View File

@ -81,9 +81,10 @@ torchvision = {version = "*", source = "torch-cpu"}
tqdm = "^4.66.1" tqdm = "^4.66.1"
transformers = {version = "*", extras = ["torch"]} transformers = {version = "*", extras = ["torch"]}
xmltodict = "^0.13.0" xmltodict = "^0.13.0"
html5lib = "^1.1"
[tool.poetry.extras] [tool.poetry.extras]
ingest = ["selenium", "deutschland", "xmltodict"] ingest = ["selenium", "deutschland", "xmltodict", "html5lib"]
transformation = ["torch", "torchaudio", "torchvision", "transformers", "spacy-sentiws", "spacy"] transformation = ["torch", "torchaudio", "torchvision", "transformers", "spacy-sentiws", "spacy"]
web-server = ["dash", "dash-auth", "dash-bootstrap-components", "matplotlib", "seaborn", "networkx"] web-server = ["dash", "dash-auth", "dash-bootstrap-components", "matplotlib", "seaborn", "networkx"]

View File

@ -1,6 +1,8 @@
"""Fetch data from Bundesanzeiger.""" """Fetch data from Bundesanzeiger."""
import re import re
from io import StringIO
import numpy as np
import pandas as pd import pandas as pd
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from deutschland.bundesanzeiger import Bundesanzeiger as Ba from deutschland.bundesanzeiger import Bundesanzeiger as Ba
@ -14,12 +16,15 @@ pd.options.mode.chained_assignment = None # type: ignore
class Bundesanzeiger: class Bundesanzeiger:
"""Bundesanzeiger wrapper to export relevant information.""" """Bundesanzeiger wrapper to export relevant information."""
def get_information(self, company_name: str, city: str | None) -> pd.DataFrame: def get_information(
self, company_name: str, city: str | None, finance_from_tables: bool = False
) -> pd.DataFrame:
"""Extract relevant information from all found yearly results for the given company. """Extract relevant information from all found yearly results for the given company.
Args: Args:
company_name (str): Name of the company to search for company_name (str): Name of the company to search for
city (Optional[str]): City where the company is registered city (Optional[str]): City where the company is registered
finance_from_tables (bool, optional): If True, financial information is extracted from tables. If False, financial information will be extracted from text via RegEx. Defaults to False.
Returns: Returns:
pd.DataFrame: Result pd.DataFrame: Result
@ -49,12 +54,17 @@ class Bundesanzeiger:
df_data["auditors"] = audits df_data["auditors"] = audits
# Add Financial information # Add Financial information
df_data["financial_results"] = df_data.raw_report.apply( if finance_from_tables is True:
self.extract_financial_results df_data["financial_results"] = df_data.raw_report.apply(
) self.parse_tables_to_kpis
)
else:
df_data["financial_results"] = df_data.raw_report.apply(
self.extract_financial_results
)
# Remove irrelevant columns # Remove irrelevant columns
return df_data.drop(["raw_report"], axis=1) return df_data
@staticmethod @staticmethod
def filter_reports(df_reports: pd.DataFrame) -> pd.DataFrame: def filter_reports(df_reports: pd.DataFrame) -> pd.DataFrame:
@ -189,6 +199,114 @@ class Bundesanzeiger:
) )
return self.__extract_kpis__(report_parsed) return self.__extract_kpis__(report_parsed)
def __extract_tables_from_report__(self, report: str) -> list[pd.DataFrame]:
result = []
soup = BeautifulSoup(report, features="html.parser")
for table in soup.find_all("table", {"class": "std_table"}):
try:
results = pd.read_html(
StringIO(str(table)), flavor="bs4", thousands=".", decimal=","
)
if len(results) > 0:
data_frame = results[0]
result.append(data_frame)
# ruff: noqa: S112
except Exception:
continue
return result
# ruff: noqa: PLR0912
def parse_tables_to_kpis(self, report: str) -> dict[str, float]:
"""Extract KPIs from tables included in a report.
Args:
report (str): Raw report
Returns:
dict: Extracted KPIs
"""
kpis = {}
tables = self.__extract_tables_from_report__(report)
for table in tables:
def cleanse_string(value: str) -> str | None:
if value is not None and isinstance(value, str):
return re.sub(r"(.+\.).", "", value)
return None
def parse_string_to_float(value: str | float) -> float | None:
if value is None:
return None
try:
return float(value)
except Exception:
return None
def apply_factor(value: str, factor: float) -> float | None:
transformed_value = parse_string_to_float(value)
if transformed_value is None or isinstance(transformed_value, str):
return None
return transformed_value * factor
table[table.columns[0]] = table[table.columns[0]].apply(cleanse_string) # type: ignore
converter = {
"Mio€": 1 * 10**6,
"Mio": 1 * 10**6,
"T€": 1 * 10**3,
"TEUR": 1 * 10**3,
"EUR": 1,
"": 1,
}
for column in table.columns[1:]:
if isinstance(column, tuple):
for c in column:
for x, factor in converter.items():
if x in c:
table[column] = table[column].apply(
lambda x, factor=factor: apply_factor(x, factor)
)
break
else:
for x, factor in converter.items(): # noqa: PLW2901
parts = str(column).split(" ")
for y in parts:
if re.match(x, y):
table[column] = table[column].apply(
lambda x, factor=factor: apply_factor(x, factor)
)
table = table.rename( # noqa: PLW2901
{column: parts[0]}, axis=1
)
break
table = table.dropna(axis=0, how="all") # noqa: PLW2901
table = table.dropna(axis=1, how="all") # noqa: PLW2901
columns_to_prune = []
for column_index, column_type in enumerate(table.dtypes[1:]):
if column_type in ["object", "str"]:
columns_to_prune.append(column_index + 1)
table = table.drop( # noqa: PLW2901
table.columns[columns_to_prune], axis="columns"
)
table = table.replace(to_replace="None", value=np.nan) # noqa: PLW2901
table = table.dropna() # noqa: PLW2901
if len(table.columns) <= 1:
continue
exps = [r"^[0-9a-zA-Z]+[\.\)] ", r"[\+\=\-\_]"]
for _index, row in table.iterrows():
name_cleansed = row.iloc[0]
if not isinstance(name_cleansed, str):
continue
for exp in exps:
name_cleansed = re.sub(exp, "", name_cleansed.strip())
kpis[name_cleansed] = row.iloc[1]
return kpis
if __name__ == "__main__": if __name__ == "__main__":
ba_wrapper = Bundesanzeiger() ba_wrapper = Bundesanzeiger()

View File

@ -136,3 +136,110 @@ def test_get_information_no_results(mock_bundesanzeiger: Mock) -> None:
ba = Bundesanzeiger() ba = Bundesanzeiger()
result = ba.get_information("PRJ 23 Transparenzregister GmbH", "Iserlohn") result = ba.get_information("PRJ 23 Transparenzregister GmbH", "Iserlohn")
assert len(result) == 0 assert len(result) == 0
def test_extract_tables_from_reports() -> None:
report = """
<table>
</table>
<div>
Möge die Macht mir dir sein
<table class="std_table">
<tr>
<th>Column A</th>
<th>Column B</th>
</tr>
<tr>
<td>42</td>
<td>4711</td>
</tr>
</table>
</div>
"""
ba = Bundesanzeiger()
result = ba.__extract_tables_from_report__(report)
assert len(result) == 1
def test_parse_tables_to_kpis() -> None:
report = """
<table class="std_table">
<tr>
<th>Position</th>
</tr>
<tr>
<td>a) Umlaufvermögen</td>
</tr>
</table>
<table class="std_table">
<tr>
<th>Position</th>
<th>Test</th>
</tr>
<tr>
<td>4711</td>
<td>4711</td>
</tr>
</table>
<div>
Möge die Macht mir dir sein
<table class="std_table">
<tr>
<th>Position</th>
<th>2023 in T€</th>
<th>1997 in €</th>
</tr>
<tr>
<td>a) Umlaufvermögen</td>
<td>12,13</td>
<td>4711</td>
</tr>
<tr>
<td>+EBIT</td>
<td>1123</td>
<td>4711</td>
</tr>
<tr>
<td>To be ignored</td>
<td>I've tried so hard and got so far, but in the end it doesn't even matter</td>
<td>4711</td>
</tr>
<tr>
<td>Gewinn</td>
<td></td>
<td>4711</td>
</tr>
<tr>
<td>Jahresüberschuss</td>
<td>4.130,12</td>
<td>4711</td>
</tr>
</table>
<table class="std_table">
<thead>
<tr>
<th>Position</th>
<th>Betrag in</th>
</tr>
<tr>
<th>Hallo</th>
<th>€</th>
</tr>
</thead>
<tbody>
<tr>
<td>I. Schulden</td>
<td>0,12</td>
</tr>
</tbody>
</table>
</div>
"""
ba = Bundesanzeiger()
result = ba.parse_tables_to_kpis(report)
assert result == {
"Umlaufvermögen": 12130.0,
"EBIT": 1123000.0,
"Jahresüberschuss": 4130120.0,
"Schulden": 0.12,
}