Introduce extended_financial_data code (#357)

Introducing the previously developed method to fetch the financial data
via table parsing (aka "data lake like solution") in a non-destructive
manner by defaulting to the current RegEx-based behaviour.
This commit is contained in:
Tristan Nolde 2023-11-11 14:10:20 +01:00 committed by GitHub
commit a6d486209a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 2154 additions and 364 deletions

File diff suppressed because it is too large Load Diff

4
poetry.lock generated
View File

@ -7361,11 +7361,11 @@ test = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
[extras]
ingest = ["deutschland", "selenium", "xmltodict"]
ingest = ["deutschland", "html5lib", "selenium", "xmltodict"]
transformation = ["spacy", "spacy-sentiws", "torch", "torchaudio", "torchvision", "transformers"]
web-server = ["dash", "dash-auth", "dash-bootstrap-components", "matplotlib", "networkx", "seaborn"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.11,<3.13"
content-hash = "0fb643247c09a91aeef5aae1286426f9296688dadff30f5a4a6085c3abe5399e"
content-hash = "5ca44ede811dc417faeda6b976c032682be7b4edadc16fc6c81e2ffe3dc4f946"

View File

@ -81,9 +81,10 @@ torchvision = {version = "*", source = "torch-cpu"}
tqdm = "^4.66.1"
transformers = {version = "*", extras = ["torch"]}
xmltodict = "^0.13.0"
html5lib = "^1.1"
[tool.poetry.extras]
ingest = ["selenium", "deutschland", "xmltodict"]
ingest = ["selenium", "deutschland", "xmltodict", "html5lib"]
transformation = ["torch", "torchaudio", "torchvision", "transformers", "spacy-sentiws", "spacy"]
web-server = ["dash", "dash-auth", "dash-bootstrap-components", "matplotlib", "seaborn", "networkx"]

View File

@ -1,6 +1,8 @@
"""Fetch data from Bundesanzeiger."""
import re
from io import StringIO
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from deutschland.bundesanzeiger import Bundesanzeiger as Ba
@ -14,12 +16,15 @@ pd.options.mode.chained_assignment = None # type: ignore
class Bundesanzeiger:
"""Bundesanzeiger wrapper to export relevant information."""
def get_information(self, company_name: str, city: str | None) -> pd.DataFrame:
def get_information(
self, company_name: str, city: str | None, finance_from_tables: bool = False
) -> pd.DataFrame:
"""Extract relevant information from all found yearly results for the given company.
Args:
company_name (str): Name of the company to search for
city (Optional[str]): City where the company is registered
finance_from_tables (bool, optional): If True, financial information is extracted from tables. If False, financial information will be extracted from text via RegEx. Defaults to False.
Returns:
pd.DataFrame: Result
@ -49,12 +54,17 @@ class Bundesanzeiger:
df_data["auditors"] = audits
# Add Financial information
df_data["financial_results"] = df_data.raw_report.apply(
self.extract_financial_results
)
if finance_from_tables is True:
df_data["financial_results"] = df_data.raw_report.apply(
self.parse_tables_to_kpis
)
else:
df_data["financial_results"] = df_data.raw_report.apply(
self.extract_financial_results
)
# Remove irrelevant columns
return df_data.drop(["raw_report"], axis=1)
return df_data
@staticmethod
def filter_reports(df_reports: pd.DataFrame) -> pd.DataFrame:
@ -189,6 +199,114 @@ class Bundesanzeiger:
)
return self.__extract_kpis__(report_parsed)
def __extract_tables_from_report__(self, report: str) -> list[pd.DataFrame]:
result = []
soup = BeautifulSoup(report, features="html.parser")
for table in soup.find_all("table", {"class": "std_table"}):
try:
results = pd.read_html(
StringIO(str(table)), flavor="bs4", thousands=".", decimal=","
)
if len(results) > 0:
data_frame = results[0]
result.append(data_frame)
# ruff: noqa: S112
except Exception:
continue
return result
# ruff: noqa: PLR0912
def parse_tables_to_kpis(self, report: str) -> dict[str, float]:
"""Extract KPIs from tables included in a report.
Args:
report (str): Raw report
Returns:
dict: Extracted KPIs
"""
kpis = {}
tables = self.__extract_tables_from_report__(report)
for table in tables:
def cleanse_string(value: str) -> str | None:
if value is not None and isinstance(value, str):
return re.sub(r"(.+\.).", "", value)
return None
def parse_string_to_float(value: str | float) -> float | None:
if value is None:
return None
try:
return float(value)
except Exception:
return None
def apply_factor(value: str, factor: float) -> float | None:
transformed_value = parse_string_to_float(value)
if transformed_value is None or isinstance(transformed_value, str):
return None
return transformed_value * factor
table[table.columns[0]] = table[table.columns[0]].apply(cleanse_string) # type: ignore
converter = {
"Mio€": 1 * 10**6,
"Mio": 1 * 10**6,
"T€": 1 * 10**3,
"TEUR": 1 * 10**3,
"EUR": 1,
"": 1,
}
for column in table.columns[1:]:
if isinstance(column, tuple):
for c in column:
for x, factor in converter.items():
if x in c:
table[column] = table[column].apply(
lambda x, factor=factor: apply_factor(x, factor)
)
break
else:
for x, factor in converter.items(): # noqa: PLW2901
parts = str(column).split(" ")
for y in parts:
if re.match(x, y):
table[column] = table[column].apply(
lambda x, factor=factor: apply_factor(x, factor)
)
table = table.rename( # noqa: PLW2901
{column: parts[0]}, axis=1
)
break
table = table.dropna(axis=0, how="all") # noqa: PLW2901
table = table.dropna(axis=1, how="all") # noqa: PLW2901
columns_to_prune = []
for column_index, column_type in enumerate(table.dtypes[1:]):
if column_type in ["object", "str"]:
columns_to_prune.append(column_index + 1)
table = table.drop( # noqa: PLW2901
table.columns[columns_to_prune], axis="columns"
)
table = table.replace(to_replace="None", value=np.nan) # noqa: PLW2901
table = table.dropna() # noqa: PLW2901
if len(table.columns) <= 1:
continue
exps = [r"^[0-9a-zA-Z]+[\.\)] ", r"[\+\=\-\_]"]
for _index, row in table.iterrows():
name_cleansed = row.iloc[0]
if not isinstance(name_cleansed, str):
continue
for exp in exps:
name_cleansed = re.sub(exp, "", name_cleansed.strip())
kpis[name_cleansed] = row.iloc[1]
return kpis
if __name__ == "__main__":
ba_wrapper = Bundesanzeiger()

View File

@ -136,3 +136,110 @@ def test_get_information_no_results(mock_bundesanzeiger: Mock) -> None:
ba = Bundesanzeiger()
result = ba.get_information("PRJ 23 Transparenzregister GmbH", "Iserlohn")
assert len(result) == 0
def test_extract_tables_from_reports() -> None:
report = """
<table>
</table>
<div>
Möge die Macht mir dir sein
<table class="std_table">
<tr>
<th>Column A</th>
<th>Column B</th>
</tr>
<tr>
<td>42</td>
<td>4711</td>
</tr>
</table>
</div>
"""
ba = Bundesanzeiger()
result = ba.__extract_tables_from_report__(report)
assert len(result) == 1
def test_parse_tables_to_kpis() -> None:
report = """
<table class="std_table">
<tr>
<th>Position</th>
</tr>
<tr>
<td>a) Umlaufvermögen</td>
</tr>
</table>
<table class="std_table">
<tr>
<th>Position</th>
<th>Test</th>
</tr>
<tr>
<td>4711</td>
<td>4711</td>
</tr>
</table>
<div>
Möge die Macht mir dir sein
<table class="std_table">
<tr>
<th>Position</th>
<th>2023 in T</th>
<th>1997 in </th>
</tr>
<tr>
<td>a) Umlaufvermögen</td>
<td>12,13</td>
<td>4711</td>
</tr>
<tr>
<td>+EBIT</td>
<td>1123</td>
<td>4711</td>
</tr>
<tr>
<td>To be ignored</td>
<td>I've tried so hard and got so far, but in the end it doesn't even matter</td>
<td>4711</td>
</tr>
<tr>
<td>Gewinn</td>
<td></td>
<td>4711</td>
</tr>
<tr>
<td>Jahresüberschuss</td>
<td>4.130,12</td>
<td>4711</td>
</tr>
</table>
<table class="std_table">
<thead>
<tr>
<th>Position</th>
<th>Betrag in</th>
</tr>
<tr>
<th>Hallo</th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<td>I. Schulden</td>
<td>0,12</td>
</tr>
</tbody>
</table>
</div>
"""
ba = Bundesanzeiger()
result = ba.parse_tables_to_kpis(report)
assert result == {
"Umlaufvermögen": 12130.0,
"EBIT": 1123000.0,
"Jahresüberschuss": 4130120.0,
"Schulden": 0.12,
}