mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-06-21 23:33:54 +02:00
Introduce extended_financial_data code (#357)
Introducing the previously developed method to fetch the financial data via table parsing (aka "data lake like solution") in a non-destructive manner by defaulting to the current RegEx-based behaviour.
This commit is contained in:
File diff suppressed because it is too large
Load Diff
4
poetry.lock
generated
4
poetry.lock
generated
@ -7361,11 +7361,11 @@ test = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
|
|||||||
testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
|
testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
|
||||||
|
|
||||||
[extras]
|
[extras]
|
||||||
ingest = ["deutschland", "selenium", "xmltodict"]
|
ingest = ["deutschland", "html5lib", "selenium", "xmltodict"]
|
||||||
transformation = ["spacy", "spacy-sentiws", "torch", "torchaudio", "torchvision", "transformers"]
|
transformation = ["spacy", "spacy-sentiws", "torch", "torchaudio", "torchvision", "transformers"]
|
||||||
web-server = ["dash", "dash-auth", "dash-bootstrap-components", "matplotlib", "networkx", "seaborn"]
|
web-server = ["dash", "dash-auth", "dash-bootstrap-components", "matplotlib", "networkx", "seaborn"]
|
||||||
|
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = ">=3.11,<3.13"
|
python-versions = ">=3.11,<3.13"
|
||||||
content-hash = "0fb643247c09a91aeef5aae1286426f9296688dadff30f5a4a6085c3abe5399e"
|
content-hash = "5ca44ede811dc417faeda6b976c032682be7b4edadc16fc6c81e2ffe3dc4f946"
|
||||||
|
@ -81,9 +81,10 @@ torchvision = {version = "*", source = "torch-cpu"}
|
|||||||
tqdm = "^4.66.1"
|
tqdm = "^4.66.1"
|
||||||
transformers = {version = "*", extras = ["torch"]}
|
transformers = {version = "*", extras = ["torch"]}
|
||||||
xmltodict = "^0.13.0"
|
xmltodict = "^0.13.0"
|
||||||
|
html5lib = "^1.1"
|
||||||
|
|
||||||
[tool.poetry.extras]
|
[tool.poetry.extras]
|
||||||
ingest = ["selenium", "deutschland", "xmltodict"]
|
ingest = ["selenium", "deutschland", "xmltodict", "html5lib"]
|
||||||
transformation = ["torch", "torchaudio", "torchvision", "transformers", "spacy-sentiws", "spacy"]
|
transformation = ["torch", "torchaudio", "torchvision", "transformers", "spacy-sentiws", "spacy"]
|
||||||
web-server = ["dash", "dash-auth", "dash-bootstrap-components", "matplotlib", "seaborn", "networkx"]
|
web-server = ["dash", "dash-auth", "dash-bootstrap-components", "matplotlib", "seaborn", "networkx"]
|
||||||
|
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
"""Fetch data from Bundesanzeiger."""
|
"""Fetch data from Bundesanzeiger."""
|
||||||
import re
|
import re
|
||||||
|
from io import StringIO
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from deutschland.bundesanzeiger import Bundesanzeiger as Ba
|
from deutschland.bundesanzeiger import Bundesanzeiger as Ba
|
||||||
@ -14,12 +16,15 @@ pd.options.mode.chained_assignment = None # type: ignore
|
|||||||
class Bundesanzeiger:
|
class Bundesanzeiger:
|
||||||
"""Bundesanzeiger wrapper to export relevant information."""
|
"""Bundesanzeiger wrapper to export relevant information."""
|
||||||
|
|
||||||
def get_information(self, company_name: str, city: str | None) -> pd.DataFrame:
|
def get_information(
|
||||||
|
self, company_name: str, city: str | None, finance_from_tables: bool = False
|
||||||
|
) -> pd.DataFrame:
|
||||||
"""Extract relevant information from all found yearly results for the given company.
|
"""Extract relevant information from all found yearly results for the given company.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
company_name (str): Name of the company to search for
|
company_name (str): Name of the company to search for
|
||||||
city (Optional[str]): City where the company is registered
|
city (Optional[str]): City where the company is registered
|
||||||
|
finance_from_tables (bool, optional): If True, financial information is extracted from tables. If False, financial information will be extracted from text via RegEx. Defaults to False.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
pd.DataFrame: Result
|
pd.DataFrame: Result
|
||||||
@ -49,12 +54,17 @@ class Bundesanzeiger:
|
|||||||
df_data["auditors"] = audits
|
df_data["auditors"] = audits
|
||||||
|
|
||||||
# Add Financial information
|
# Add Financial information
|
||||||
df_data["financial_results"] = df_data.raw_report.apply(
|
if finance_from_tables is True:
|
||||||
self.extract_financial_results
|
df_data["financial_results"] = df_data.raw_report.apply(
|
||||||
)
|
self.parse_tables_to_kpis
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
df_data["financial_results"] = df_data.raw_report.apply(
|
||||||
|
self.extract_financial_results
|
||||||
|
)
|
||||||
|
|
||||||
# Remove irrelevant columns
|
# Remove irrelevant columns
|
||||||
return df_data.drop(["raw_report"], axis=1)
|
return df_data
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def filter_reports(df_reports: pd.DataFrame) -> pd.DataFrame:
|
def filter_reports(df_reports: pd.DataFrame) -> pd.DataFrame:
|
||||||
@ -189,6 +199,114 @@ class Bundesanzeiger:
|
|||||||
)
|
)
|
||||||
return self.__extract_kpis__(report_parsed)
|
return self.__extract_kpis__(report_parsed)
|
||||||
|
|
||||||
|
def __extract_tables_from_report__(self, report: str) -> list[pd.DataFrame]:
|
||||||
|
result = []
|
||||||
|
soup = BeautifulSoup(report, features="html.parser")
|
||||||
|
for table in soup.find_all("table", {"class": "std_table"}):
|
||||||
|
try:
|
||||||
|
results = pd.read_html(
|
||||||
|
StringIO(str(table)), flavor="bs4", thousands=".", decimal=","
|
||||||
|
)
|
||||||
|
if len(results) > 0:
|
||||||
|
data_frame = results[0]
|
||||||
|
result.append(data_frame)
|
||||||
|
# ruff: noqa: S112
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
return result
|
||||||
|
|
||||||
|
# ruff: noqa: PLR0912
|
||||||
|
def parse_tables_to_kpis(self, report: str) -> dict[str, float]:
|
||||||
|
"""Extract KPIs from tables included in a report.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
report (str): Raw report
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Extracted KPIs
|
||||||
|
"""
|
||||||
|
kpis = {}
|
||||||
|
tables = self.__extract_tables_from_report__(report)
|
||||||
|
for table in tables:
|
||||||
|
|
||||||
|
def cleanse_string(value: str) -> str | None:
|
||||||
|
if value is not None and isinstance(value, str):
|
||||||
|
return re.sub(r"(.+\.).", "", value)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def parse_string_to_float(value: str | float) -> float | None:
|
||||||
|
if value is None:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return float(value)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def apply_factor(value: str, factor: float) -> float | None:
|
||||||
|
transformed_value = parse_string_to_float(value)
|
||||||
|
if transformed_value is None or isinstance(transformed_value, str):
|
||||||
|
return None
|
||||||
|
return transformed_value * factor
|
||||||
|
|
||||||
|
table[table.columns[0]] = table[table.columns[0]].apply(cleanse_string) # type: ignore
|
||||||
|
|
||||||
|
converter = {
|
||||||
|
"Mio€": 1 * 10**6,
|
||||||
|
"Mio": 1 * 10**6,
|
||||||
|
"T€": 1 * 10**3,
|
||||||
|
"TEUR": 1 * 10**3,
|
||||||
|
"EUR": 1,
|
||||||
|
"€": 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
for column in table.columns[1:]:
|
||||||
|
if isinstance(column, tuple):
|
||||||
|
for c in column:
|
||||||
|
for x, factor in converter.items():
|
||||||
|
if x in c:
|
||||||
|
table[column] = table[column].apply(
|
||||||
|
lambda x, factor=factor: apply_factor(x, factor)
|
||||||
|
)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
for x, factor in converter.items(): # noqa: PLW2901
|
||||||
|
parts = str(column).split(" ")
|
||||||
|
for y in parts:
|
||||||
|
if re.match(x, y):
|
||||||
|
table[column] = table[column].apply(
|
||||||
|
lambda x, factor=factor: apply_factor(x, factor)
|
||||||
|
)
|
||||||
|
table = table.rename( # noqa: PLW2901
|
||||||
|
{column: parts[0]}, axis=1
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
table = table.dropna(axis=0, how="all") # noqa: PLW2901
|
||||||
|
table = table.dropna(axis=1, how="all") # noqa: PLW2901
|
||||||
|
|
||||||
|
columns_to_prune = []
|
||||||
|
for column_index, column_type in enumerate(table.dtypes[1:]):
|
||||||
|
if column_type in ["object", "str"]:
|
||||||
|
columns_to_prune.append(column_index + 1)
|
||||||
|
|
||||||
|
table = table.drop( # noqa: PLW2901
|
||||||
|
table.columns[columns_to_prune], axis="columns"
|
||||||
|
)
|
||||||
|
table = table.replace(to_replace="None", value=np.nan) # noqa: PLW2901
|
||||||
|
table = table.dropna() # noqa: PLW2901
|
||||||
|
if len(table.columns) <= 1:
|
||||||
|
continue
|
||||||
|
|
||||||
|
exps = [r"^[0-9a-zA-Z]+[\.\)] ", r"[\+\=\-\_]"]
|
||||||
|
for _index, row in table.iterrows():
|
||||||
|
name_cleansed = row.iloc[0]
|
||||||
|
if not isinstance(name_cleansed, str):
|
||||||
|
continue
|
||||||
|
for exp in exps:
|
||||||
|
name_cleansed = re.sub(exp, "", name_cleansed.strip())
|
||||||
|
kpis[name_cleansed] = row.iloc[1]
|
||||||
|
return kpis
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
ba_wrapper = Bundesanzeiger()
|
ba_wrapper = Bundesanzeiger()
|
||||||
|
@ -136,3 +136,110 @@ def test_get_information_no_results(mock_bundesanzeiger: Mock) -> None:
|
|||||||
ba = Bundesanzeiger()
|
ba = Bundesanzeiger()
|
||||||
result = ba.get_information("PRJ 23 Transparenzregister GmbH", "Iserlohn")
|
result = ba.get_information("PRJ 23 Transparenzregister GmbH", "Iserlohn")
|
||||||
assert len(result) == 0
|
assert len(result) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_tables_from_reports() -> None:
|
||||||
|
report = """
|
||||||
|
<table>
|
||||||
|
</table>
|
||||||
|
<div>
|
||||||
|
Möge die Macht mir dir sein
|
||||||
|
<table class="std_table">
|
||||||
|
<tr>
|
||||||
|
<th>Column A</th>
|
||||||
|
<th>Column B</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>42</td>
|
||||||
|
<td>4711</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
ba = Bundesanzeiger()
|
||||||
|
result = ba.__extract_tables_from_report__(report)
|
||||||
|
assert len(result) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_tables_to_kpis() -> None:
|
||||||
|
report = """
|
||||||
|
<table class="std_table">
|
||||||
|
<tr>
|
||||||
|
<th>Position</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>a) Umlaufvermögen</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
<table class="std_table">
|
||||||
|
<tr>
|
||||||
|
<th>Position</th>
|
||||||
|
<th>Test</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>4711</td>
|
||||||
|
<td>4711</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
<div>
|
||||||
|
Möge die Macht mir dir sein
|
||||||
|
<table class="std_table">
|
||||||
|
<tr>
|
||||||
|
<th>Position</th>
|
||||||
|
<th>2023 in T€</th>
|
||||||
|
<th>1997 in €</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>a) Umlaufvermögen</td>
|
||||||
|
<td>12,13</td>
|
||||||
|
<td>4711</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>+EBIT</td>
|
||||||
|
<td>1123</td>
|
||||||
|
<td>4711</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>To be ignored</td>
|
||||||
|
<td>I've tried so hard and got so far, but in the end it doesn't even matter</td>
|
||||||
|
<td>4711</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Gewinn</td>
|
||||||
|
<td></td>
|
||||||
|
<td>4711</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Jahresüberschuss</td>
|
||||||
|
<td>4.130,12</td>
|
||||||
|
<td>4711</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
<table class="std_table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Position</th>
|
||||||
|
<th>Betrag in</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Hallo</th>
|
||||||
|
<th>€</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td>I. Schulden</td>
|
||||||
|
<td>0,12</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
ba = Bundesanzeiger()
|
||||||
|
result = ba.parse_tables_to_kpis(report)
|
||||||
|
assert result == {
|
||||||
|
"Umlaufvermögen": 12130.0,
|
||||||
|
"EBIT": 1123000.0,
|
||||||
|
"Jahresüberschuss": 4130120.0,
|
||||||
|
"Schulden": 0.12,
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user