mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-12-17 20:00:43 +01:00
69 KiB
69 KiB
Daten Extraktion aus dem Bundesanzeiger¶
Vorbereitung¶
In [77]:
import pandas as pd
from aki_prj23_transparenzregister.utils.data_extraction.bundesanzeiger import (
Bundesanzeiger,
)
ba_wrapper = Bundesanzeiger()
# df_reports = ba_wrapper.get_information("Törmer Energy Solar 1 GmbH & Co. KG", "")
# df_reports = ba_wrapper.get_information("Atos IT-Dienstleistung und Beratung GmbH", "")
df_reports = ba_wrapper.get_information(
"Stadtwerke Haltern am See Gesellschaft mit beschränkter Haftung", ""
)
df_reports.head()
Out[77]:
Daten Extraktion¶
In [78]:
from bs4 import BeautifulSoup
from io import StringIO
In [79]:
sample_report = df_reports.iloc[1].raw_report
Aufsichtsrat¶
TODO
Bilanz bzw. GuV¶
In [80]:
def parse_tables(report: str) -> list:
result = []
soup = BeautifulSoup(report, features="html.parser")
for table in soup.find_all("table", {"class": "std_table"}):
df = pd.read_html(StringIO(str(table)), flavor="bs4")[0]
print(df.columns)
print(df.dtypes)
result.append(df)
return result
tables = parse_tables(sample_report)
In [81]:
current_table = tables[1]
current_table.head()
Out[81]:
In [82]:
import re
def cleanse_string(value: str) -> str:
if value is not None and isinstance(value, str):
return re.sub(r"(.+\.).", "", value)
return None
In [83]:
for index, row in current_table.iterrows():
current_table.iloc[index][0] = cleanse_string(row[0])
current_table.head()
Out[83]:
In [84]:
def parse_string_to_float(value) -> float:
try:
if value is None:
return None
if isinstance(value, float):
return value
return float(value.replace(".", "").replace(",", "."))
except Exception as e:
return None
def apply_factor(value, factor: float):
transformed_value = parse_string_to_float(value)
if transformed_value is None or isinstance(transformed_value, str):
return None
result = transformed_value * factor
# print(result)
return result
In [85]:
converter = {
"Mio€": 1 * 10**6,
"Mio": 1 * 10**6,
"T€": 1 * 10**3,
"TEUR": 1 * 10**3,
"EUR": 1,
"€": 1,
}
for column in current_table.columns:
if isinstance(column, tuple):
for c in column:
for x, factor in converter.items():
if x in c:
current_table[column] = current_table[column].apply(
lambda x: apply_factor(x, factor)
)
next
else:
for x, factor in converter.items():
parts = column.split(" ")
for y in parts:
if re.match(x, y):
current_table[column] = current_table[column].apply(
lambda x: apply_factor(x, factor)
)
current_table.rename({column: parts[0]}, inplace=True, axis=1)
next
# print(current_table[column])
current_table.dropna(axis=0, how="all", inplace=True)
current_table.dropna(axis=1, how="all", inplace=True)
current_table.head()
Out[85]:
In [86]:
current_table.dtypes
Out[86]:
In [87]:
# Remove columns hosting non-numerics; excl. first column hosting keys
columns_to_prune = []
for column_index, column_type in enumerate(current_table.dtypes[1:]):
if column_type in ["object", "str"]:
columns_to_prune.append(column_index + 1)
current_table = current_table.drop(
current_table.columns[columns_to_prune], axis="columns"
)
In [88]:
# Prune rows where first columns is None
import numpy as np
current_table = current_table.replace(to_replace="None", value=np.nan).dropna()
current_table
Out[88]:
In [89]:
kpis = {}
for _index, row in current_table.iterrows():
kpis[row[0]] = row[1]
kpis
Out[89]:
In [90]:
import re
def get_bilanz(report: str) -> any:
result = {}
soup = BeautifulSoup(report, features="html.parser")
for pos in ["Aktiva", "Passiva"]:
tag = soup.find("b", string=re.compile(pos))
if tag:
pos_results = pd.read_html(
StringIO(str(tag.findNext("table", {"class": "std_table"})))
)[0]
result[pos] = pos_results
else:
result[pos] = pd.DataFrame([])
return result
In [91]:
bilanz = get_bilanz(sample_report)
bilanz["Passiva"].head()
Out[91]:
In [92]:
bilanz["Aktiva"].head()
Out[92]:
In [93]:
from IPython.display import display, HTML
# Assuming that dataframes df1 and df2 are already defined:
display(HTML(bilanz["Passiva"].to_html()))
In [94]:
def get_tables(raw_report: str) -> list:
soup = BeautifulSoup(raw_report, features="html.parser")
tables = soup.find_all("table", {"class": "std_table"})
dfs = []
for table in tables:
for df in pd.read_html(StringIO(str(table))):
dfs.append(df)
return dfs
for df in get_tables(sample_report):
print(df.columns)
tables = get_tables(sample_report)