mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-08-13 19:54:37 +02:00
20 KiB
20 KiB
Daten Extraktion aus dem Bundesanzeiger¶
Vorbereitung¶
In [2]:
import pandas as pd
from aki_prj23_transparenzregister.utils.data_extraction.bundesanzeiger import (
Bundesanzeiger,
)
ba_wrapper = Bundesanzeiger()
df_reports = ba_wrapper.get_information("Atos IT-Dienstleistung und Beratung GmbH")
df_reports.head()
Out[2]:
In [9]:
df_jahresabschluss = df_reports.loc[df_reports.type == "Jahresabschluss"]
df_jahresabschluss["jahr"] = df_jahresabschluss.name.apply(
lambda name: name.split(" ")[-1].split(".")[-1]
)
df_jahresabschluss = df_jahresabschluss.drop(["name", "report", "type"], axis=1)
df_jahresabschluss.head()
Out[9]:
Daten Extraktion¶
In [10]:
from bs4 import BeautifulSoup
from io import StringIO
In [11]:
sample_report = df_jahresabschluss.iloc[0].raw_report
sample_report_content = df_jahresabschluss.iloc[0].raw_report
Wirtschaftsprüfer¶
In [18]:
import re
from aki_prj23_transparenzregister.models.auditor import Auditor
def extract_auditor_company(report: str) -> str:
soup = BeautifulSoup(report, features="html.parser")
temp = soup.find_all("b")
for elem in temp:
br = elem.findChildren("br")
if len(br) > 0:
return elem.text.split("\n")[1].strip()
return None
def extract_auditors(report: str) -> list:
auditor_company = extract_auditor_company(report)
auditor_regex = r"[a-z A-Z,.'-]+, Wirtschaftsprüfer"
hits = re.findall(auditor_regex, report)
return [
Auditor(hit.replace(", Wirtschaftsprüfer", "").lstrip(), auditor_company)
for hit in hits
]
In [13]:
extract_auditors(sample_report)
Out[13]:
Aufsichtsrat¶
TODO
Bilanz bzw. GuV¶
In [14]:
def extract_kpis(report_content) -> dict:
"""
Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd
Extracts Key Performance Indicators (KPIs) from the financial reports.
Args:
reports (dict): A dictionary containing the financial reports with their hash as keys and report details as values.
Returns:
dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.
"""
kpis = {}
# Define KPI patterns to search for
kpi_patterns = {
"revenue": r"(?:revenue|umsatz|erlöse)[:\s]*([\d,.]+[mmb]?)",
"net_income": r"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\s]*([\d,.]+[mmb]?)",
"ebit": r"(?:ebit|operating income)[:\s]*([\d,.]+[mmb]?)",
"ebitda": r"(?:ebitda)[:\s]*([\d,.]+[mmb]?)",
"gross_profit": r"(?:gross profit|bruttogewinn)[:\s]*([\d,.]+[mmb]?)",
"operating_profit": r"(?:operating profit|betriebsgewinn)[:\s]*([\d,.]+[mmb]?)",
"assets": r"(?:total assets|bilanzsumme)[:\s]*([\d,.]+[mmb]?)",
"liabilities": r"(?:total liabilities|gesamtverbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
"equity": r"(?:shareholders'? equity|eigenkapital)[:\s]*([\d,.]+[mmb]?)",
"current_assets": r"(?:current assets|umlaufvermögen)[:\s]*([\d,.]+[mmb]?)",
"current_liabilities": r"(?:current liabilities|kurzfristige verbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
"long_term_debt": r"(?:long[-\s]?term debt|langfristige verbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
"short_term_debt": r"(?:short[-\s]?term debt|kurzfristige verbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
"cash_and_cash_equivalents": r"(?:cash (?:and cash equivalents)?|barmittel)[:\s]*([\d,.]+[mmb]?)",
"dividends": r"(?:dividends?|dividende)[:\s]*([\d,.]+[mmb]?)",
"cash_flow": r"(?:cash flow|cashflow|cash flow from operating activities)[:\s]*([\d,.]+[mmb]?)",
}
report_kpis = {}
for kpi, pattern in kpi_patterns.items():
match = re.search(pattern, report_content, flags=re.IGNORECASE | re.UNICODE)
if match:
value = match.group(1)
# Clean and validate the extracted number
try:
if not value: # Check if value is empty
cleaned_value = None
else:
multiplier = 1
if value[-1].lower() == "m":
value = value[:-1]
multiplier = 1_000_000
elif value[-1].lower() == "b":
value = value[:-1]
multiplier = 1_000_000_000
# Remove commas after checking for multipliers
value = value.replace(".", "").replace(",", ".").strip()
cleaned_value = float(value) * multiplier
except ValueError:
cleaned_value = None
if cleaned_value is not None:
report_kpis[kpi] = cleaned_value
return report_kpis
extract_kpis(
BeautifulSoup(sample_report, features="html.parser").get_text().replace("\n", " ")
)
Out[14]:
In [15]:
import os
with open("./temp.txt", "w") as file:
file.write(
BeautifulSoup(sample_report, features="html.parser")
.get_text()
.replace("\n", " ")
)
In [16]:
def parse_tables(report: str) -> list:
result = {}
soup = BeautifulSoup(report, features="html.parser")
for table in soup.find_all("table", {"class": "std_table"}):
df = pd.read_html(StringIO(str(table)))[0]
print(df.columns)
print(df.dtypes)
return result
parse_tables(sample_report)
Out[16]:
In [22]:
def get_bilanz(report: str) -> any:
result = {}
soup = BeautifulSoup(report, features="html.parser")
for pos in ["Aktiva", "Passiva"]:
tag = soup.find("b", string=re.compile(pos))
if tag:
pos_results = pd.read_html(
StringIO(str(tag.findNext("table", {"class": "std_table"})))
)[0]
result[pos] = pos_results
else:
result[pos] = pd.DataFrame([])
return result
bilanz = get_bilanz(sample_report)
bilanz["Passiva"].head()
Out[22]:
In [23]:
def get_tables(raw_report: str) -> list:
soup = BeautifulSoup(raw_report, features="html.parser")
tables = soup.find_all("table", {"class": "std_table"})
dfs = []
for table in tables:
for df in pd.read_html(StringIO(str(table))):
dfs.append(df)
return dfs
for df in get_tables(sample_report):
print(df.columns)
tables = get_tables(sample_report)