mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-12-15 19:30:44 +01:00
28 KiB
28 KiB
Daten Extraktion aus dem Bundesanzeiger¶
In order to run this notebooks, download the deutschland library source code from: TrisNol/deutschland and place it in the Jupyter/API-tests/Bundesanzeiger/deutschland directory. Since the PR adding the required features to the main repo has not been completet as of of yet (see: PR) we have to include it in another way...
Vorbereitung¶
In [5]:
import pandas as pd
from deutschland.bundesanzeiger import Bundesanzeiger
In [6]:
ba = Bundesanzeiger()
reports = ba.get_reports("Atos IT-Dienstleistung und Beratung GmbH")
print(reports.keys())
In [7]:
report_contents = []
for key in reports.keys():
report_contents.append(reports[key])
In [8]:
df_reports = pd.DataFrame(report_contents)
df_reports.head()
Out[8]:
In [9]:
df_reports["type"] = df_reports.name.apply(lambda name: name.split(" ")[0])
df_reports.head()
Out[9]:
In [10]:
df_jahresabschluss = df_reports.loc[df_reports.type == "Jahresabschluss"]
df_jahresabschluss["jahr"] = df_jahresabschluss.name.apply(
lambda name: name.split(" ")[-1].split(".")[-1]
)
df_jahresabschluss = df_jahresabschluss.drop(["name", "report", "type"], axis=1)
df_jahresabschluss.head()
Out[10]:
Daten Extraktion¶
In [11]:
from bs4 import BeautifulSoup
from io import StringIO
In [12]:
sample_report = df_jahresabschluss.iloc[0].raw_report
Wirtschaftsprüfer¶
In [14]:
import re
def extract_auditors(report: str) -> list:
auditor_regex = r"[a-z A-Z,.'-]+, Wirtschaftsprüfer"
hits = re.findall(auditor_regex, report)
return [hit.replace(", Wirtschaftsprüfer", "").lstrip() for hit in hits]
In [15]:
extract_auditors(sample_report)
Out[15]:
In [16]:
def extract_auditor_company(report: str) -> str:
soup = BeautifulSoup(report, features="html.parser")
temp = soup.find_all("b")
for elem in temp:
br = elem.findChildren("br")
if len(br) > 0:
return elem.text.split("\n")[1].strip()
return None
In [17]:
extract_auditor_company(sample_report)
Out[17]:
Aufsichtsrat¶
TODO
Bilanz bzw. GuV¶
In [18]:
def get_bilanz(report: str) -> any:
result = {}
soup = BeautifulSoup(report, features="html.parser")
for pos in ["Aktiva", "Passiva"]:
tag = soup.find("b", string=re.compile(pos))
if tag:
pos_results = pd.read_html(
StringIO(str(tag.findNext("table", {"class": "std_table"})))
)[0]
result[pos] = pos_results
return result
bilanz = get_bilanz(sample_report)
bilanz["Passiva"].head()
Out[18]:
In [19]:
def get_tables(raw_report: str) -> list:
soup = BeautifulSoup(raw_report, features="html.parser")
tables = soup.find_all("table", {"class": "std_table"})
dfs = []
for table in tables:
for df in pd.read_html(StringIO(str(table))):
dfs.append(df)
return dfs
for df in get_tables(sample_report):
print(df.columns)
tables = get_tables(sample_report)