Files
aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb

20 KiB

Daten Extraktion aus dem Bundesanzeiger

Vorbereitung

In [2]:
import pandas as pd

from aki_prj23_transparenzregister.utils.data_extraction.bundesanzeiger import (
    Bundesanzeiger,
)

ba_wrapper = Bundesanzeiger()
df_reports = ba_wrapper.get_information("Atos IT-Dienstleistung und Beratung GmbH")
df_reports.head()
Out[2]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
date company raw_report jahr auditors
0 2023-07-07 Atos IT-Dienstleistung und Beratung GmbH <div class="publication_container">\n <div cla... 2021 []
2 2023-05-10 Atos IT-Dienstleistung und Beratung GmbH <div class="publication_container">\n <div cla... 2021 [Auditor(name='Eckhard Lewe', company='Grant T...
4 2022-03-25 Atos IT-Dienstleistung und Beratung GmbH <div class="publication_container">\n <div cla... 2020 [Auditor(name='Eckhard Lewe', company='Warth &...
5 2021-03-11 Atos IT-Dienstleistung und Beratung GmbH <div class="publication_container">\n <div cla... 2019 [Auditor(name='Eckhard Lewe', company='Warth &...
6 2020-03-24 Atos IT-Dienstleistung und Beratung GmbH <div class="publication_container">\n <div cla... 2018 [Auditor(name='Ulrich Diersch', company='Warth...
In [9]:
df_jahresabschluss = df_reports.loc[df_reports.type == "Jahresabschluss"]
df_jahresabschluss["jahr"] = df_jahresabschluss.name.apply(
    lambda name: name.split(" ")[-1].split(".")[-1]
)
df_jahresabschluss = df_jahresabschluss.drop(["name", "report", "type"], axis=1)
df_jahresabschluss.head()
Out[9]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
date company raw_report jahr
0 2023-07-11 Volkswagen Economy Service Erdle Bernhard Erdl... <div class="publication_container">\n <div cla... 2021
1 2023-05-25 Volkswagen Economy Service Erdle Bernhard Erdl... <div class="publication_container">\n <div cla... 2020
2 2023-05-24 Volkswagen Economy Service Erdle Bernhard Erdl... <div class="publication_container">\n <div cla... 2019

Daten Extraktion

In [10]:
from bs4 import BeautifulSoup
from io import StringIO
In [11]:
sample_report = df_jahresabschluss.iloc[0].raw_report
sample_report_content = df_jahresabschluss.iloc[0].raw_report

Wirtschaftsprüfer

In [18]:
import re
from aki_prj23_transparenzregister.models.auditor import Auditor


def extract_auditor_company(report: str) -> str:
    soup = BeautifulSoup(report, features="html.parser")
    temp = soup.find_all("b")
    for elem in temp:
        br = elem.findChildren("br")
        if len(br) > 0:
            return elem.text.split("\n")[1].strip()
    return None


def extract_auditors(report: str) -> list:
    auditor_company = extract_auditor_company(report)
    auditor_regex = r"[a-z A-Z,.'-]+, Wirtschaftsprüfer"
    hits = re.findall(auditor_regex, report)
    return [
        Auditor(hit.replace(", Wirtschaftsprüfer", "").lstrip(), auditor_company)
        for hit in hits
    ]
In [13]:
extract_auditors(sample_report)
Out[13]:
[]

Aufsichtsrat

TODO

Bilanz bzw. GuV

In [14]:
def extract_kpis(report_content) -> dict:
    """
    Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd
    Extracts Key Performance Indicators (KPIs) from the financial reports.
    Args:
        reports (dict): A dictionary containing the financial reports with their hash as keys and report details as values.
    Returns:
        dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.
    """

    kpis = {}

    # Define KPI patterns to search for
    kpi_patterns = {
        "revenue": r"(?:revenue|umsatz|erlöse)[:\s]*([\d,.]+[mmb]?)",
        "net_income": r"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\s]*([\d,.]+[mmb]?)",
        "ebit": r"(?:ebit|operating income)[:\s]*([\d,.]+[mmb]?)",
        "ebitda": r"(?:ebitda)[:\s]*([\d,.]+[mmb]?)",
        "gross_profit": r"(?:gross profit|bruttogewinn)[:\s]*([\d,.]+[mmb]?)",
        "operating_profit": r"(?:operating profit|betriebsgewinn)[:\s]*([\d,.]+[mmb]?)",
        "assets": r"(?:total assets|bilanzsumme)[:\s]*([\d,.]+[mmb]?)",
        "liabilities": r"(?:total liabilities|gesamtverbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
        "equity": r"(?:shareholders'? equity|eigenkapital)[:\s]*([\d,.]+[mmb]?)",
        "current_assets": r"(?:current assets|umlaufvermögen)[:\s]*([\d,.]+[mmb]?)",
        "current_liabilities": r"(?:current liabilities|kurzfristige verbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
        "long_term_debt": r"(?:long[-\s]?term debt|langfristige verbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
        "short_term_debt": r"(?:short[-\s]?term debt|kurzfristige verbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
        "cash_and_cash_equivalents": r"(?:cash (?:and cash equivalents)?|barmittel)[:\s]*([\d,.]+[mmb]?)",
        "dividends": r"(?:dividends?|dividende)[:\s]*([\d,.]+[mmb]?)",
        "cash_flow": r"(?:cash flow|cashflow|cash flow from operating activities)[:\s]*([\d,.]+[mmb]?)",
    }

    report_kpis = {}
    for kpi, pattern in kpi_patterns.items():
        match = re.search(pattern, report_content, flags=re.IGNORECASE | re.UNICODE)
        if match:
            value = match.group(1)

            # Clean and validate the extracted number
            try:
                if not value:  # Check if value is empty
                    cleaned_value = None
                else:
                    multiplier = 1
                    if value[-1].lower() == "m":
                        value = value[:-1]
                        multiplier = 1_000_000
                    elif value[-1].lower() == "b":
                        value = value[:-1]
                        multiplier = 1_000_000_000

                    # Remove commas after checking for multipliers
                    value = value.replace(".", "").replace(",", ".").strip()
                    cleaned_value = float(value) * multiplier
            except ValueError:
                cleaned_value = None

            if cleaned_value is not None:
                report_kpis[kpi] = cleaned_value
    return report_kpis


extract_kpis(
    BeautifulSoup(sample_report, features="html.parser").get_text().replace("\n", " ")
)
Out[14]:
{'net_income': 100238.5, 'equity': 165322.34, 'current_assets': 435344.07}
In [15]:
import os

with open("./temp.txt", "w") as file:
    file.write(
        BeautifulSoup(sample_report, features="html.parser")
        .get_text()
        .replace("\n", " ")
    )
In [16]:
def parse_tables(report: str) -> list:
    result = {}
    soup = BeautifulSoup(report, features="html.parser")
    for table in soup.find_all("table", {"class": "std_table"}):
        df = pd.read_html(StringIO(str(table)))[0]
        print(df.columns)
        print(df.dtypes)
    return result


parse_tables(sample_report)
MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),
            ('Aktiva',    '31.12.2021  EUR'),
            ('Aktiva',    '31.12.2020  EUR')],
           )
Aktiva  Unnamed: 0_level_1    object
        31.12.2021  EUR       object
        31.12.2020  EUR       object
dtype: object
MultiIndex([('Passiva', 'Unnamed: 0_level_1'),
            ('Passiva',    '31.12.2021  EUR'),
            ('Passiva',    '31.12.2020  EUR')],
           )
Passiva  Unnamed: 0_level_1    object
         31.12.2021  EUR       object
         31.12.2020  EUR       object
dtype: object
Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')
Angaben zur Identifikation der Gesellschaft laut Registergericht      object
Angaben zur Identifikation der Gesellschaft laut Registergericht.1    object
dtype: object
MultiIndex([('Kreditentwicklung', 'Unnamed: 0_level_1'),
            (           'Betrag',                'EUR')],
           )
Kreditentwicklung  Unnamed: 0_level_1    object
Betrag             EUR                   object
dtype: object
Out[16]:
{}
In [22]:
def get_bilanz(report: str) -> any:
    result = {}
    soup = BeautifulSoup(report, features="html.parser")
    for pos in ["Aktiva", "Passiva"]:
        tag = soup.find("b", string=re.compile(pos))
        if tag:
            pos_results = pd.read_html(
                StringIO(str(tag.findNext("table", {"class": "std_table"})))
            )[0]
            result[pos] = pos_results
        else:
            result[pos] = pd.DataFrame([])
    return result


bilanz = get_bilanz(sample_report)
bilanz["Passiva"].head()
Out[22]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
In [23]:
def get_tables(raw_report: str) -> list:
    soup = BeautifulSoup(raw_report, features="html.parser")
    tables = soup.find_all("table", {"class": "std_table"})
    dfs = []
    for table in tables:
        for df in pd.read_html(StringIO(str(table))):
            dfs.append(df)
    return dfs


for df in get_tables(sample_report):
    print(df.columns)

tables = get_tables(sample_report)
MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),
            ('Aktiva',    '31.12.2021  EUR'),
            ('Aktiva',    '31.12.2020  EUR')],
           )
MultiIndex([('Passiva', 'Unnamed: 0_level_1'),
            ('Passiva',    '31.12.2021  EUR'),
            ('Passiva',    '31.12.2020  EUR')],
           )
Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')
MultiIndex([('Kreditentwicklung', 'Unnamed: 0_level_1'),
            (           'Betrag',                'EUR')],
           )