mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-04-22 11:42:55 +02:00
622 lines
20 KiB
Plaintext
622 lines
20 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Daten Extraktion aus dem Bundesanzeiger"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Vorbereitung"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>date</th>\n",
|
|
" <th>company</th>\n",
|
|
" <th>raw_report</th>\n",
|
|
" <th>jahr</th>\n",
|
|
" <th>auditors</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>2023-07-07</td>\n",
|
|
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
|
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
|
" <td>2021</td>\n",
|
|
" <td>[]</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>2023-05-10</td>\n",
|
|
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
|
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
|
" <td>2021</td>\n",
|
|
" <td>[Auditor(name='Eckhard Lewe', company='Grant T...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>2022-03-25</td>\n",
|
|
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
|
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
|
" <td>2020</td>\n",
|
|
" <td>[Auditor(name='Eckhard Lewe', company='Warth &...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>5</th>\n",
|
|
" <td>2021-03-11</td>\n",
|
|
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
|
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
|
" <td>2019</td>\n",
|
|
" <td>[Auditor(name='Eckhard Lewe', company='Warth &...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>6</th>\n",
|
|
" <td>2020-03-24</td>\n",
|
|
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
|
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
|
" <td>2018</td>\n",
|
|
" <td>[Auditor(name='Ulrich Diersch', company='Warth...</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" date company \\\n",
|
|
"0 2023-07-07 Atos IT-Dienstleistung und Beratung GmbH \n",
|
|
"2 2023-05-10 Atos IT-Dienstleistung und Beratung GmbH \n",
|
|
"4 2022-03-25 Atos IT-Dienstleistung und Beratung GmbH \n",
|
|
"5 2021-03-11 Atos IT-Dienstleistung und Beratung GmbH \n",
|
|
"6 2020-03-24 Atos IT-Dienstleistung und Beratung GmbH \n",
|
|
"\n",
|
|
" raw_report jahr \\\n",
|
|
"0 <div class=\"publication_container\">\\n <div cla... 2021 \n",
|
|
"2 <div class=\"publication_container\">\\n <div cla... 2021 \n",
|
|
"4 <div class=\"publication_container\">\\n <div cla... 2020 \n",
|
|
"5 <div class=\"publication_container\">\\n <div cla... 2019 \n",
|
|
"6 <div class=\"publication_container\">\\n <div cla... 2018 \n",
|
|
"\n",
|
|
" auditors \n",
|
|
"0 [] \n",
|
|
"2 [Auditor(name='Eckhard Lewe', company='Grant T... \n",
|
|
"4 [Auditor(name='Eckhard Lewe', company='Warth &... \n",
|
|
"5 [Auditor(name='Eckhard Lewe', company='Warth &... \n",
|
|
"6 [Auditor(name='Ulrich Diersch', company='Warth... "
|
|
]
|
|
},
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"from aki_prj23_transparenzregister.utils.data_extraction.bundesanzeiger import (\n",
|
|
" Bundesanzeiger,\n",
|
|
")\n",
|
|
"\n",
|
|
"ba_wrapper = Bundesanzeiger()\n",
|
|
"df_reports = ba_wrapper.get_information(\"Atos IT-Dienstleistung und Beratung GmbH\")\n",
|
|
"df_reports.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>date</th>\n",
|
|
" <th>company</th>\n",
|
|
" <th>raw_report</th>\n",
|
|
" <th>jahr</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>2023-07-11</td>\n",
|
|
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
|
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
|
" <td>2021</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>2023-05-25</td>\n",
|
|
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
|
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
|
" <td>2020</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>2023-05-24</td>\n",
|
|
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
|
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
|
" <td>2019</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" date company \\\n",
|
|
"0 2023-07-11 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
|
"1 2023-05-25 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
|
"2 2023-05-24 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
|
"\n",
|
|
" raw_report jahr \n",
|
|
"0 <div class=\"publication_container\">\\n <div cla... 2021 \n",
|
|
"1 <div class=\"publication_container\">\\n <div cla... 2020 \n",
|
|
"2 <div class=\"publication_container\">\\n <div cla... 2019 "
|
|
]
|
|
},
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df_jahresabschluss = df_reports.loc[df_reports.type == \"Jahresabschluss\"]\n",
|
|
"df_jahresabschluss[\"jahr\"] = df_jahresabschluss.name.apply(\n",
|
|
" lambda name: name.split(\" \")[-1].split(\".\")[-1]\n",
|
|
")\n",
|
|
"df_jahresabschluss = df_jahresabschluss.drop([\"name\", \"report\", \"type\"], axis=1)\n",
|
|
"df_jahresabschluss.head()"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Daten Extraktion"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from bs4 import BeautifulSoup\n",
|
|
"from io import StringIO"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"sample_report = df_jahresabschluss.iloc[0].raw_report\n",
|
|
"sample_report_content = df_jahresabschluss.iloc[0].raw_report"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Wirtschaftsprüfer"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import re\n",
|
|
"from aki_prj23_transparenzregister.models.auditor import Auditor\n",
|
|
"\n",
|
|
"\n",
|
|
"def extract_auditor_company(report: str) -> str:\n",
|
|
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
|
|
" temp = soup.find_all(\"b\")\n",
|
|
" for elem in temp:\n",
|
|
" br = elem.findChildren(\"br\")\n",
|
|
" if len(br) > 0:\n",
|
|
" return elem.text.split(\"\\n\")[1].strip()\n",
|
|
" return None\n",
|
|
"\n",
|
|
"\n",
|
|
"def extract_auditors(report: str) -> list:\n",
|
|
" auditor_company = extract_auditor_company(report)\n",
|
|
" auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
|
|
" hits = re.findall(auditor_regex, report)\n",
|
|
" return [\n",
|
|
" Auditor(hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip(), auditor_company)\n",
|
|
" for hit in hits\n",
|
|
" ]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"[]"
|
|
]
|
|
},
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"extract_auditors(sample_report)"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Aufsichtsrat"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"**TODO**"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Bilanz bzw. GuV"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'net_income': 100238.5, 'equity': 165322.34, 'current_assets': 435344.07}"
|
|
]
|
|
},
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"def extract_kpis(report_content) -> dict:\n",
|
|
" \"\"\"\n",
|
|
" Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd\n",
|
|
" Extracts Key Performance Indicators (KPIs) from the financial reports.\n",
|
|
" Args:\n",
|
|
" reports (dict): A dictionary containing the financial reports with their hash as keys and report details as values.\n",
|
|
" Returns:\n",
|
|
" dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.\n",
|
|
" \"\"\"\n",
|
|
"\n",
|
|
" kpis = {}\n",
|
|
"\n",
|
|
" # Define KPI patterns to search for\n",
|
|
" kpi_patterns = {\n",
|
|
" \"revenue\": r\"(?:revenue|umsatz|erlöse)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
|
" \"net_income\": r\"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
|
" \"ebit\": r\"(?:ebit|operating income)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
|
" \"ebitda\": r\"(?:ebitda)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
|
" \"gross_profit\": r\"(?:gross profit|bruttogewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
|
" \"operating_profit\": r\"(?:operating profit|betriebsgewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
|
" \"assets\": r\"(?:total assets|bilanzsumme)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
|
" \"liabilities\": r\"(?:total liabilities|gesamtverbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
|
" \"equity\": r\"(?:shareholders'? equity|eigenkapital)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
|
" \"current_assets\": r\"(?:current assets|umlaufvermögen)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
|
" \"current_liabilities\": r\"(?:current liabilities|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
|
" \"long_term_debt\": r\"(?:long[-\\s]?term debt|langfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
|
" \"short_term_debt\": r\"(?:short[-\\s]?term debt|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
|
" \"cash_and_cash_equivalents\": r\"(?:cash (?:and cash equivalents)?|barmittel)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
|
" \"dividends\": r\"(?:dividends?|dividende)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
|
" \"cash_flow\": r\"(?:cash flow|cashflow|cash flow from operating activities)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
|
" }\n",
|
|
"\n",
|
|
" report_kpis = {}\n",
|
|
" for kpi, pattern in kpi_patterns.items():\n",
|
|
" match = re.search(pattern, report_content, flags=re.IGNORECASE | re.UNICODE)\n",
|
|
" if match:\n",
|
|
" value = match.group(1)\n",
|
|
"\n",
|
|
" # Clean and validate the extracted number\n",
|
|
" try:\n",
|
|
" if not value: # Check if value is empty\n",
|
|
" cleaned_value = None\n",
|
|
" else:\n",
|
|
" multiplier = 1\n",
|
|
" if value[-1].lower() == \"m\":\n",
|
|
" value = value[:-1]\n",
|
|
" multiplier = 1_000_000\n",
|
|
" elif value[-1].lower() == \"b\":\n",
|
|
" value = value[:-1]\n",
|
|
" multiplier = 1_000_000_000\n",
|
|
"\n",
|
|
" # Remove commas after checking for multipliers\n",
|
|
" value = value.replace(\".\", \"\").replace(\",\", \".\").strip()\n",
|
|
" cleaned_value = float(value) * multiplier\n",
|
|
" except ValueError:\n",
|
|
" cleaned_value = None\n",
|
|
"\n",
|
|
" if cleaned_value is not None:\n",
|
|
" report_kpis[kpi] = cleaned_value\n",
|
|
" return report_kpis\n",
|
|
"\n",
|
|
"\n",
|
|
"extract_kpis(\n",
|
|
" BeautifulSoup(sample_report, features=\"html.parser\").get_text().replace(\"\\n\", \" \")\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"\n",
|
|
"with open(\"./temp.txt\", \"w\") as file:\n",
|
|
" file.write(\n",
|
|
" BeautifulSoup(sample_report, features=\"html.parser\")\n",
|
|
" .get_text()\n",
|
|
" .replace(\"\\n\", \" \")\n",
|
|
" )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n",
|
|
" ('Aktiva', '31.12.2021 EUR'),\n",
|
|
" ('Aktiva', '31.12.2020 EUR')],\n",
|
|
" )\n",
|
|
"Aktiva Unnamed: 0_level_1 object\n",
|
|
" 31.12.2021 EUR object\n",
|
|
" 31.12.2020 EUR object\n",
|
|
"dtype: object\n",
|
|
"MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n",
|
|
" ('Passiva', '31.12.2021 EUR'),\n",
|
|
" ('Passiva', '31.12.2020 EUR')],\n",
|
|
" )\n",
|
|
"Passiva Unnamed: 0_level_1 object\n",
|
|
" 31.12.2021 EUR object\n",
|
|
" 31.12.2020 EUR object\n",
|
|
"dtype: object\n",
|
|
"Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n",
|
|
"Angaben zur Identifikation der Gesellschaft laut Registergericht object\n",
|
|
"Angaben zur Identifikation der Gesellschaft laut Registergericht.1 object\n",
|
|
"dtype: object\n",
|
|
"MultiIndex([('Kreditentwicklung', 'Unnamed: 0_level_1'),\n",
|
|
" ( 'Betrag', 'EUR')],\n",
|
|
" )\n",
|
|
"Kreditentwicklung Unnamed: 0_level_1 object\n",
|
|
"Betrag EUR object\n",
|
|
"dtype: object\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{}"
|
|
]
|
|
},
|
|
"execution_count": 16,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"def parse_tables(report: str) -> list:\n",
|
|
" result = {}\n",
|
|
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
|
|
" for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n",
|
|
" df = pd.read_html(StringIO(str(table)))[0]\n",
|
|
" print(df.columns)\n",
|
|
" print(df.dtypes)\n",
|
|
" return result\n",
|
|
"\n",
|
|
"\n",
|
|
"parse_tables(sample_report)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
"Empty DataFrame\n",
|
|
"Columns: []\n",
|
|
"Index: []"
|
|
]
|
|
},
|
|
"execution_count": 22,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"def get_bilanz(report: str) -> any:\n",
|
|
" result = {}\n",
|
|
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
|
|
" for pos in [\"Aktiva\", \"Passiva\"]:\n",
|
|
" tag = soup.find(\"b\", string=re.compile(pos))\n",
|
|
" if tag:\n",
|
|
" pos_results = pd.read_html(\n",
|
|
" StringIO(str(tag.findNext(\"table\", {\"class\": \"std_table\"})))\n",
|
|
" )[0]\n",
|
|
" result[pos] = pos_results\n",
|
|
" else:\n",
|
|
" result[pos] = pd.DataFrame([])\n",
|
|
" return result\n",
|
|
"\n",
|
|
"\n",
|
|
"bilanz = get_bilanz(sample_report)\n",
|
|
"bilanz[\"Passiva\"].head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n",
|
|
" ('Aktiva', '31.12.2021 EUR'),\n",
|
|
" ('Aktiva', '31.12.2020 EUR')],\n",
|
|
" )\n",
|
|
"MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n",
|
|
" ('Passiva', '31.12.2021 EUR'),\n",
|
|
" ('Passiva', '31.12.2020 EUR')],\n",
|
|
" )\n",
|
|
"Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n",
|
|
"MultiIndex([('Kreditentwicklung', 'Unnamed: 0_level_1'),\n",
|
|
" ( 'Betrag', 'EUR')],\n",
|
|
" )\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"def get_tables(raw_report: str) -> list:\n",
|
|
" soup = BeautifulSoup(raw_report, features=\"html.parser\")\n",
|
|
" tables = soup.find_all(\"table\", {\"class\": \"std_table\"})\n",
|
|
" dfs = []\n",
|
|
" for table in tables:\n",
|
|
" for df in pd.read_html(StringIO(str(table))):\n",
|
|
" dfs.append(df)\n",
|
|
" return dfs\n",
|
|
"\n",
|
|
"\n",
|
|
"for df in get_tables(sample_report):\n",
|
|
" print(df.columns)\n",
|
|
"\n",
|
|
"tables = get_tables(sample_report)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.3"
|
|
},
|
|
"orig_nbformat": 4
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|