2023-08-18 14:15:05 +02:00

720 lines
26 KiB
Plaintext
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Daten Extraktion aus dem Bundesanzeiger"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Vorbereitung"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'deutschland'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[1], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpandas\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mpd\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mdeutschland\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbundesanzeiger\u001b[39;00m \u001b[39mimport\u001b[39;00m Bundesanzeiger\n",
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'deutschland'"
]
}
],
"source": [
"import pandas as pd\n",
"from deutschland.bundesanzeiger import Bundesanzeiger"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"dict_keys(['c1051233030a8e0232523052fd4a2310', '57d129e6fd7505d567fa13919e5e6bdd'])\n"
]
}
],
"source": [
"ba = Bundesanzeiger()\n",
"reports = ba.get_reports(\n",
" \"Volkswagen Economy Service Erdle Bernhard Erdle GmbH\"\n",
") # \"Atos IT-Dienstleistung und Beratung GmbH\")\n",
"print(reports.keys())"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"report_contents = []\n",
"for key in reports.keys():\n",
" report_contents.append(reports[key])"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>name</th>\n",
" <th>company</th>\n",
" <th>report</th>\n",
" <th>raw_report</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2023-05-25</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2023-05-24</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" date name \\\n",
"0 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"1 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"\n",
" company \\\n",
"0 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"1 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"\n",
" report \\\n",
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
"\n",
" raw_report \n",
"0 <div class=\"publication_container\">\\n <div cla... \n",
"1 <div class=\"publication_container\">\\n <div cla... "
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_reports = pd.DataFrame(report_contents)\n",
"df_reports.head()"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>name</th>\n",
" <th>company</th>\n",
" <th>report</th>\n",
" <th>raw_report</th>\n",
" <th>type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2023-05-25</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>Jahresabschluss</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2023-05-24</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>Jahresabschluss</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" date name \\\n",
"0 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"1 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"\n",
" company \\\n",
"0 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"1 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"\n",
" report \\\n",
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
"\n",
" raw_report type \n",
"0 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
"1 <div class=\"publication_container\">\\n <div cla... Jahresabschluss "
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_reports[\"type\"] = df_reports.name.apply(lambda name: name.split(\" \")[0])\n",
"df_reports.head()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>company</th>\n",
" <th>raw_report</th>\n",
" <th>jahr</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2023-05-25</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2020</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2023-05-24</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2019</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" date company \\\n",
"0 2023-05-25 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"1 2023-05-24 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"\n",
" raw_report jahr \n",
"0 <div class=\"publication_container\">\\n <div cla... 2020 \n",
"1 <div class=\"publication_container\">\\n <div cla... 2019 "
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_jahresabschluss = df_reports.loc[df_reports.type == \"Jahresabschluss\"]\n",
"df_jahresabschluss[\"jahr\"] = df_jahresabschluss.name.apply(\n",
" lambda name: name.split(\" \")[-1].split(\".\")[-1]\n",
")\n",
"df_jahresabschluss = df_jahresabschluss.drop([\"name\", \"report\", \"type\"], axis=1)\n",
"df_jahresabschluss.head()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Daten Extraktion"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"from io import StringIO"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"sample_report = df_jahresabschluss.iloc[0].raw_report\n",
"sample_report_content = df_jahresabschluss.iloc[0].raw_report"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Wirtschaftsprüfer"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"from dataclasses import dataclass\n",
"\n",
"\n",
"@dataclass\n",
"class Auditor:\n",
" name: str\n",
" company: str\n",
"\n",
"\n",
"def extract_auditor_company(report: str) -> str:\n",
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
" temp = soup.find_all(\"b\")\n",
" for elem in temp:\n",
" br = elem.findChildren(\"br\")\n",
" if len(br) > 0:\n",
" return elem.text.split(\"\\n\")[1].strip()\n",
" return None\n",
"\n",
"\n",
"def extract_auditors(report: str) -> list:\n",
" auditor_company = extract_auditor_company(report)\n",
" auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
" hits = re.findall(auditor_regex, report)\n",
" return [\n",
" Auditor(hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip(), auditor_company)\n",
" for hit in hits\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extract_auditors(sample_report)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Aufsichtsrat"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"**TODO**"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Bilanz bzw. GuV"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'net_income': 23484.67, 'equity': 65083.84, 'current_assets': 357613.61}"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def extract_kpis(report_content) -> dict:\n",
" \"\"\"\n",
" Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd\n",
" Extracts Key Performance Indicators (KPIs) from the financial reports.\n",
" Args:\n",
" reports (dict): A dictionary containing the financial reports with their hash as keys and report details as values.\n",
" Returns:\n",
" dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.\n",
" \"\"\"\n",
"\n",
" kpis = {}\n",
"\n",
" # Define KPI patterns to search for\n",
" kpi_patterns = {\n",
" \"revenue\": r\"(?:revenue|umsatz|erlöse)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"net_income\": r\"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"ebit\": r\"(?:ebit|operating income)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"ebitda\": r\"(?:ebitda)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"gross_profit\": r\"(?:gross profit|bruttogewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"operating_profit\": r\"(?:operating profit|betriebsgewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"assets\": r\"(?:total assets|bilanzsumme)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"liabilities\": r\"(?:total liabilities|gesamtverbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"equity\": r\"(?:shareholders'? equity|eigenkapital)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"current_assets\": r\"(?:current assets|umlaufvermögen)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"current_liabilities\": r\"(?:current liabilities|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"long_term_debt\": r\"(?:long[-\\s]?term debt|langfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"short_term_debt\": r\"(?:short[-\\s]?term debt|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"cash_and_cash_equivalents\": r\"(?:cash (?:and cash equivalents)?|barmittel)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"dividends\": r\"(?:dividends?|dividende)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"cash_flow\": r\"(?:cash flow|cashflow|cash flow from operating activities)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" }\n",
"\n",
" report_kpis = {}\n",
" for kpi, pattern in kpi_patterns.items():\n",
" match = re.search(pattern, report_content, flags=re.IGNORECASE | re.UNICODE)\n",
" if match:\n",
" value = match.group(1)\n",
"\n",
" # Clean and validate the extracted number\n",
" try:\n",
" if not value: # Check if value is empty\n",
" cleaned_value = None\n",
" else:\n",
" multiplier = 1\n",
" if value[-1].lower() == \"m\":\n",
" value = value[:-1]\n",
" multiplier = 1_000_000\n",
" elif value[-1].lower() == \"b\":\n",
" value = value[:-1]\n",
" multiplier = 1_000_000_000\n",
"\n",
" # Remove commas after checking for multipliers\n",
" value = value.replace(\".\", \"\").replace(\",\", \".\").strip()\n",
" cleaned_value = float(value) * multiplier\n",
" except ValueError:\n",
" cleaned_value = None\n",
"\n",
" if cleaned_value is not None:\n",
" report_kpis[kpi] = cleaned_value\n",
" return report_kpis\n",
"\n",
"\n",
"extract_kpis(\n",
" BeautifulSoup(sample_report, features=\"html.parser\").get_text().replace(\"\\n\", \" \")\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"with open(\"./temp.txt\", \"w\") as file:\n",
" file.write(\n",
" BeautifulSoup(sample_report, features=\"html.parser\")\n",
" .get_text()\n",
" .replace(\"\\n\", \" \")\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n",
" ('Aktiva', '31.12.2020 EUR'),\n",
" ('Aktiva', '31.12.2019 EUR')],\n",
" )\n",
"Aktiva Unnamed: 0_level_1 object\n",
" 31.12.2020 EUR object\n",
" 31.12.2019 EUR object\n",
"dtype: object\n",
"MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n",
" ('Passiva', '31.12.2020 EUR'),\n",
" ('Passiva', '31.12.2019 EUR')],\n",
" )\n",
"Passiva Unnamed: 0_level_1 object\n",
" 31.12.2020 EUR object\n",
" 31.12.2019 EUR object\n",
"dtype: object\n",
"Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n",
"Angaben zur Identifikation der Gesellschaft laut Registergericht object\n",
"Angaben zur Identifikation der Gesellschaft laut Registergericht.1 object\n",
"dtype: object\n"
]
},
{
"data": {
"text/plain": [
"{}"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def parse_tables(report: str) -> list:\n",
" result = {}\n",
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
" for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n",
" df = pd.read_html(StringIO(str(table)))[0]\n",
" print(df.columns)\n",
" print(df.dtypes)\n",
" return result\n",
"\n",
"\n",
"parse_tables(sample_report)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "'Passiva'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Bundesanzeiger\\notebook.ipynb Cell 21\u001b[0m in \u001b[0;36m1\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=10'>11</a>\u001b[0m \u001b[39mreturn\u001b[39;00m result\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=13'>14</a>\u001b[0m bilanz \u001b[39m=\u001b[39m get_bilanz(sample_report)\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=14'>15</a>\u001b[0m bilanz[\u001b[39m\"\u001b[39;49m\u001b[39mPassiva\u001b[39;49m\u001b[39m\"\u001b[39;49m]\u001b[39m.\u001b[39mhead()\n",
"\u001b[1;31mKeyError\u001b[0m: 'Passiva'"
]
}
],
"source": [
"def get_bilanz(report: str) -> any:\n",
" result = {}\n",
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
" for pos in [\"Aktiva\", \"Passiva\"]:\n",
" tag = soup.find(\"b\", string=re.compile(pos))\n",
" if tag:\n",
" pos_results = pd.read_html(\n",
" StringIO(str(tag.findNext(\"table\", {\"class\": \"std_table\"})))\n",
" )[0]\n",
" result[pos] = pos_results\n",
" return result\n",
"\n",
"\n",
"bilanz = get_bilanz(sample_report)\n",
"bilanz[\"Passiva\"].head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Int64Index([0, 1], dtype='int64')\n",
"Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n",
" 'Vorjahr TEUR'],\n",
" dtype='object')\n",
"Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Int64Index([0, 1, 2], dtype='int64')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
"MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n",
" ('Art der Beziehung', 'Gesellschafterin TEUR'),\n",
" ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n",
" )\n",
"Int64Index([0, 1], dtype='int64')\n",
"MultiIndex([( 'Unnamed: 0_level_0', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...)],\n",
" )\n",
"MultiIndex([('Unnamed: 0_level_0', ...),\n",
" ( 'Abschreibungen', ...),\n",
" ( 'Abschreibungen', ...),\n",
" ( 'Abschreibungen', ...),\n",
" ( 'Abschreibungen', ...)],\n",
" )\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n",
" ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n",
" )\n",
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n",
" '2019'],\n",
" dtype='object')\n",
"Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n",
" 'Veränderung TEUR'],\n",
" dtype='object')\n",
"Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n"
]
}
],
"source": [
"def get_tables(raw_report: str) -> list:\n",
" soup = BeautifulSoup(raw_report, features=\"html.parser\")\n",
" tables = soup.find_all(\"table\", {\"class\": \"std_table\"})\n",
" dfs = []\n",
" for table in tables:\n",
" for df in pd.read_html(StringIO(str(table))):\n",
" dfs.append(df)\n",
" return dfs\n",
"\n",
"\n",
"for df in get_tables(sample_report):\n",
" print(df.columns)\n",
"\n",
"tables = get_tables(sample_report)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}