mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-04-22 11:42:55 +02:00
708 lines
25 KiB
Plaintext
708 lines
25 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Daten Extraktion aus dem Bundesanzeiger"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Vorbereitung"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"from deutschland.bundesanzeiger import Bundesanzeiger"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 33,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"dict_keys(['c1051233030a8e0232523052fd4a2310', '57d129e6fd7505d567fa13919e5e6bdd'])\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"ba = Bundesanzeiger()\n",
|
||
"reports = ba.get_reports(\n",
|
||
" \"Volkswagen Economy Service Erdle Bernhard Erdle GmbH\"\n",
|
||
") # \"Atos IT-Dienstleistung und Beratung GmbH\")\n",
|
||
"print(reports.keys())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 34,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"report_contents = []\n",
|
||
"for key in reports.keys():\n",
|
||
" report_contents.append(reports[key])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 35,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>date</th>\n",
|
||
" <th>name</th>\n",
|
||
" <th>company</th>\n",
|
||
" <th>report</th>\n",
|
||
" <th>raw_report</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>2023-05-25</td>\n",
|
||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2023-05-24</td>\n",
|
||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" date name \\\n",
|
||
"0 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||
"1 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||
"\n",
|
||
" company \\\n",
|
||
"0 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||
"1 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||
"\n",
|
||
" report \\\n",
|
||
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||
"\n",
|
||
" raw_report \n",
|
||
"0 <div class=\"publication_container\">\\n <div cla... \n",
|
||
"1 <div class=\"publication_container\">\\n <div cla... "
|
||
]
|
||
},
|
||
"execution_count": 35,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df_reports = pd.DataFrame(report_contents)\n",
|
||
"df_reports.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 36,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>date</th>\n",
|
||
" <th>name</th>\n",
|
||
" <th>company</th>\n",
|
||
" <th>report</th>\n",
|
||
" <th>raw_report</th>\n",
|
||
" <th>type</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>2023-05-25</td>\n",
|
||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||
" <td>Jahresabschluss</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2023-05-24</td>\n",
|
||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||
" <td>Jahresabschluss</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" date name \\\n",
|
||
"0 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||
"1 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||
"\n",
|
||
" company \\\n",
|
||
"0 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||
"1 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||
"\n",
|
||
" report \\\n",
|
||
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||
"\n",
|
||
" raw_report type \n",
|
||
"0 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
||
"1 <div class=\"publication_container\">\\n <div cla... Jahresabschluss "
|
||
]
|
||
},
|
||
"execution_count": 36,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df_reports[\"type\"] = df_reports.name.apply(lambda name: name.split(\" \")[0])\n",
|
||
"df_reports.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 37,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>date</th>\n",
|
||
" <th>company</th>\n",
|
||
" <th>raw_report</th>\n",
|
||
" <th>jahr</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>2023-05-25</td>\n",
|
||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||
" <td>2020</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2023-05-24</td>\n",
|
||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||
" <td>2019</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" date company \\\n",
|
||
"0 2023-05-25 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||
"1 2023-05-24 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||
"\n",
|
||
" raw_report jahr \n",
|
||
"0 <div class=\"publication_container\">\\n <div cla... 2020 \n",
|
||
"1 <div class=\"publication_container\">\\n <div cla... 2019 "
|
||
]
|
||
},
|
||
"execution_count": 37,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df_jahresabschluss = df_reports.loc[df_reports.type == \"Jahresabschluss\"]\n",
|
||
"df_jahresabschluss[\"jahr\"] = df_jahresabschluss.name.apply(\n",
|
||
" lambda name: name.split(\" \")[-1].split(\".\")[-1]\n",
|
||
")\n",
|
||
"df_jahresabschluss = df_jahresabschluss.drop([\"name\", \"report\", \"type\"], axis=1)\n",
|
||
"df_jahresabschluss.head()"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Daten Extraktion"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 38,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from bs4 import BeautifulSoup\n",
|
||
"from io import StringIO"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 39,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"sample_report = df_jahresabschluss.iloc[0].raw_report\n",
|
||
"sample_report_content = df_jahresabschluss.iloc[0].raw_report"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Wirtschaftsprüfer"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 40,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import re\n",
|
||
"from dataclasses import dataclass\n",
|
||
"\n",
|
||
"\n",
|
||
"@dataclass\n",
|
||
"class Auditor:\n",
|
||
" name: str\n",
|
||
" company: str\n",
|
||
"\n",
|
||
"\n",
|
||
"def extract_auditor_company(report: str) -> str:\n",
|
||
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
|
||
" temp = soup.find_all(\"b\")\n",
|
||
" for elem in temp:\n",
|
||
" br = elem.findChildren(\"br\")\n",
|
||
" if len(br) > 0:\n",
|
||
" return elem.text.split(\"\\n\")[1].strip()\n",
|
||
" return None\n",
|
||
"\n",
|
||
"\n",
|
||
"def extract_auditors(report: str) -> list:\n",
|
||
" auditor_company = extract_auditor_company(report)\n",
|
||
" auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
|
||
" hits = re.findall(auditor_regex, report)\n",
|
||
" return [\n",
|
||
" Auditor(hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip(), auditor_company)\n",
|
||
" for hit in hits\n",
|
||
" ]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 41,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"[]"
|
||
]
|
||
},
|
||
"execution_count": 41,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"extract_auditors(sample_report)"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Aufsichtsrat"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"**TODO**"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Bilanz bzw. GuV"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 42,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"{'net_income': 23484.67, 'equity': 65083.84, 'current_assets': 357613.61}"
|
||
]
|
||
},
|
||
"execution_count": 42,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"def extract_kpis(report_content) -> dict:\n",
|
||
" \"\"\"\n",
|
||
" Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd\n",
|
||
" Extracts Key Performance Indicators (KPIs) from the financial reports.\n",
|
||
" Args:\n",
|
||
" reports (dict): A dictionary containing the financial reports with their hash as keys and report details as values.\n",
|
||
" Returns:\n",
|
||
" dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.\n",
|
||
" \"\"\"\n",
|
||
"\n",
|
||
" kpis = {}\n",
|
||
"\n",
|
||
" # Define KPI patterns to search for\n",
|
||
" kpi_patterns = {\n",
|
||
" \"revenue\": r\"(?:revenue|umsatz|erlöse)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||
" \"net_income\": r\"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||
" \"ebit\": r\"(?:ebit|operating income)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||
" \"ebitda\": r\"(?:ebitda)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||
" \"gross_profit\": r\"(?:gross profit|bruttogewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||
" \"operating_profit\": r\"(?:operating profit|betriebsgewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||
" \"assets\": r\"(?:total assets|bilanzsumme)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||
" \"liabilities\": r\"(?:total liabilities|gesamtverbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||
" \"equity\": r\"(?:shareholders'? equity|eigenkapital)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||
" \"current_assets\": r\"(?:current assets|umlaufvermögen)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||
" \"current_liabilities\": r\"(?:current liabilities|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||
" \"long_term_debt\": r\"(?:long[-\\s]?term debt|langfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||
" \"short_term_debt\": r\"(?:short[-\\s]?term debt|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||
" \"cash_and_cash_equivalents\": r\"(?:cash (?:and cash equivalents)?|barmittel)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||
" \"dividends\": r\"(?:dividends?|dividende)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||
" \"cash_flow\": r\"(?:cash flow|cashflow|cash flow from operating activities)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||
" }\n",
|
||
"\n",
|
||
" report_kpis = {}\n",
|
||
" for kpi, pattern in kpi_patterns.items():\n",
|
||
" match = re.search(pattern, report_content, flags=re.IGNORECASE | re.UNICODE)\n",
|
||
" if match:\n",
|
||
" value = match.group(1)\n",
|
||
"\n",
|
||
" # Clean and validate the extracted number\n",
|
||
" try:\n",
|
||
" if not value: # Check if value is empty\n",
|
||
" cleaned_value = None\n",
|
||
" else:\n",
|
||
" multiplier = 1\n",
|
||
" if value[-1].lower() == \"m\":\n",
|
||
" value = value[:-1]\n",
|
||
" multiplier = 1_000_000\n",
|
||
" elif value[-1].lower() == \"b\":\n",
|
||
" value = value[:-1]\n",
|
||
" multiplier = 1_000_000_000\n",
|
||
"\n",
|
||
" # Remove commas after checking for multipliers\n",
|
||
" value = value.replace(\".\", \"\").replace(\",\", \".\").strip()\n",
|
||
" cleaned_value = float(value) * multiplier\n",
|
||
" except ValueError:\n",
|
||
" cleaned_value = None\n",
|
||
"\n",
|
||
" if cleaned_value is not None:\n",
|
||
" report_kpis[kpi] = cleaned_value\n",
|
||
" return report_kpis\n",
|
||
"\n",
|
||
"\n",
|
||
"extract_kpis(\n",
|
||
" BeautifulSoup(sample_report, features=\"html.parser\").get_text().replace(\"\\n\", \" \")\n",
|
||
")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 43,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import os\n",
|
||
"\n",
|
||
"with open(\"./temp.txt\", \"w\") as file:\n",
|
||
" file.write(\n",
|
||
" BeautifulSoup(sample_report, features=\"html.parser\")\n",
|
||
" .get_text()\n",
|
||
" .replace(\"\\n\", \" \")\n",
|
||
" )"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 46,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n",
|
||
" ('Aktiva', '31.12.2020 EUR'),\n",
|
||
" ('Aktiva', '31.12.2019 EUR')],\n",
|
||
" )\n",
|
||
"Aktiva Unnamed: 0_level_1 object\n",
|
||
" 31.12.2020 EUR object\n",
|
||
" 31.12.2019 EUR object\n",
|
||
"dtype: object\n",
|
||
"MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n",
|
||
" ('Passiva', '31.12.2020 EUR'),\n",
|
||
" ('Passiva', '31.12.2019 EUR')],\n",
|
||
" )\n",
|
||
"Passiva Unnamed: 0_level_1 object\n",
|
||
" 31.12.2020 EUR object\n",
|
||
" 31.12.2019 EUR object\n",
|
||
"dtype: object\n",
|
||
"Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n",
|
||
"Angaben zur Identifikation der Gesellschaft laut Registergericht object\n",
|
||
"Angaben zur Identifikation der Gesellschaft laut Registergericht.1 object\n",
|
||
"dtype: object\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"{}"
|
||
]
|
||
},
|
||
"execution_count": 46,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"def parse_tables(report: str) -> list:\n",
|
||
" result = {}\n",
|
||
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
|
||
" for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n",
|
||
" df = pd.read_html(StringIO(str(table)))[0]\n",
|
||
" print(df.columns)\n",
|
||
" print(df.dtypes)\n",
|
||
" return result\n",
|
||
"\n",
|
||
"\n",
|
||
"parse_tables(sample_report)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 45,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"ename": "KeyError",
|
||
"evalue": "'Passiva'",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||
"\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
|
||
"\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Bundesanzeiger\\notebook.ipynb Cell 21\u001b[0m in \u001b[0;36m1\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=10'>11</a>\u001b[0m \u001b[39mreturn\u001b[39;00m result\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=13'>14</a>\u001b[0m bilanz \u001b[39m=\u001b[39m get_bilanz(sample_report)\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=14'>15</a>\u001b[0m bilanz[\u001b[39m\"\u001b[39;49m\u001b[39mPassiva\u001b[39;49m\u001b[39m\"\u001b[39;49m]\u001b[39m.\u001b[39mhead()\n",
|
||
"\u001b[1;31mKeyError\u001b[0m: 'Passiva'"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"def get_bilanz(report: str) -> any:\n",
|
||
" result = {}\n",
|
||
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
|
||
" for pos in [\"Aktiva\", \"Passiva\"]:\n",
|
||
" tag = soup.find(\"b\", string=re.compile(pos))\n",
|
||
" if tag:\n",
|
||
" pos_results = pd.read_html(\n",
|
||
" StringIO(str(tag.findNext(\"table\", {\"class\": \"std_table\"})))\n",
|
||
" )[0]\n",
|
||
" result[pos] = pos_results\n",
|
||
" return result\n",
|
||
"\n",
|
||
"\n",
|
||
"bilanz = get_bilanz(sample_report)\n",
|
||
"bilanz[\"Passiva\"].head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Int64Index([0, 1], dtype='int64')\n",
|
||
"Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||
"Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n",
|
||
" 'Vorjahr TEUR'],\n",
|
||
" dtype='object')\n",
|
||
"Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||
"Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n",
|
||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||
"Int64Index([0, 1, 2], dtype='int64')\n",
|
||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||
"Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
|
||
"MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n",
|
||
" ('Art der Beziehung', 'Gesellschafterin TEUR'),\n",
|
||
" ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n",
|
||
" )\n",
|
||
"Int64Index([0, 1], dtype='int64')\n",
|
||
"MultiIndex([( 'Unnamed: 0_level_0', ...),\n",
|
||
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
|
||
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
|
||
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
|
||
" ('Anschaffungs- oder Herstellungskosten', ...)],\n",
|
||
" )\n",
|
||
"MultiIndex([('Unnamed: 0_level_0', ...),\n",
|
||
" ( 'Abschreibungen', ...),\n",
|
||
" ( 'Abschreibungen', ...),\n",
|
||
" ( 'Abschreibungen', ...),\n",
|
||
" ( 'Abschreibungen', ...)],\n",
|
||
" )\n",
|
||
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
|
||
" ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n",
|
||
" ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n",
|
||
" )\n",
|
||
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n",
|
||
" '2019'],\n",
|
||
" dtype='object')\n",
|
||
"Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n",
|
||
" 'Veränderung TEUR'],\n",
|
||
" dtype='object')\n",
|
||
"Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"def get_tables(raw_report: str) -> list:\n",
|
||
" soup = BeautifulSoup(raw_report, features=\"html.parser\")\n",
|
||
" tables = soup.find_all(\"table\", {\"class\": \"std_table\"})\n",
|
||
" dfs = []\n",
|
||
" for table in tables:\n",
|
||
" for df in pd.read_html(StringIO(str(table))):\n",
|
||
" dfs.append(df)\n",
|
||
" return dfs\n",
|
||
"\n",
|
||
"\n",
|
||
"for df in get_tables(sample_report):\n",
|
||
" print(df.columns)\n",
|
||
"\n",
|
||
"tables = get_tables(sample_report)"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.10.7"
|
||
},
|
||
"orig_nbformat": 4
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|