Files
aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb

1790 lines
69 KiB
Plaintext

{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Daten Extraktion aus dem Bundesanzeiger"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Vorbereitung"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>company</th>\n",
" <th>raw_report</th>\n",
" <th>jahr</th>\n",
" <th>auditors</th>\n",
" <th>financial_results</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2022-10-21</td>\n",
" <td>Stadtwerke Haltern am See Gesellschaft mit bes...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2021</td>\n",
" <td>[Auditor(name='Volker Voelcker', company='Pric...</td>\n",
" <td>{'revenue': 46275.0, 'net_income': 1757.0, 'eb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2021-10-12</td>\n",
" <td>Stadtwerke Haltern am See Gesellschaft mit bes...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2020</td>\n",
" <td>[Auditor(name='Hubert Ahlers', company='Pricew...</td>\n",
" <td>{'revenue': 47459.0, 'net_income': 1661.0, 'eb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2020-12-03</td>\n",
" <td>Stadtwerke Haltern am See Gesellschaft mit bes...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2019</td>\n",
" <td>[Auditor(name='Hubert Ahlers', company='Pricew...</td>\n",
" <td>{'revenue': 45575.0, 'net_income': 1599.0, 'eb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>2020-01-09</td>\n",
" <td>Stadtwerke Haltern am See Gesellschaft mit bes...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2018</td>\n",
" <td>[Auditor(name='Hubert Ahlers', company='Pricew...</td>\n",
" <td>{'revenue': 43898.0, 'net_income': 2043.0, 'eb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>2019-10-10</td>\n",
" <td>Stadtwerke Haltern am See Gesellschaft mit bes...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2017</td>\n",
" <td>[]</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" date company \\\n",
"1 2022-10-21 Stadtwerke Haltern am See Gesellschaft mit bes... \n",
"3 2021-10-12 Stadtwerke Haltern am See Gesellschaft mit bes... \n",
"5 2020-12-03 Stadtwerke Haltern am See Gesellschaft mit bes... \n",
"6 2020-01-09 Stadtwerke Haltern am See Gesellschaft mit bes... \n",
"7 2019-10-10 Stadtwerke Haltern am See Gesellschaft mit bes... \n",
"\n",
" raw_report jahr \\\n",
"1 <div class=\"publication_container\">\\n <div cla... 2021 \n",
"3 <div class=\"publication_container\">\\n <div cla... 2020 \n",
"5 <div class=\"publication_container\">\\n <div cla... 2019 \n",
"6 <div class=\"publication_container\">\\n <div cla... 2018 \n",
"7 <div class=\"publication_container\">\\n <div cla... 2017 \n",
"\n",
" auditors \\\n",
"1 [Auditor(name='Volker Voelcker', company='Pric... \n",
"3 [Auditor(name='Hubert Ahlers', company='Pricew... \n",
"5 [Auditor(name='Hubert Ahlers', company='Pricew... \n",
"6 [Auditor(name='Hubert Ahlers', company='Pricew... \n",
"7 [] \n",
"\n",
" financial_results \n",
"1 {'revenue': 46275.0, 'net_income': 1757.0, 'eb... \n",
"3 {'revenue': 47459.0, 'net_income': 1661.0, 'eb... \n",
"5 {'revenue': 45575.0, 'net_income': 1599.0, 'eb... \n",
"6 {'revenue': 43898.0, 'net_income': 2043.0, 'eb... \n",
"7 {} "
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"from aki_prj23_transparenzregister.utils.data_extraction.bundesanzeiger import (\n",
" Bundesanzeiger,\n",
")\n",
"\n",
"ba_wrapper = Bundesanzeiger()\n",
"# df_reports = ba_wrapper.get_information(\"Törmer Energy Solar 1 GmbH & Co. KG\", \"\")\n",
"# df_reports = ba_wrapper.get_information(\"Atos IT-Dienstleistung und Beratung GmbH\", \"\")\n",
"df_reports = ba_wrapper.get_information(\n",
" \"Stadtwerke Haltern am See Gesellschaft mit beschränkter Haftung\", \"\"\n",
")\n",
"df_reports.head()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Daten Extraktion"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"from io import StringIO"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"sample_report = df_reports.iloc[1].raw_report"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Aufsichtsrat"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"**TODO**"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Bilanz bzw. GuV"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( '2020', 'T€'),\n",
" ( '2019', 'T€'),\n",
" ( 'Veränderung', 'T€'),\n",
" ( 'Veränderung', '%')],\n",
" )\n",
"Unnamed: 0_level_0 Unnamed: 0_level_1 object\n",
"2020 T€ float64\n",
"2019 T€ float64\n",
"Veränderung T€ int64\n",
" % int64\n",
"dtype: object\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( '2020', 'T€'),\n",
" ( '2020', '%'),\n",
" ( '2019', 'T€'),\n",
" ( '2019', '%'),\n",
" ( 'Veränderungen', 'T€'),\n",
" ( 'Veränderungen', '%')],\n",
" )\n",
"Unnamed: 0_level_0 Unnamed: 0_level_1 object\n",
"2020 T€ float64\n",
" % int64\n",
"2019 T€ float64\n",
" % int64\n",
"Veränderungen T€ float64\n",
" % int64\n",
"dtype: object\n",
"MultiIndex([('Unnamed: 0_level_0', 'gerundet'),\n",
" ( '2020', 'T€'),\n",
" ( '2019', 'T€'),\n",
" ( 'Veränderung', 'T€'),\n",
" ( 'Veränderung', '%')],\n",
" )\n",
"Unnamed: 0_level_0 gerundet object\n",
"2020 T€ float64\n",
"2019 T€ float64\n",
"Veränderung T€ float64\n",
" % int64\n",
"dtype: object\n",
"MultiIndex([('Unnamed: 0_level_0', 'unkonsolidiert gerundet'),\n",
" ( '2020', 'T€'),\n",
" ( '2019', 'T€'),\n",
" ( 'Veränderung', 'T€'),\n",
" ( 'Veränderung', '%')],\n",
" )\n",
"Unnamed: 0_level_0 unkonsolidiert gerundet object\n",
"2020 T€ float64\n",
"2019 T€ float64\n",
"Veränderung T€ float64\n",
" % int64\n",
"dtype: object\n",
"MultiIndex([('Jahresüberschuss/Jahresfehlbetrag nach Betriebszweigen', ...),\n",
" ( '2020', ...),\n",
" ( '2019', ...),\n",
" ( 'Veränderung', ...),\n",
" ( 'Veränderung', ...)],\n",
" )\n",
"Jahresüberschuss/Jahresfehlbetrag nach Betriebszweigen Unnamed: 0_level_1 object\n",
"2020 T€ float64\n",
"2019 T€ float64\n",
"Veränderung T€ int64\n",
" % int64\n",
"dtype: object\n",
"Index(['Unnamed: 0', '2020 T€', '2019 T€'], dtype='object')\n",
"Unnamed: 0 object\n",
"2020 T€ float64\n",
"2019 T€ float64\n",
"dtype: object\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( '31. Dezember 2020', 'T€'),\n",
" ( '31. Dezember 2020', '%'),\n",
" ( '31. Dezember 2019', 'T€'),\n",
" ( '31. Dezember 2019', '%'),\n",
" ( 'Veränderung', 'T€')],\n",
" )\n",
"Unnamed: 0_level_0 Unnamed: 0_level_1 object\n",
"31. Dezember 2020 T€ float64\n",
" % float64\n",
"31. Dezember 2019 T€ float64\n",
" % float64\n",
"Veränderung T€ float64\n",
"dtype: object\n",
"Index(['Investitionen (netto)', '2020 T€', '2019 T€', 'Veränderung T€'], dtype='object')\n",
"Investitionen (netto) object\n",
"2020 T€ float64\n",
"2019 T€ float64\n",
"Veränderung T€ float64\n",
"dtype: object\n",
"Index(['Unnamed: 0', '€', '31.12.2019 in T €'], dtype='object')\n",
"Unnamed: 0 object\n",
"€ object\n",
"31.12.2019 in T € object\n",
"dtype: object\n",
"Index(['Unnamed: 0', '€', '€.1', '31.12.2019 in T €'], dtype='object')\n",
"Unnamed: 0 object\n",
"€ object\n",
"€.1 object\n",
"31.12.2019 in T € float64\n",
"dtype: object\n",
"Index([0, 1], dtype='int64')\n",
"0 object\n",
"1 object\n",
"dtype: object\n",
"Index(['Beteiligung', 'Anteil', 'Eigenkapital der Beteiligungsgesellschaft',\n",
" 'Eigenkapital der Beteiligungsgesellschaft.1',\n",
" 'Jahresergebnis der Beteiligungsgesellschaft',\n",
" 'Jahresergebnis der Beteiligungsgesellschaft.1'],\n",
" dtype='object')\n",
"Beteiligung object\n",
"Anteil object\n",
"Eigenkapital der Beteiligungsgesellschaft object\n",
"Eigenkapital der Beteiligungsgesellschaft.1 object\n",
"Jahresergebnis der Beteiligungsgesellschaft object\n",
"Jahresergebnis der Beteiligungsgesellschaft.1 object\n",
"dtype: object\n",
"Index(['Beteiligung', 'Anteil', 'Eigenkapital der Beteiligungsgesellschaft',\n",
" 'Eigenkapital der Beteiligungsgesellschaft.1',\n",
" 'Jahresergebnis der Beteiligungsgesellschaft',\n",
" 'Jahresergebnis der Beteiligungsgesellschaft.1'],\n",
" dtype='object')\n",
"Beteiligung object\n",
"Anteil object\n",
"Eigenkapital der Beteiligungsgesellschaft object\n",
"Eigenkapital der Beteiligungsgesellschaft.1 object\n",
"Jahresergebnis der Beteiligungsgesellschaft object\n",
"Jahresergebnis der Beteiligungsgesellschaft.1 object\n",
"dtype: object\n",
"Index(['Unnamed: 0', '2020', '2019'], dtype='object')\n",
"Unnamed: 0 object\n",
"2020 object\n",
"2019 object\n",
"dtype: object\n",
"MultiIndex([( 'Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( 'Unnamed: 1_level_0', 'Gesamt in T€'),\n",
" ('davon mit einer Restlaufzeit', 'bis zu 1 Jahr in T€'),\n",
" ('davon mit einer Restlaufzeit', 'mehr als 1 Jahr in T€'),\n",
" ('davon mit einer Restlaufzeit', 'davon über 5 Jahre in T€')],\n",
" )\n",
"Unnamed: 0_level_0 Unnamed: 0_level_1 object\n",
"Unnamed: 1_level_0 Gesamt in T€ float64\n",
"davon mit einer Restlaufzeit bis zu 1 Jahr in T€ float64\n",
" mehr als 1 Jahr in T€ float64\n",
" davon über 5 Jahre in T€ float64\n",
"dtype: object\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( '2020', '€'),\n",
" ( '2019', '€'),\n",
" ( 'Veränderung', '€'),\n",
" ( 'Veränderung', '%')],\n",
" )\n",
"Unnamed: 0_level_0 Unnamed: 0_level_1 object\n",
"2020 € object\n",
"2019 € object\n",
"Veränderung € object\n",
" % int64\n",
"dtype: object\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( '2020', '€'),\n",
" ( '2019', '€'),\n",
" ( 'Veränderung', '€'),\n",
" ( 'Veränderung', '%')],\n",
" )\n",
"Unnamed: 0_level_0 Unnamed: 0_level_1 object\n",
"2020 € object\n",
"2019 € object\n",
"Veränderung € object\n",
" % int64\n",
"dtype: object\n",
"Index(['Unnamed: 0', '2020 T€', '2019 T€'], dtype='object')\n",
"Unnamed: 0 object\n",
"2020 T€ int64\n",
"2019 T€ int64\n",
"dtype: object\n",
"MultiIndex([( 'Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ('Anschaffungs- und Herstellungskosten', 'Stand am 01.01.2020 €'),\n",
" ('Anschaffungs- und Herstellungskosten', 'Zugang €'),\n",
" ('Anschaffungs- und Herstellungskosten', 'Abgang €'),\n",
" ('Anschaffungs- und Herstellungskosten', 'Umbuchung €'),\n",
" ('Anschaffungs- und Herstellungskosten', 'Stand am 31.12.2020 €')],\n",
" )\n",
"Unnamed: 0_level_0 Unnamed: 0_level_1 object\n",
"Anschaffungs- und Herstellungskosten Stand am 01.01.2020 € object\n",
" Zugang € object\n",
" Abgang € object\n",
" Umbuchung € object\n",
" Stand am 31.12.2020 € object\n",
"dtype: object\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( 'Abschreibungen', 'Stand am 01.01.2020 €'),\n",
" ( 'Abschreibungen', 'Zugang €'),\n",
" ( 'Abschreibungen', 'außerplanm. AfA'),\n",
" ( 'Abschreibungen', 'Abgang €'),\n",
" ( 'Abschreibungen', 'Umbuchung €'),\n",
" ( 'Abschreibungen', 'Stand am 31.12.2020 €')],\n",
" )\n",
"Unnamed: 0_level_0 Unnamed: 0_level_1 object\n",
"Abschreibungen Stand am 01.01.2020 € object\n",
" Zugang € object\n",
" außerplanm. AfA float64\n",
" Abgang € object\n",
" Umbuchung € float64\n",
" Stand am 31.12.2020 € object\n",
"dtype: object\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( 'Restbuchwerte', 'Stand am 31.12.2020 €'),\n",
" ( 'Restbuchwerte', 'Stand am 31.12.2019 €')],\n",
" )\n",
"Unnamed: 0_level_0 Unnamed: 0_level_1 object\n",
"Restbuchwerte Stand am 31.12.2020 € object\n",
" Stand am 31.12.2019 € object\n",
"dtype: object\n",
"Index(['Unnamed: 0', 'Elektrizitätsverteilung', '31.12.2019 in T €',\n",
" 'Gasverteilung', '31.12.2019 in T €.1'],\n",
" dtype='object')\n",
"Unnamed: 0 object\n",
"Elektrizitätsverteilung object\n",
"31.12.2019 in T € object\n",
"Gasverteilung object\n",
"31.12.2019 in T €.1 object\n",
"dtype: object\n",
"Index(['Unnamed: 0', '€', '31.12.2019 in T €'], dtype='object')\n",
"Unnamed: 0 object\n",
"€ object\n",
"31.12.2019 in T € object\n",
"dtype: object\n",
"Index(['Unnamed: 0', 'Elektrizitätsverteilung €', '31.12.2019 in T €',\n",
" 'Gasverteilung €', '31.12.2019 in T €.1'],\n",
" dtype='object')\n",
"Unnamed: 0 object\n",
"Elektrizitätsverteilung € object\n",
"31.12.2019 in T € float64\n",
"Gasverteilung € object\n",
"31.12.2019 in T €.1 float64\n",
"dtype: object\n",
"Index(['Unnamed: 0', '€', 'Vorjahr in T €'], dtype='object')\n",
"Unnamed: 0 object\n",
"€ object\n",
"Vorjahr in T € float64\n",
"dtype: object\n",
"MultiIndex([('Verbindlichkeitenspiegel 2020 Elektrizitätsverteilung', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...)],\n",
" )\n",
"Verbindlichkeitenspiegel 2020 Elektrizitätsverteilung Unnamed: 0_level_1 object\n",
"davon mit einer Restlaufzeit Gesamt in T€ float64\n",
" bis zu 1 Jahr in T€ float64\n",
" über 1 Jahr in T€ float64\n",
" mehr als 5 Jahre in T€ float64\n",
"dtype: object\n",
"MultiIndex([('Verbindlichkeitenspiegel 2020 Gasverteilung', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...)],\n",
" )\n",
"Verbindlichkeitenspiegel 2020 Gasverteilung Unnamed: 0_level_1 object\n",
"davon mit einer Restlaufzeit Gesamt in T€ float64\n",
" bis zu 1 Jahr in T€ float64\n",
" über 1 Jahr in T€ float64\n",
" mehr als 5 Jahre in T€ float64\n",
"dtype: object\n",
"MultiIndex([('Verbindlichkeitenspiegel 2020 Intelligenter', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...)],\n",
" )\n",
"Verbindlichkeitenspiegel 2020 Intelligenter Messstellenbetrieb object\n",
"davon mit einer Restlaufzeit Gesamt in T€ float64\n",
" bis zu 1 Jahr in T€ float64\n",
" über 1 Jahr in T€ float64\n",
" mehr als 5 Jahre in T€ float64\n",
"dtype: object\n"
]
}
],
"source": [
"def parse_tables(report: str) -> list:\n",
" result = []\n",
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
" for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n",
" df = pd.read_html(StringIO(str(table)), flavor=\"bs4\")[0]\n",
" print(df.columns)\n",
" print(df.dtypes)\n",
" result.append(df)\n",
" return result\n",
"\n",
"\n",
"tables = parse_tables(sample_report)"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>Unnamed: 0_level_0</th>\n",
" <th colspan=\"2\" halign=\"left\">2020</th>\n",
" <th colspan=\"2\" halign=\"left\">2019</th>\n",
" <th colspan=\"2\" halign=\"left\">Veränderungen</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>Unnamed: 0_level_1</th>\n",
" <th>T€</th>\n",
" <th>%</th>\n",
" <th>T€</th>\n",
" <th>%</th>\n",
" <th>T€</th>\n",
" <th>%</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Umsatzerlöse</td>\n",
" <td>47.459</td>\n",
" <td>978</td>\n",
" <td>45.575</td>\n",
" <td>970</td>\n",
" <td>1.884</td>\n",
" <td>41</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Aktivierte Eigenleistungen</td>\n",
" <td>380.000</td>\n",
" <td>8</td>\n",
" <td>400.000</td>\n",
" <td>9</td>\n",
" <td>-20.000</td>\n",
" <td>-50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Sonstige betriebliche Erträge</td>\n",
" <td>687.000</td>\n",
" <td>14</td>\n",
" <td>991.000</td>\n",
" <td>21</td>\n",
" <td>-304.000</td>\n",
" <td>-307</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Betriebliche Erträge</td>\n",
" <td>48.526</td>\n",
" <td>1000</td>\n",
" <td>46.966</td>\n",
" <td>1000</td>\n",
" <td>1.560</td>\n",
" <td>33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Materialaufwand</td>\n",
" <td>34.007</td>\n",
" <td>701</td>\n",
" <td>32.647</td>\n",
" <td>695</td>\n",
" <td>1.360</td>\n",
" <td>42</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0_level_0 2020 2019 Veränderungen \\\n",
" Unnamed: 0_level_1 T€ % T€ % T€ \n",
"0 Umsatzerlöse 47.459 978 45.575 970 1.884 \n",
"1 Aktivierte Eigenleistungen 380.000 8 400.000 9 -20.000 \n",
"2 Sonstige betriebliche Erträge 687.000 14 991.000 21 -304.000 \n",
"3 Betriebliche Erträge 48.526 1000 46.966 1000 1.560 \n",
"4 Materialaufwand 34.007 701 32.647 695 1.360 \n",
"\n",
" \n",
" % \n",
"0 41 \n",
"1 -50 \n",
"2 -307 \n",
"3 33 \n",
"4 42 "
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"current_table = tables[1]\n",
"current_table.head()"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"\n",
"def cleanse_string(value: str) -> str:\n",
" if value is not None and isinstance(value, str):\n",
" return re.sub(r\"(.+\\.).\", \"\", value)\n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_15672\\152097142.py:2: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" current_table.iloc[index][0] = cleanse_string(row[0])\n",
"C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_15672\\152097142.py:2: FutureWarning: Series.__setitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To set a value by position, use `ser.iloc[pos] = value`\n",
" current_table.iloc[index][0] = cleanse_string(row[0])\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>Unnamed: 0_level_0</th>\n",
" <th colspan=\"2\" halign=\"left\">2020</th>\n",
" <th colspan=\"2\" halign=\"left\">2019</th>\n",
" <th colspan=\"2\" halign=\"left\">Veränderungen</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>Unnamed: 0_level_1</th>\n",
" <th>T€</th>\n",
" <th>%</th>\n",
" <th>T€</th>\n",
" <th>%</th>\n",
" <th>T€</th>\n",
" <th>%</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Umsatzerlöse</td>\n",
" <td>47.459</td>\n",
" <td>978</td>\n",
" <td>45.575</td>\n",
" <td>970</td>\n",
" <td>1.884</td>\n",
" <td>41</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Aktivierte Eigenleistungen</td>\n",
" <td>380.000</td>\n",
" <td>8</td>\n",
" <td>400.000</td>\n",
" <td>9</td>\n",
" <td>-20.000</td>\n",
" <td>-50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Sonstige betriebliche Erträge</td>\n",
" <td>687.000</td>\n",
" <td>14</td>\n",
" <td>991.000</td>\n",
" <td>21</td>\n",
" <td>-304.000</td>\n",
" <td>-307</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Betriebliche Erträge</td>\n",
" <td>48.526</td>\n",
" <td>1000</td>\n",
" <td>46.966</td>\n",
" <td>1000</td>\n",
" <td>1.560</td>\n",
" <td>33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Materialaufwand</td>\n",
" <td>34.007</td>\n",
" <td>701</td>\n",
" <td>32.647</td>\n",
" <td>695</td>\n",
" <td>1.360</td>\n",
" <td>42</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0_level_0 2020 2019 Veränderungen \\\n",
" Unnamed: 0_level_1 T€ % T€ % T€ \n",
"0 Umsatzerlöse 47.459 978 45.575 970 1.884 \n",
"1 Aktivierte Eigenleistungen 380.000 8 400.000 9 -20.000 \n",
"2 Sonstige betriebliche Erträge 687.000 14 991.000 21 -304.000 \n",
"3 Betriebliche Erträge 48.526 1000 46.966 1000 1.560 \n",
"4 Materialaufwand 34.007 701 32.647 695 1.360 \n",
"\n",
" \n",
" % \n",
"0 41 \n",
"1 -50 \n",
"2 -307 \n",
"3 33 \n",
"4 42 "
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"for index, row in current_table.iterrows():\n",
" current_table.iloc[index][0] = cleanse_string(row[0])\n",
"current_table.head()"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
"def parse_string_to_float(value) -> float:\n",
" try:\n",
" if value is None:\n",
" return None\n",
" if isinstance(value, float):\n",
" return value\n",
" return float(value.replace(\".\", \"\").replace(\",\", \".\"))\n",
" except Exception as e:\n",
" return None\n",
"\n",
"\n",
"def apply_factor(value, factor: float):\n",
" transformed_value = parse_string_to_float(value)\n",
" if transformed_value is None or isinstance(transformed_value, str):\n",
" return None\n",
" result = transformed_value * factor\n",
" # print(result)\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>Unnamed: 0_level_0</th>\n",
" <th colspan=\"2\" halign=\"left\">2020</th>\n",
" <th colspan=\"2\" halign=\"left\">2019</th>\n",
" <th colspan=\"2\" halign=\"left\">Veränderungen</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>Unnamed: 0_level_1</th>\n",
" <th>T€</th>\n",
" <th>%</th>\n",
" <th>T€</th>\n",
" <th>%</th>\n",
" <th>T€</th>\n",
" <th>%</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Umsatzerlöse</td>\n",
" <td>47459.0</td>\n",
" <td>978</td>\n",
" <td>45575.0</td>\n",
" <td>970</td>\n",
" <td>1884.0</td>\n",
" <td>41</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Aktivierte Eigenleistungen</td>\n",
" <td>380000.0</td>\n",
" <td>8</td>\n",
" <td>400000.0</td>\n",
" <td>9</td>\n",
" <td>-20000.0</td>\n",
" <td>-50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Sonstige betriebliche Erträge</td>\n",
" <td>687000.0</td>\n",
" <td>14</td>\n",
" <td>991000.0</td>\n",
" <td>21</td>\n",
" <td>-304000.0</td>\n",
" <td>-307</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Betriebliche Erträge</td>\n",
" <td>48526.0</td>\n",
" <td>1000</td>\n",
" <td>46966.0</td>\n",
" <td>1000</td>\n",
" <td>1560.0</td>\n",
" <td>33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Materialaufwand</td>\n",
" <td>34007.0</td>\n",
" <td>701</td>\n",
" <td>32647.0</td>\n",
" <td>695</td>\n",
" <td>1360.0</td>\n",
" <td>42</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0_level_0 2020 2019 \\\n",
" Unnamed: 0_level_1 T€ % T€ % \n",
"0 Umsatzerlöse 47459.0 978 45575.0 970 \n",
"1 Aktivierte Eigenleistungen 380000.0 8 400000.0 9 \n",
"2 Sonstige betriebliche Erträge 687000.0 14 991000.0 21 \n",
"3 Betriebliche Erträge 48526.0 1000 46966.0 1000 \n",
"4 Materialaufwand 34007.0 701 32647.0 695 \n",
"\n",
" Veränderungen \n",
" T€ % \n",
"0 1884.0 41 \n",
"1 -20000.0 -50 \n",
"2 -304000.0 -307 \n",
"3 1560.0 33 \n",
"4 1360.0 42 "
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"converter = {\n",
" \"Mio€\": 1 * 10**6,\n",
" \"Mio\": 1 * 10**6,\n",
" \"T€\": 1 * 10**3,\n",
" \"TEUR\": 1 * 10**3,\n",
" \"EUR\": 1,\n",
" \"€\": 1,\n",
"}\n",
"\n",
"for column in current_table.columns:\n",
" if isinstance(column, tuple):\n",
" for c in column:\n",
" for x, factor in converter.items():\n",
" if x in c:\n",
" current_table[column] = current_table[column].apply(\n",
" lambda x: apply_factor(x, factor)\n",
" )\n",
" next\n",
" else:\n",
" for x, factor in converter.items():\n",
" parts = column.split(\" \")\n",
" for y in parts:\n",
" if re.match(x, y):\n",
" current_table[column] = current_table[column].apply(\n",
" lambda x: apply_factor(x, factor)\n",
" )\n",
" current_table.rename({column: parts[0]}, inplace=True, axis=1)\n",
" next\n",
" # print(current_table[column])\n",
"current_table.dropna(axis=0, how=\"all\", inplace=True)\n",
"current_table.dropna(axis=1, how=\"all\", inplace=True)\n",
"current_table.head()"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Unnamed: 0_level_0 Unnamed: 0_level_1 object\n",
"2020 T€ float64\n",
" % int64\n",
"2019 T€ float64\n",
" % int64\n",
"Veränderungen T€ float64\n",
" % int64\n",
"dtype: object"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"current_table.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
"# Remove columns hosting non-numerics; excl. first column hosting keys\n",
"columns_to_prune = []\n",
"for column_index, column_type in enumerate(current_table.dtypes[1:]):\n",
" if column_type in [\"object\", \"str\"]:\n",
" columns_to_prune.append(column_index + 1)\n",
"\n",
"current_table = current_table.drop(\n",
" current_table.columns[columns_to_prune], axis=\"columns\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>Unnamed: 0_level_0</th>\n",
" <th colspan=\"2\" halign=\"left\">2020</th>\n",
" <th colspan=\"2\" halign=\"left\">2019</th>\n",
" <th colspan=\"2\" halign=\"left\">Veränderungen</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>Unnamed: 0_level_1</th>\n",
" <th>T€</th>\n",
" <th>%</th>\n",
" <th>T€</th>\n",
" <th>%</th>\n",
" <th>T€</th>\n",
" <th>%</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Umsatzerlöse</td>\n",
" <td>47459.0</td>\n",
" <td>978</td>\n",
" <td>45575.0</td>\n",
" <td>970</td>\n",
" <td>1884.0</td>\n",
" <td>41</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Aktivierte Eigenleistungen</td>\n",
" <td>380000.0</td>\n",
" <td>8</td>\n",
" <td>400000.0</td>\n",
" <td>9</td>\n",
" <td>-20000.0</td>\n",
" <td>-50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Sonstige betriebliche Erträge</td>\n",
" <td>687000.0</td>\n",
" <td>14</td>\n",
" <td>991000.0</td>\n",
" <td>21</td>\n",
" <td>-304000.0</td>\n",
" <td>-307</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Betriebliche Erträge</td>\n",
" <td>48526.0</td>\n",
" <td>1000</td>\n",
" <td>46966.0</td>\n",
" <td>1000</td>\n",
" <td>1560.0</td>\n",
" <td>33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Materialaufwand</td>\n",
" <td>34007.0</td>\n",
" <td>701</td>\n",
" <td>32647.0</td>\n",
" <td>695</td>\n",
" <td>1360.0</td>\n",
" <td>42</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Personalaufwand</td>\n",
" <td>6258.0</td>\n",
" <td>129</td>\n",
" <td>6222.0</td>\n",
" <td>132</td>\n",
" <td>36000.0</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Abschreibungen</td>\n",
" <td>2239.0</td>\n",
" <td>46</td>\n",
" <td>2273.0</td>\n",
" <td>48</td>\n",
" <td>-34000.0</td>\n",
" <td>-15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Konzessionsabgabe</td>\n",
" <td>1331.0</td>\n",
" <td>27</td>\n",
" <td>1302.0</td>\n",
" <td>28</td>\n",
" <td>29000.0</td>\n",
" <td>22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Übrige sonstige betriebliche Aufwendungen</td>\n",
" <td>2100.0</td>\n",
" <td>43</td>\n",
" <td>2066.0</td>\n",
" <td>44</td>\n",
" <td>34000.0</td>\n",
" <td>16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Betriebliche Aufwendungen</td>\n",
" <td>45935.0</td>\n",
" <td>947</td>\n",
" <td>44510.0</td>\n",
" <td>948</td>\n",
" <td>1425.0</td>\n",
" <td>32</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Ergebnis der betrieblichen Tätigkeit</td>\n",
" <td>2591.0</td>\n",
" <td>53</td>\n",
" <td>2456.0</td>\n",
" <td>52</td>\n",
" <td>135000.0</td>\n",
" <td>55</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Finanzergebnis (Ertrags-/Aufwandsaldo)</td>\n",
" <td>-13000.0</td>\n",
" <td>0</td>\n",
" <td>-99000.0</td>\n",
" <td>-2</td>\n",
" <td>86000.0</td>\n",
" <td>-869</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>sonstige Steuern</td>\n",
" <td>147000.0</td>\n",
" <td>3</td>\n",
" <td>164000.0</td>\n",
" <td>3</td>\n",
" <td>-17000.0</td>\n",
" <td>-104</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Neutraler Bereich</td>\n",
" <td>134000.0</td>\n",
" <td>3</td>\n",
" <td>65000.0</td>\n",
" <td>1</td>\n",
" <td>86000.0</td>\n",
" <td>1062</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Jahresüberschuss vor Ertragsteuern</td>\n",
" <td>2457.0</td>\n",
" <td>51</td>\n",
" <td>2391.0</td>\n",
" <td>51</td>\n",
" <td>66000.0</td>\n",
" <td>28</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Ertragsteuern</td>\n",
" <td>796000.0</td>\n",
" <td>16</td>\n",
" <td>792000.0</td>\n",
" <td>17</td>\n",
" <td>4000.0</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Jahresüberschuss</td>\n",
" <td>1661.0</td>\n",
" <td>34</td>\n",
" <td>1599.0</td>\n",
" <td>34</td>\n",
" <td>62000.0</td>\n",
" <td>39</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0_level_0 2020 2019 \\\n",
" Unnamed: 0_level_1 T€ % T€ % \n",
"0 Umsatzerlöse 47459.0 978 45575.0 970 \n",
"1 Aktivierte Eigenleistungen 380000.0 8 400000.0 9 \n",
"2 Sonstige betriebliche Erträge 687000.0 14 991000.0 21 \n",
"3 Betriebliche Erträge 48526.0 1000 46966.0 1000 \n",
"4 Materialaufwand 34007.0 701 32647.0 695 \n",
"5 Personalaufwand 6258.0 129 6222.0 132 \n",
"6 Abschreibungen 2239.0 46 2273.0 48 \n",
"7 Konzessionsabgabe 1331.0 27 1302.0 28 \n",
"8 Übrige sonstige betriebliche Aufwendungen 2100.0 43 2066.0 44 \n",
"9 Betriebliche Aufwendungen 45935.0 947 44510.0 948 \n",
"10 Ergebnis der betrieblichen Tätigkeit 2591.0 53 2456.0 52 \n",
"11 Finanzergebnis (Ertrags-/Aufwandsaldo) -13000.0 0 -99000.0 -2 \n",
"12 sonstige Steuern 147000.0 3 164000.0 3 \n",
"13 Neutraler Bereich 134000.0 3 65000.0 1 \n",
"14 Jahresüberschuss vor Ertragsteuern 2457.0 51 2391.0 51 \n",
"15 Ertragsteuern 796000.0 16 792000.0 17 \n",
"16 Jahresüberschuss 1661.0 34 1599.0 34 \n",
"\n",
" Veränderungen \n",
" T€ % \n",
"0 1884.0 41 \n",
"1 -20000.0 -50 \n",
"2 -304000.0 -307 \n",
"3 1560.0 33 \n",
"4 1360.0 42 \n",
"5 36000.0 6 \n",
"6 -34000.0 -15 \n",
"7 29000.0 22 \n",
"8 34000.0 16 \n",
"9 1425.0 32 \n",
"10 135000.0 55 \n",
"11 86000.0 -869 \n",
"12 -17000.0 -104 \n",
"13 86000.0 1062 \n",
"14 66000.0 28 \n",
"15 4000.0 5 \n",
"16 62000.0 39 "
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Prune rows where first columns is None\n",
"import numpy as np\n",
"\n",
"current_table = current_table.replace(to_replace=\"None\", value=np.nan).dropna()\n",
"current_table"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_15672\\340569398.py:3: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" kpis[row[0]] = row[1]\n"
]
},
{
"data": {
"text/plain": [
"{'Umsatzerlöse': 47459.0,\n",
" 'Aktivierte Eigenleistungen': 380000.0,\n",
" 'Sonstige betriebliche Erträge': 687000.0,\n",
" 'Betriebliche Erträge': 48526.0,\n",
" 'Materialaufwand': 34007.0,\n",
" 'Personalaufwand': 6258.0,\n",
" 'Abschreibungen': 2239.0,\n",
" 'Konzessionsabgabe': 1331.0,\n",
" 'Übrige sonstige betriebliche Aufwendungen': 2100.0,\n",
" 'Betriebliche Aufwendungen': 45935.0,\n",
" 'Ergebnis der betrieblichen Tätigkeit': 2591.0,\n",
" 'Finanzergebnis (Ertrags-/Aufwandsaldo)': -13000.0,\n",
" 'sonstige Steuern': 147000.0,\n",
" 'Neutraler Bereich': 134000.0,\n",
" 'Jahresüberschuss vor Ertragsteuern': 2457.0,\n",
" 'Ertragsteuern': 796000.0,\n",
" 'Jahresüberschuss': 1661.0}"
]
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kpis = {}\n",
"for _index, row in current_table.iterrows():\n",
" kpis[row[0]] = row[1]\n",
"kpis"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"\n",
"def get_bilanz(report: str) -> any:\n",
" result = {}\n",
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
" for pos in [\"Aktiva\", \"Passiva\"]:\n",
" tag = soup.find(\"b\", string=re.compile(pos))\n",
" if tag:\n",
" pos_results = pd.read_html(\n",
" StringIO(str(tag.findNext(\"table\", {\"class\": \"std_table\"})))\n",
" )[0]\n",
" result[pos] = pos_results\n",
" else:\n",
" result[pos] = pd.DataFrame([])\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Investitionen (netto)</th>\n",
" <th>2020 T€</th>\n",
" <th>2019 T€</th>\n",
" <th>Veränderung T€</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Stromversorgung</td>\n",
" <td>1.372</td>\n",
" <td>1.553</td>\n",
" <td>-181.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Gasversorgung</td>\n",
" <td>713.000</td>\n",
" <td>707.000</td>\n",
" <td>6.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>sonstige Aktivitäten</td>\n",
" <td>661.000</td>\n",
" <td>2.605</td>\n",
" <td>-1.944</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Insgesamt</td>\n",
" <td>2.746</td>\n",
" <td>4.865</td>\n",
" <td>-2.119</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Investitionen (netto) 2020 T€ 2019 T€ Veränderung T€\n",
"0 Stromversorgung 1.372 1.553 -181.000\n",
"1 Gasversorgung 713.000 707.000 6.000\n",
"2 sonstige Aktivitäten 661.000 2.605 -1.944\n",
"3 Insgesamt 2.746 4.865 -2.119"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bilanz = get_bilanz(sample_report)\n",
"bilanz[\"Passiva\"].head()"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>Unnamed: 0_level_0</th>\n",
" <th colspan=\"2\" halign=\"left\">31. Dezember 2020</th>\n",
" <th colspan=\"2\" halign=\"left\">31. Dezember 2019</th>\n",
" <th>Veränderung</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>Unnamed: 0_level_1</th>\n",
" <th>T€</th>\n",
" <th>%</th>\n",
" <th>T€</th>\n",
" <th>%</th>\n",
" <th>T€</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Anlagevermögen</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Sachanlagen</td>\n",
" <td>28.919</td>\n",
" <td>689.0</td>\n",
" <td>28.812</td>\n",
" <td>689.0</td>\n",
" <td>107.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Finanzanlagen</td>\n",
" <td>2.667</td>\n",
" <td>64.0</td>\n",
" <td>4.189</td>\n",
" <td>100.0</td>\n",
" <td>-1.522</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>NaN</td>\n",
" <td>31.586</td>\n",
" <td>753.0</td>\n",
" <td>33.001</td>\n",
" <td>789.0</td>\n",
" <td>-1.415</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Umlaufvermögen</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0_level_0 31. Dezember 2020 31. Dezember 2019 \\\n",
" Unnamed: 0_level_1 T€ % T€ % \n",
"0 Anlagevermögen NaN NaN NaN NaN \n",
"1 Sachanlagen 28.919 689.0 28.812 689.0 \n",
"2 Finanzanlagen 2.667 64.0 4.189 100.0 \n",
"3 NaN 31.586 753.0 33.001 789.0 \n",
"4 Umlaufvermögen NaN NaN NaN NaN \n",
"\n",
" Veränderung \n",
" T€ \n",
"0 NaN \n",
"1 107.000 \n",
"2 -1.522 \n",
"3 -1.415 \n",
"4 NaN "
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bilanz[\"Aktiva\"].head()"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Investitionen (netto)</th>\n",
" <th>2020 T€</th>\n",
" <th>2019 T€</th>\n",
" <th>Veränderung T€</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Stromversorgung</td>\n",
" <td>1.372</td>\n",
" <td>1.553</td>\n",
" <td>-181.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Gasversorgung</td>\n",
" <td>713.000</td>\n",
" <td>707.000</td>\n",
" <td>6.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>sonstige Aktivitäten</td>\n",
" <td>661.000</td>\n",
" <td>2.605</td>\n",
" <td>-1.944</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Insgesamt</td>\n",
" <td>2.746</td>\n",
" <td>4.865</td>\n",
" <td>-2.119</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from IPython.display import display, HTML\n",
"\n",
"# Assuming that dataframes df1 and df2 are already defined:\n",
"display(HTML(bilanz[\"Passiva\"].to_html()))"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( '2020', 'T€'),\n",
" ( '2019', 'T€'),\n",
" ( 'Veränderung', 'T€'),\n",
" ( 'Veränderung', '%')],\n",
" )\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( '2020', 'T€'),\n",
" ( '2020', '%'),\n",
" ( '2019', 'T€'),\n",
" ( '2019', '%'),\n",
" ( 'Veränderungen', 'T€'),\n",
" ( 'Veränderungen', '%')],\n",
" )\n",
"MultiIndex([('Unnamed: 0_level_0', 'gerundet'),\n",
" ( '2020', 'T€'),\n",
" ( '2019', 'T€'),\n",
" ( 'Veränderung', 'T€'),\n",
" ( 'Veränderung', '%')],\n",
" )\n",
"MultiIndex([('Unnamed: 0_level_0', 'unkonsolidiert gerundet'),\n",
" ( '2020', 'T€'),\n",
" ( '2019', 'T€'),\n",
" ( 'Veränderung', 'T€'),\n",
" ( 'Veränderung', '%')],\n",
" )\n",
"MultiIndex([('Jahresüberschuss/Jahresfehlbetrag nach Betriebszweigen', ...),\n",
" ( '2020', ...),\n",
" ( '2019', ...),\n",
" ( 'Veränderung', ...),\n",
" ( 'Veränderung', ...)],\n",
" )\n",
"Index(['Unnamed: 0', '2020 T€', '2019 T€'], dtype='object')\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( '31. Dezember 2020', 'T€'),\n",
" ( '31. Dezember 2020', '%'),\n",
" ( '31. Dezember 2019', 'T€'),\n",
" ( '31. Dezember 2019', '%'),\n",
" ( 'Veränderung', 'T€')],\n",
" )\n",
"Index(['Investitionen (netto)', '2020 T€', '2019 T€', 'Veränderung T€'], dtype='object')\n",
"Index(['Unnamed: 0', '€', '31.12.2019 in T €'], dtype='object')\n",
"Index(['Unnamed: 0', '€', '€.1', '31.12.2019 in T €'], dtype='object')\n",
"Index([0, 1], dtype='int64')\n",
"Index(['Beteiligung', 'Anteil', 'Eigenkapital der Beteiligungsgesellschaft',\n",
" 'Eigenkapital der Beteiligungsgesellschaft.1',\n",
" 'Jahresergebnis der Beteiligungsgesellschaft',\n",
" 'Jahresergebnis der Beteiligungsgesellschaft.1'],\n",
" dtype='object')\n",
"Index(['Beteiligung', 'Anteil', 'Eigenkapital der Beteiligungsgesellschaft',\n",
" 'Eigenkapital der Beteiligungsgesellschaft.1',\n",
" 'Jahresergebnis der Beteiligungsgesellschaft',\n",
" 'Jahresergebnis der Beteiligungsgesellschaft.1'],\n",
" dtype='object')\n",
"Index(['Unnamed: 0', '2020', '2019'], dtype='object')\n",
"MultiIndex([( 'Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( 'Unnamed: 1_level_0', 'Gesamt in T€'),\n",
" ('davon mit einer Restlaufzeit', 'bis zu 1 Jahr in T€'),\n",
" ('davon mit einer Restlaufzeit', 'mehr als 1 Jahr in T€'),\n",
" ('davon mit einer Restlaufzeit', 'davon über 5 Jahre in T€')],\n",
" )\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( '2020', '€'),\n",
" ( '2019', '€'),\n",
" ( 'Veränderung', '€'),\n",
" ( 'Veränderung', '%')],\n",
" )\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( '2020', '€'),\n",
" ( '2019', '€'),\n",
" ( 'Veränderung', '€'),\n",
" ( 'Veränderung', '%')],\n",
" )\n",
"Index(['Unnamed: 0', '2020 T€', '2019 T€'], dtype='object')\n",
"MultiIndex([( 'Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ('Anschaffungs- und Herstellungskosten', 'Stand am 01.01.2020 €'),\n",
" ('Anschaffungs- und Herstellungskosten', 'Zugang €'),\n",
" ('Anschaffungs- und Herstellungskosten', 'Abgang €'),\n",
" ('Anschaffungs- und Herstellungskosten', 'Umbuchung €'),\n",
" ('Anschaffungs- und Herstellungskosten', 'Stand am 31.12.2020 €')],\n",
" )\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( 'Abschreibungen', 'Stand am 01.01.2020 €'),\n",
" ( 'Abschreibungen', 'Zugang €'),\n",
" ( 'Abschreibungen', 'außerplanm. AfA'),\n",
" ( 'Abschreibungen', 'Abgang €'),\n",
" ( 'Abschreibungen', 'Umbuchung €'),\n",
" ( 'Abschreibungen', 'Stand am 31.12.2020 €')],\n",
" )\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( 'Restbuchwerte', 'Stand am 31.12.2020 €'),\n",
" ( 'Restbuchwerte', 'Stand am 31.12.2019 €')],\n",
" )\n",
"Index(['Unnamed: 0', 'Elektrizitätsverteilung', '31.12.2019 in T €',\n",
" 'Gasverteilung', '31.12.2019 in T €.1'],\n",
" dtype='object')\n",
"Index(['Unnamed: 0', '€', '31.12.2019 in T €'], dtype='object')\n",
"Index(['Unnamed: 0', 'Elektrizitätsverteilung €', '31.12.2019 in T €',\n",
" 'Gasverteilung €', '31.12.2019 in T €.1'],\n",
" dtype='object')\n",
"Index(['Unnamed: 0', '€', 'Vorjahr in T €'], dtype='object')\n",
"MultiIndex([('Verbindlichkeitenspiegel 2020 Elektrizitätsverteilung', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...)],\n",
" )\n",
"MultiIndex([('Verbindlichkeitenspiegel 2020 Gasverteilung', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...)],\n",
" )\n",
"MultiIndex([('Verbindlichkeitenspiegel 2020 Intelligenter', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...),\n",
" ( 'davon mit einer Restlaufzeit', ...)],\n",
" )\n"
]
}
],
"source": [
"def get_tables(raw_report: str) -> list:\n",
" soup = BeautifulSoup(raw_report, features=\"html.parser\")\n",
" tables = soup.find_all(\"table\", {\"class\": \"std_table\"})\n",
" dfs = []\n",
" for table in tables:\n",
" for df in pd.read_html(StringIO(str(table))):\n",
" dfs.append(df)\n",
" return dfs\n",
"\n",
"\n",
"for df in get_tables(sample_report):\n",
" print(df.columns)\n",
"\n",
"tables = get_tables(sample_report)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}