{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Daten Extraktion aus dem Bundesanzeiger" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Vorbereitung" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datecompanyraw_reportjahrauditorsfinancial_results
12022-10-21Stadtwerke Haltern am See Gesellschaft mit bes...<div class=\"publication_container\">\\n <div cla...2021[Auditor(name='Volker Voelcker', company='Pric...{'revenue': 46275.0, 'net_income': 1757.0, 'eb...
32021-10-12Stadtwerke Haltern am See Gesellschaft mit bes...<div class=\"publication_container\">\\n <div cla...2020[Auditor(name='Hubert Ahlers', company='Pricew...{'revenue': 47459.0, 'net_income': 1661.0, 'eb...
52020-12-03Stadtwerke Haltern am See Gesellschaft mit bes...<div class=\"publication_container\">\\n <div cla...2019[Auditor(name='Hubert Ahlers', company='Pricew...{'revenue': 45575.0, 'net_income': 1599.0, 'eb...
62020-01-09Stadtwerke Haltern am See Gesellschaft mit bes...<div class=\"publication_container\">\\n <div cla...2018[Auditor(name='Hubert Ahlers', company='Pricew...{'revenue': 43898.0, 'net_income': 2043.0, 'eb...
72019-10-10Stadtwerke Haltern am See Gesellschaft mit bes...<div class=\"publication_container\">\\n <div cla...2017[]{}
\n", "
" ], "text/plain": [ " date company \\\n", "1 2022-10-21 Stadtwerke Haltern am See Gesellschaft mit bes... \n", "3 2021-10-12 Stadtwerke Haltern am See Gesellschaft mit bes... \n", "5 2020-12-03 Stadtwerke Haltern am See Gesellschaft mit bes... \n", "6 2020-01-09 Stadtwerke Haltern am See Gesellschaft mit bes... \n", "7 2019-10-10 Stadtwerke Haltern am See Gesellschaft mit bes... \n", "\n", " raw_report jahr \\\n", "1
\\n
\\n
\\n
\\n
\\n
list:\n", " result = []\n", " soup = BeautifulSoup(report, features=\"html.parser\")\n", " for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n", " df = pd.read_html(StringIO(str(table)), flavor=\"bs4\")[0]\n", " print(df.columns)\n", " print(df.dtypes)\n", " result.append(df)\n", " return result\n", "\n", "\n", "tables = parse_tables(sample_report)" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0_level_020202019Veränderungen
Unnamed: 0_level_1T€%T€%T€%
0Umsatzerlöse47.45997845.5759701.88441
1Aktivierte Eigenleistungen380.0008400.0009-20.000-50
2Sonstige betriebliche Erträge687.00014991.00021-304.000-307
3Betriebliche Erträge48.526100046.96610001.56033
4Materialaufwand34.00770132.6476951.36042
\n", "
" ], "text/plain": [ " Unnamed: 0_level_0 2020 2019 Veränderungen \\\n", " Unnamed: 0_level_1 T€ % T€ % T€ \n", "0 Umsatzerlöse 47.459 978 45.575 970 1.884 \n", "1 Aktivierte Eigenleistungen 380.000 8 400.000 9 -20.000 \n", "2 Sonstige betriebliche Erträge 687.000 14 991.000 21 -304.000 \n", "3 Betriebliche Erträge 48.526 1000 46.966 1000 1.560 \n", "4 Materialaufwand 34.007 701 32.647 695 1.360 \n", "\n", " \n", " % \n", "0 41 \n", "1 -50 \n", "2 -307 \n", "3 33 \n", "4 42 " ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "current_table = tables[1]\n", "current_table.head()" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "\n", "def cleanse_string(value: str) -> str:\n", " if value is not None and isinstance(value, str):\n", " return re.sub(r\"(.+\\.).\", \"\", value)\n", " return None" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_15672\\152097142.py:2: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", " current_table.iloc[index][0] = cleanse_string(row[0])\n", "C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_15672\\152097142.py:2: FutureWarning: Series.__setitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To set a value by position, use `ser.iloc[pos] = value`\n", " current_table.iloc[index][0] = cleanse_string(row[0])\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0_level_020202019Veränderungen
Unnamed: 0_level_1T€%T€%T€%
0Umsatzerlöse47.45997845.5759701.88441
1Aktivierte Eigenleistungen380.0008400.0009-20.000-50
2Sonstige betriebliche Erträge687.00014991.00021-304.000-307
3Betriebliche Erträge48.526100046.96610001.56033
4Materialaufwand34.00770132.6476951.36042
\n", "
" ], "text/plain": [ " Unnamed: 0_level_0 2020 2019 Veränderungen \\\n", " Unnamed: 0_level_1 T€ % T€ % T€ \n", "0 Umsatzerlöse 47.459 978 45.575 970 1.884 \n", "1 Aktivierte Eigenleistungen 380.000 8 400.000 9 -20.000 \n", "2 Sonstige betriebliche Erträge 687.000 14 991.000 21 -304.000 \n", "3 Betriebliche Erträge 48.526 1000 46.966 1000 1.560 \n", "4 Materialaufwand 34.007 701 32.647 695 1.360 \n", "\n", " \n", " % \n", "0 41 \n", "1 -50 \n", "2 -307 \n", "3 33 \n", "4 42 " ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "for index, row in current_table.iterrows():\n", " current_table.iloc[index][0] = cleanse_string(row[0])\n", "current_table.head()" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "def parse_string_to_float(value) -> float:\n", " try:\n", " if value is None:\n", " return None\n", " if isinstance(value, float):\n", " return value\n", " return float(value.replace(\".\", \"\").replace(\",\", \".\"))\n", " except Exception as e:\n", " return None\n", "\n", "\n", "def apply_factor(value, factor: float):\n", " transformed_value = parse_string_to_float(value)\n", " if transformed_value is None or isinstance(transformed_value, str):\n", " return None\n", " result = transformed_value * factor\n", " # print(result)\n", " return result" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0_level_020202019Veränderungen
Unnamed: 0_level_1T€%T€%T€%
0Umsatzerlöse47459.097845575.09701884.041
1Aktivierte Eigenleistungen380000.08400000.09-20000.0-50
2Sonstige betriebliche Erträge687000.014991000.021-304000.0-307
3Betriebliche Erträge48526.0100046966.010001560.033
4Materialaufwand34007.070132647.06951360.042
\n", "
" ], "text/plain": [ " Unnamed: 0_level_0 2020 2019 \\\n", " Unnamed: 0_level_1 T€ % T€ % \n", "0 Umsatzerlöse 47459.0 978 45575.0 970 \n", "1 Aktivierte Eigenleistungen 380000.0 8 400000.0 9 \n", "2 Sonstige betriebliche Erträge 687000.0 14 991000.0 21 \n", "3 Betriebliche Erträge 48526.0 1000 46966.0 1000 \n", "4 Materialaufwand 34007.0 701 32647.0 695 \n", "\n", " Veränderungen \n", " T€ % \n", "0 1884.0 41 \n", "1 -20000.0 -50 \n", "2 -304000.0 -307 \n", "3 1560.0 33 \n", "4 1360.0 42 " ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "converter = {\n", " \"Mio€\": 1 * 10**6,\n", " \"Mio\": 1 * 10**6,\n", " \"T€\": 1 * 10**3,\n", " \"TEUR\": 1 * 10**3,\n", " \"EUR\": 1,\n", " \"€\": 1,\n", "}\n", "\n", "for column in current_table.columns:\n", " if isinstance(column, tuple):\n", " for c in column:\n", " for x, factor in converter.items():\n", " if x in c:\n", " current_table[column] = current_table[column].apply(\n", " lambda x: apply_factor(x, factor)\n", " )\n", " next\n", " else:\n", " for x, factor in converter.items():\n", " parts = column.split(\" \")\n", " for y in parts:\n", " if re.match(x, y):\n", " current_table[column] = current_table[column].apply(\n", " lambda x: apply_factor(x, factor)\n", " )\n", " current_table.rename({column: parts[0]}, inplace=True, axis=1)\n", " next\n", " # print(current_table[column])\n", "current_table.dropna(axis=0, how=\"all\", inplace=True)\n", "current_table.dropna(axis=1, how=\"all\", inplace=True)\n", "current_table.head()" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Unnamed: 0_level_0 Unnamed: 0_level_1 object\n", "2020 T€ float64\n", " % int64\n", "2019 T€ float64\n", " % int64\n", "Veränderungen T€ float64\n", " % int64\n", "dtype: object" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "current_table.dtypes" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "# Remove columns hosting non-numerics; excl. first column hosting keys\n", "columns_to_prune = []\n", "for column_index, column_type in enumerate(current_table.dtypes[1:]):\n", " if column_type in [\"object\", \"str\"]:\n", " columns_to_prune.append(column_index + 1)\n", "\n", "current_table = current_table.drop(\n", " current_table.columns[columns_to_prune], axis=\"columns\"\n", ")" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0_level_020202019Veränderungen
Unnamed: 0_level_1T€%T€%T€%
0Umsatzerlöse47459.097845575.09701884.041
1Aktivierte Eigenleistungen380000.08400000.09-20000.0-50
2Sonstige betriebliche Erträge687000.014991000.021-304000.0-307
3Betriebliche Erträge48526.0100046966.010001560.033
4Materialaufwand34007.070132647.06951360.042
5Personalaufwand6258.01296222.013236000.06
6Abschreibungen2239.0462273.048-34000.0-15
7Konzessionsabgabe1331.0271302.02829000.022
8Übrige sonstige betriebliche Aufwendungen2100.0432066.04434000.016
9Betriebliche Aufwendungen45935.094744510.09481425.032
10Ergebnis der betrieblichen Tätigkeit2591.0532456.052135000.055
11Finanzergebnis (Ertrags-/Aufwandsaldo)-13000.00-99000.0-286000.0-869
12sonstige Steuern147000.03164000.03-17000.0-104
13Neutraler Bereich134000.0365000.0186000.01062
14Jahresüberschuss vor Ertragsteuern2457.0512391.05166000.028
15Ertragsteuern796000.016792000.0174000.05
16Jahresüberschuss1661.0341599.03462000.039
\n", "
" ], "text/plain": [ " Unnamed: 0_level_0 2020 2019 \\\n", " Unnamed: 0_level_1 T€ % T€ % \n", "0 Umsatzerlöse 47459.0 978 45575.0 970 \n", "1 Aktivierte Eigenleistungen 380000.0 8 400000.0 9 \n", "2 Sonstige betriebliche Erträge 687000.0 14 991000.0 21 \n", "3 Betriebliche Erträge 48526.0 1000 46966.0 1000 \n", "4 Materialaufwand 34007.0 701 32647.0 695 \n", "5 Personalaufwand 6258.0 129 6222.0 132 \n", "6 Abschreibungen 2239.0 46 2273.0 48 \n", "7 Konzessionsabgabe 1331.0 27 1302.0 28 \n", "8 Übrige sonstige betriebliche Aufwendungen 2100.0 43 2066.0 44 \n", "9 Betriebliche Aufwendungen 45935.0 947 44510.0 948 \n", "10 Ergebnis der betrieblichen Tätigkeit 2591.0 53 2456.0 52 \n", "11 Finanzergebnis (Ertrags-/Aufwandsaldo) -13000.0 0 -99000.0 -2 \n", "12 sonstige Steuern 147000.0 3 164000.0 3 \n", "13 Neutraler Bereich 134000.0 3 65000.0 1 \n", "14 Jahresüberschuss vor Ertragsteuern 2457.0 51 2391.0 51 \n", "15 Ertragsteuern 796000.0 16 792000.0 17 \n", "16 Jahresüberschuss 1661.0 34 1599.0 34 \n", "\n", " Veränderungen \n", " T€ % \n", "0 1884.0 41 \n", "1 -20000.0 -50 \n", "2 -304000.0 -307 \n", "3 1560.0 33 \n", "4 1360.0 42 \n", "5 36000.0 6 \n", "6 -34000.0 -15 \n", "7 29000.0 22 \n", "8 34000.0 16 \n", "9 1425.0 32 \n", "10 135000.0 55 \n", "11 86000.0 -869 \n", "12 -17000.0 -104 \n", "13 86000.0 1062 \n", "14 66000.0 28 \n", "15 4000.0 5 \n", "16 62000.0 39 " ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Prune rows where first columns is None\n", "import numpy as np\n", "\n", "current_table = current_table.replace(to_replace=\"None\", value=np.nan).dropna()\n", "current_table" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_15672\\340569398.py:3: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", " kpis[row[0]] = row[1]\n" ] }, { "data": { "text/plain": [ "{'Umsatzerlöse': 47459.0,\n", " 'Aktivierte Eigenleistungen': 380000.0,\n", " 'Sonstige betriebliche Erträge': 687000.0,\n", " 'Betriebliche Erträge': 48526.0,\n", " 'Materialaufwand': 34007.0,\n", " 'Personalaufwand': 6258.0,\n", " 'Abschreibungen': 2239.0,\n", " 'Konzessionsabgabe': 1331.0,\n", " 'Übrige sonstige betriebliche Aufwendungen': 2100.0,\n", " 'Betriebliche Aufwendungen': 45935.0,\n", " 'Ergebnis der betrieblichen Tätigkeit': 2591.0,\n", " 'Finanzergebnis (Ertrags-/Aufwandsaldo)': -13000.0,\n", " 'sonstige Steuern': 147000.0,\n", " 'Neutraler Bereich': 134000.0,\n", " 'Jahresüberschuss vor Ertragsteuern': 2457.0,\n", " 'Ertragsteuern': 796000.0,\n", " 'Jahresüberschuss': 1661.0}" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kpis = {}\n", "for _index, row in current_table.iterrows():\n", " kpis[row[0]] = row[1]\n", "kpis" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "\n", "def get_bilanz(report: str) -> any:\n", " result = {}\n", " soup = BeautifulSoup(report, features=\"html.parser\")\n", " for pos in [\"Aktiva\", \"Passiva\"]:\n", " tag = soup.find(\"b\", string=re.compile(pos))\n", " if tag:\n", " pos_results = pd.read_html(\n", " StringIO(str(tag.findNext(\"table\", {\"class\": \"std_table\"})))\n", " )[0]\n", " result[pos] = pos_results\n", " else:\n", " result[pos] = pd.DataFrame([])\n", " return result" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Investitionen (netto)2020 T€2019 T€Veränderung T€
0Stromversorgung1.3721.553-181.000
1Gasversorgung713.000707.0006.000
2sonstige Aktivitäten661.0002.605-1.944
3Insgesamt2.7464.865-2.119
\n", "
" ], "text/plain": [ " Investitionen (netto) 2020 T€ 2019 T€ Veränderung T€\n", "0 Stromversorgung 1.372 1.553 -181.000\n", "1 Gasversorgung 713.000 707.000 6.000\n", "2 sonstige Aktivitäten 661.000 2.605 -1.944\n", "3 Insgesamt 2.746 4.865 -2.119" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bilanz = get_bilanz(sample_report)\n", "bilanz[\"Passiva\"].head()" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0_level_031. Dezember 202031. Dezember 2019Veränderung
Unnamed: 0_level_1T€%T€%T€
0AnlagevermögenNaNNaNNaNNaNNaN
1Sachanlagen28.919689.028.812689.0107.000
2Finanzanlagen2.66764.04.189100.0-1.522
3NaN31.586753.033.001789.0-1.415
4UmlaufvermögenNaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " Unnamed: 0_level_0 31. Dezember 2020 31. Dezember 2019 \\\n", " Unnamed: 0_level_1 T€ % T€ % \n", "0 Anlagevermögen NaN NaN NaN NaN \n", "1 Sachanlagen 28.919 689.0 28.812 689.0 \n", "2 Finanzanlagen 2.667 64.0 4.189 100.0 \n", "3 NaN 31.586 753.0 33.001 789.0 \n", "4 Umlaufvermögen NaN NaN NaN NaN \n", "\n", " Veränderung \n", " T€ \n", "0 NaN \n", "1 107.000 \n", "2 -1.522 \n", "3 -1.415 \n", "4 NaN " ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bilanz[\"Aktiva\"].head()" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Investitionen (netto)2020 T€2019 T€Veränderung T€
0Stromversorgung1.3721.553-181.000
1Gasversorgung713.000707.0006.000
2sonstige Aktivitäten661.0002.605-1.944
3Insgesamt2.7464.865-2.119
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from IPython.display import display, HTML\n", "\n", "# Assuming that dataframes df1 and df2 are already defined:\n", "display(HTML(bilanz[\"Passiva\"].to_html()))" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", " ( '2020', 'T€'),\n", " ( '2019', 'T€'),\n", " ( 'Veränderung', 'T€'),\n", " ( 'Veränderung', '%')],\n", " )\n", "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", " ( '2020', 'T€'),\n", " ( '2020', '%'),\n", " ( '2019', 'T€'),\n", " ( '2019', '%'),\n", " ( 'Veränderungen', 'T€'),\n", " ( 'Veränderungen', '%')],\n", " )\n", "MultiIndex([('Unnamed: 0_level_0', 'gerundet'),\n", " ( '2020', 'T€'),\n", " ( '2019', 'T€'),\n", " ( 'Veränderung', 'T€'),\n", " ( 'Veränderung', '%')],\n", " )\n", "MultiIndex([('Unnamed: 0_level_0', 'unkonsolidiert gerundet'),\n", " ( '2020', 'T€'),\n", " ( '2019', 'T€'),\n", " ( 'Veränderung', 'T€'),\n", " ( 'Veränderung', '%')],\n", " )\n", "MultiIndex([('Jahresüberschuss/Jahresfehlbetrag nach Betriebszweigen', ...),\n", " ( '2020', ...),\n", " ( '2019', ...),\n", " ( 'Veränderung', ...),\n", " ( 'Veränderung', ...)],\n", " )\n", "Index(['Unnamed: 0', '2020 T€', '2019 T€'], dtype='object')\n", "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", " ( '31. Dezember 2020', 'T€'),\n", " ( '31. Dezember 2020', '%'),\n", " ( '31. Dezember 2019', 'T€'),\n", " ( '31. Dezember 2019', '%'),\n", " ( 'Veränderung', 'T€')],\n", " )\n", "Index(['Investitionen (netto)', '2020 T€', '2019 T€', 'Veränderung T€'], dtype='object')\n", "Index(['Unnamed: 0', '€', '31.12.2019 in T €'], dtype='object')\n", "Index(['Unnamed: 0', '€', '€.1', '31.12.2019 in T €'], dtype='object')\n", "Index([0, 1], dtype='int64')\n", "Index(['Beteiligung', 'Anteil', 'Eigenkapital der Beteiligungsgesellschaft',\n", " 'Eigenkapital der Beteiligungsgesellschaft.1',\n", " 'Jahresergebnis der Beteiligungsgesellschaft',\n", " 'Jahresergebnis der Beteiligungsgesellschaft.1'],\n", " dtype='object')\n", "Index(['Beteiligung', 'Anteil', 'Eigenkapital der Beteiligungsgesellschaft',\n", " 'Eigenkapital der Beteiligungsgesellschaft.1',\n", " 'Jahresergebnis der Beteiligungsgesellschaft',\n", " 'Jahresergebnis der Beteiligungsgesellschaft.1'],\n", " dtype='object')\n", "Index(['Unnamed: 0', '2020', '2019'], dtype='object')\n", "MultiIndex([( 'Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", " ( 'Unnamed: 1_level_0', 'Gesamt in T€'),\n", " ('davon mit einer Restlaufzeit', 'bis zu 1 Jahr in T€'),\n", " ('davon mit einer Restlaufzeit', 'mehr als 1 Jahr in T€'),\n", " ('davon mit einer Restlaufzeit', 'davon über 5 Jahre in T€')],\n", " )\n", "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", " ( '2020', '€'),\n", " ( '2019', '€'),\n", " ( 'Veränderung', '€'),\n", " ( 'Veränderung', '%')],\n", " )\n", "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", " ( '2020', '€'),\n", " ( '2019', '€'),\n", " ( 'Veränderung', '€'),\n", " ( 'Veränderung', '%')],\n", " )\n", "Index(['Unnamed: 0', '2020 T€', '2019 T€'], dtype='object')\n", "MultiIndex([( 'Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", " ('Anschaffungs- und Herstellungskosten', 'Stand am 01.01.2020 €'),\n", " ('Anschaffungs- und Herstellungskosten', 'Zugang €'),\n", " ('Anschaffungs- und Herstellungskosten', 'Abgang €'),\n", " ('Anschaffungs- und Herstellungskosten', 'Umbuchung €'),\n", " ('Anschaffungs- und Herstellungskosten', 'Stand am 31.12.2020 €')],\n", " )\n", "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", " ( 'Abschreibungen', 'Stand am 01.01.2020 €'),\n", " ( 'Abschreibungen', 'Zugang €'),\n", " ( 'Abschreibungen', 'außerplanm. AfA'),\n", " ( 'Abschreibungen', 'Abgang €'),\n", " ( 'Abschreibungen', 'Umbuchung €'),\n", " ( 'Abschreibungen', 'Stand am 31.12.2020 €')],\n", " )\n", "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", " ( 'Restbuchwerte', 'Stand am 31.12.2020 €'),\n", " ( 'Restbuchwerte', 'Stand am 31.12.2019 €')],\n", " )\n", "Index(['Unnamed: 0', 'Elektrizitätsverteilung', '31.12.2019 in T €',\n", " 'Gasverteilung', '31.12.2019 in T €.1'],\n", " dtype='object')\n", "Index(['Unnamed: 0', '€', '31.12.2019 in T €'], dtype='object')\n", "Index(['Unnamed: 0', 'Elektrizitätsverteilung €', '31.12.2019 in T €',\n", " 'Gasverteilung €', '31.12.2019 in T €.1'],\n", " dtype='object')\n", "Index(['Unnamed: 0', '€', 'Vorjahr in T €'], dtype='object')\n", "MultiIndex([('Verbindlichkeitenspiegel 2020 Elektrizitätsverteilung', ...),\n", " ( 'davon mit einer Restlaufzeit', ...),\n", " ( 'davon mit einer Restlaufzeit', ...),\n", " ( 'davon mit einer Restlaufzeit', ...),\n", " ( 'davon mit einer Restlaufzeit', ...)],\n", " )\n", "MultiIndex([('Verbindlichkeitenspiegel 2020 Gasverteilung', ...),\n", " ( 'davon mit einer Restlaufzeit', ...),\n", " ( 'davon mit einer Restlaufzeit', ...),\n", " ( 'davon mit einer Restlaufzeit', ...),\n", " ( 'davon mit einer Restlaufzeit', ...)],\n", " )\n", "MultiIndex([('Verbindlichkeitenspiegel 2020 Intelligenter', ...),\n", " ( 'davon mit einer Restlaufzeit', ...),\n", " ( 'davon mit einer Restlaufzeit', ...),\n", " ( 'davon mit einer Restlaufzeit', ...),\n", " ( 'davon mit einer Restlaufzeit', ...)],\n", " )\n" ] } ], "source": [ "def get_tables(raw_report: str) -> list:\n", " soup = BeautifulSoup(raw_report, features=\"html.parser\")\n", " tables = soup.find_all(\"table\", {\"class\": \"std_table\"})\n", " dfs = []\n", " for table in tables:\n", " for df in pd.read_html(StringIO(str(table))):\n", " dfs.append(df)\n", " return dfs\n", "\n", "\n", "for df in get_tables(sample_report):\n", " print(df.columns)\n", "\n", "tables = get_tables(sample_report)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.3" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }