From 815e08a8f1fbe4011b058422c9a32f71ce12b766 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Fri, 20 Oct 2023 15:06:34 +0200 Subject: [PATCH] =?UTF-8?q?checkpoint:=20Transform=20values=20to=20?= =?UTF-8?q?=E2=82=AC=20and=20normalize=20column=20names?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../API-tests/Bundesanzeiger/notebook.ipynb | 1113 ++++------------- 1 file changed, 254 insertions(+), 859 deletions(-) diff --git a/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb b/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb index e67c604..bb076c8 100644 --- a/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb +++ b/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 255, "metadata": {}, "outputs": [ { @@ -130,7 +130,7 @@ "7 {} " ] }, - "execution_count": 77, + "execution_count": 255, "metadata": {}, "output_type": "execute_result" } @@ -161,7 +161,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 256, "metadata": {}, "outputs": [], "source": [ @@ -171,7 +171,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 257, "metadata": {}, "outputs": [], "source": [ @@ -204,7 +204,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 258, "metadata": {}, "outputs": [ { @@ -507,7 +507,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 259, "metadata": {}, "outputs": [ { @@ -532,17 +532,15 @@ " \n", " \n", " Unnamed: 0_level_0\n", - " 2020\n", - " 2019\n", - " Veränderungen\n", + " 2020\n", + " 2019\n", + " Veränderung\n", " \n", " \n", " \n", " Unnamed: 0_level_1\n", " T€\n", - " %\n", " T€\n", - " %\n", " T€\n", " %\n", " \n", @@ -550,89 +548,71 @@ " \n", " \n", " 0\n", - " Umsatzerlöse\n", - " 47.459\n", - " 978\n", - " 45.575\n", - " 970\n", - " 1.884\n", - " 41\n", + " Jahresüberschuss/Jahresfehlbetrag\n", + " 1.661\n", + " 1.599\n", + " 62\n", + " 39\n", " \n", " \n", " 1\n", - " Aktivierte Eigenleistungen\n", - " 380.000\n", - " 8\n", - " 400.000\n", - " 9\n", - " -20.000\n", - " -50\n", + " + Steuern vom Einkommen und vom Ertrag\n", + " 796.000\n", + " 792.000\n", + " 4\n", + " 5\n", " \n", " \n", " 2\n", - " Sonstige betriebliche Erträge\n", - " 687.000\n", - " 14\n", - " 991.000\n", - " 21\n", - " -304.000\n", - " -307\n", + " = EBT\n", + " 2.457\n", + " 2.391\n", + " 66\n", + " 28\n", " \n", " \n", " 3\n", - " Betriebliche Erträge\n", - " 48.526\n", - " 1000\n", - " 46.966\n", - " 1000\n", - " 1.560\n", - " 33\n", + " + Finanzergebnis\n", + " -13.000\n", + " -99.000\n", + " 86\n", + " -869\n", " \n", " \n", " 4\n", - " Materialaufwand\n", - " 34.007\n", - " 701\n", - " 32.647\n", - " 695\n", - " 1.360\n", - " 42\n", + " = EBIT\n", + " 2.444\n", + " 2.292\n", + " 152\n", + " 66\n", " \n", " \n", "\n", "" ], "text/plain": [ - " Unnamed: 0_level_0 2020 2019 Veränderungen \\\n", - " Unnamed: 0_level_1 T€ % T€ % T€ \n", - "0 Umsatzerlöse 47.459 978 45.575 970 1.884 \n", - "1 Aktivierte Eigenleistungen 380.000 8 400.000 9 -20.000 \n", - "2 Sonstige betriebliche Erträge 687.000 14 991.000 21 -304.000 \n", - "3 Betriebliche Erträge 48.526 1000 46.966 1000 1.560 \n", - "4 Materialaufwand 34.007 701 32.647 695 1.360 \n", - "\n", - " \n", - " % \n", - "0 41 \n", - "1 -50 \n", - "2 -307 \n", - "3 33 \n", - "4 42 " + " Unnamed: 0_level_0 2020 2019 Veränderung \n", + " Unnamed: 0_level_1 T€ T€ T€ %\n", + "0 Jahresüberschuss/Jahresfehlbetrag 1.661 1.599 62 39\n", + "1 + Steuern vom Einkommen und vom Ertrag 796.000 792.000 4 5\n", + "2 = EBT 2.457 2.391 66 28\n", + "3 + Finanzergebnis -13.000 -99.000 86 -869\n", + "4 = EBIT 2.444 2.292 152 66" ] }, - "execution_count": 81, + "execution_count": 259, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "current_table = tables[1]\n", + "current_table = tables[0]\n", "current_table.head()" ] }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 260, "metadata": {}, "outputs": [], "source": [ @@ -647,16 +627,16 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 261, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_15672\\152097142.py:2: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + "C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_24300\\152097142.py:2: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", " current_table.iloc[index][0] = cleanse_string(row[0])\n", - "C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_15672\\152097142.py:2: FutureWarning: Series.__setitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To set a value by position, use `ser.iloc[pos] = value`\n", + "C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_24300\\152097142.py:2: FutureWarning: Series.__setitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To set a value by position, use `ser.iloc[pos] = value`\n", " current_table.iloc[index][0] = cleanse_string(row[0])\n" ] }, @@ -682,17 +662,15 @@ " \n", " \n", " Unnamed: 0_level_0\n", - " 2020\n", - " 2019\n", - " Veränderungen\n", + " 2020\n", + " 2019\n", + " Veränderung\n", " \n", " \n", " \n", " Unnamed: 0_level_1\n", " T€\n", - " %\n", " T€\n", - " %\n", " T€\n", " %\n", " \n", @@ -700,77 +678,59 @@ " \n", " \n", " 0\n", - " Umsatzerlöse\n", - " 47.459\n", - " 978\n", - " 45.575\n", - " 970\n", - " 1.884\n", - " 41\n", + " Jahresüberschuss/Jahresfehlbetrag\n", + " 1.661\n", + " 1.599\n", + " 62\n", + " 39\n", " \n", " \n", " 1\n", - " Aktivierte Eigenleistungen\n", - " 380.000\n", - " 8\n", - " 400.000\n", - " 9\n", - " -20.000\n", - " -50\n", + " + Steuern vom Einkommen und vom Ertrag\n", + " 796.000\n", + " 792.000\n", + " 4\n", + " 5\n", " \n", " \n", " 2\n", - " Sonstige betriebliche Erträge\n", - " 687.000\n", - " 14\n", - " 991.000\n", - " 21\n", - " -304.000\n", - " -307\n", + " = EBT\n", + " 2.457\n", + " 2.391\n", + " 66\n", + " 28\n", " \n", " \n", " 3\n", - " Betriebliche Erträge\n", - " 48.526\n", - " 1000\n", - " 46.966\n", - " 1000\n", - " 1.560\n", - " 33\n", + " + Finanzergebnis\n", + " -13.000\n", + " -99.000\n", + " 86\n", + " -869\n", " \n", " \n", " 4\n", - " Materialaufwand\n", - " 34.007\n", - " 701\n", - " 32.647\n", - " 695\n", - " 1.360\n", - " 42\n", + " = EBIT\n", + " 2.444\n", + " 2.292\n", + " 152\n", + " 66\n", " \n", " \n", "\n", "" ], "text/plain": [ - " Unnamed: 0_level_0 2020 2019 Veränderungen \\\n", - " Unnamed: 0_level_1 T€ % T€ % T€ \n", - "0 Umsatzerlöse 47.459 978 45.575 970 1.884 \n", - "1 Aktivierte Eigenleistungen 380.000 8 400.000 9 -20.000 \n", - "2 Sonstige betriebliche Erträge 687.000 14 991.000 21 -304.000 \n", - "3 Betriebliche Erträge 48.526 1000 46.966 1000 1.560 \n", - "4 Materialaufwand 34.007 701 32.647 695 1.360 \n", - "\n", - " \n", - " % \n", - "0 41 \n", - "1 -50 \n", - "2 -307 \n", - "3 33 \n", - "4 42 " + " Unnamed: 0_level_0 2020 2019 Veränderung \n", + " Unnamed: 0_level_1 T€ T€ T€ %\n", + "0 Jahresüberschuss/Jahresfehlbetrag 1.661 1.599 62 39\n", + "1 + Steuern vom Einkommen und vom Ertrag 796.000 792.000 4 5\n", + "2 = EBT 2.457 2.391 66 28\n", + "3 + Finanzergebnis -13.000 -99.000 86 -869\n", + "4 = EBIT 2.444 2.292 152 66" ] }, - "execution_count": 83, + "execution_count": 261, "metadata": {}, "output_type": "execute_result" } @@ -783,7 +743,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 262, "metadata": {}, "outputs": [], "source": [ @@ -791,9 +751,9 @@ " try:\n", " if value is None:\n", " return None\n", - " if isinstance(value, float):\n", - " return value\n", - " return float(value.replace(\".\", \"\").replace(\",\", \".\"))\n", + " # if isinstance(value, float):\n", + " # return value\n", + " return float(str(value).replace(\".\", \"\").replace(\",\", \".\"))\n", " except Exception as e:\n", " return None\n", "\n", @@ -809,7 +769,7 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 263, "metadata": {}, "outputs": [ { @@ -834,17 +794,15 @@ " \n", " \n", " Unnamed: 0_level_0\n", - " 2020\n", - " 2019\n", - " Veränderungen\n", + " 2020\n", + " 2019\n", + " Veränderung\n", " \n", " \n", " \n", " Unnamed: 0_level_1\n", " T€\n", - " %\n", " T€\n", - " %\n", " T€\n", " %\n", " \n", @@ -852,77 +810,67 @@ " \n", " \n", " 0\n", - " Umsatzerlöse\n", - " 47459.0\n", - " 978\n", - " 45575.0\n", - " 970\n", - " 1884.0\n", - " 41\n", + " Jahresüberschuss/Jahresfehlbetrag\n", + " 1661000.0\n", + " 1599000.0\n", + " 62000.0\n", + " 39\n", " \n", " \n", " 1\n", - " Aktivierte Eigenleistungen\n", - " 380000.0\n", - " 8\n", - " 400000.0\n", - " 9\n", - " -20000.0\n", - " -50\n", + " + Steuern vom Einkommen und vom Ertrag\n", + " 7960000.0\n", + " 7920000.0\n", + " 4000.0\n", + " 5\n", " \n", " \n", " 2\n", - " Sonstige betriebliche Erträge\n", - " 687000.0\n", - " 14\n", - " 991000.0\n", - " 21\n", - " -304000.0\n", - " -307\n", + " = EBT\n", + " 2457000.0\n", + " 2391000.0\n", + " 66000.0\n", + " 28\n", " \n", " \n", " 3\n", - " Betriebliche Erträge\n", - " 48526.0\n", - " 1000\n", - " 46966.0\n", - " 1000\n", - " 1560.0\n", - " 33\n", + " + Finanzergebnis\n", + " -130000.0\n", + " -990000.0\n", + " 86000.0\n", + " -869\n", " \n", " \n", " 4\n", - " Materialaufwand\n", - " 34007.0\n", - " 701\n", - " 32647.0\n", - " 695\n", - " 1360.0\n", - " 42\n", + " = EBIT\n", + " 2444000.0\n", + " 2292000.0\n", + " 152000.0\n", + " 66\n", " \n", " \n", "\n", "" ], "text/plain": [ - " Unnamed: 0_level_0 2020 2019 \\\n", - " Unnamed: 0_level_1 T€ % T€ % \n", - "0 Umsatzerlöse 47459.0 978 45575.0 970 \n", - "1 Aktivierte Eigenleistungen 380000.0 8 400000.0 9 \n", - "2 Sonstige betriebliche Erträge 687000.0 14 991000.0 21 \n", - "3 Betriebliche Erträge 48526.0 1000 46966.0 1000 \n", - "4 Materialaufwand 34007.0 701 32647.0 695 \n", + " Unnamed: 0_level_0 2020 2019 Veränderung \\\n", + " Unnamed: 0_level_1 T€ T€ T€ \n", + "0 Jahresüberschuss/Jahresfehlbetrag 1661000.0 1599000.0 62000.0 \n", + "1 + Steuern vom Einkommen und vom Ertrag 7960000.0 7920000.0 4000.0 \n", + "2 = EBT 2457000.0 2391000.0 66000.0 \n", + "3 + Finanzergebnis -130000.0 -990000.0 86000.0 \n", + "4 = EBIT 2444000.0 2292000.0 152000.0 \n", "\n", - " Veränderungen \n", - " T€ % \n", - "0 1884.0 41 \n", - "1 -20000.0 -50 \n", - "2 -304000.0 -307 \n", - "3 1560.0 33 \n", - "4 1360.0 42 " + " \n", + " % \n", + "0 39 \n", + "1 5 \n", + "2 28 \n", + "3 -869 \n", + "4 66 " ] }, - "execution_count": 85, + "execution_count": 263, "metadata": {}, "output_type": "execute_result" } @@ -937,7 +885,7 @@ " \"€\": 1,\n", "}\n", "\n", - "for column in current_table.columns:\n", + "for column in current_table.columns[1:]:\n", " if isinstance(column, tuple):\n", " for c in column:\n", " for x, factor in converter.items():\n", @@ -945,18 +893,18 @@ " current_table[column] = current_table[column].apply(\n", " lambda x: apply_factor(x, factor)\n", " )\n", - " next\n", + " break\n", " else:\n", " for x, factor in converter.items():\n", - " parts = column.split(\" \")\n", + " parts = str(column).split(\" \")\n", " for y in parts:\n", " if re.match(x, y):\n", " current_table[column] = current_table[column].apply(\n", " lambda x: apply_factor(x, factor)\n", " )\n", " current_table.rename({column: parts[0]}, inplace=True, axis=1)\n", - " next\n", - " # print(current_table[column])\n", + " break\n", + "\n", "current_table.dropna(axis=0, how=\"all\", inplace=True)\n", "current_table.dropna(axis=1, how=\"all\", inplace=True)\n", "current_table.head()" @@ -964,7 +912,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 264, "metadata": {}, "outputs": [ { @@ -972,15 +920,13 @@ "text/plain": [ "Unnamed: 0_level_0 Unnamed: 0_level_1 object\n", "2020 T€ float64\n", - " % int64\n", "2019 T€ float64\n", - " % int64\n", - "Veränderungen T€ float64\n", + "Veränderung T€ float64\n", " % int64\n", "dtype: object" ] }, - "execution_count": 86, + "execution_count": 264, "metadata": {}, "output_type": "execute_result" } @@ -991,7 +937,7 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 265, "metadata": {}, "outputs": [], "source": [ @@ -1008,7 +954,7 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 266, "metadata": {}, "outputs": [ { @@ -1033,17 +979,15 @@ " \n", " \n", " Unnamed: 0_level_0\n", - " 2020\n", - " 2019\n", - " Veränderungen\n", + " 2020\n", + " 2019\n", + " Veränderung\n", " \n", " \n", " \n", " Unnamed: 0_level_1\n", " T€\n", - " %\n", " T€\n", - " %\n", " T€\n", " %\n", " \n", @@ -1051,221 +995,87 @@ " \n", " \n", " 0\n", - " Umsatzerlöse\n", - " 47459.0\n", - " 978\n", - " 45575.0\n", - " 970\n", - " 1884.0\n", - " 41\n", + " Jahresüberschuss/Jahresfehlbetrag\n", + " 1661000.0\n", + " 1599000.0\n", + " 62000.0\n", + " 39\n", " \n", " \n", " 1\n", - " Aktivierte Eigenleistungen\n", - " 380000.0\n", - " 8\n", - " 400000.0\n", - " 9\n", - " -20000.0\n", - " -50\n", - " \n", - " \n", - " 2\n", - " Sonstige betriebliche Erträge\n", - " 687000.0\n", - " 14\n", - " 991000.0\n", - " 21\n", - " -304000.0\n", - " -307\n", - " \n", - " \n", - " 3\n", - " Betriebliche Erträge\n", - " 48526.0\n", - " 1000\n", - " 46966.0\n", - " 1000\n", - " 1560.0\n", - " 33\n", - " \n", - " \n", - " 4\n", - " Materialaufwand\n", - " 34007.0\n", - " 701\n", - " 32647.0\n", - " 695\n", - " 1360.0\n", - " 42\n", - " \n", - " \n", - " 5\n", - " Personalaufwand\n", - " 6258.0\n", - " 129\n", - " 6222.0\n", - " 132\n", - " 36000.0\n", - " 6\n", - " \n", - " \n", - " 6\n", - " Abschreibungen\n", - " 2239.0\n", - " 46\n", - " 2273.0\n", - " 48\n", - " -34000.0\n", - " -15\n", - " \n", - " \n", - " 7\n", - " Konzessionsabgabe\n", - " 1331.0\n", - " 27\n", - " 1302.0\n", - " 28\n", - " 29000.0\n", - " 22\n", - " \n", - " \n", - " 8\n", - " Übrige sonstige betriebliche Aufwendungen\n", - " 2100.0\n", - " 43\n", - " 2066.0\n", - " 44\n", - " 34000.0\n", - " 16\n", - " \n", - " \n", - " 9\n", - " Betriebliche Aufwendungen\n", - " 45935.0\n", - " 947\n", - " 44510.0\n", - " 948\n", - " 1425.0\n", - " 32\n", - " \n", - " \n", - " 10\n", - " Ergebnis der betrieblichen Tätigkeit\n", - " 2591.0\n", - " 53\n", - " 2456.0\n", - " 52\n", - " 135000.0\n", - " 55\n", - " \n", - " \n", - " 11\n", - " Finanzergebnis (Ertrags-/Aufwandsaldo)\n", - " -13000.0\n", - " 0\n", - " -99000.0\n", - " -2\n", - " 86000.0\n", - " -869\n", - " \n", - " \n", - " 12\n", - " sonstige Steuern\n", - " 147000.0\n", - " 3\n", - " 164000.0\n", - " 3\n", - " -17000.0\n", - " -104\n", - " \n", - " \n", - " 13\n", - " Neutraler Bereich\n", - " 134000.0\n", - " 3\n", - " 65000.0\n", - " 1\n", - " 86000.0\n", - " 1062\n", - " \n", - " \n", - " 14\n", - " Jahresüberschuss vor Ertragsteuern\n", - " 2457.0\n", - " 51\n", - " 2391.0\n", - " 51\n", - " 66000.0\n", - " 28\n", - " \n", - " \n", - " 15\n", - " Ertragsteuern\n", - " 796000.0\n", - " 16\n", - " 792000.0\n", - " 17\n", + " + Steuern vom Einkommen und vom Ertrag\n", + " 7960000.0\n", + " 7920000.0\n", " 4000.0\n", " 5\n", " \n", " \n", - " 16\n", - " Jahresüberschuss\n", - " 1661.0\n", - " 34\n", - " 1599.0\n", - " 34\n", - " 62000.0\n", - " 39\n", + " 2\n", + " = EBT\n", + " 2457000.0\n", + " 2391000.0\n", + " 66000.0\n", + " 28\n", + " \n", + " \n", + " 3\n", + " + Finanzergebnis\n", + " -130000.0\n", + " -990000.0\n", + " 86000.0\n", + " -869\n", + " \n", + " \n", + " 4\n", + " = EBIT\n", + " 2444000.0\n", + " 2292000.0\n", + " 152000.0\n", + " 66\n", + " \n", + " \n", + " 5\n", + " + Abschreibungsaufwand\n", + " 2239000.0\n", + " 2273000.0\n", + " -34000.0\n", + " -15\n", + " \n", + " \n", + " 6\n", + " = EBITDA\n", + " 4683000.0\n", + " 4565000.0\n", + " 118000.0\n", + " 26\n", " \n", " \n", "\n", "" ], "text/plain": [ - " Unnamed: 0_level_0 2020 2019 \\\n", - " Unnamed: 0_level_1 T€ % T€ % \n", - "0 Umsatzerlöse 47459.0 978 45575.0 970 \n", - "1 Aktivierte Eigenleistungen 380000.0 8 400000.0 9 \n", - "2 Sonstige betriebliche Erträge 687000.0 14 991000.0 21 \n", - "3 Betriebliche Erträge 48526.0 1000 46966.0 1000 \n", - "4 Materialaufwand 34007.0 701 32647.0 695 \n", - "5 Personalaufwand 6258.0 129 6222.0 132 \n", - "6 Abschreibungen 2239.0 46 2273.0 48 \n", - "7 Konzessionsabgabe 1331.0 27 1302.0 28 \n", - "8 Übrige sonstige betriebliche Aufwendungen 2100.0 43 2066.0 44 \n", - "9 Betriebliche Aufwendungen 45935.0 947 44510.0 948 \n", - "10 Ergebnis der betrieblichen Tätigkeit 2591.0 53 2456.0 52 \n", - "11 Finanzergebnis (Ertrags-/Aufwandsaldo) -13000.0 0 -99000.0 -2 \n", - "12 sonstige Steuern 147000.0 3 164000.0 3 \n", - "13 Neutraler Bereich 134000.0 3 65000.0 1 \n", - "14 Jahresüberschuss vor Ertragsteuern 2457.0 51 2391.0 51 \n", - "15 Ertragsteuern 796000.0 16 792000.0 17 \n", - "16 Jahresüberschuss 1661.0 34 1599.0 34 \n", + " Unnamed: 0_level_0 2020 2019 Veränderung \\\n", + " Unnamed: 0_level_1 T€ T€ T€ \n", + "0 Jahresüberschuss/Jahresfehlbetrag 1661000.0 1599000.0 62000.0 \n", + "1 + Steuern vom Einkommen und vom Ertrag 7960000.0 7920000.0 4000.0 \n", + "2 = EBT 2457000.0 2391000.0 66000.0 \n", + "3 + Finanzergebnis -130000.0 -990000.0 86000.0 \n", + "4 = EBIT 2444000.0 2292000.0 152000.0 \n", + "5 + Abschreibungsaufwand 2239000.0 2273000.0 -34000.0 \n", + "6 = EBITDA 4683000.0 4565000.0 118000.0 \n", "\n", - " Veränderungen \n", - " T€ % \n", - "0 1884.0 41 \n", - "1 -20000.0 -50 \n", - "2 -304000.0 -307 \n", - "3 1560.0 33 \n", - "4 1360.0 42 \n", - "5 36000.0 6 \n", - "6 -34000.0 -15 \n", - "7 29000.0 22 \n", - "8 34000.0 16 \n", - "9 1425.0 32 \n", - "10 135000.0 55 \n", - "11 86000.0 -869 \n", - "12 -17000.0 -104 \n", - "13 86000.0 1062 \n", - "14 66000.0 28 \n", - "15 4000.0 5 \n", - "16 62000.0 39 " + " \n", + " % \n", + "0 39 \n", + "1 5 \n", + "2 28 \n", + "3 -869 \n", + "4 66 \n", + "5 -15 \n", + "6 26 " ] }, - "execution_count": 88, + "execution_count": 266, "metadata": {}, "output_type": "execute_result" } @@ -1280,488 +1090,73 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 267, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_15672\\340569398.py:3: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", - " kpis[row[0]] = row[1]\n" + "C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_24300\\1758297134.py:8: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " row[0] = re.sub(exp, '', row[0]).strip()\n", + "C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_24300\\1758297134.py:8: FutureWarning: Series.__setitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To set a value by position, use `ser.iloc[pos] = value`\n", + " row[0] = re.sub(exp, '', row[0]).strip()\n" + ] + } + ], + "source": [ + "exps = [r\"^[0-9a-zA-Z]+[\\.\\)] \", r\"[\\+\\=\\-\\_]\"]\n", + "for _index, row in current_table.iterrows():\n", + " for exp in exps:\n", + " # print(row[0])\n", + " row[0] = re.sub(exp, \"\", row[0]).strip()\n", + " # print(row[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 268, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_24300\\2923576447.py:8: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " name_cleansed = row[0]\n", + "C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_24300\\2923576447.py:12: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " kpis[name_cleansed] = row[1]\n" ] }, { "data": { "text/plain": [ - "{'Umsatzerlöse': 47459.0,\n", - " 'Aktivierte Eigenleistungen': 380000.0,\n", - " 'Sonstige betriebliche Erträge': 687000.0,\n", - " 'Betriebliche Erträge': 48526.0,\n", - " 'Materialaufwand': 34007.0,\n", - " 'Personalaufwand': 6258.0,\n", - " 'Abschreibungen': 2239.0,\n", - " 'Konzessionsabgabe': 1331.0,\n", - " 'Übrige sonstige betriebliche Aufwendungen': 2100.0,\n", - " 'Betriebliche Aufwendungen': 45935.0,\n", - " 'Ergebnis der betrieblichen Tätigkeit': 2591.0,\n", - " 'Finanzergebnis (Ertrags-/Aufwandsaldo)': -13000.0,\n", - " 'sonstige Steuern': 147000.0,\n", - " 'Neutraler Bereich': 134000.0,\n", - " 'Jahresüberschuss vor Ertragsteuern': 2457.0,\n", - " 'Ertragsteuern': 796000.0,\n", - " 'Jahresüberschuss': 1661.0}" + "{'Jahresüberschuss/Jahresfehlbetrag': 1661000.0,\n", + " 'Steuern vom Einkommen und vom Ertrag': 7960000.0,\n", + " 'EBT': 2457000.0,\n", + " 'Finanzergebnis': -130000.0,\n", + " 'EBIT': 2444000.0,\n", + " 'Abschreibungsaufwand': 2239000.0,\n", + " 'EBITDA': 4683000.0}" ] }, - "execution_count": 89, + "execution_count": 268, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kpis = {}\n", + "\n", + "exps = [r\"^[0-9a-zA-Z]+[\\.\\)] \", r\"[\\+\\=\\-\\_]\"]\n", "for _index, row in current_table.iterrows():\n", - " kpis[row[0]] = row[1]\n", + " name_cleansed = row[0]\n", + " for exp in exps:\n", + " # print(row[0])\n", + " name_cleansed = re.sub(exp, \"\", name_cleansed).strip()\n", + " kpis[name_cleansed] = row[1]\n", "kpis" ] - }, - { - "cell_type": "code", - "execution_count": 90, - "metadata": {}, - "outputs": [], - "source": [ - "import re\n", - "\n", - "\n", - "def get_bilanz(report: str) -> any:\n", - " result = {}\n", - " soup = BeautifulSoup(report, features=\"html.parser\")\n", - " for pos in [\"Aktiva\", \"Passiva\"]:\n", - " tag = soup.find(\"b\", string=re.compile(pos))\n", - " if tag:\n", - " pos_results = pd.read_html(\n", - " StringIO(str(tag.findNext(\"table\", {\"class\": \"std_table\"})))\n", - " )[0]\n", - " result[pos] = pos_results\n", - " else:\n", - " result[pos] = pd.DataFrame([])\n", - " return result" - ] - }, - { - "cell_type": "code", - "execution_count": 91, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Investitionen (netto)2020 T€2019 T€Veränderung T€
0Stromversorgung1.3721.553-181.000
1Gasversorgung713.000707.0006.000
2sonstige Aktivitäten661.0002.605-1.944
3Insgesamt2.7464.865-2.119
\n", - "
" - ], - "text/plain": [ - " Investitionen (netto) 2020 T€ 2019 T€ Veränderung T€\n", - "0 Stromversorgung 1.372 1.553 -181.000\n", - "1 Gasversorgung 713.000 707.000 6.000\n", - "2 sonstige Aktivitäten 661.000 2.605 -1.944\n", - "3 Insgesamt 2.746 4.865 -2.119" - ] - }, - "execution_count": 91, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bilanz = get_bilanz(sample_report)\n", - "bilanz[\"Passiva\"].head()" - ] - }, - { - "cell_type": "code", - "execution_count": 92, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0_level_031. Dezember 202031. Dezember 2019Veränderung
Unnamed: 0_level_1T€%T€%T€
0AnlagevermögenNaNNaNNaNNaNNaN
1Sachanlagen28.919689.028.812689.0107.000
2Finanzanlagen2.66764.04.189100.0-1.522
3NaN31.586753.033.001789.0-1.415
4UmlaufvermögenNaNNaNNaNNaNNaN
\n", - "
" - ], - "text/plain": [ - " Unnamed: 0_level_0 31. Dezember 2020 31. Dezember 2019 \\\n", - " Unnamed: 0_level_1 T€ % T€ % \n", - "0 Anlagevermögen NaN NaN NaN NaN \n", - "1 Sachanlagen 28.919 689.0 28.812 689.0 \n", - "2 Finanzanlagen 2.667 64.0 4.189 100.0 \n", - "3 NaN 31.586 753.0 33.001 789.0 \n", - "4 Umlaufvermögen NaN NaN NaN NaN \n", - "\n", - " Veränderung \n", - " T€ \n", - "0 NaN \n", - "1 107.000 \n", - "2 -1.522 \n", - "3 -1.415 \n", - "4 NaN " - ] - }, - "execution_count": 92, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bilanz[\"Aktiva\"].head()" - ] - }, - { - "cell_type": "code", - "execution_count": 93, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Investitionen (netto)2020 T€2019 T€Veränderung T€
0Stromversorgung1.3721.553-181.000
1Gasversorgung713.000707.0006.000
2sonstige Aktivitäten661.0002.605-1.944
3Insgesamt2.7464.865-2.119
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from IPython.display import display, HTML\n", - "\n", - "# Assuming that dataframes df1 and df2 are already defined:\n", - "display(HTML(bilanz[\"Passiva\"].to_html()))" - ] - }, - { - "cell_type": "code", - "execution_count": 94, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", - " ( '2020', 'T€'),\n", - " ( '2019', 'T€'),\n", - " ( 'Veränderung', 'T€'),\n", - " ( 'Veränderung', '%')],\n", - " )\n", - "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", - " ( '2020', 'T€'),\n", - " ( '2020', '%'),\n", - " ( '2019', 'T€'),\n", - " ( '2019', '%'),\n", - " ( 'Veränderungen', 'T€'),\n", - " ( 'Veränderungen', '%')],\n", - " )\n", - "MultiIndex([('Unnamed: 0_level_0', 'gerundet'),\n", - " ( '2020', 'T€'),\n", - " ( '2019', 'T€'),\n", - " ( 'Veränderung', 'T€'),\n", - " ( 'Veränderung', '%')],\n", - " )\n", - "MultiIndex([('Unnamed: 0_level_0', 'unkonsolidiert gerundet'),\n", - " ( '2020', 'T€'),\n", - " ( '2019', 'T€'),\n", - " ( 'Veränderung', 'T€'),\n", - " ( 'Veränderung', '%')],\n", - " )\n", - "MultiIndex([('Jahresüberschuss/Jahresfehlbetrag nach Betriebszweigen', ...),\n", - " ( '2020', ...),\n", - " ( '2019', ...),\n", - " ( 'Veränderung', ...),\n", - " ( 'Veränderung', ...)],\n", - " )\n", - "Index(['Unnamed: 0', '2020 T€', '2019 T€'], dtype='object')\n", - "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", - " ( '31. Dezember 2020', 'T€'),\n", - " ( '31. Dezember 2020', '%'),\n", - " ( '31. Dezember 2019', 'T€'),\n", - " ( '31. Dezember 2019', '%'),\n", - " ( 'Veränderung', 'T€')],\n", - " )\n", - "Index(['Investitionen (netto)', '2020 T€', '2019 T€', 'Veränderung T€'], dtype='object')\n", - "Index(['Unnamed: 0', '€', '31.12.2019 in T €'], dtype='object')\n", - "Index(['Unnamed: 0', '€', '€.1', '31.12.2019 in T €'], dtype='object')\n", - "Index([0, 1], dtype='int64')\n", - "Index(['Beteiligung', 'Anteil', 'Eigenkapital der Beteiligungsgesellschaft',\n", - " 'Eigenkapital der Beteiligungsgesellschaft.1',\n", - " 'Jahresergebnis der Beteiligungsgesellschaft',\n", - " 'Jahresergebnis der Beteiligungsgesellschaft.1'],\n", - " dtype='object')\n", - "Index(['Beteiligung', 'Anteil', 'Eigenkapital der Beteiligungsgesellschaft',\n", - " 'Eigenkapital der Beteiligungsgesellschaft.1',\n", - " 'Jahresergebnis der Beteiligungsgesellschaft',\n", - " 'Jahresergebnis der Beteiligungsgesellschaft.1'],\n", - " dtype='object')\n", - "Index(['Unnamed: 0', '2020', '2019'], dtype='object')\n", - "MultiIndex([( 'Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", - " ( 'Unnamed: 1_level_0', 'Gesamt in T€'),\n", - " ('davon mit einer Restlaufzeit', 'bis zu 1 Jahr in T€'),\n", - " ('davon mit einer Restlaufzeit', 'mehr als 1 Jahr in T€'),\n", - " ('davon mit einer Restlaufzeit', 'davon über 5 Jahre in T€')],\n", - " )\n", - "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", - " ( '2020', '€'),\n", - " ( '2019', '€'),\n", - " ( 'Veränderung', '€'),\n", - " ( 'Veränderung', '%')],\n", - " )\n", - "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", - " ( '2020', '€'),\n", - " ( '2019', '€'),\n", - " ( 'Veränderung', '€'),\n", - " ( 'Veränderung', '%')],\n", - " )\n", - "Index(['Unnamed: 0', '2020 T€', '2019 T€'], dtype='object')\n", - "MultiIndex([( 'Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", - " ('Anschaffungs- und Herstellungskosten', 'Stand am 01.01.2020 €'),\n", - " ('Anschaffungs- und Herstellungskosten', 'Zugang €'),\n", - " ('Anschaffungs- und Herstellungskosten', 'Abgang €'),\n", - " ('Anschaffungs- und Herstellungskosten', 'Umbuchung €'),\n", - " ('Anschaffungs- und Herstellungskosten', 'Stand am 31.12.2020 €')],\n", - " )\n", - "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", - " ( 'Abschreibungen', 'Stand am 01.01.2020 €'),\n", - " ( 'Abschreibungen', 'Zugang €'),\n", - " ( 'Abschreibungen', 'außerplanm. AfA'),\n", - " ( 'Abschreibungen', 'Abgang €'),\n", - " ( 'Abschreibungen', 'Umbuchung €'),\n", - " ( 'Abschreibungen', 'Stand am 31.12.2020 €')],\n", - " )\n", - "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", - " ( 'Restbuchwerte', 'Stand am 31.12.2020 €'),\n", - " ( 'Restbuchwerte', 'Stand am 31.12.2019 €')],\n", - " )\n", - "Index(['Unnamed: 0', 'Elektrizitätsverteilung', '31.12.2019 in T €',\n", - " 'Gasverteilung', '31.12.2019 in T €.1'],\n", - " dtype='object')\n", - "Index(['Unnamed: 0', '€', '31.12.2019 in T €'], dtype='object')\n", - "Index(['Unnamed: 0', 'Elektrizitätsverteilung €', '31.12.2019 in T €',\n", - " 'Gasverteilung €', '31.12.2019 in T €.1'],\n", - " dtype='object')\n", - "Index(['Unnamed: 0', '€', 'Vorjahr in T €'], dtype='object')\n", - "MultiIndex([('Verbindlichkeitenspiegel 2020 Elektrizitätsverteilung', ...),\n", - " ( 'davon mit einer Restlaufzeit', ...),\n", - " ( 'davon mit einer Restlaufzeit', ...),\n", - " ( 'davon mit einer Restlaufzeit', ...),\n", - " ( 'davon mit einer Restlaufzeit', ...)],\n", - " )\n", - "MultiIndex([('Verbindlichkeitenspiegel 2020 Gasverteilung', ...),\n", - " ( 'davon mit einer Restlaufzeit', ...),\n", - " ( 'davon mit einer Restlaufzeit', ...),\n", - " ( 'davon mit einer Restlaufzeit', ...),\n", - " ( 'davon mit einer Restlaufzeit', ...)],\n", - " )\n", - "MultiIndex([('Verbindlichkeitenspiegel 2020 Intelligenter', ...),\n", - " ( 'davon mit einer Restlaufzeit', ...),\n", - " ( 'davon mit einer Restlaufzeit', ...),\n", - " ( 'davon mit einer Restlaufzeit', ...),\n", - " ( 'davon mit einer Restlaufzeit', ...)],\n", - " )\n" - ] - } - ], - "source": [ - "def get_tables(raw_report: str) -> list:\n", - " soup = BeautifulSoup(raw_report, features=\"html.parser\")\n", - " tables = soup.find_all(\"table\", {\"class\": \"std_table\"})\n", - " dfs = []\n", - " for table in tables:\n", - " for df in pd.read_html(StringIO(str(table))):\n", - " dfs.append(df)\n", - " return dfs\n", - "\n", - "\n", - "for df in get_tables(sample_report):\n", - " print(df.columns)\n", - "\n", - "tables = get_tables(sample_report)" - ] } ], "metadata": {