mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-04-22 04:12:54 +02:00
1249 lines
43 KiB
Plaintext
1249 lines
43 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Daten Extraktion aus dem Bundesanzeiger"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Vorbereitung"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 64,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>date</th>\n",
|
|
" <th>company</th>\n",
|
|
" <th>raw_report</th>\n",
|
|
" <th>jahr</th>\n",
|
|
" <th>auditors</th>\n",
|
|
" <th>financial_results</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>2023-07-07</td>\n",
|
|
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
|
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
|
" <td>2021</td>\n",
|
|
" <td>[]</td>\n",
|
|
" <td>{}</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>2023-05-10</td>\n",
|
|
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
|
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
|
" <td>2021</td>\n",
|
|
" <td>[Auditor(name='Eckhard Lewe', company='Grant T...</td>\n",
|
|
" <td>{'equity': 23295.0, 'current_assets': 111516.0}</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>2022-03-25</td>\n",
|
|
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
|
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
|
" <td>2020</td>\n",
|
|
" <td>[Auditor(name='Eckhard Lewe', company='Warth &...</td>\n",
|
|
" <td>{'equity': 23296.0, 'current_assets': 93901.0}</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>5</th>\n",
|
|
" <td>2021-03-11</td>\n",
|
|
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
|
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
|
" <td>2019</td>\n",
|
|
" <td>[Auditor(name='Eckhard Lewe', company='Warth &...</td>\n",
|
|
" <td>{'net_income': 0.0, 'equity': 23296.0, 'curren...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>6</th>\n",
|
|
" <td>2020-03-24</td>\n",
|
|
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
|
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
|
" <td>2018</td>\n",
|
|
" <td>[Auditor(name='Ulrich Diersch', company='Warth...</td>\n",
|
|
" <td>{'net_income': 0.0, 'equity': 23296.0, 'curren...</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" date company \\\n",
|
|
"0 2023-07-07 Atos IT-Dienstleistung und Beratung GmbH \n",
|
|
"2 2023-05-10 Atos IT-Dienstleistung und Beratung GmbH \n",
|
|
"4 2022-03-25 Atos IT-Dienstleistung und Beratung GmbH \n",
|
|
"5 2021-03-11 Atos IT-Dienstleistung und Beratung GmbH \n",
|
|
"6 2020-03-24 Atos IT-Dienstleistung und Beratung GmbH \n",
|
|
"\n",
|
|
" raw_report jahr \\\n",
|
|
"0 <div class=\"publication_container\">\\n <div cla... 2021 \n",
|
|
"2 <div class=\"publication_container\">\\n <div cla... 2021 \n",
|
|
"4 <div class=\"publication_container\">\\n <div cla... 2020 \n",
|
|
"5 <div class=\"publication_container\">\\n <div cla... 2019 \n",
|
|
"6 <div class=\"publication_container\">\\n <div cla... 2018 \n",
|
|
"\n",
|
|
" auditors \\\n",
|
|
"0 [] \n",
|
|
"2 [Auditor(name='Eckhard Lewe', company='Grant T... \n",
|
|
"4 [Auditor(name='Eckhard Lewe', company='Warth &... \n",
|
|
"5 [Auditor(name='Eckhard Lewe', company='Warth &... \n",
|
|
"6 [Auditor(name='Ulrich Diersch', company='Warth... \n",
|
|
"\n",
|
|
" financial_results \n",
|
|
"0 {} \n",
|
|
"2 {'equity': 23295.0, 'current_assets': 111516.0} \n",
|
|
"4 {'equity': 23296.0, 'current_assets': 93901.0} \n",
|
|
"5 {'net_income': 0.0, 'equity': 23296.0, 'curren... \n",
|
|
"6 {'net_income': 0.0, 'equity': 23296.0, 'curren... "
|
|
]
|
|
},
|
|
"execution_count": 64,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"from aki_prj23_transparenzregister.utils.data_extraction.bundesanzeiger import (\n",
|
|
" Bundesanzeiger,\n",
|
|
")\n",
|
|
"\n",
|
|
"ba_wrapper = Bundesanzeiger()\n",
|
|
"# df_reports = ba_wrapper.get_information(\"Törmer Energy Solar 1 GmbH & Co. KG\", \"\")\n",
|
|
"df_reports = ba_wrapper.get_information(\"Atos IT-Dienstleistung und Beratung GmbH\", \"\")\n",
|
|
"df_reports.head()"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Daten Extraktion"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 65,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from bs4 import BeautifulSoup\n",
|
|
"from io import StringIO"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 68,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"sample_report = df_reports.iloc[1].raw_report"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Aufsichtsrat"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"**TODO**"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Bilanz bzw. GuV"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 163,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Index([0, 1], dtype='int64')\n",
|
|
"0 object\n",
|
|
"1 object\n",
|
|
"dtype: object\n",
|
|
"Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
|
"Unnamed: 0 object\n",
|
|
"Anhang object\n",
|
|
"31.12.2021 TEUR object\n",
|
|
"Vorjahr TEUR object\n",
|
|
"dtype: object\n",
|
|
"Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
|
"Unnamed: 0 object\n",
|
|
"Anhang object\n",
|
|
"2021 TEUR float64\n",
|
|
"Vorjahr TEUR float64\n",
|
|
"dtype: object\n",
|
|
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n",
|
|
" 'Vorjahr TEUR'],\n",
|
|
" dtype='object')\n",
|
|
"Aufgliederung nach Tätigkeitsbereichen object\n",
|
|
"2021 TEUR float64\n",
|
|
"Vorjahr TEUR float64\n",
|
|
"dtype: object\n",
|
|
"Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
|
"Aufgliederung nach Inland und Ausland object\n",
|
|
"2021 TEUR float64\n",
|
|
"Vorjahr TEUR float64\n",
|
|
"dtype: object\n",
|
|
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
|
"Unnamed: 0 object\n",
|
|
"31.12.2021 TEUR float64\n",
|
|
"Vorjahr TEUR float64\n",
|
|
"dtype: object\n",
|
|
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
|
"Unnamed: 0 object\n",
|
|
"31.12.2021 TEUR float64\n",
|
|
"Vorjahr TEUR float64\n",
|
|
"dtype: object\n",
|
|
"Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n",
|
|
"Unnamed: 0 object\n",
|
|
"31.12.2021 object\n",
|
|
"dtype: object\n",
|
|
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
|
"Unnamed: 0 object\n",
|
|
"TEUR float64\n",
|
|
"dtype: object\n",
|
|
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
|
"Unnamed: 0 object\n",
|
|
"TEUR float64\n",
|
|
"dtype: object\n",
|
|
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
|
"Unnamed: 0 object\n",
|
|
"TEUR float64\n",
|
|
"dtype: object\n",
|
|
"Index([0, 1, 2], dtype='int64')\n",
|
|
"0 object\n",
|
|
"1 object\n",
|
|
"2 int64\n",
|
|
"dtype: object\n",
|
|
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
|
"Unnamed: 0 object\n",
|
|
"TEUR int64\n",
|
|
"dtype: object\n",
|
|
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
|
"Unnamed: 0 object\n",
|
|
"31.12.2021 TEUR float64\n",
|
|
"Vorjahr TEUR float64\n",
|
|
"dtype: object\n",
|
|
"Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
|
|
"Unnamed: 0 object\n",
|
|
"2021 Anzahl MA int64\n",
|
|
"Vorjahr Anzahl MA int64\n",
|
|
"dtype: object\n",
|
|
"MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n",
|
|
" ('Art der Beziehung', 'Gesellschafterin TEUR'),\n",
|
|
" ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n",
|
|
" )\n",
|
|
"Art des Geschäfts Unnamed: 0_level_1 object\n",
|
|
"Art der Beziehung Gesellschafterin TEUR float64\n",
|
|
" Verbundene Unternehmen TEUR float64\n",
|
|
"dtype: object\n",
|
|
"Index([0, 1], dtype='int64')\n",
|
|
"0 object\n",
|
|
"1 object\n",
|
|
"dtype: object\n",
|
|
"MultiIndex([( 'Unnamed: 0_level_0', ...),\n",
|
|
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
|
|
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
|
|
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
|
|
" ('Anschaffungs- oder Herstellungskosten', ...)],\n",
|
|
" )\n",
|
|
"Unnamed: 0_level_0 Unnamed: 0_level_1 object\n",
|
|
"Anschaffungs- oder Herstellungskosten Stand 01.01.2021 EUR object\n",
|
|
" Zugänge Umbuchung U EUR object\n",
|
|
" Abgänge Umbuchung EUR object\n",
|
|
" Stand 31.12.2021 EUR object\n",
|
|
"dtype: object\n",
|
|
"MultiIndex([('Unnamed: 0_level_0', ...),\n",
|
|
" ( 'Abschreibungen', ...),\n",
|
|
" ( 'Abschreibungen', ...),\n",
|
|
" ( 'Abschreibungen', ...),\n",
|
|
" ( 'Abschreibungen', ...)],\n",
|
|
" )\n",
|
|
"Unnamed: 0_level_0 Unnamed: 0_level_1 object\n",
|
|
"Abschreibungen Stand 01.01.2021 EUR object\n",
|
|
" Abschreibungen des Geschäftsjahres U EUR object\n",
|
|
" Abgänge Umbuchung U EUR object\n",
|
|
" Stand 31.12.2021 EUR object\n",
|
|
"dtype: object\n",
|
|
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
|
|
" ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n",
|
|
" ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n",
|
|
" )\n",
|
|
"Unnamed: 0_level_0 Unnamed: 0_level_1 object\n",
|
|
"Buchwerte Stand 31.12.2021 EUR object\n",
|
|
" Stand 31.12.2020 EUR object\n",
|
|
"dtype: object\n",
|
|
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n",
|
|
" '2019'],\n",
|
|
" dtype='object')\n",
|
|
"Nichtfinanzieller Leistungsindikator object\n",
|
|
"Unnamed: 1 object\n",
|
|
"2021 int64\n",
|
|
"2020 int64\n",
|
|
"2019 int64\n",
|
|
"dtype: object\n",
|
|
"Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n",
|
|
" 'Veränderung TEUR'],\n",
|
|
" dtype='object')\n",
|
|
"Gewinn- und Verlustrechnung object\n",
|
|
"2021 TEUR float64\n",
|
|
"Vorjahr TEUR float64\n",
|
|
"Veränderung TEUR float64\n",
|
|
"dtype: object\n",
|
|
"Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n",
|
|
"Bilanz object\n",
|
|
"31.12.2021 TEUR float64\n",
|
|
"Vorjahr TEUR float64\n",
|
|
"Veränderung TEUR float64\n",
|
|
"dtype: object\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"def parse_tables(report: str) -> list:\n",
|
|
" result = []\n",
|
|
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
|
|
" for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n",
|
|
" df = pd.read_html(StringIO(str(table)), flavor=\"bs4\")[0]\n",
|
|
" print(df.columns)\n",
|
|
" print(df.dtypes)\n",
|
|
" result.append(df)\n",
|
|
" return result\n",
|
|
"\n",
|
|
"\n",
|
|
"tables = parse_tables(sample_report)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 164,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Unnamed: 0</th>\n",
|
|
" <th>Anhang</th>\n",
|
|
" <th>31.12.2021 TEUR</th>\n",
|
|
" <th>Vorjahr TEUR</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>A. Anlagevermögen</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>I. Immaterielle Vermögensgegenstände</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>Entgeltlich erworbene Software</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>6</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>II. Sachanlagen</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>1. Grundstücke und Bauten</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>75</td>\n",
|
|
" <td>89</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Unnamed: 0 Anhang 31.12.2021 TEUR Vorjahr TEUR\n",
|
|
"0 A. Anlagevermögen NaN NaN NaN\n",
|
|
"1 I. Immaterielle Vermögensgegenstände NaN NaN NaN\n",
|
|
"2 Entgeltlich erworbene Software NaN 3 6\n",
|
|
"3 II. Sachanlagen NaN NaN NaN\n",
|
|
"4 1. Grundstücke und Bauten NaN 75 89"
|
|
]
|
|
},
|
|
"execution_count": 164,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"current_table = tables[1]\n",
|
|
"current_table.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 165,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import re\n",
|
|
"\n",
|
|
"\n",
|
|
"def cleanse_string(value: str) -> str:\n",
|
|
" print(value)\n",
|
|
" if value is not None and isinstance(value, str):\n",
|
|
" return re.sub(r\"(.+\\.).\", \"\", value)\n",
|
|
" return None"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 166,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"A. Anlagevermögen\n",
|
|
"I. Immaterielle Vermögensgegenstände\n",
|
|
"Entgeltlich erworbene Software\n",
|
|
"II. Sachanlagen\n",
|
|
"1. Grundstücke und Bauten\n",
|
|
"2. Technische Anlagen und Maschinen\n",
|
|
"3. Andere Anlagen, Betriebs- und Geschäftsausstattung\n",
|
|
"4. Geleistete Anzahlung und Anlagen im Bau\n",
|
|
"nan\n",
|
|
"III. Finanzanlagen\n",
|
|
"Sonstige Ausleihungen\n",
|
|
"nan\n",
|
|
"B. Umlaufvermögen\n",
|
|
"I. Vorräte\n",
|
|
"Waren\n",
|
|
"II. Forderungen und sonstige Vermögensgegenstände\n",
|
|
"1. Forderungen aus Lieferungen und Leistungen\n",
|
|
"2. Forderungen gegen verbundene Unternehmen\n",
|
|
"3. Sonstige Vermögensgegenstände\n",
|
|
"nan\n",
|
|
"nan\n",
|
|
"C. Rechnungsabgrenzungsposten\n",
|
|
"D. Aktiver Unterschiedsbetrag aus der Vermögensverrechnung\n",
|
|
"nan\n",
|
|
"Passiva\n",
|
|
"nan\n",
|
|
"A. Eigenkapital\n",
|
|
"I. Gezeichnetes Kapital\n",
|
|
"II. Kapitalrücklage\n",
|
|
"III. Gewinnrücklagen\n",
|
|
"Andere Gewinnrücklagen\n",
|
|
"IV. Gewinnvortrag\n",
|
|
"nan\n",
|
|
"B. Rückstellungen\n",
|
|
"1. Rückstellungen für Pensionen\n",
|
|
"2. Steuerrückstellungen\n",
|
|
"3. Sonstige Rückstellungen\n",
|
|
"nan\n",
|
|
"C. Verbindlichkeiten\n",
|
|
"1. Erhaltene Anzahlungen\n",
|
|
"2. Verbindlichkeiten aus Lieferungen und Leistungen\n",
|
|
"3. Verbindlichkeiten gegenüber verbundenen Unternehmen\n",
|
|
"4. Sonstige Verbindlichkeiten\n",
|
|
"nan\n",
|
|
"D. Rechnungsabgrenzungungsposten\n",
|
|
"nan\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_18940\\152097142.py:2: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
|
" current_table.iloc[index][0] = cleanse_string(row[0])\n",
|
|
"C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_18940\\152097142.py:2: FutureWarning: Series.__setitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To set a value by position, use `ser.iloc[pos] = value`\n",
|
|
" current_table.iloc[index][0] = cleanse_string(row[0])\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Unnamed: 0</th>\n",
|
|
" <th>Anhang</th>\n",
|
|
" <th>31.12.2021 TEUR</th>\n",
|
|
" <th>Vorjahr TEUR</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>Anlagevermögen</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>Immaterielle Vermögensgegenstände</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>Entgeltlich erworbene Software</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>6</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>Sachanlagen</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>Grundstücke und Bauten</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>75</td>\n",
|
|
" <td>89</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Unnamed: 0 Anhang 31.12.2021 TEUR Vorjahr TEUR\n",
|
|
"0 Anlagevermögen NaN NaN NaN\n",
|
|
"1 Immaterielle Vermögensgegenstände NaN NaN NaN\n",
|
|
"2 Entgeltlich erworbene Software NaN 3 6\n",
|
|
"3 Sachanlagen NaN NaN NaN\n",
|
|
"4 Grundstücke und Bauten NaN 75 89"
|
|
]
|
|
},
|
|
"execution_count": 166,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"for index, row in current_table.iterrows():\n",
|
|
" current_table.iloc[index][0] = cleanse_string(row[0])\n",
|
|
"current_table.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 167,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def parse_string_to_float(value) -> float:\n",
|
|
" try:\n",
|
|
" if value is None:\n",
|
|
" return None\n",
|
|
" if isinstance(value, float):\n",
|
|
" return value\n",
|
|
" return float(value.replace(\".\", \"\").replace(\",\", \".\"))\n",
|
|
" except Exception as e:\n",
|
|
" return None\n",
|
|
"\n",
|
|
"\n",
|
|
"def apply_factor(value, factor: float):\n",
|
|
" transformed_value = parse_string_to_float(value)\n",
|
|
" if transformed_value is None or isinstance(transformed_value, str):\n",
|
|
" return None\n",
|
|
" result = transformed_value * factor\n",
|
|
" # print(result)\n",
|
|
" return result"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 168,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Unnamed: 0</th>\n",
|
|
" <th>Anhang</th>\n",
|
|
" <th>31.12.2021</th>\n",
|
|
" <th>Vorjahr</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>Anlagevermögen</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>Immaterielle Vermögensgegenstände</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>Entgeltlich erworbene Software</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>3000.0</td>\n",
|
|
" <td>6000.0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>Sachanlagen</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>Grundstücke und Bauten</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>75000.0</td>\n",
|
|
" <td>89000.0</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Unnamed: 0 Anhang 31.12.2021 Vorjahr\n",
|
|
"0 Anlagevermögen NaN NaN NaN\n",
|
|
"1 Immaterielle Vermögensgegenstände NaN NaN NaN\n",
|
|
"2 Entgeltlich erworbene Software NaN 3000.0 6000.0\n",
|
|
"3 Sachanlagen NaN NaN NaN\n",
|
|
"4 Grundstücke und Bauten NaN 75000.0 89000.0"
|
|
]
|
|
},
|
|
"execution_count": 168,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"converter = {\"T€\": 1 / 1000, \"TEUR\": 1000, \"EUR\": 1 / 1000, \"€\": 1}\n",
|
|
"\n",
|
|
"for column in current_table.columns:\n",
|
|
" if isinstance(column, tuple):\n",
|
|
" for c in column:\n",
|
|
" for x, factor in converter.items():\n",
|
|
" if x in c:\n",
|
|
" current_table[column] = current_table[column].apply(\n",
|
|
" lambda x: apply_factor(x, factor)\n",
|
|
" )\n",
|
|
" next\n",
|
|
" else:\n",
|
|
" for x, factor in converter.items():\n",
|
|
" parts = column.split(\" \")\n",
|
|
" for y in parts:\n",
|
|
" if re.match(x, y):\n",
|
|
" current_table[column] = current_table[column].apply(\n",
|
|
" lambda x: apply_factor(x, factor)\n",
|
|
" )\n",
|
|
" current_table.rename({column: parts[0]}, inplace=True, axis=1)\n",
|
|
" next\n",
|
|
" # print(current_table[column])\n",
|
|
"current_table.dropna(axis=0, how=\"all\", inplace=True)\n",
|
|
"current_table.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 169,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import re\n",
|
|
"\n",
|
|
"\n",
|
|
"def get_bilanz(report: str) -> any:\n",
|
|
" result = {}\n",
|
|
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
|
|
" for pos in [\"Aktiva\", \"Passiva\"]:\n",
|
|
" tag = soup.find(\"b\", string=re.compile(pos))\n",
|
|
" if tag:\n",
|
|
" pos_results = pd.read_html(\n",
|
|
" StringIO(str(tag.findNext(\"table\", {\"class\": \"std_table\"})))\n",
|
|
" )[0]\n",
|
|
" result[pos] = pos_results\n",
|
|
" else:\n",
|
|
" result[pos] = pd.DataFrame([])\n",
|
|
" return result"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 170,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Unnamed: 0</th>\n",
|
|
" <th>Anhang</th>\n",
|
|
" <th>2021 TEUR</th>\n",
|
|
" <th>Vorjahr TEUR</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>1. Umsatzerlöse</td>\n",
|
|
" <td>(1)</td>\n",
|
|
" <td>66.767</td>\n",
|
|
" <td>69.819</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>2. Veränderung des Bestandes an unfertigen Lei...</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.000</td>\n",
|
|
" <td>-41.000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>3. Sonstige betriebliche Erträge</td>\n",
|
|
" <td>(2)</td>\n",
|
|
" <td>621.000</td>\n",
|
|
" <td>489.000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>4. Materialaufwand</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>a) Aufwendungen für bezogene Waren</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>-475.000</td>\n",
|
|
" <td>-1.220</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Unnamed: 0 Anhang 2021 TEUR \\\n",
|
|
"0 1. Umsatzerlöse (1) 66.767 \n",
|
|
"1 2. Veränderung des Bestandes an unfertigen Lei... NaN 0.000 \n",
|
|
"2 3. Sonstige betriebliche Erträge (2) 621.000 \n",
|
|
"3 4. Materialaufwand NaN NaN \n",
|
|
"4 a) Aufwendungen für bezogene Waren NaN -475.000 \n",
|
|
"\n",
|
|
" Vorjahr TEUR \n",
|
|
"0 69.819 \n",
|
|
"1 -41.000 \n",
|
|
"2 489.000 \n",
|
|
"3 NaN \n",
|
|
"4 -1.220 "
|
|
]
|
|
},
|
|
"execution_count": 170,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"bilanz = get_bilanz(sample_report)\n",
|
|
"bilanz[\"Passiva\"].head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 171,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Unnamed: 0</th>\n",
|
|
" <th>Anhang</th>\n",
|
|
" <th>31.12.2021 TEUR</th>\n",
|
|
" <th>Vorjahr TEUR</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>A. Anlagevermögen</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>I. Immaterielle Vermögensgegenstände</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>Entgeltlich erworbene Software</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>6</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>II. Sachanlagen</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>1. Grundstücke und Bauten</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>75</td>\n",
|
|
" <td>89</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Unnamed: 0 Anhang 31.12.2021 TEUR Vorjahr TEUR\n",
|
|
"0 A. Anlagevermögen NaN NaN NaN\n",
|
|
"1 I. Immaterielle Vermögensgegenstände NaN NaN NaN\n",
|
|
"2 Entgeltlich erworbene Software NaN 3 6\n",
|
|
"3 II. Sachanlagen NaN NaN NaN\n",
|
|
"4 1. Grundstücke und Bauten NaN 75 89"
|
|
]
|
|
},
|
|
"execution_count": 171,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"bilanz[\"Aktiva\"].head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 172,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Unnamed: 0</th>\n",
|
|
" <th>Anhang</th>\n",
|
|
" <th>2021 TEUR</th>\n",
|
|
" <th>Vorjahr TEUR</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>1. Umsatzerlöse</td>\n",
|
|
" <td>(1)</td>\n",
|
|
" <td>66.767</td>\n",
|
|
" <td>69.819</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>2. Veränderung des Bestandes an unfertigen Leistungen</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.000</td>\n",
|
|
" <td>-41.000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>3. Sonstige betriebliche Erträge</td>\n",
|
|
" <td>(2)</td>\n",
|
|
" <td>621.000</td>\n",
|
|
" <td>489.000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>4. Materialaufwand</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>a) Aufwendungen für bezogene Waren</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>-475.000</td>\n",
|
|
" <td>-1.220</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>5</th>\n",
|
|
" <td>b) Aufwendungen für bezogene Leistungen</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>-12.855</td>\n",
|
|
" <td>-12.457</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>6</th>\n",
|
|
" <td>5. Personalaufwand</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>7</th>\n",
|
|
" <td>a) Gehälter</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>-52.916</td>\n",
|
|
" <td>-45.242</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>8</th>\n",
|
|
" <td>b) Soziale Abgaben und Aufwendungen für Altersversorgung und für Unterstützung</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>-9.945</td>\n",
|
|
" <td>-9.999</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>9</th>\n",
|
|
" <td>davon für Altersversorgung: TEUR 1.817 (Vorjahr: TEUR 1.676)</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>10</th>\n",
|
|
" <td>6. Abschreibungen auf immaterielle Vermögensgegenstände des Anlagevermögens und Sachanlagen</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>-165.000</td>\n",
|
|
" <td>-201.000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>11</th>\n",
|
|
" <td>7. Sonstige betriebliche Aufwendungen</td>\n",
|
|
" <td>(3)</td>\n",
|
|
" <td>-4.968</td>\n",
|
|
" <td>-7.356</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>12</th>\n",
|
|
" <td>8. Zinsen und ähnliche Aufwendungen</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>-6.170</td>\n",
|
|
" <td>-10.748</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>13</th>\n",
|
|
" <td>davon aus der Aufzinsung von Rückstellungen: TEUR 6.116 (Vorjahr: TEUR 10.730)</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>14</th>\n",
|
|
" <td>9. Steuern vom Einkommen und vom Ertrag</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>35.000</td>\n",
|
|
" <td>0.000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>15</th>\n",
|
|
" <td>10. Ergebnis vor sonstigen Steuern und Verlustübernahme</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>-20.072</td>\n",
|
|
" <td>-16.956</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>16</th>\n",
|
|
" <td>11. Sonstige Steuern</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.000</td>\n",
|
|
" <td>-7.000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>17</th>\n",
|
|
" <td>12. Erträge aus Verlustübernahme</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>20.072</td>\n",
|
|
" <td>16.963</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>18</th>\n",
|
|
" <td>13. Jahresergebnis</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.000</td>\n",
|
|
" <td>0.000</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"from IPython.display import display, HTML\n",
|
|
"\n",
|
|
"# Assuming that dataframes df1 and df2 are already defined:\n",
|
|
"display(HTML(bilanz[\"Passiva\"].to_html()))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 173,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Index([0, 1], dtype='int64')\n",
|
|
"Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
|
"Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
|
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n",
|
|
" 'Vorjahr TEUR'],\n",
|
|
" dtype='object')\n",
|
|
"Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
|
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
|
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
|
"Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n",
|
|
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
|
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
|
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
|
"Index([0, 1, 2], dtype='int64')\n",
|
|
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
|
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
|
"Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
|
|
"MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n",
|
|
" ('Art der Beziehung', 'Gesellschafterin TEUR'),\n",
|
|
" ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n",
|
|
" )\n",
|
|
"Index([0, 1], dtype='int64')\n",
|
|
"MultiIndex([( 'Unnamed: 0_level_0', ...),\n",
|
|
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
|
|
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
|
|
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
|
|
" ('Anschaffungs- oder Herstellungskosten', ...)],\n",
|
|
" )\n",
|
|
"MultiIndex([('Unnamed: 0_level_0', ...),\n",
|
|
" ( 'Abschreibungen', ...),\n",
|
|
" ( 'Abschreibungen', ...),\n",
|
|
" ( 'Abschreibungen', ...),\n",
|
|
" ( 'Abschreibungen', ...)],\n",
|
|
" )\n",
|
|
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
|
|
" ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n",
|
|
" ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n",
|
|
" )\n",
|
|
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n",
|
|
" '2019'],\n",
|
|
" dtype='object')\n",
|
|
"Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n",
|
|
" 'Veränderung TEUR'],\n",
|
|
" dtype='object')\n",
|
|
"Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"def get_tables(raw_report: str) -> list:\n",
|
|
" soup = BeautifulSoup(raw_report, features=\"html.parser\")\n",
|
|
" tables = soup.find_all(\"table\", {\"class\": \"std_table\"})\n",
|
|
" dfs = []\n",
|
|
" for table in tables:\n",
|
|
" for df in pd.read_html(StringIO(str(table))):\n",
|
|
" dfs.append(df)\n",
|
|
" return dfs\n",
|
|
"\n",
|
|
"\n",
|
|
"for df in get_tables(sample_report):\n",
|
|
" print(df.columns)\n",
|
|
"\n",
|
|
"tables = get_tables(sample_report)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.3"
|
|
},
|
|
"orig_nbformat": 4
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|