1249 lines
43 KiB
Plaintext

{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Daten Extraktion aus dem Bundesanzeiger"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Vorbereitung"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>company</th>\n",
" <th>raw_report</th>\n",
" <th>jahr</th>\n",
" <th>auditors</th>\n",
" <th>financial_results</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2023-07-07</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2021</td>\n",
" <td>[]</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2023-05-10</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2021</td>\n",
" <td>[Auditor(name='Eckhard Lewe', company='Grant T...</td>\n",
" <td>{'equity': 23295.0, 'current_assets': 111516.0}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2022-03-25</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2020</td>\n",
" <td>[Auditor(name='Eckhard Lewe', company='Warth &amp;...</td>\n",
" <td>{'equity': 23296.0, 'current_assets': 93901.0}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2021-03-11</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2019</td>\n",
" <td>[Auditor(name='Eckhard Lewe', company='Warth &amp;...</td>\n",
" <td>{'net_income': 0.0, 'equity': 23296.0, 'curren...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>2020-03-24</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2018</td>\n",
" <td>[Auditor(name='Ulrich Diersch', company='Warth...</td>\n",
" <td>{'net_income': 0.0, 'equity': 23296.0, 'curren...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" date company \\\n",
"0 2023-07-07 Atos IT-Dienstleistung und Beratung GmbH \n",
"2 2023-05-10 Atos IT-Dienstleistung und Beratung GmbH \n",
"4 2022-03-25 Atos IT-Dienstleistung und Beratung GmbH \n",
"5 2021-03-11 Atos IT-Dienstleistung und Beratung GmbH \n",
"6 2020-03-24 Atos IT-Dienstleistung und Beratung GmbH \n",
"\n",
" raw_report jahr \\\n",
"0 <div class=\"publication_container\">\\n <div cla... 2021 \n",
"2 <div class=\"publication_container\">\\n <div cla... 2021 \n",
"4 <div class=\"publication_container\">\\n <div cla... 2020 \n",
"5 <div class=\"publication_container\">\\n <div cla... 2019 \n",
"6 <div class=\"publication_container\">\\n <div cla... 2018 \n",
"\n",
" auditors \\\n",
"0 [] \n",
"2 [Auditor(name='Eckhard Lewe', company='Grant T... \n",
"4 [Auditor(name='Eckhard Lewe', company='Warth &... \n",
"5 [Auditor(name='Eckhard Lewe', company='Warth &... \n",
"6 [Auditor(name='Ulrich Diersch', company='Warth... \n",
"\n",
" financial_results \n",
"0 {} \n",
"2 {'equity': 23295.0, 'current_assets': 111516.0} \n",
"4 {'equity': 23296.0, 'current_assets': 93901.0} \n",
"5 {'net_income': 0.0, 'equity': 23296.0, 'curren... \n",
"6 {'net_income': 0.0, 'equity': 23296.0, 'curren... "
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"from aki_prj23_transparenzregister.utils.data_extraction.bundesanzeiger import (\n",
" Bundesanzeiger,\n",
")\n",
"\n",
"ba_wrapper = Bundesanzeiger()\n",
"# df_reports = ba_wrapper.get_information(\"Törmer Energy Solar 1 GmbH & Co. KG\", \"\")\n",
"df_reports = ba_wrapper.get_information(\"Atos IT-Dienstleistung und Beratung GmbH\", \"\")\n",
"df_reports.head()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Daten Extraktion"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"from io import StringIO"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"sample_report = df_reports.iloc[1].raw_report"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Aufsichtsrat"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"**TODO**"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Bilanz bzw. GuV"
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index([0, 1], dtype='int64')\n",
"0 object\n",
"1 object\n",
"dtype: object\n",
"Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Unnamed: 0 object\n",
"Anhang object\n",
"31.12.2021 TEUR object\n",
"Vorjahr TEUR object\n",
"dtype: object\n",
"Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Unnamed: 0 object\n",
"Anhang object\n",
"2021 TEUR float64\n",
"Vorjahr TEUR float64\n",
"dtype: object\n",
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n",
" 'Vorjahr TEUR'],\n",
" dtype='object')\n",
"Aufgliederung nach Tätigkeitsbereichen object\n",
"2021 TEUR float64\n",
"Vorjahr TEUR float64\n",
"dtype: object\n",
"Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Aufgliederung nach Inland und Ausland object\n",
"2021 TEUR float64\n",
"Vorjahr TEUR float64\n",
"dtype: object\n",
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Unnamed: 0 object\n",
"31.12.2021 TEUR float64\n",
"Vorjahr TEUR float64\n",
"dtype: object\n",
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Unnamed: 0 object\n",
"31.12.2021 TEUR float64\n",
"Vorjahr TEUR float64\n",
"dtype: object\n",
"Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n",
"Unnamed: 0 object\n",
"31.12.2021 object\n",
"dtype: object\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Unnamed: 0 object\n",
"TEUR float64\n",
"dtype: object\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Unnamed: 0 object\n",
"TEUR float64\n",
"dtype: object\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Unnamed: 0 object\n",
"TEUR float64\n",
"dtype: object\n",
"Index([0, 1, 2], dtype='int64')\n",
"0 object\n",
"1 object\n",
"2 int64\n",
"dtype: object\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Unnamed: 0 object\n",
"TEUR int64\n",
"dtype: object\n",
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Unnamed: 0 object\n",
"31.12.2021 TEUR float64\n",
"Vorjahr TEUR float64\n",
"dtype: object\n",
"Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
"Unnamed: 0 object\n",
"2021 Anzahl MA int64\n",
"Vorjahr Anzahl MA int64\n",
"dtype: object\n",
"MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n",
" ('Art der Beziehung', 'Gesellschafterin TEUR'),\n",
" ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n",
" )\n",
"Art des Geschäfts Unnamed: 0_level_1 object\n",
"Art der Beziehung Gesellschafterin TEUR float64\n",
" Verbundene Unternehmen TEUR float64\n",
"dtype: object\n",
"Index([0, 1], dtype='int64')\n",
"0 object\n",
"1 object\n",
"dtype: object\n",
"MultiIndex([( 'Unnamed: 0_level_0', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...)],\n",
" )\n",
"Unnamed: 0_level_0 Unnamed: 0_level_1 object\n",
"Anschaffungs- oder Herstellungskosten Stand 01.01.2021 EUR object\n",
" Zugänge Umbuchung U EUR object\n",
" Abgänge Umbuchung EUR object\n",
" Stand 31.12.2021 EUR object\n",
"dtype: object\n",
"MultiIndex([('Unnamed: 0_level_0', ...),\n",
" ( 'Abschreibungen', ...),\n",
" ( 'Abschreibungen', ...),\n",
" ( 'Abschreibungen', ...),\n",
" ( 'Abschreibungen', ...)],\n",
" )\n",
"Unnamed: 0_level_0 Unnamed: 0_level_1 object\n",
"Abschreibungen Stand 01.01.2021 EUR object\n",
" Abschreibungen des Geschäftsjahres U EUR object\n",
" Abgänge Umbuchung U EUR object\n",
" Stand 31.12.2021 EUR object\n",
"dtype: object\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n",
" ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n",
" )\n",
"Unnamed: 0_level_0 Unnamed: 0_level_1 object\n",
"Buchwerte Stand 31.12.2021 EUR object\n",
" Stand 31.12.2020 EUR object\n",
"dtype: object\n",
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n",
" '2019'],\n",
" dtype='object')\n",
"Nichtfinanzieller Leistungsindikator object\n",
"Unnamed: 1 object\n",
"2021 int64\n",
"2020 int64\n",
"2019 int64\n",
"dtype: object\n",
"Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n",
" 'Veränderung TEUR'],\n",
" dtype='object')\n",
"Gewinn- und Verlustrechnung object\n",
"2021 TEUR float64\n",
"Vorjahr TEUR float64\n",
"Veränderung TEUR float64\n",
"dtype: object\n",
"Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n",
"Bilanz object\n",
"31.12.2021 TEUR float64\n",
"Vorjahr TEUR float64\n",
"Veränderung TEUR float64\n",
"dtype: object\n"
]
}
],
"source": [
"def parse_tables(report: str) -> list:\n",
" result = []\n",
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
" for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n",
" df = pd.read_html(StringIO(str(table)), flavor=\"bs4\")[0]\n",
" print(df.columns)\n",
" print(df.dtypes)\n",
" result.append(df)\n",
" return result\n",
"\n",
"\n",
"tables = parse_tables(sample_report)"
]
},
{
"cell_type": "code",
"execution_count": 164,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Anhang</th>\n",
" <th>31.12.2021 TEUR</th>\n",
" <th>Vorjahr TEUR</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>A. Anlagevermögen</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>I. Immaterielle Vermögensgegenstände</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Entgeltlich erworbene Software</td>\n",
" <td>NaN</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>II. Sachanlagen</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1. Grundstücke und Bauten</td>\n",
" <td>NaN</td>\n",
" <td>75</td>\n",
" <td>89</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 Anhang 31.12.2021 TEUR Vorjahr TEUR\n",
"0 A. Anlagevermögen NaN NaN NaN\n",
"1 I. Immaterielle Vermögensgegenstände NaN NaN NaN\n",
"2 Entgeltlich erworbene Software NaN 3 6\n",
"3 II. Sachanlagen NaN NaN NaN\n",
"4 1. Grundstücke und Bauten NaN 75 89"
]
},
"execution_count": 164,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"current_table = tables[1]\n",
"current_table.head()"
]
},
{
"cell_type": "code",
"execution_count": 165,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"\n",
"def cleanse_string(value: str) -> str:\n",
" print(value)\n",
" if value is not None and isinstance(value, str):\n",
" return re.sub(r\"(.+\\.).\", \"\", value)\n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": 166,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"A. Anlagevermögen\n",
"I. Immaterielle Vermögensgegenstände\n",
"Entgeltlich erworbene Software\n",
"II. Sachanlagen\n",
"1. Grundstücke und Bauten\n",
"2. Technische Anlagen und Maschinen\n",
"3. Andere Anlagen, Betriebs- und Geschäftsausstattung\n",
"4. Geleistete Anzahlung und Anlagen im Bau\n",
"nan\n",
"III. Finanzanlagen\n",
"Sonstige Ausleihungen\n",
"nan\n",
"B. Umlaufvermögen\n",
"I. Vorräte\n",
"Waren\n",
"II. Forderungen und sonstige Vermögensgegenstände\n",
"1. Forderungen aus Lieferungen und Leistungen\n",
"2. Forderungen gegen verbundene Unternehmen\n",
"3. Sonstige Vermögensgegenstände\n",
"nan\n",
"nan\n",
"C. Rechnungsabgrenzungsposten\n",
"D. Aktiver Unterschiedsbetrag aus der Vermögensverrechnung\n",
"nan\n",
"Passiva\n",
"nan\n",
"A. Eigenkapital\n",
"I. Gezeichnetes Kapital\n",
"II. Kapitalrücklage\n",
"III. Gewinnrücklagen\n",
"Andere Gewinnrücklagen\n",
"IV. Gewinnvortrag\n",
"nan\n",
"B. Rückstellungen\n",
"1. Rückstellungen für Pensionen\n",
"2. Steuerrückstellungen\n",
"3. Sonstige Rückstellungen\n",
"nan\n",
"C. Verbindlichkeiten\n",
"1. Erhaltene Anzahlungen\n",
"2. Verbindlichkeiten aus Lieferungen und Leistungen\n",
"3. Verbindlichkeiten gegenüber verbundenen Unternehmen\n",
"4. Sonstige Verbindlichkeiten\n",
"nan\n",
"D. Rechnungsabgrenzungungsposten\n",
"nan\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_18940\\152097142.py:2: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" current_table.iloc[index][0] = cleanse_string(row[0])\n",
"C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_18940\\152097142.py:2: FutureWarning: Series.__setitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To set a value by position, use `ser.iloc[pos] = value`\n",
" current_table.iloc[index][0] = cleanse_string(row[0])\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Anhang</th>\n",
" <th>31.12.2021 TEUR</th>\n",
" <th>Vorjahr TEUR</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Anlagevermögen</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Immaterielle Vermögensgegenstände</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Entgeltlich erworbene Software</td>\n",
" <td>NaN</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Sachanlagen</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Grundstücke und Bauten</td>\n",
" <td>NaN</td>\n",
" <td>75</td>\n",
" <td>89</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 Anhang 31.12.2021 TEUR Vorjahr TEUR\n",
"0 Anlagevermögen NaN NaN NaN\n",
"1 Immaterielle Vermögensgegenstände NaN NaN NaN\n",
"2 Entgeltlich erworbene Software NaN 3 6\n",
"3 Sachanlagen NaN NaN NaN\n",
"4 Grundstücke und Bauten NaN 75 89"
]
},
"execution_count": 166,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"for index, row in current_table.iterrows():\n",
" current_table.iloc[index][0] = cleanse_string(row[0])\n",
"current_table.head()"
]
},
{
"cell_type": "code",
"execution_count": 167,
"metadata": {},
"outputs": [],
"source": [
"def parse_string_to_float(value) -> float:\n",
" try:\n",
" if value is None:\n",
" return None\n",
" if isinstance(value, float):\n",
" return value\n",
" return float(value.replace(\".\", \"\").replace(\",\", \".\"))\n",
" except Exception as e:\n",
" return None\n",
"\n",
"\n",
"def apply_factor(value, factor: float):\n",
" transformed_value = parse_string_to_float(value)\n",
" if transformed_value is None or isinstance(transformed_value, str):\n",
" return None\n",
" result = transformed_value * factor\n",
" # print(result)\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 168,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Anhang</th>\n",
" <th>31.12.2021</th>\n",
" <th>Vorjahr</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Anlagevermögen</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Immaterielle Vermögensgegenstände</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Entgeltlich erworbene Software</td>\n",
" <td>NaN</td>\n",
" <td>3000.0</td>\n",
" <td>6000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Sachanlagen</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Grundstücke und Bauten</td>\n",
" <td>NaN</td>\n",
" <td>75000.0</td>\n",
" <td>89000.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 Anhang 31.12.2021 Vorjahr\n",
"0 Anlagevermögen NaN NaN NaN\n",
"1 Immaterielle Vermögensgegenstände NaN NaN NaN\n",
"2 Entgeltlich erworbene Software NaN 3000.0 6000.0\n",
"3 Sachanlagen NaN NaN NaN\n",
"4 Grundstücke und Bauten NaN 75000.0 89000.0"
]
},
"execution_count": 168,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"converter = {\"T€\": 1 / 1000, \"TEUR\": 1000, \"EUR\": 1 / 1000, \"€\": 1}\n",
"\n",
"for column in current_table.columns:\n",
" if isinstance(column, tuple):\n",
" for c in column:\n",
" for x, factor in converter.items():\n",
" if x in c:\n",
" current_table[column] = current_table[column].apply(\n",
" lambda x: apply_factor(x, factor)\n",
" )\n",
" next\n",
" else:\n",
" for x, factor in converter.items():\n",
" parts = column.split(\" \")\n",
" for y in parts:\n",
" if re.match(x, y):\n",
" current_table[column] = current_table[column].apply(\n",
" lambda x: apply_factor(x, factor)\n",
" )\n",
" current_table.rename({column: parts[0]}, inplace=True, axis=1)\n",
" next\n",
" # print(current_table[column])\n",
"current_table.dropna(axis=0, how=\"all\", inplace=True)\n",
"current_table.head()"
]
},
{
"cell_type": "code",
"execution_count": 169,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"\n",
"def get_bilanz(report: str) -> any:\n",
" result = {}\n",
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
" for pos in [\"Aktiva\", \"Passiva\"]:\n",
" tag = soup.find(\"b\", string=re.compile(pos))\n",
" if tag:\n",
" pos_results = pd.read_html(\n",
" StringIO(str(tag.findNext(\"table\", {\"class\": \"std_table\"})))\n",
" )[0]\n",
" result[pos] = pos_results\n",
" else:\n",
" result[pos] = pd.DataFrame([])\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Anhang</th>\n",
" <th>2021 TEUR</th>\n",
" <th>Vorjahr TEUR</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1. Umsatzerlöse</td>\n",
" <td>(1)</td>\n",
" <td>66.767</td>\n",
" <td>69.819</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2. Veränderung des Bestandes an unfertigen Lei...</td>\n",
" <td>NaN</td>\n",
" <td>0.000</td>\n",
" <td>-41.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3. Sonstige betriebliche Erträge</td>\n",
" <td>(2)</td>\n",
" <td>621.000</td>\n",
" <td>489.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4. Materialaufwand</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>a) Aufwendungen für bezogene Waren</td>\n",
" <td>NaN</td>\n",
" <td>-475.000</td>\n",
" <td>-1.220</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 Anhang 2021 TEUR \\\n",
"0 1. Umsatzerlöse (1) 66.767 \n",
"1 2. Veränderung des Bestandes an unfertigen Lei... NaN 0.000 \n",
"2 3. Sonstige betriebliche Erträge (2) 621.000 \n",
"3 4. Materialaufwand NaN NaN \n",
"4 a) Aufwendungen für bezogene Waren NaN -475.000 \n",
"\n",
" Vorjahr TEUR \n",
"0 69.819 \n",
"1 -41.000 \n",
"2 489.000 \n",
"3 NaN \n",
"4 -1.220 "
]
},
"execution_count": 170,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bilanz = get_bilanz(sample_report)\n",
"bilanz[\"Passiva\"].head()"
]
},
{
"cell_type": "code",
"execution_count": 171,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Anhang</th>\n",
" <th>31.12.2021 TEUR</th>\n",
" <th>Vorjahr TEUR</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>A. Anlagevermögen</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>I. Immaterielle Vermögensgegenstände</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Entgeltlich erworbene Software</td>\n",
" <td>NaN</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>II. Sachanlagen</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1. Grundstücke und Bauten</td>\n",
" <td>NaN</td>\n",
" <td>75</td>\n",
" <td>89</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 Anhang 31.12.2021 TEUR Vorjahr TEUR\n",
"0 A. Anlagevermögen NaN NaN NaN\n",
"1 I. Immaterielle Vermögensgegenstände NaN NaN NaN\n",
"2 Entgeltlich erworbene Software NaN 3 6\n",
"3 II. Sachanlagen NaN NaN NaN\n",
"4 1. Grundstücke und Bauten NaN 75 89"
]
},
"execution_count": 171,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bilanz[\"Aktiva\"].head()"
]
},
{
"cell_type": "code",
"execution_count": 172,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Anhang</th>\n",
" <th>2021 TEUR</th>\n",
" <th>Vorjahr TEUR</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1. Umsatzerlöse</td>\n",
" <td>(1)</td>\n",
" <td>66.767</td>\n",
" <td>69.819</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2. Veränderung des Bestandes an unfertigen Leistungen</td>\n",
" <td>NaN</td>\n",
" <td>0.000</td>\n",
" <td>-41.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3. Sonstige betriebliche Erträge</td>\n",
" <td>(2)</td>\n",
" <td>621.000</td>\n",
" <td>489.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4. Materialaufwand</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>a) Aufwendungen für bezogene Waren</td>\n",
" <td>NaN</td>\n",
" <td>-475.000</td>\n",
" <td>-1.220</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>b) Aufwendungen für bezogene Leistungen</td>\n",
" <td>NaN</td>\n",
" <td>-12.855</td>\n",
" <td>-12.457</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>5. Personalaufwand</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>a) Gehälter</td>\n",
" <td>NaN</td>\n",
" <td>-52.916</td>\n",
" <td>-45.242</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>b) Soziale Abgaben und Aufwendungen für Altersversorgung und für Unterstützung</td>\n",
" <td>NaN</td>\n",
" <td>-9.945</td>\n",
" <td>-9.999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>davon für Altersversorgung: TEUR 1.817 (Vorjahr: TEUR 1.676)</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>6. Abschreibungen auf immaterielle Vermögensgegenstände des Anlagevermögens und Sachanlagen</td>\n",
" <td>NaN</td>\n",
" <td>-165.000</td>\n",
" <td>-201.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>7. Sonstige betriebliche Aufwendungen</td>\n",
" <td>(3)</td>\n",
" <td>-4.968</td>\n",
" <td>-7.356</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>8. Zinsen und ähnliche Aufwendungen</td>\n",
" <td>NaN</td>\n",
" <td>-6.170</td>\n",
" <td>-10.748</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>davon aus der Aufzinsung von Rückstellungen: TEUR 6.116 (Vorjahr: TEUR 10.730)</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>9. Steuern vom Einkommen und vom Ertrag</td>\n",
" <td>NaN</td>\n",
" <td>35.000</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>10. Ergebnis vor sonstigen Steuern und Verlustübernahme</td>\n",
" <td>NaN</td>\n",
" <td>-20.072</td>\n",
" <td>-16.956</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>11. Sonstige Steuern</td>\n",
" <td>NaN</td>\n",
" <td>0.000</td>\n",
" <td>-7.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>12. Erträge aus Verlustübernahme</td>\n",
" <td>NaN</td>\n",
" <td>20.072</td>\n",
" <td>16.963</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>13. Jahresergebnis</td>\n",
" <td>NaN</td>\n",
" <td>0.000</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from IPython.display import display, HTML\n",
"\n",
"# Assuming that dataframes df1 and df2 are already defined:\n",
"display(HTML(bilanz[\"Passiva\"].to_html()))"
]
},
{
"cell_type": "code",
"execution_count": 173,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index([0, 1], dtype='int64')\n",
"Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n",
" 'Vorjahr TEUR'],\n",
" dtype='object')\n",
"Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index([0, 1, 2], dtype='int64')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
"MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n",
" ('Art der Beziehung', 'Gesellschafterin TEUR'),\n",
" ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n",
" )\n",
"Index([0, 1], dtype='int64')\n",
"MultiIndex([( 'Unnamed: 0_level_0', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...)],\n",
" )\n",
"MultiIndex([('Unnamed: 0_level_0', ...),\n",
" ( 'Abschreibungen', ...),\n",
" ( 'Abschreibungen', ...),\n",
" ( 'Abschreibungen', ...),\n",
" ( 'Abschreibungen', ...)],\n",
" )\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n",
" ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n",
" )\n",
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n",
" '2019'],\n",
" dtype='object')\n",
"Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n",
" 'Veränderung TEUR'],\n",
" dtype='object')\n",
"Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n"
]
}
],
"source": [
"def get_tables(raw_report: str) -> list:\n",
" soup = BeautifulSoup(raw_report, features=\"html.parser\")\n",
" tables = soup.find_all(\"table\", {\"class\": \"std_table\"})\n",
" dfs = []\n",
" for table in tables:\n",
" for df in pd.read_html(StringIO(str(table))):\n",
" dfs.append(df)\n",
" return dfs\n",
"\n",
"\n",
"for df in get_tables(sample_report):\n",
" print(df.columns)\n",
"\n",
"tables = get_tables(sample_report)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}