{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Daten Extraktion aus dem Bundesanzeiger" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Vorbereitung" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datecompanyraw_reportjahrauditorsfinancial_results
02023-07-07Atos IT-Dienstleistung und Beratung GmbH<div class=\"publication_container\">\\n <div cla...2021[]{}
22023-05-10Atos IT-Dienstleistung und Beratung GmbH<div class=\"publication_container\">\\n <div cla...2021[Auditor(name='Eckhard Lewe', company='Grant T...{'equity': 23295.0, 'current_assets': 111516.0}
42022-03-25Atos IT-Dienstleistung und Beratung GmbH<div class=\"publication_container\">\\n <div cla...2020[Auditor(name='Eckhard Lewe', company='Warth &...{'equity': 23296.0, 'current_assets': 93901.0}
52021-03-11Atos IT-Dienstleistung und Beratung GmbH<div class=\"publication_container\">\\n <div cla...2019[Auditor(name='Eckhard Lewe', company='Warth &...{'net_income': 0.0, 'equity': 23296.0, 'curren...
62020-03-24Atos IT-Dienstleistung und Beratung GmbH<div class=\"publication_container\">\\n <div cla...2018[Auditor(name='Ulrich Diersch', company='Warth...{'net_income': 0.0, 'equity': 23296.0, 'curren...
\n", "
" ], "text/plain": [ " date company \\\n", "0 2023-07-07 Atos IT-Dienstleistung und Beratung GmbH \n", "2 2023-05-10 Atos IT-Dienstleistung und Beratung GmbH \n", "4 2022-03-25 Atos IT-Dienstleistung und Beratung GmbH \n", "5 2021-03-11 Atos IT-Dienstleistung und Beratung GmbH \n", "6 2020-03-24 Atos IT-Dienstleistung und Beratung GmbH \n", "\n", " raw_report jahr \\\n", "0
\\n
\\n
\\n
\\n
\\n
list:\n", " result = []\n", " soup = BeautifulSoup(report, features=\"html.parser\")\n", " for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n", " df = pd.read_html(StringIO(str(table)), flavor=\"bs4\")[0]\n", " print(df.columns)\n", " print(df.dtypes)\n", " result.append(df)\n", " return result\n", "\n", "\n", "tables = parse_tables(sample_report)" ] }, { "cell_type": "code", "execution_count": 164, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0Anhang31.12.2021 TEURVorjahr TEUR
0A. AnlagevermögenNaNNaNNaN
1I. Immaterielle VermögensgegenständeNaNNaNNaN
2Entgeltlich erworbene SoftwareNaN36
3II. SachanlagenNaNNaNNaN
41. Grundstücke und BautenNaN7589
\n", "
" ], "text/plain": [ " Unnamed: 0 Anhang 31.12.2021 TEUR Vorjahr TEUR\n", "0 A. Anlagevermögen NaN NaN NaN\n", "1 I. Immaterielle Vermögensgegenstände NaN NaN NaN\n", "2 Entgeltlich erworbene Software NaN 3 6\n", "3 II. Sachanlagen NaN NaN NaN\n", "4 1. Grundstücke und Bauten NaN 75 89" ] }, "execution_count": 164, "metadata": {}, "output_type": "execute_result" } ], "source": [ "current_table = tables[1]\n", "current_table.head()" ] }, { "cell_type": "code", "execution_count": 165, "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "\n", "def cleanse_string(value: str) -> str:\n", " print(value)\n", " if value is not None and isinstance(value, str):\n", " return re.sub(r\"(.+\\.).\", \"\", value)\n", " return None" ] }, { "cell_type": "code", "execution_count": 166, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A. Anlagevermögen\n", "I. Immaterielle Vermögensgegenstände\n", "Entgeltlich erworbene Software\n", "II. Sachanlagen\n", "1. Grundstücke und Bauten\n", "2. Technische Anlagen und Maschinen\n", "3. Andere Anlagen, Betriebs- und Geschäftsausstattung\n", "4. Geleistete Anzahlung und Anlagen im Bau\n", "nan\n", "III. Finanzanlagen\n", "Sonstige Ausleihungen\n", "nan\n", "B. Umlaufvermögen\n", "I. Vorräte\n", "Waren\n", "II. Forderungen und sonstige Vermögensgegenstände\n", "1. Forderungen aus Lieferungen und Leistungen\n", "2. Forderungen gegen verbundene Unternehmen\n", "3. Sonstige Vermögensgegenstände\n", "nan\n", "nan\n", "C. Rechnungsabgrenzungsposten\n", "D. Aktiver Unterschiedsbetrag aus der Vermögensverrechnung\n", "nan\n", "Passiva\n", "nan\n", "A. Eigenkapital\n", "I. Gezeichnetes Kapital\n", "II. Kapitalrücklage\n", "III. Gewinnrücklagen\n", "Andere Gewinnrücklagen\n", "IV. Gewinnvortrag\n", "nan\n", "B. Rückstellungen\n", "1. Rückstellungen für Pensionen\n", "2. Steuerrückstellungen\n", "3. Sonstige Rückstellungen\n", "nan\n", "C. Verbindlichkeiten\n", "1. Erhaltene Anzahlungen\n", "2. Verbindlichkeiten aus Lieferungen und Leistungen\n", "3. Verbindlichkeiten gegenüber verbundenen Unternehmen\n", "4. Sonstige Verbindlichkeiten\n", "nan\n", "D. Rechnungsabgrenzungungsposten\n", "nan\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_18940\\152097142.py:2: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", " current_table.iloc[index][0] = cleanse_string(row[0])\n", "C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_18940\\152097142.py:2: FutureWarning: Series.__setitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To set a value by position, use `ser.iloc[pos] = value`\n", " current_table.iloc[index][0] = cleanse_string(row[0])\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0Anhang31.12.2021 TEURVorjahr TEUR
0AnlagevermögenNaNNaNNaN
1Immaterielle VermögensgegenständeNaNNaNNaN
2Entgeltlich erworbene SoftwareNaN36
3SachanlagenNaNNaNNaN
4Grundstücke und BautenNaN7589
\n", "
" ], "text/plain": [ " Unnamed: 0 Anhang 31.12.2021 TEUR Vorjahr TEUR\n", "0 Anlagevermögen NaN NaN NaN\n", "1 Immaterielle Vermögensgegenstände NaN NaN NaN\n", "2 Entgeltlich erworbene Software NaN 3 6\n", "3 Sachanlagen NaN NaN NaN\n", "4 Grundstücke und Bauten NaN 75 89" ] }, "execution_count": 166, "metadata": {}, "output_type": "execute_result" } ], "source": [ "for index, row in current_table.iterrows():\n", " current_table.iloc[index][0] = cleanse_string(row[0])\n", "current_table.head()" ] }, { "cell_type": "code", "execution_count": 167, "metadata": {}, "outputs": [], "source": [ "def parse_string_to_float(value) -> float:\n", " try:\n", " if value is None:\n", " return None\n", " if isinstance(value, float):\n", " return value\n", " return float(value.replace(\".\", \"\").replace(\",\", \".\"))\n", " except Exception as e:\n", " return None\n", "\n", "\n", "def apply_factor(value, factor: float):\n", " transformed_value = parse_string_to_float(value)\n", " if transformed_value is None or isinstance(transformed_value, str):\n", " return None\n", " result = transformed_value * factor\n", " # print(result)\n", " return result" ] }, { "cell_type": "code", "execution_count": 168, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0Anhang31.12.2021Vorjahr
0AnlagevermögenNaNNaNNaN
1Immaterielle VermögensgegenständeNaNNaNNaN
2Entgeltlich erworbene SoftwareNaN3000.06000.0
3SachanlagenNaNNaNNaN
4Grundstücke und BautenNaN75000.089000.0
\n", "
" ], "text/plain": [ " Unnamed: 0 Anhang 31.12.2021 Vorjahr\n", "0 Anlagevermögen NaN NaN NaN\n", "1 Immaterielle Vermögensgegenstände NaN NaN NaN\n", "2 Entgeltlich erworbene Software NaN 3000.0 6000.0\n", "3 Sachanlagen NaN NaN NaN\n", "4 Grundstücke und Bauten NaN 75000.0 89000.0" ] }, "execution_count": 168, "metadata": {}, "output_type": "execute_result" } ], "source": [ "converter = {\"T€\": 1 / 1000, \"TEUR\": 1000, \"EUR\": 1 / 1000, \"€\": 1}\n", "\n", "for column in current_table.columns:\n", " if isinstance(column, tuple):\n", " for c in column:\n", " for x, factor in converter.items():\n", " if x in c:\n", " current_table[column] = current_table[column].apply(\n", " lambda x: apply_factor(x, factor)\n", " )\n", " next\n", " else:\n", " for x, factor in converter.items():\n", " parts = column.split(\" \")\n", " for y in parts:\n", " if re.match(x, y):\n", " current_table[column] = current_table[column].apply(\n", " lambda x: apply_factor(x, factor)\n", " )\n", " current_table.rename({column: parts[0]}, inplace=True, axis=1)\n", " next\n", " # print(current_table[column])\n", "current_table.dropna(axis=0, how=\"all\", inplace=True)\n", "current_table.head()" ] }, { "cell_type": "code", "execution_count": 169, "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "\n", "def get_bilanz(report: str) -> any:\n", " result = {}\n", " soup = BeautifulSoup(report, features=\"html.parser\")\n", " for pos in [\"Aktiva\", \"Passiva\"]:\n", " tag = soup.find(\"b\", string=re.compile(pos))\n", " if tag:\n", " pos_results = pd.read_html(\n", " StringIO(str(tag.findNext(\"table\", {\"class\": \"std_table\"})))\n", " )[0]\n", " result[pos] = pos_results\n", " else:\n", " result[pos] = pd.DataFrame([])\n", " return result" ] }, { "cell_type": "code", "execution_count": 170, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0Anhang2021 TEURVorjahr TEUR
01. Umsatzerlöse(1)66.76769.819
12. Veränderung des Bestandes an unfertigen Lei...NaN0.000-41.000
23. Sonstige betriebliche Erträge(2)621.000489.000
34. MaterialaufwandNaNNaNNaN
4a) Aufwendungen für bezogene WarenNaN-475.000-1.220
\n", "
" ], "text/plain": [ " Unnamed: 0 Anhang 2021 TEUR \\\n", "0 1. Umsatzerlöse (1) 66.767 \n", "1 2. Veränderung des Bestandes an unfertigen Lei... NaN 0.000 \n", "2 3. Sonstige betriebliche Erträge (2) 621.000 \n", "3 4. Materialaufwand NaN NaN \n", "4 a) Aufwendungen für bezogene Waren NaN -475.000 \n", "\n", " Vorjahr TEUR \n", "0 69.819 \n", "1 -41.000 \n", "2 489.000 \n", "3 NaN \n", "4 -1.220 " ] }, "execution_count": 170, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bilanz = get_bilanz(sample_report)\n", "bilanz[\"Passiva\"].head()" ] }, { "cell_type": "code", "execution_count": 171, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0Anhang31.12.2021 TEURVorjahr TEUR
0A. AnlagevermögenNaNNaNNaN
1I. Immaterielle VermögensgegenständeNaNNaNNaN
2Entgeltlich erworbene SoftwareNaN36
3II. SachanlagenNaNNaNNaN
41. Grundstücke und BautenNaN7589
\n", "
" ], "text/plain": [ " Unnamed: 0 Anhang 31.12.2021 TEUR Vorjahr TEUR\n", "0 A. Anlagevermögen NaN NaN NaN\n", "1 I. Immaterielle Vermögensgegenstände NaN NaN NaN\n", "2 Entgeltlich erworbene Software NaN 3 6\n", "3 II. Sachanlagen NaN NaN NaN\n", "4 1. Grundstücke und Bauten NaN 75 89" ] }, "execution_count": 171, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bilanz[\"Aktiva\"].head()" ] }, { "cell_type": "code", "execution_count": 172, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0Anhang2021 TEURVorjahr TEUR
01. Umsatzerlöse(1)66.76769.819
12. Veränderung des Bestandes an unfertigen LeistungenNaN0.000-41.000
23. Sonstige betriebliche Erträge(2)621.000489.000
34. MaterialaufwandNaNNaNNaN
4a) Aufwendungen für bezogene WarenNaN-475.000-1.220
5b) Aufwendungen für bezogene LeistungenNaN-12.855-12.457
65. PersonalaufwandNaNNaNNaN
7a) GehälterNaN-52.916-45.242
8b) Soziale Abgaben und Aufwendungen für Altersversorgung und für UnterstützungNaN-9.945-9.999
9davon für Altersversorgung: TEUR 1.817 (Vorjahr: TEUR 1.676)NaNNaNNaN
106. Abschreibungen auf immaterielle Vermögensgegenstände des Anlagevermögens und SachanlagenNaN-165.000-201.000
117. Sonstige betriebliche Aufwendungen(3)-4.968-7.356
128. Zinsen und ähnliche AufwendungenNaN-6.170-10.748
13davon aus der Aufzinsung von Rückstellungen: TEUR 6.116 (Vorjahr: TEUR 10.730)NaNNaNNaN
149. Steuern vom Einkommen und vom ErtragNaN35.0000.000
1510. Ergebnis vor sonstigen Steuern und VerlustübernahmeNaN-20.072-16.956
1611. Sonstige SteuernNaN0.000-7.000
1712. Erträge aus VerlustübernahmeNaN20.07216.963
1813. JahresergebnisNaN0.0000.000
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from IPython.display import display, HTML\n", "\n", "# Assuming that dataframes df1 and df2 are already defined:\n", "display(HTML(bilanz[\"Passiva\"].to_html()))" ] }, { "cell_type": "code", "execution_count": 173, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index([0, 1], dtype='int64')\n", "Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", "Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", "Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n", " 'Vorjahr TEUR'],\n", " dtype='object')\n", "Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", "Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n", "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n", "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n", "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n", "Index([0, 1, 2], dtype='int64')\n", "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n", "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", "Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n", "MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n", " ('Art der Beziehung', 'Gesellschafterin TEUR'),\n", " ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n", " )\n", "Index([0, 1], dtype='int64')\n", "MultiIndex([( 'Unnamed: 0_level_0', ...),\n", " ('Anschaffungs- oder Herstellungskosten', ...),\n", " ('Anschaffungs- oder Herstellungskosten', ...),\n", " ('Anschaffungs- oder Herstellungskosten', ...),\n", " ('Anschaffungs- oder Herstellungskosten', ...)],\n", " )\n", "MultiIndex([('Unnamed: 0_level_0', ...),\n", " ( 'Abschreibungen', ...),\n", " ( 'Abschreibungen', ...),\n", " ( 'Abschreibungen', ...),\n", " ( 'Abschreibungen', ...)],\n", " )\n", "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", " ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n", " ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n", " )\n", "Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n", " '2019'],\n", " dtype='object')\n", "Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n", " 'Veränderung TEUR'],\n", " dtype='object')\n", "Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n" ] } ], "source": [ "def get_tables(raw_report: str) -> list:\n", " soup = BeautifulSoup(raw_report, features=\"html.parser\")\n", " tables = soup.find_all(\"table\", {\"class\": \"std_table\"})\n", " dfs = []\n", " for table in tables:\n", " for df in pd.read_html(StringIO(str(table))):\n", " dfs.append(df)\n", " return dfs\n", "\n", "\n", "for df in get_tables(sample_report):\n", " print(df.columns)\n", "\n", "tables = get_tables(sample_report)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.3" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }