{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Daten Extraktion aus dem Bundesanzeiger" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Vorbereitung" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from deutschland.bundesanzeiger import Bundesanzeiger" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dict_keys(['c1051233030a8e0232523052fd4a2310', '57d129e6fd7505d567fa13919e5e6bdd'])\n" ] } ], "source": [ "ba = Bundesanzeiger()\n", "reports = ba.get_reports(\n", " \"Volkswagen Economy Service Erdle Bernhard Erdle GmbH\"\n", ") # \"Atos IT-Dienstleistung und Beratung GmbH\")\n", "print(reports.keys())" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "report_contents = []\n", "for key in reports.keys():\n", " report_contents.append(reports[key])" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datenamecompanyreportraw_report
02023-05-25Jahresabschluss zum Geschäftsjahr vom 01.01.20...Volkswagen Economy Service Erdle Bernhard Erdl...\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...<div class=\"publication_container\">\\n <div cla...
12023-05-24Jahresabschluss zum Geschäftsjahr vom 01.01.20...Volkswagen Economy Service Erdle Bernhard Erdl...\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...<div class=\"publication_container\">\\n <div cla...
\n", "
" ], "text/plain": [ " date name \\\n", "0 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n", "1 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n", "\n", " company \\\n", "0 Volkswagen Economy Service Erdle Bernhard Erdl... \n", "1 Volkswagen Economy Service Erdle Bernhard Erdl... \n", "\n", " report \\\n", "0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n", "1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n", "\n", " raw_report \n", "0
\\n
\\n
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datenamecompanyreportraw_reporttype
02023-05-25Jahresabschluss zum Geschäftsjahr vom 01.01.20...Volkswagen Economy Service Erdle Bernhard Erdl...\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...<div class=\"publication_container\">\\n <div cla...Jahresabschluss
12023-05-24Jahresabschluss zum Geschäftsjahr vom 01.01.20...Volkswagen Economy Service Erdle Bernhard Erdl...\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...<div class=\"publication_container\">\\n <div cla...Jahresabschluss
\n", "
" ], "text/plain": [ " date name \\\n", "0 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n", "1 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n", "\n", " company \\\n", "0 Volkswagen Economy Service Erdle Bernhard Erdl... \n", "1 Volkswagen Economy Service Erdle Bernhard Erdl... \n", "\n", " report \\\n", "0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n", "1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n", "\n", " raw_report type \n", "0
\\n
\\n
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datecompanyraw_reportjahr
02023-05-25Volkswagen Economy Service Erdle Bernhard Erdl...<div class=\"publication_container\">\\n <div cla...2020
12023-05-24Volkswagen Economy Service Erdle Bernhard Erdl...<div class=\"publication_container\">\\n <div cla...2019
\n", "
" ], "text/plain": [ " date company \\\n", "0 2023-05-25 Volkswagen Economy Service Erdle Bernhard Erdl... \n", "1 2023-05-24 Volkswagen Economy Service Erdle Bernhard Erdl... \n", "\n", " raw_report jahr \n", "0
\\n
\\n
str:\n", " soup = BeautifulSoup(report, features=\"html.parser\")\n", " temp = soup.find_all(\"b\")\n", " for elem in temp:\n", " br = elem.findChildren(\"br\")\n", " if len(br) > 0:\n", " return elem.text.split(\"\\n\")[1].strip()\n", " return None\n", "\n", "\n", "def extract_auditors(report: str) -> list:\n", " auditor_company = extract_auditor_company(report)\n", " auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n", " hits = re.findall(auditor_regex, report)\n", " return [\n", " Auditor(hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip(), auditor_company)\n", " for hit in hits\n", " ]" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "extract_auditors(sample_report)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Aufsichtsrat" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "**TODO**" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Bilanz bzw. GuV" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'net_income': 23484.67, 'equity': 65083.84, 'current_assets': 357613.61}" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def extract_kpis(report_content) -> dict:\n", " \"\"\"\n", " Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd\n", " Extracts Key Performance Indicators (KPIs) from the financial reports.\n", " Args:\n", " reports (dict): A dictionary containing the financial reports with their hash as keys and report details as values.\n", " Returns:\n", " dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.\n", " \"\"\"\n", "\n", " kpis = {}\n", "\n", " # Define KPI patterns to search for\n", " kpi_patterns = {\n", " \"revenue\": r\"(?:revenue|umsatz|erlöse)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"net_income\": r\"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"ebit\": r\"(?:ebit|operating income)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"ebitda\": r\"(?:ebitda)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"gross_profit\": r\"(?:gross profit|bruttogewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"operating_profit\": r\"(?:operating profit|betriebsgewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"assets\": r\"(?:total assets|bilanzsumme)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"liabilities\": r\"(?:total liabilities|gesamtverbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"equity\": r\"(?:shareholders'? equity|eigenkapital)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"current_assets\": r\"(?:current assets|umlaufvermögen)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"current_liabilities\": r\"(?:current liabilities|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"long_term_debt\": r\"(?:long[-\\s]?term debt|langfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"short_term_debt\": r\"(?:short[-\\s]?term debt|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"cash_and_cash_equivalents\": r\"(?:cash (?:and cash equivalents)?|barmittel)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"dividends\": r\"(?:dividends?|dividende)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"cash_flow\": r\"(?:cash flow|cashflow|cash flow from operating activities)[:\\s]*([\\d,.]+[mmb]?)\",\n", " }\n", "\n", " report_kpis = {}\n", " for kpi, pattern in kpi_patterns.items():\n", " match = re.search(pattern, report_content, flags=re.IGNORECASE | re.UNICODE)\n", " if match:\n", " value = match.group(1)\n", "\n", " # Clean and validate the extracted number\n", " try:\n", " if not value: # Check if value is empty\n", " cleaned_value = None\n", " else:\n", " multiplier = 1\n", " if value[-1].lower() == \"m\":\n", " value = value[:-1]\n", " multiplier = 1_000_000\n", " elif value[-1].lower() == \"b\":\n", " value = value[:-1]\n", " multiplier = 1_000_000_000\n", "\n", " # Remove commas after checking for multipliers\n", " value = value.replace(\".\", \"\").replace(\",\", \".\").strip()\n", " cleaned_value = float(value) * multiplier\n", " except ValueError:\n", " cleaned_value = None\n", "\n", " if cleaned_value is not None:\n", " report_kpis[kpi] = cleaned_value\n", " return report_kpis\n", "\n", "\n", "extract_kpis(\n", " BeautifulSoup(sample_report, features=\"html.parser\").get_text().replace(\"\\n\", \" \")\n", ")" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "with open(\"./temp.txt\", \"w\") as file:\n", " file.write(\n", " BeautifulSoup(sample_report, features=\"html.parser\")\n", " .get_text()\n", " .replace(\"\\n\", \" \")\n", " )" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n", " ('Aktiva', '31.12.2020 EUR'),\n", " ('Aktiva', '31.12.2019 EUR')],\n", " )\n", "Aktiva Unnamed: 0_level_1 object\n", " 31.12.2020 EUR object\n", " 31.12.2019 EUR object\n", "dtype: object\n", "MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n", " ('Passiva', '31.12.2020 EUR'),\n", " ('Passiva', '31.12.2019 EUR')],\n", " )\n", "Passiva Unnamed: 0_level_1 object\n", " 31.12.2020 EUR object\n", " 31.12.2019 EUR object\n", "dtype: object\n", "Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n", "Angaben zur Identifikation der Gesellschaft laut Registergericht object\n", "Angaben zur Identifikation der Gesellschaft laut Registergericht.1 object\n", "dtype: object\n" ] }, { "data": { "text/plain": [ "{}" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def parse_tables(report: str) -> list:\n", " result = {}\n", " soup = BeautifulSoup(report, features=\"html.parser\")\n", " for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n", " df = pd.read_html(StringIO(str(table)))[0]\n", " print(df.columns)\n", " print(df.dtypes)\n", " return result\n", "\n", "\n", "parse_tables(sample_report)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "ename": "KeyError", "evalue": "'Passiva'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", "\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Bundesanzeiger\\notebook.ipynb Cell 21\u001b[0m in \u001b[0;36m1\n\u001b[0;32m 11\u001b[0m \u001b[39mreturn\u001b[39;00m result\n\u001b[0;32m 14\u001b[0m bilanz \u001b[39m=\u001b[39m get_bilanz(sample_report)\n\u001b[1;32m---> 15\u001b[0m bilanz[\u001b[39m\"\u001b[39;49m\u001b[39mPassiva\u001b[39;49m\u001b[39m\"\u001b[39;49m]\u001b[39m.\u001b[39mhead()\n", "\u001b[1;31mKeyError\u001b[0m: 'Passiva'" ] } ], "source": [ "def get_bilanz(report: str) -> any:\n", " result = {}\n", " soup = BeautifulSoup(report, features=\"html.parser\")\n", " for pos in [\"Aktiva\", \"Passiva\"]:\n", " tag = soup.find(\"b\", string=re.compile(pos))\n", " if tag:\n", " pos_results = pd.read_html(\n", " StringIO(str(tag.findNext(\"table\", {\"class\": \"std_table\"})))\n", " )[0]\n", " result[pos] = pos_results\n", " return result\n", "\n", "\n", "bilanz = get_bilanz(sample_report)\n", "bilanz[\"Passiva\"].head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Int64Index([0, 1], dtype='int64')\n", "Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", "Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", "Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n", " 'Vorjahr TEUR'],\n", " dtype='object')\n", "Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", "Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n", "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n", "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n", "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n", "Int64Index([0, 1, 2], dtype='int64')\n", "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n", "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", "Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n", "MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n", " ('Art der Beziehung', 'Gesellschafterin TEUR'),\n", " ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n", " )\n", "Int64Index([0, 1], dtype='int64')\n", "MultiIndex([( 'Unnamed: 0_level_0', ...),\n", " ('Anschaffungs- oder Herstellungskosten', ...),\n", " ('Anschaffungs- oder Herstellungskosten', ...),\n", " ('Anschaffungs- oder Herstellungskosten', ...),\n", " ('Anschaffungs- oder Herstellungskosten', ...)],\n", " )\n", "MultiIndex([('Unnamed: 0_level_0', ...),\n", " ( 'Abschreibungen', ...),\n", " ( 'Abschreibungen', ...),\n", " ( 'Abschreibungen', ...),\n", " ( 'Abschreibungen', ...)],\n", " )\n", "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", " ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n", " ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n", " )\n", "Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n", " '2019'],\n", " dtype='object')\n", "Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n", " 'Veränderung TEUR'],\n", " dtype='object')\n", "Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n" ] } ], "source": [ "def get_tables(raw_report: str) -> list:\n", " soup = BeautifulSoup(raw_report, features=\"html.parser\")\n", " tables = soup.find_all(\"table\", {\"class\": \"std_table\"})\n", " dfs = []\n", " for table in tables:\n", " for df in pd.read_html(StringIO(str(table))):\n", " dfs.append(df)\n", " return dfs\n", "\n", "\n", "for df in get_tables(sample_report):\n", " print(df.columns)\n", "\n", "tables = get_tables(sample_report)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.7" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }