{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Daten Extraktion aus dem Bundesanzeiger" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Vorbereitung" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datecompanyraw_reportjahrauditors
02023-07-07Atos IT-Dienstleistung und Beratung GmbH<div class=\"publication_container\">\\n <div cla...2021[]
22023-05-10Atos IT-Dienstleistung und Beratung GmbH<div class=\"publication_container\">\\n <div cla...2021[Auditor(name='Eckhard Lewe', company='Grant T...
42022-03-25Atos IT-Dienstleistung und Beratung GmbH<div class=\"publication_container\">\\n <div cla...2020[Auditor(name='Eckhard Lewe', company='Warth &...
52021-03-11Atos IT-Dienstleistung und Beratung GmbH<div class=\"publication_container\">\\n <div cla...2019[Auditor(name='Eckhard Lewe', company='Warth &...
62020-03-24Atos IT-Dienstleistung und Beratung GmbH<div class=\"publication_container\">\\n <div cla...2018[Auditor(name='Ulrich Diersch', company='Warth...
\n", "
" ], "text/plain": [ " date company \\\n", "0 2023-07-07 Atos IT-Dienstleistung und Beratung GmbH \n", "2 2023-05-10 Atos IT-Dienstleistung und Beratung GmbH \n", "4 2022-03-25 Atos IT-Dienstleistung und Beratung GmbH \n", "5 2021-03-11 Atos IT-Dienstleistung und Beratung GmbH \n", "6 2020-03-24 Atos IT-Dienstleistung und Beratung GmbH \n", "\n", " raw_report jahr \\\n", "0
\\n
\\n
\\n
\\n
\\n
dict:\n", " \"\"\"\n", " Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd\n", " Extracts Key Performance Indicators (KPIs) from the financial reports.\n", " Args:\n", " reports (dict): A dictionary containing the financial reports with their hash as keys and report details as values.\n", " Returns:\n", " dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.\n", " \"\"\"\n", "\n", " kpis = {}\n", "\n", " # Define KPI patterns to search for\n", " kpi_patterns = {\n", " \"revenue\": r\"(?:revenue|umsatz|erlöse)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"net_income\": r\"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"ebit\": r\"(?:ebit|operating income)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"ebitda\": r\"(?:ebitda)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"gross_profit\": r\"(?:gross profit|bruttogewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"operating_profit\": r\"(?:operating profit|betriebsgewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"assets\": r\"(?:total assets|bilanzsumme)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"liabilities\": r\"(?:total liabilities|gesamtverbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"equity\": r\"(?:shareholders'? equity|eigenkapital)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"current_assets\": r\"(?:current assets|umlaufvermögen)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"current_liabilities\": r\"(?:current liabilities|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"long_term_debt\": r\"(?:long[-\\s]?term debt|langfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"short_term_debt\": r\"(?:short[-\\s]?term debt|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"cash_and_cash_equivalents\": r\"(?:cash (?:and cash equivalents)?|barmittel)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"dividends\": r\"(?:dividends?|dividende)[:\\s]*([\\d,.]+[mmb]?)\",\n", " \"cash_flow\": r\"(?:cash flow|cashflow|cash flow from operating activities)[:\\s]*([\\d,.]+[mmb]?)\",\n", " }\n", "\n", " report_kpis = {}\n", " for kpi, pattern in kpi_patterns.items():\n", " match = re.search(pattern, report_content, flags=re.IGNORECASE | re.UNICODE)\n", " if match:\n", " value = match.group(1)\n", "\n", " # Clean and validate the extracted number\n", " try:\n", " if not value: # Check if value is empty\n", " cleaned_value = None\n", " else:\n", " multiplier = 1\n", " if value[-1].lower() == \"m\":\n", " value = value[:-1]\n", " multiplier = 1_000_000\n", " elif value[-1].lower() == \"b\":\n", " value = value[:-1]\n", " multiplier = 1_000_000_000\n", "\n", " # Remove commas after checking for multipliers\n", " value = value.replace(\".\", \"\").replace(\",\", \".\").strip()\n", " cleaned_value = float(value) * multiplier\n", " except ValueError:\n", " cleaned_value = None\n", "\n", " if cleaned_value is not None:\n", " report_kpis[kpi] = cleaned_value\n", " return report_kpis\n", "\n", "\n", "extract_kpis(\n", " BeautifulSoup(sample_report, features=\"html.parser\").get_text().replace(\"\\n\", \" \")\n", ")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "with open(\"./temp.txt\", \"w\") as file:\n", " file.write(\n", " BeautifulSoup(sample_report, features=\"html.parser\")\n", " .get_text()\n", " .replace(\"\\n\", \" \")\n", " )" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n", " ('Aktiva', '31.12.2021 EUR'),\n", " ('Aktiva', '31.12.2020 EUR')],\n", " )\n", "Aktiva Unnamed: 0_level_1 object\n", " 31.12.2021 EUR object\n", " 31.12.2020 EUR object\n", "dtype: object\n", "MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n", " ('Passiva', '31.12.2021 EUR'),\n", " ('Passiva', '31.12.2020 EUR')],\n", " )\n", "Passiva Unnamed: 0_level_1 object\n", " 31.12.2021 EUR object\n", " 31.12.2020 EUR object\n", "dtype: object\n", "Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n", "Angaben zur Identifikation der Gesellschaft laut Registergericht object\n", "Angaben zur Identifikation der Gesellschaft laut Registergericht.1 object\n", "dtype: object\n", "MultiIndex([('Kreditentwicklung', 'Unnamed: 0_level_1'),\n", " ( 'Betrag', 'EUR')],\n", " )\n", "Kreditentwicklung Unnamed: 0_level_1 object\n", "Betrag EUR object\n", "dtype: object\n" ] }, { "data": { "text/plain": [ "{}" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def parse_tables(report: str) -> list:\n", " result = {}\n", " soup = BeautifulSoup(report, features=\"html.parser\")\n", " for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n", " df = pd.read_html(StringIO(str(table)))[0]\n", " print(df.columns)\n", " print(df.dtypes)\n", " return result\n", "\n", "\n", "parse_tables(sample_report)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: []\n", "Index: []" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def get_bilanz(report: str) -> any:\n", " result = {}\n", " soup = BeautifulSoup(report, features=\"html.parser\")\n", " for pos in [\"Aktiva\", \"Passiva\"]:\n", " tag = soup.find(\"b\", string=re.compile(pos))\n", " if tag:\n", " pos_results = pd.read_html(\n", " StringIO(str(tag.findNext(\"table\", {\"class\": \"std_table\"})))\n", " )[0]\n", " result[pos] = pos_results\n", " else:\n", " result[pos] = pd.DataFrame([])\n", " return result\n", "\n", "\n", "bilanz = get_bilanz(sample_report)\n", "bilanz[\"Passiva\"].head()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n", " ('Aktiva', '31.12.2021 EUR'),\n", " ('Aktiva', '31.12.2020 EUR')],\n", " )\n", "MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n", " ('Passiva', '31.12.2021 EUR'),\n", " ('Passiva', '31.12.2020 EUR')],\n", " )\n", "Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n", "MultiIndex([('Kreditentwicklung', 'Unnamed: 0_level_1'),\n", " ( 'Betrag', 'EUR')],\n", " )\n" ] } ], "source": [ "def get_tables(raw_report: str) -> list:\n", " soup = BeautifulSoup(raw_report, features=\"html.parser\")\n", " tables = soup.find_all(\"table\", {\"class\": \"std_table\"})\n", " dfs = []\n", " for table in tables:\n", " for df in pd.read_html(StringIO(str(table))):\n", " dfs.append(df)\n", " return dfs\n", "\n", "\n", "for df in get_tables(sample_report):\n", " print(df.columns)\n", "\n", "tables = get_tables(sample_report)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.3" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }