diff --git a/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb b/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb index f67b7e7..cfcf1c6 100644 --- a/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb +++ b/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 64, "metadata": {}, "outputs": [ { @@ -47,6 +47,7 @@ " raw_report\n", " jahr\n", " auditors\n", + " financial_results\n", " \n", " \n", " \n", @@ -57,6 +58,7 @@ " <div class=\"publication_container\">\\n <div cla...\n", " 2021\n", " []\n", + " {}\n", " \n", " \n", " 2\n", @@ -65,6 +67,7 @@ " <div class=\"publication_container\">\\n <div cla...\n", " 2021\n", " [Auditor(name='Eckhard Lewe', company='Grant T...\n", + " {'equity': 23295.0, 'current_assets': 111516.0}\n", " \n", " \n", " 4\n", @@ -73,6 +76,7 @@ " <div class=\"publication_container\">\\n <div cla...\n", " 2020\n", " [Auditor(name='Eckhard Lewe', company='Warth &...\n", + " {'equity': 23296.0, 'current_assets': 93901.0}\n", " \n", " \n", " 5\n", @@ -81,6 +85,7 @@ " <div class=\"publication_container\">\\n <div cla...\n", " 2019\n", " [Auditor(name='Eckhard Lewe', company='Warth &...\n", + " {'net_income': 0.0, 'equity': 23296.0, 'curren...\n", " \n", " \n", " 6\n", @@ -89,6 +94,7 @@ " <div class=\"publication_container\">\\n <div cla...\n", " 2018\n", " [Auditor(name='Ulrich Diersch', company='Warth...\n", + " {'net_income': 0.0, 'equity': 23296.0, 'curren...\n", " \n", " \n", "\n", @@ -109,15 +115,22 @@ "5
\\n
\\n
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
datecompanyraw_reportjahr
02023-07-11Volkswagen Economy Service Erdle Bernhard Erdl...<div class=\"publication_container\">\\n <div cla...2021
12023-05-25Volkswagen Economy Service Erdle Bernhard Erdl...<div class=\"publication_container\">\\n <div cla...2020
22023-05-24Volkswagen Economy Service Erdle Bernhard Erdl...<div class=\"publication_container\">\\n <div cla...2019
\n", - "
" - ], - "text/plain": [ - " date company \\\n", - "0 2023-07-11 Volkswagen Economy Service Erdle Bernhard Erdl... \n", - "1 2023-05-25 Volkswagen Economy Service Erdle Bernhard Erdl... \n", - "2 2023-05-24 Volkswagen Economy Service Erdle Bernhard Erdl... \n", - "\n", - " raw_report jahr \n", - "0
\\n
\\n
\\n
str:\n", - " soup = BeautifulSoup(report, features=\"html.parser\")\n", - " temp = soup.find_all(\"b\")\n", - " for elem in temp:\n", - " br = elem.findChildren(\"br\")\n", - " if len(br) > 0:\n", - " return elem.text.split(\"\\n\")[1].strip()\n", - " return None\n", - "\n", - "\n", - "def extract_auditors(report: str) -> list:\n", - " auditor_company = extract_auditor_company(report)\n", - " auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n", - " hits = re.findall(auditor_regex, report)\n", - " return [\n", - " Auditor(hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip(), auditor_company)\n", - " for hit in hits\n", - " ]" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "extract_auditors(sample_report)" + "sample_report = df_reports.iloc[1].raw_report" ] }, { @@ -330,169 +201,174 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'net_income': 100238.5, 'equity': 165322.34, 'current_assets': 435344.07}" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def extract_kpis(report_content) -> dict:\n", - " \"\"\"\n", - " Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd\n", - " Extracts Key Performance Indicators (KPIs) from the financial reports.\n", - " Args:\n", - " reports (dict): A dictionary containing the financial reports with their hash as keys and report details as values.\n", - " Returns:\n", - " dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.\n", - " \"\"\"\n", - "\n", - " kpis = {}\n", - "\n", - " # Define KPI patterns to search for\n", - " kpi_patterns = {\n", - " \"revenue\": r\"(?:revenue|umsatz|erlöse)[:\\s]*([\\d,.]+[mmb]?)\",\n", - " \"net_income\": r\"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\\s]*([\\d,.]+[mmb]?)\",\n", - " \"ebit\": r\"(?:ebit|operating income)[:\\s]*([\\d,.]+[mmb]?)\",\n", - " \"ebitda\": r\"(?:ebitda)[:\\s]*([\\d,.]+[mmb]?)\",\n", - " \"gross_profit\": r\"(?:gross profit|bruttogewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n", - " \"operating_profit\": r\"(?:operating profit|betriebsgewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n", - " \"assets\": r\"(?:total assets|bilanzsumme)[:\\s]*([\\d,.]+[mmb]?)\",\n", - " \"liabilities\": r\"(?:total liabilities|gesamtverbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n", - " \"equity\": r\"(?:shareholders'? equity|eigenkapital)[:\\s]*([\\d,.]+[mmb]?)\",\n", - " \"current_assets\": r\"(?:current assets|umlaufvermögen)[:\\s]*([\\d,.]+[mmb]?)\",\n", - " \"current_liabilities\": r\"(?:current liabilities|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n", - " \"long_term_debt\": r\"(?:long[-\\s]?term debt|langfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n", - " \"short_term_debt\": r\"(?:short[-\\s]?term debt|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n", - " \"cash_and_cash_equivalents\": r\"(?:cash (?:and cash equivalents)?|barmittel)[:\\s]*([\\d,.]+[mmb]?)\",\n", - " \"dividends\": r\"(?:dividends?|dividende)[:\\s]*([\\d,.]+[mmb]?)\",\n", - " \"cash_flow\": r\"(?:cash flow|cashflow|cash flow from operating activities)[:\\s]*([\\d,.]+[mmb]?)\",\n", - " }\n", - "\n", - " report_kpis = {}\n", - " for kpi, pattern in kpi_patterns.items():\n", - " match = re.search(pattern, report_content, flags=re.IGNORECASE | re.UNICODE)\n", - " if match:\n", - " value = match.group(1)\n", - "\n", - " # Clean and validate the extracted number\n", - " try:\n", - " if not value: # Check if value is empty\n", - " cleaned_value = None\n", - " else:\n", - " multiplier = 1\n", - " if value[-1].lower() == \"m\":\n", - " value = value[:-1]\n", - " multiplier = 1_000_000\n", - " elif value[-1].lower() == \"b\":\n", - " value = value[:-1]\n", - " multiplier = 1_000_000_000\n", - "\n", - " # Remove commas after checking for multipliers\n", - " value = value.replace(\".\", \"\").replace(\",\", \".\").strip()\n", - " cleaned_value = float(value) * multiplier\n", - " except ValueError:\n", - " cleaned_value = None\n", - "\n", - " if cleaned_value is not None:\n", - " report_kpis[kpi] = cleaned_value\n", - " return report_kpis\n", - "\n", - "\n", - "extract_kpis(\n", - " BeautifulSoup(sample_report, features=\"html.parser\").get_text().replace(\"\\n\", \" \")\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "with open(\"./temp.txt\", \"w\") as file:\n", - " file.write(\n", - " BeautifulSoup(sample_report, features=\"html.parser\")\n", - " .get_text()\n", - " .replace(\"\\n\", \" \")\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 16, + "execution_count": 163, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n", - " ('Aktiva', '31.12.2021 EUR'),\n", - " ('Aktiva', '31.12.2020 EUR')],\n", - " )\n", - "Aktiva Unnamed: 0_level_1 object\n", - " 31.12.2021 EUR object\n", - " 31.12.2020 EUR object\n", + "Index([0, 1], dtype='int64')\n", + "0 object\n", + "1 object\n", "dtype: object\n", - "MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n", - " ('Passiva', '31.12.2021 EUR'),\n", - " ('Passiva', '31.12.2020 EUR')],\n", - " )\n", - "Passiva Unnamed: 0_level_1 object\n", - " 31.12.2021 EUR object\n", - " 31.12.2020 EUR object\n", + "Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", + "Unnamed: 0 object\n", + "Anhang object\n", + "31.12.2021 TEUR object\n", + "Vorjahr TEUR object\n", "dtype: object\n", - "Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n", - "Angaben zur Identifikation der Gesellschaft laut Registergericht object\n", - "Angaben zur Identifikation der Gesellschaft laut Registergericht.1 object\n", + "Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", + "Unnamed: 0 object\n", + "Anhang object\n", + "2021 TEUR float64\n", + "Vorjahr TEUR float64\n", "dtype: object\n", - "MultiIndex([('Kreditentwicklung', 'Unnamed: 0_level_1'),\n", - " ( 'Betrag', 'EUR')],\n", + "Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n", + " 'Vorjahr TEUR'],\n", + " dtype='object')\n", + "Aufgliederung nach Tätigkeitsbereichen object\n", + "2021 TEUR float64\n", + "Vorjahr TEUR float64\n", + "dtype: object\n", + "Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", + "Aufgliederung nach Inland und Ausland object\n", + "2021 TEUR float64\n", + "Vorjahr TEUR float64\n", + "dtype: object\n", + "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", + "Unnamed: 0 object\n", + "31.12.2021 TEUR float64\n", + "Vorjahr TEUR float64\n", + "dtype: object\n", + "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", + "Unnamed: 0 object\n", + "31.12.2021 TEUR float64\n", + "Vorjahr TEUR float64\n", + "dtype: object\n", + "Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n", + "Unnamed: 0 object\n", + "31.12.2021 object\n", + "dtype: object\n", + "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n", + "Unnamed: 0 object\n", + "TEUR float64\n", + "dtype: object\n", + "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n", + "Unnamed: 0 object\n", + "TEUR float64\n", + "dtype: object\n", + "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n", + "Unnamed: 0 object\n", + "TEUR float64\n", + "dtype: object\n", + "Index([0, 1, 2], dtype='int64')\n", + "0 object\n", + "1 object\n", + "2 int64\n", + "dtype: object\n", + "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n", + "Unnamed: 0 object\n", + "TEUR int64\n", + "dtype: object\n", + "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", + "Unnamed: 0 object\n", + "31.12.2021 TEUR float64\n", + "Vorjahr TEUR float64\n", + "dtype: object\n", + "Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n", + "Unnamed: 0 object\n", + "2021 Anzahl MA int64\n", + "Vorjahr Anzahl MA int64\n", + "dtype: object\n", + "MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n", + " ('Art der Beziehung', 'Gesellschafterin TEUR'),\n", + " ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n", " )\n", - "Kreditentwicklung Unnamed: 0_level_1 object\n", - "Betrag EUR object\n", + "Art des Geschäfts Unnamed: 0_level_1 object\n", + "Art der Beziehung Gesellschafterin TEUR float64\n", + " Verbundene Unternehmen TEUR float64\n", + "dtype: object\n", + "Index([0, 1], dtype='int64')\n", + "0 object\n", + "1 object\n", + "dtype: object\n", + "MultiIndex([( 'Unnamed: 0_level_0', ...),\n", + " ('Anschaffungs- oder Herstellungskosten', ...),\n", + " ('Anschaffungs- oder Herstellungskosten', ...),\n", + " ('Anschaffungs- oder Herstellungskosten', ...),\n", + " ('Anschaffungs- oder Herstellungskosten', ...)],\n", + " )\n", + "Unnamed: 0_level_0 Unnamed: 0_level_1 object\n", + "Anschaffungs- oder Herstellungskosten Stand 01.01.2021 EUR object\n", + " Zugänge Umbuchung U EUR object\n", + " Abgänge Umbuchung EUR object\n", + " Stand 31.12.2021 EUR object\n", + "dtype: object\n", + "MultiIndex([('Unnamed: 0_level_0', ...),\n", + " ( 'Abschreibungen', ...),\n", + " ( 'Abschreibungen', ...),\n", + " ( 'Abschreibungen', ...),\n", + " ( 'Abschreibungen', ...)],\n", + " )\n", + "Unnamed: 0_level_0 Unnamed: 0_level_1 object\n", + "Abschreibungen Stand 01.01.2021 EUR object\n", + " Abschreibungen des Geschäftsjahres U EUR object\n", + " Abgänge Umbuchung U EUR object\n", + " Stand 31.12.2021 EUR object\n", + "dtype: object\n", + "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", + " ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n", + " ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n", + " )\n", + "Unnamed: 0_level_0 Unnamed: 0_level_1 object\n", + "Buchwerte Stand 31.12.2021 EUR object\n", + " Stand 31.12.2020 EUR object\n", + "dtype: object\n", + "Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n", + " '2019'],\n", + " dtype='object')\n", + "Nichtfinanzieller Leistungsindikator object\n", + "Unnamed: 1 object\n", + "2021 int64\n", + "2020 int64\n", + "2019 int64\n", + "dtype: object\n", + "Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n", + " 'Veränderung TEUR'],\n", + " dtype='object')\n", + "Gewinn- und Verlustrechnung object\n", + "2021 TEUR float64\n", + "Vorjahr TEUR float64\n", + "Veränderung TEUR float64\n", + "dtype: object\n", + "Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n", + "Bilanz object\n", + "31.12.2021 TEUR float64\n", + "Vorjahr TEUR float64\n", + "Veränderung TEUR float64\n", "dtype: object\n" ] - }, - { - "data": { - "text/plain": [ - "{}" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ "def parse_tables(report: str) -> list:\n", - " result = {}\n", + " result = []\n", " soup = BeautifulSoup(report, features=\"html.parser\")\n", " for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n", - " df = pd.read_html(StringIO(str(table)))[0]\n", + " df = pd.read_html(StringIO(str(table)), flavor=\"bs4\")[0]\n", " print(df.columns)\n", " print(df.dtypes)\n", + " result.append(df)\n", " return result\n", "\n", "\n", - "parse_tables(sample_report)" + "tables = parse_tables(sample_report)" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 164, "metadata": {}, "outputs": [ { @@ -516,25 +392,389 @@ " \n", " \n", " \n", + " Unnamed: 0\n", + " Anhang\n", + " 31.12.2021 TEUR\n", + " Vorjahr TEUR\n", " \n", " \n", " \n", + " \n", + " 0\n", + " A. Anlagevermögen\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 1\n", + " I. Immaterielle Vermögensgegenstände\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 2\n", + " Entgeltlich erworbene Software\n", + " NaN\n", + " 3\n", + " 6\n", + " \n", + " \n", + " 3\n", + " II. Sachanlagen\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 4\n", + " 1. Grundstücke und Bauten\n", + " NaN\n", + " 75\n", + " 89\n", + " \n", " \n", "\n", "
" ], "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" + " Unnamed: 0 Anhang 31.12.2021 TEUR Vorjahr TEUR\n", + "0 A. Anlagevermögen NaN NaN NaN\n", + "1 I. Immaterielle Vermögensgegenstände NaN NaN NaN\n", + "2 Entgeltlich erworbene Software NaN 3 6\n", + "3 II. Sachanlagen NaN NaN NaN\n", + "4 1. Grundstücke und Bauten NaN 75 89" ] }, - "execution_count": 22, + "execution_count": 164, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "current_table = tables[1]\n", + "current_table.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "\n", + "def cleanse_string(value: str) -> str:\n", + " print(value)\n", + " if value is not None and isinstance(value, str):\n", + " return re.sub(r\"(.+\\.).\", \"\", value)\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A. Anlagevermögen\n", + "I. Immaterielle Vermögensgegenstände\n", + "Entgeltlich erworbene Software\n", + "II. Sachanlagen\n", + "1. Grundstücke und Bauten\n", + "2. Technische Anlagen und Maschinen\n", + "3. Andere Anlagen, Betriebs- und Geschäftsausstattung\n", + "4. Geleistete Anzahlung und Anlagen im Bau\n", + "nan\n", + "III. Finanzanlagen\n", + "Sonstige Ausleihungen\n", + "nan\n", + "B. Umlaufvermögen\n", + "I. Vorräte\n", + "Waren\n", + "II. Forderungen und sonstige Vermögensgegenstände\n", + "1. Forderungen aus Lieferungen und Leistungen\n", + "2. Forderungen gegen verbundene Unternehmen\n", + "3. Sonstige Vermögensgegenstände\n", + "nan\n", + "nan\n", + "C. Rechnungsabgrenzungsposten\n", + "D. Aktiver Unterschiedsbetrag aus der Vermögensverrechnung\n", + "nan\n", + "Passiva\n", + "nan\n", + "A. Eigenkapital\n", + "I. Gezeichnetes Kapital\n", + "II. Kapitalrücklage\n", + "III. Gewinnrücklagen\n", + "Andere Gewinnrücklagen\n", + "IV. Gewinnvortrag\n", + "nan\n", + "B. Rückstellungen\n", + "1. Rückstellungen für Pensionen\n", + "2. Steuerrückstellungen\n", + "3. Sonstige Rückstellungen\n", + "nan\n", + "C. Verbindlichkeiten\n", + "1. Erhaltene Anzahlungen\n", + "2. Verbindlichkeiten aus Lieferungen und Leistungen\n", + "3. Verbindlichkeiten gegenüber verbundenen Unternehmen\n", + "4. Sonstige Verbindlichkeiten\n", + "nan\n", + "D. Rechnungsabgrenzungungsposten\n", + "nan\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_18940\\152097142.py:2: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " current_table.iloc[index][0] = cleanse_string(row[0])\n", + "C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_18940\\152097142.py:2: FutureWarning: Series.__setitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To set a value by position, use `ser.iloc[pos] = value`\n", + " current_table.iloc[index][0] = cleanse_string(row[0])\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0Anhang31.12.2021 TEURVorjahr TEUR
0AnlagevermögenNaNNaNNaN
1Immaterielle VermögensgegenständeNaNNaNNaN
2Entgeltlich erworbene SoftwareNaN36
3SachanlagenNaNNaNNaN
4Grundstücke und BautenNaN7589
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 Anhang 31.12.2021 TEUR Vorjahr TEUR\n", + "0 Anlagevermögen NaN NaN NaN\n", + "1 Immaterielle Vermögensgegenstände NaN NaN NaN\n", + "2 Entgeltlich erworbene Software NaN 3 6\n", + "3 Sachanlagen NaN NaN NaN\n", + "4 Grundstücke und Bauten NaN 75 89" + ] + }, + "execution_count": 166, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "for index, row in current_table.iterrows():\n", + " current_table.iloc[index][0] = cleanse_string(row[0])\n", + "current_table.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": {}, + "outputs": [], + "source": [ + "def parse_string_to_float(value) -> float:\n", + " try:\n", + " if value is None:\n", + " return None\n", + " if isinstance(value, float):\n", + " return value\n", + " return float(value.replace(\".\", \"\").replace(\",\", \".\"))\n", + " except Exception as e:\n", + " return None\n", + "\n", + "\n", + "def apply_factor(value, factor: float):\n", + " transformed_value = parse_string_to_float(value)\n", + " if transformed_value is None or isinstance(transformed_value, str):\n", + " return None\n", + " result = transformed_value * factor\n", + " # print(result)\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0Anhang31.12.2021Vorjahr
0AnlagevermögenNaNNaNNaN
1Immaterielle VermögensgegenständeNaNNaNNaN
2Entgeltlich erworbene SoftwareNaN3000.06000.0
3SachanlagenNaNNaNNaN
4Grundstücke und BautenNaN75000.089000.0
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 Anhang 31.12.2021 Vorjahr\n", + "0 Anlagevermögen NaN NaN NaN\n", + "1 Immaterielle Vermögensgegenstände NaN NaN NaN\n", + "2 Entgeltlich erworbene Software NaN 3000.0 6000.0\n", + "3 Sachanlagen NaN NaN NaN\n", + "4 Grundstücke und Bauten NaN 75000.0 89000.0" + ] + }, + "execution_count": 168, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "converter = {\"T€\": 1 / 1000, \"TEUR\": 1000, \"EUR\": 1 / 1000, \"€\": 1}\n", + "\n", + "for column in current_table.columns:\n", + " if isinstance(column, tuple):\n", + " for c in column:\n", + " for x, factor in converter.items():\n", + " if x in c:\n", + " current_table[column] = current_table[column].apply(\n", + " lambda x: apply_factor(x, factor)\n", + " )\n", + " next\n", + " else:\n", + " for x, factor in converter.items():\n", + " parts = column.split(\" \")\n", + " for y in parts:\n", + " if re.match(x, y):\n", + " current_table[column] = current_table[column].apply(\n", + " lambda x: apply_factor(x, factor)\n", + " )\n", + " current_table.rename({column: parts[0]}, inplace=True, axis=1)\n", + " next\n", + " # print(current_table[column])\n", + "current_table.dropna(axis=0, how=\"all\", inplace=True)\n", + "current_table.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "\n", "def get_bilanz(report: str) -> any:\n", " result = {}\n", " soup = BeautifulSoup(report, features=\"html.parser\")\n", @@ -547,34 +787,421 @@ " result[pos] = pos_results\n", " else:\n", " result[pos] = pd.DataFrame([])\n", - " return result\n", - "\n", - "\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0Anhang2021 TEURVorjahr TEUR
01. Umsatzerlöse(1)66.76769.819
12. Veränderung des Bestandes an unfertigen Lei...NaN0.000-41.000
23. Sonstige betriebliche Erträge(2)621.000489.000
34. MaterialaufwandNaNNaNNaN
4a) Aufwendungen für bezogene WarenNaN-475.000-1.220
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 Anhang 2021 TEUR \\\n", + "0 1. Umsatzerlöse (1) 66.767 \n", + "1 2. Veränderung des Bestandes an unfertigen Lei... NaN 0.000 \n", + "2 3. Sonstige betriebliche Erträge (2) 621.000 \n", + "3 4. Materialaufwand NaN NaN \n", + "4 a) Aufwendungen für bezogene Waren NaN -475.000 \n", + "\n", + " Vorjahr TEUR \n", + "0 69.819 \n", + "1 -41.000 \n", + "2 489.000 \n", + "3 NaN \n", + "4 -1.220 " + ] + }, + "execution_count": 170, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ "bilanz = get_bilanz(sample_report)\n", "bilanz[\"Passiva\"].head()" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 171, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0Anhang31.12.2021 TEURVorjahr TEUR
0A. AnlagevermögenNaNNaNNaN
1I. Immaterielle VermögensgegenständeNaNNaNNaN
2Entgeltlich erworbene SoftwareNaN36
3II. SachanlagenNaNNaNNaN
41. Grundstücke und BautenNaN7589
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 Anhang 31.12.2021 TEUR Vorjahr TEUR\n", + "0 A. Anlagevermögen NaN NaN NaN\n", + "1 I. Immaterielle Vermögensgegenstände NaN NaN NaN\n", + "2 Entgeltlich erworbene Software NaN 3 6\n", + "3 II. Sachanlagen NaN NaN NaN\n", + "4 1. Grundstücke und Bauten NaN 75 89" + ] + }, + "execution_count": 171, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bilanz[\"Aktiva\"].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0Anhang2021 TEURVorjahr TEUR
01. Umsatzerlöse(1)66.76769.819
12. Veränderung des Bestandes an unfertigen LeistungenNaN0.000-41.000
23. Sonstige betriebliche Erträge(2)621.000489.000
34. MaterialaufwandNaNNaNNaN
4a) Aufwendungen für bezogene WarenNaN-475.000-1.220
5b) Aufwendungen für bezogene LeistungenNaN-12.855-12.457
65. PersonalaufwandNaNNaNNaN
7a) GehälterNaN-52.916-45.242
8b) Soziale Abgaben und Aufwendungen für Altersversorgung und für UnterstützungNaN-9.945-9.999
9davon für Altersversorgung: TEUR 1.817 (Vorjahr: TEUR 1.676)NaNNaNNaN
106. Abschreibungen auf immaterielle Vermögensgegenstände des Anlagevermögens und SachanlagenNaN-165.000-201.000
117. Sonstige betriebliche Aufwendungen(3)-4.968-7.356
128. Zinsen und ähnliche AufwendungenNaN-6.170-10.748
13davon aus der Aufzinsung von Rückstellungen: TEUR 6.116 (Vorjahr: TEUR 10.730)NaNNaNNaN
149. Steuern vom Einkommen und vom ErtragNaN35.0000.000
1510. Ergebnis vor sonstigen Steuern und VerlustübernahmeNaN-20.072-16.956
1611. Sonstige SteuernNaN0.000-7.000
1712. Erträge aus VerlustübernahmeNaN20.07216.963
1813. JahresergebnisNaN0.0000.000
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from IPython.display import display, HTML\n", + "\n", + "# Assuming that dataframes df1 and df2 are already defined:\n", + "display(HTML(bilanz[\"Passiva\"].to_html()))" + ] + }, + { + "cell_type": "code", + "execution_count": 173, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n", - " ('Aktiva', '31.12.2021 EUR'),\n", - " ('Aktiva', '31.12.2020 EUR')],\n", + "Index([0, 1], dtype='int64')\n", + "Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", + "Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", + "Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n", + " 'Vorjahr TEUR'],\n", + " dtype='object')\n", + "Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", + "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", + "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", + "Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n", + "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n", + "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n", + "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n", + "Index([0, 1, 2], dtype='int64')\n", + "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n", + "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", + "Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n", + "MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n", + " ('Art der Beziehung', 'Gesellschafterin TEUR'),\n", + " ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n", " )\n", - "MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n", - " ('Passiva', '31.12.2021 EUR'),\n", - " ('Passiva', '31.12.2020 EUR')],\n", + "Index([0, 1], dtype='int64')\n", + "MultiIndex([( 'Unnamed: 0_level_0', ...),\n", + " ('Anschaffungs- oder Herstellungskosten', ...),\n", + " ('Anschaffungs- oder Herstellungskosten', ...),\n", + " ('Anschaffungs- oder Herstellungskosten', ...),\n", + " ('Anschaffungs- oder Herstellungskosten', ...)],\n", " )\n", - "Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n", - "MultiIndex([('Kreditentwicklung', 'Unnamed: 0_level_1'),\n", - " ( 'Betrag', 'EUR')],\n", - " )\n" + "MultiIndex([('Unnamed: 0_level_0', ...),\n", + " ( 'Abschreibungen', ...),\n", + " ( 'Abschreibungen', ...),\n", + " ( 'Abschreibungen', ...),\n", + " ( 'Abschreibungen', ...)],\n", + " )\n", + "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", + " ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n", + " ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n", + " )\n", + "Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n", + " '2019'],\n", + " dtype='object')\n", + "Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n", + " 'Veränderung TEUR'],\n", + " dtype='object')\n", + "Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n" ] } ], diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py index b19398d..36dcdbc 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py @@ -54,7 +54,8 @@ class Bundesanzeiger: ) # Remove irrelevant columns - return df_data.drop(["raw_report"], axis=1) + # return df_data.drop(["raw_report"], axis=1) + return df_data @staticmethod def filter_reports(df_reports: pd.DataFrame) -> pd.DataFrame: