\\n
str:\n",
- " soup = BeautifulSoup(report, features=\"html.parser\")\n",
- " temp = soup.find_all(\"b\")\n",
- " for elem in temp:\n",
- " br = elem.findChildren(\"br\")\n",
- " if len(br) > 0:\n",
- " return elem.text.split(\"\\n\")[1].strip()\n",
- " return None\n",
- "\n",
- "\n",
- "def extract_auditors(report: str) -> list:\n",
- " auditor_company = extract_auditor_company(report)\n",
- " auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
- " hits = re.findall(auditor_regex, report)\n",
- " return [\n",
- " Auditor(hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip(), auditor_company)\n",
- " for hit in hits\n",
- " ]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[]"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "extract_auditors(sample_report)"
+ "sample_report = df_reports.iloc[1].raw_report"
]
},
{
@@ -330,169 +201,174 @@
},
{
"cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'net_income': 100238.5, 'equity': 165322.34, 'current_assets': 435344.07}"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "def extract_kpis(report_content) -> dict:\n",
- " \"\"\"\n",
- " Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd\n",
- " Extracts Key Performance Indicators (KPIs) from the financial reports.\n",
- " Args:\n",
- " reports (dict): A dictionary containing the financial reports with their hash as keys and report details as values.\n",
- " Returns:\n",
- " dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.\n",
- " \"\"\"\n",
- "\n",
- " kpis = {}\n",
- "\n",
- " # Define KPI patterns to search for\n",
- " kpi_patterns = {\n",
- " \"revenue\": r\"(?:revenue|umsatz|erlöse)[:\\s]*([\\d,.]+[mmb]?)\",\n",
- " \"net_income\": r\"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\\s]*([\\d,.]+[mmb]?)\",\n",
- " \"ebit\": r\"(?:ebit|operating income)[:\\s]*([\\d,.]+[mmb]?)\",\n",
- " \"ebitda\": r\"(?:ebitda)[:\\s]*([\\d,.]+[mmb]?)\",\n",
- " \"gross_profit\": r\"(?:gross profit|bruttogewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
- " \"operating_profit\": r\"(?:operating profit|betriebsgewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
- " \"assets\": r\"(?:total assets|bilanzsumme)[:\\s]*([\\d,.]+[mmb]?)\",\n",
- " \"liabilities\": r\"(?:total liabilities|gesamtverbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
- " \"equity\": r\"(?:shareholders'? equity|eigenkapital)[:\\s]*([\\d,.]+[mmb]?)\",\n",
- " \"current_assets\": r\"(?:current assets|umlaufvermögen)[:\\s]*([\\d,.]+[mmb]?)\",\n",
- " \"current_liabilities\": r\"(?:current liabilities|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
- " \"long_term_debt\": r\"(?:long[-\\s]?term debt|langfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
- " \"short_term_debt\": r\"(?:short[-\\s]?term debt|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
- " \"cash_and_cash_equivalents\": r\"(?:cash (?:and cash equivalents)?|barmittel)[:\\s]*([\\d,.]+[mmb]?)\",\n",
- " \"dividends\": r\"(?:dividends?|dividende)[:\\s]*([\\d,.]+[mmb]?)\",\n",
- " \"cash_flow\": r\"(?:cash flow|cashflow|cash flow from operating activities)[:\\s]*([\\d,.]+[mmb]?)\",\n",
- " }\n",
- "\n",
- " report_kpis = {}\n",
- " for kpi, pattern in kpi_patterns.items():\n",
- " match = re.search(pattern, report_content, flags=re.IGNORECASE | re.UNICODE)\n",
- " if match:\n",
- " value = match.group(1)\n",
- "\n",
- " # Clean and validate the extracted number\n",
- " try:\n",
- " if not value: # Check if value is empty\n",
- " cleaned_value = None\n",
- " else:\n",
- " multiplier = 1\n",
- " if value[-1].lower() == \"m\":\n",
- " value = value[:-1]\n",
- " multiplier = 1_000_000\n",
- " elif value[-1].lower() == \"b\":\n",
- " value = value[:-1]\n",
- " multiplier = 1_000_000_000\n",
- "\n",
- " # Remove commas after checking for multipliers\n",
- " value = value.replace(\".\", \"\").replace(\",\", \".\").strip()\n",
- " cleaned_value = float(value) * multiplier\n",
- " except ValueError:\n",
- " cleaned_value = None\n",
- "\n",
- " if cleaned_value is not None:\n",
- " report_kpis[kpi] = cleaned_value\n",
- " return report_kpis\n",
- "\n",
- "\n",
- "extract_kpis(\n",
- " BeautifulSoup(sample_report, features=\"html.parser\").get_text().replace(\"\\n\", \" \")\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [],
- "source": [
- "import os\n",
- "\n",
- "with open(\"./temp.txt\", \"w\") as file:\n",
- " file.write(\n",
- " BeautifulSoup(sample_report, features=\"html.parser\")\n",
- " .get_text()\n",
- " .replace(\"\\n\", \" \")\n",
- " )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
+ "execution_count": 163,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n",
- " ('Aktiva', '31.12.2021 EUR'),\n",
- " ('Aktiva', '31.12.2020 EUR')],\n",
- " )\n",
- "Aktiva Unnamed: 0_level_1 object\n",
- " 31.12.2021 EUR object\n",
- " 31.12.2020 EUR object\n",
+ "Index([0, 1], dtype='int64')\n",
+ "0 object\n",
+ "1 object\n",
"dtype: object\n",
- "MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n",
- " ('Passiva', '31.12.2021 EUR'),\n",
- " ('Passiva', '31.12.2020 EUR')],\n",
- " )\n",
- "Passiva Unnamed: 0_level_1 object\n",
- " 31.12.2021 EUR object\n",
- " 31.12.2020 EUR object\n",
+ "Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
+ "Unnamed: 0 object\n",
+ "Anhang object\n",
+ "31.12.2021 TEUR object\n",
+ "Vorjahr TEUR object\n",
"dtype: object\n",
- "Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n",
- "Angaben zur Identifikation der Gesellschaft laut Registergericht object\n",
- "Angaben zur Identifikation der Gesellschaft laut Registergericht.1 object\n",
+ "Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
+ "Unnamed: 0 object\n",
+ "Anhang object\n",
+ "2021 TEUR float64\n",
+ "Vorjahr TEUR float64\n",
"dtype: object\n",
- "MultiIndex([('Kreditentwicklung', 'Unnamed: 0_level_1'),\n",
- " ( 'Betrag', 'EUR')],\n",
+ "Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n",
+ " 'Vorjahr TEUR'],\n",
+ " dtype='object')\n",
+ "Aufgliederung nach Tätigkeitsbereichen object\n",
+ "2021 TEUR float64\n",
+ "Vorjahr TEUR float64\n",
+ "dtype: object\n",
+ "Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
+ "Aufgliederung nach Inland und Ausland object\n",
+ "2021 TEUR float64\n",
+ "Vorjahr TEUR float64\n",
+ "dtype: object\n",
+ "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
+ "Unnamed: 0 object\n",
+ "31.12.2021 TEUR float64\n",
+ "Vorjahr TEUR float64\n",
+ "dtype: object\n",
+ "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
+ "Unnamed: 0 object\n",
+ "31.12.2021 TEUR float64\n",
+ "Vorjahr TEUR float64\n",
+ "dtype: object\n",
+ "Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n",
+ "Unnamed: 0 object\n",
+ "31.12.2021 object\n",
+ "dtype: object\n",
+ "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
+ "Unnamed: 0 object\n",
+ "TEUR float64\n",
+ "dtype: object\n",
+ "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
+ "Unnamed: 0 object\n",
+ "TEUR float64\n",
+ "dtype: object\n",
+ "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
+ "Unnamed: 0 object\n",
+ "TEUR float64\n",
+ "dtype: object\n",
+ "Index([0, 1, 2], dtype='int64')\n",
+ "0 object\n",
+ "1 object\n",
+ "2 int64\n",
+ "dtype: object\n",
+ "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
+ "Unnamed: 0 object\n",
+ "TEUR int64\n",
+ "dtype: object\n",
+ "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
+ "Unnamed: 0 object\n",
+ "31.12.2021 TEUR float64\n",
+ "Vorjahr TEUR float64\n",
+ "dtype: object\n",
+ "Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
+ "Unnamed: 0 object\n",
+ "2021 Anzahl MA int64\n",
+ "Vorjahr Anzahl MA int64\n",
+ "dtype: object\n",
+ "MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n",
+ " ('Art der Beziehung', 'Gesellschafterin TEUR'),\n",
+ " ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n",
" )\n",
- "Kreditentwicklung Unnamed: 0_level_1 object\n",
- "Betrag EUR object\n",
+ "Art des Geschäfts Unnamed: 0_level_1 object\n",
+ "Art der Beziehung Gesellschafterin TEUR float64\n",
+ " Verbundene Unternehmen TEUR float64\n",
+ "dtype: object\n",
+ "Index([0, 1], dtype='int64')\n",
+ "0 object\n",
+ "1 object\n",
+ "dtype: object\n",
+ "MultiIndex([( 'Unnamed: 0_level_0', ...),\n",
+ " ('Anschaffungs- oder Herstellungskosten', ...),\n",
+ " ('Anschaffungs- oder Herstellungskosten', ...),\n",
+ " ('Anschaffungs- oder Herstellungskosten', ...),\n",
+ " ('Anschaffungs- oder Herstellungskosten', ...)],\n",
+ " )\n",
+ "Unnamed: 0_level_0 Unnamed: 0_level_1 object\n",
+ "Anschaffungs- oder Herstellungskosten Stand 01.01.2021 EUR object\n",
+ " Zugänge Umbuchung U EUR object\n",
+ " Abgänge Umbuchung EUR object\n",
+ " Stand 31.12.2021 EUR object\n",
+ "dtype: object\n",
+ "MultiIndex([('Unnamed: 0_level_0', ...),\n",
+ " ( 'Abschreibungen', ...),\n",
+ " ( 'Abschreibungen', ...),\n",
+ " ( 'Abschreibungen', ...),\n",
+ " ( 'Abschreibungen', ...)],\n",
+ " )\n",
+ "Unnamed: 0_level_0 Unnamed: 0_level_1 object\n",
+ "Abschreibungen Stand 01.01.2021 EUR object\n",
+ " Abschreibungen des Geschäftsjahres U EUR object\n",
+ " Abgänge Umbuchung U EUR object\n",
+ " Stand 31.12.2021 EUR object\n",
+ "dtype: object\n",
+ "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
+ " ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n",
+ " ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n",
+ " )\n",
+ "Unnamed: 0_level_0 Unnamed: 0_level_1 object\n",
+ "Buchwerte Stand 31.12.2021 EUR object\n",
+ " Stand 31.12.2020 EUR object\n",
+ "dtype: object\n",
+ "Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n",
+ " '2019'],\n",
+ " dtype='object')\n",
+ "Nichtfinanzieller Leistungsindikator object\n",
+ "Unnamed: 1 object\n",
+ "2021 int64\n",
+ "2020 int64\n",
+ "2019 int64\n",
+ "dtype: object\n",
+ "Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n",
+ " 'Veränderung TEUR'],\n",
+ " dtype='object')\n",
+ "Gewinn- und Verlustrechnung object\n",
+ "2021 TEUR float64\n",
+ "Vorjahr TEUR float64\n",
+ "Veränderung TEUR float64\n",
+ "dtype: object\n",
+ "Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n",
+ "Bilanz object\n",
+ "31.12.2021 TEUR float64\n",
+ "Vorjahr TEUR float64\n",
+ "Veränderung TEUR float64\n",
"dtype: object\n"
]
- },
- {
- "data": {
- "text/plain": [
- "{}"
- ]
- },
- "execution_count": 16,
- "metadata": {},
- "output_type": "execute_result"
}
],
"source": [
"def parse_tables(report: str) -> list:\n",
- " result = {}\n",
+ " result = []\n",
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
" for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n",
- " df = pd.read_html(StringIO(str(table)))[0]\n",
+ " df = pd.read_html(StringIO(str(table)), flavor=\"bs4\")[0]\n",
" print(df.columns)\n",
" print(df.dtypes)\n",
+ " result.append(df)\n",
" return result\n",
"\n",
"\n",
- "parse_tables(sample_report)"
+ "tables = parse_tables(sample_report)"
]
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 164,
"metadata": {},
"outputs": [
{
@@ -516,25 +392,389 @@
" \n",
" \n",
" | \n",
+ " Unnamed: 0 | \n",
+ " Anhang | \n",
+ " 31.12.2021 TEUR | \n",
+ " Vorjahr TEUR | \n",
"
\n",
" \n",
"
\n",
+ " \n",
+ " 0 | \n",
+ " A. Anlagevermögen | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " I. Immaterielle Vermögensgegenstände | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Entgeltlich erworbene Software | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " II. Sachanlagen | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1. Grundstücke und Bauten | \n",
+ " NaN | \n",
+ " 75 | \n",
+ " 89 | \n",
+ "
\n",
" \n",
"\n",
""
],
"text/plain": [
- "Empty DataFrame\n",
- "Columns: []\n",
- "Index: []"
+ " Unnamed: 0 Anhang 31.12.2021 TEUR Vorjahr TEUR\n",
+ "0 A. Anlagevermögen NaN NaN NaN\n",
+ "1 I. Immaterielle Vermögensgegenstände NaN NaN NaN\n",
+ "2 Entgeltlich erworbene Software NaN 3 6\n",
+ "3 II. Sachanlagen NaN NaN NaN\n",
+ "4 1. Grundstücke und Bauten NaN 75 89"
]
},
- "execution_count": 22,
+ "execution_count": 164,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
+ "current_table = tables[1]\n",
+ "current_table.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 165,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import re\n",
+ "\n",
+ "\n",
+ "def cleanse_string(value: str) -> str:\n",
+ " print(value)\n",
+ " if value is not None and isinstance(value, str):\n",
+ " return re.sub(r\"(.+\\.).\", \"\", value)\n",
+ " return None"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 166,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "A. Anlagevermögen\n",
+ "I. Immaterielle Vermögensgegenstände\n",
+ "Entgeltlich erworbene Software\n",
+ "II. Sachanlagen\n",
+ "1. Grundstücke und Bauten\n",
+ "2. Technische Anlagen und Maschinen\n",
+ "3. Andere Anlagen, Betriebs- und Geschäftsausstattung\n",
+ "4. Geleistete Anzahlung und Anlagen im Bau\n",
+ "nan\n",
+ "III. Finanzanlagen\n",
+ "Sonstige Ausleihungen\n",
+ "nan\n",
+ "B. Umlaufvermögen\n",
+ "I. Vorräte\n",
+ "Waren\n",
+ "II. Forderungen und sonstige Vermögensgegenstände\n",
+ "1. Forderungen aus Lieferungen und Leistungen\n",
+ "2. Forderungen gegen verbundene Unternehmen\n",
+ "3. Sonstige Vermögensgegenstände\n",
+ "nan\n",
+ "nan\n",
+ "C. Rechnungsabgrenzungsposten\n",
+ "D. Aktiver Unterschiedsbetrag aus der Vermögensverrechnung\n",
+ "nan\n",
+ "Passiva\n",
+ "nan\n",
+ "A. Eigenkapital\n",
+ "I. Gezeichnetes Kapital\n",
+ "II. Kapitalrücklage\n",
+ "III. Gewinnrücklagen\n",
+ "Andere Gewinnrücklagen\n",
+ "IV. Gewinnvortrag\n",
+ "nan\n",
+ "B. Rückstellungen\n",
+ "1. Rückstellungen für Pensionen\n",
+ "2. Steuerrückstellungen\n",
+ "3. Sonstige Rückstellungen\n",
+ "nan\n",
+ "C. Verbindlichkeiten\n",
+ "1. Erhaltene Anzahlungen\n",
+ "2. Verbindlichkeiten aus Lieferungen und Leistungen\n",
+ "3. Verbindlichkeiten gegenüber verbundenen Unternehmen\n",
+ "4. Sonstige Verbindlichkeiten\n",
+ "nan\n",
+ "D. Rechnungsabgrenzungungsposten\n",
+ "nan\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_18940\\152097142.py:2: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+ " current_table.iloc[index][0] = cleanse_string(row[0])\n",
+ "C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_18940\\152097142.py:2: FutureWarning: Series.__setitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To set a value by position, use `ser.iloc[pos] = value`\n",
+ " current_table.iloc[index][0] = cleanse_string(row[0])\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " Anhang | \n",
+ " 31.12.2021 TEUR | \n",
+ " Vorjahr TEUR | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Anlagevermögen | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Immaterielle Vermögensgegenstände | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Entgeltlich erworbene Software | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Sachanlagen | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Grundstücke und Bauten | \n",
+ " NaN | \n",
+ " 75 | \n",
+ " 89 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Unnamed: 0 Anhang 31.12.2021 TEUR Vorjahr TEUR\n",
+ "0 Anlagevermögen NaN NaN NaN\n",
+ "1 Immaterielle Vermögensgegenstände NaN NaN NaN\n",
+ "2 Entgeltlich erworbene Software NaN 3 6\n",
+ "3 Sachanlagen NaN NaN NaN\n",
+ "4 Grundstücke und Bauten NaN 75 89"
+ ]
+ },
+ "execution_count": 166,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "for index, row in current_table.iterrows():\n",
+ " current_table.iloc[index][0] = cleanse_string(row[0])\n",
+ "current_table.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 167,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def parse_string_to_float(value) -> float:\n",
+ " try:\n",
+ " if value is None:\n",
+ " return None\n",
+ " if isinstance(value, float):\n",
+ " return value\n",
+ " return float(value.replace(\".\", \"\").replace(\",\", \".\"))\n",
+ " except Exception as e:\n",
+ " return None\n",
+ "\n",
+ "\n",
+ "def apply_factor(value, factor: float):\n",
+ " transformed_value = parse_string_to_float(value)\n",
+ " if transformed_value is None or isinstance(transformed_value, str):\n",
+ " return None\n",
+ " result = transformed_value * factor\n",
+ " # print(result)\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 168,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " Anhang | \n",
+ " 31.12.2021 | \n",
+ " Vorjahr | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Anlagevermögen | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Immaterielle Vermögensgegenstände | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Entgeltlich erworbene Software | \n",
+ " NaN | \n",
+ " 3000.0 | \n",
+ " 6000.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Sachanlagen | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Grundstücke und Bauten | \n",
+ " NaN | \n",
+ " 75000.0 | \n",
+ " 89000.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Unnamed: 0 Anhang 31.12.2021 Vorjahr\n",
+ "0 Anlagevermögen NaN NaN NaN\n",
+ "1 Immaterielle Vermögensgegenstände NaN NaN NaN\n",
+ "2 Entgeltlich erworbene Software NaN 3000.0 6000.0\n",
+ "3 Sachanlagen NaN NaN NaN\n",
+ "4 Grundstücke und Bauten NaN 75000.0 89000.0"
+ ]
+ },
+ "execution_count": 168,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "converter = {\"T€\": 1 / 1000, \"TEUR\": 1000, \"EUR\": 1 / 1000, \"€\": 1}\n",
+ "\n",
+ "for column in current_table.columns:\n",
+ " if isinstance(column, tuple):\n",
+ " for c in column:\n",
+ " for x, factor in converter.items():\n",
+ " if x in c:\n",
+ " current_table[column] = current_table[column].apply(\n",
+ " lambda x: apply_factor(x, factor)\n",
+ " )\n",
+ " next\n",
+ " else:\n",
+ " for x, factor in converter.items():\n",
+ " parts = column.split(\" \")\n",
+ " for y in parts:\n",
+ " if re.match(x, y):\n",
+ " current_table[column] = current_table[column].apply(\n",
+ " lambda x: apply_factor(x, factor)\n",
+ " )\n",
+ " current_table.rename({column: parts[0]}, inplace=True, axis=1)\n",
+ " next\n",
+ " # print(current_table[column])\n",
+ "current_table.dropna(axis=0, how=\"all\", inplace=True)\n",
+ "current_table.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 169,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import re\n",
+ "\n",
+ "\n",
"def get_bilanz(report: str) -> any:\n",
" result = {}\n",
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
@@ -547,34 +787,421 @@
" result[pos] = pos_results\n",
" else:\n",
" result[pos] = pd.DataFrame([])\n",
- " return result\n",
- "\n",
- "\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 170,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " Anhang | \n",
+ " 2021 TEUR | \n",
+ " Vorjahr TEUR | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1. Umsatzerlöse | \n",
+ " (1) | \n",
+ " 66.767 | \n",
+ " 69.819 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2. Veränderung des Bestandes an unfertigen Lei... | \n",
+ " NaN | \n",
+ " 0.000 | \n",
+ " -41.000 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3. Sonstige betriebliche Erträge | \n",
+ " (2) | \n",
+ " 621.000 | \n",
+ " 489.000 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4. Materialaufwand | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " a) Aufwendungen für bezogene Waren | \n",
+ " NaN | \n",
+ " -475.000 | \n",
+ " -1.220 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Unnamed: 0 Anhang 2021 TEUR \\\n",
+ "0 1. Umsatzerlöse (1) 66.767 \n",
+ "1 2. Veränderung des Bestandes an unfertigen Lei... NaN 0.000 \n",
+ "2 3. Sonstige betriebliche Erträge (2) 621.000 \n",
+ "3 4. Materialaufwand NaN NaN \n",
+ "4 a) Aufwendungen für bezogene Waren NaN -475.000 \n",
+ "\n",
+ " Vorjahr TEUR \n",
+ "0 69.819 \n",
+ "1 -41.000 \n",
+ "2 489.000 \n",
+ "3 NaN \n",
+ "4 -1.220 "
+ ]
+ },
+ "execution_count": 170,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
"bilanz = get_bilanz(sample_report)\n",
"bilanz[\"Passiva\"].head()"
]
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": 171,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " Anhang | \n",
+ " 31.12.2021 TEUR | \n",
+ " Vorjahr TEUR | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " A. Anlagevermögen | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " I. Immaterielle Vermögensgegenstände | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Entgeltlich erworbene Software | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " II. Sachanlagen | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1. Grundstücke und Bauten | \n",
+ " NaN | \n",
+ " 75 | \n",
+ " 89 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Unnamed: 0 Anhang 31.12.2021 TEUR Vorjahr TEUR\n",
+ "0 A. Anlagevermögen NaN NaN NaN\n",
+ "1 I. Immaterielle Vermögensgegenstände NaN NaN NaN\n",
+ "2 Entgeltlich erworbene Software NaN 3 6\n",
+ "3 II. Sachanlagen NaN NaN NaN\n",
+ "4 1. Grundstücke und Bauten NaN 75 89"
+ ]
+ },
+ "execution_count": 171,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "bilanz[\"Aktiva\"].head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 172,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " Anhang | \n",
+ " 2021 TEUR | \n",
+ " Vorjahr TEUR | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1. Umsatzerlöse | \n",
+ " (1) | \n",
+ " 66.767 | \n",
+ " 69.819 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2. Veränderung des Bestandes an unfertigen Leistungen | \n",
+ " NaN | \n",
+ " 0.000 | \n",
+ " -41.000 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3. Sonstige betriebliche Erträge | \n",
+ " (2) | \n",
+ " 621.000 | \n",
+ " 489.000 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4. Materialaufwand | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " a) Aufwendungen für bezogene Waren | \n",
+ " NaN | \n",
+ " -475.000 | \n",
+ " -1.220 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " b) Aufwendungen für bezogene Leistungen | \n",
+ " NaN | \n",
+ " -12.855 | \n",
+ " -12.457 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 5. Personalaufwand | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " a) Gehälter | \n",
+ " NaN | \n",
+ " -52.916 | \n",
+ " -45.242 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " b) Soziale Abgaben und Aufwendungen für Altersversorgung und für Unterstützung | \n",
+ " NaN | \n",
+ " -9.945 | \n",
+ " -9.999 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " davon für Altersversorgung: TEUR 1.817 (Vorjahr: TEUR 1.676) | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 6. Abschreibungen auf immaterielle Vermögensgegenstände des Anlagevermögens und Sachanlagen | \n",
+ " NaN | \n",
+ " -165.000 | \n",
+ " -201.000 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 7. Sonstige betriebliche Aufwendungen | \n",
+ " (3) | \n",
+ " -4.968 | \n",
+ " -7.356 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 8. Zinsen und ähnliche Aufwendungen | \n",
+ " NaN | \n",
+ " -6.170 | \n",
+ " -10.748 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " davon aus der Aufzinsung von Rückstellungen: TEUR 6.116 (Vorjahr: TEUR 10.730) | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 9. Steuern vom Einkommen und vom Ertrag | \n",
+ " NaN | \n",
+ " 35.000 | \n",
+ " 0.000 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 10. Ergebnis vor sonstigen Steuern und Verlustübernahme | \n",
+ " NaN | \n",
+ " -20.072 | \n",
+ " -16.956 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 11. Sonstige Steuern | \n",
+ " NaN | \n",
+ " 0.000 | \n",
+ " -7.000 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 12. Erträge aus Verlustübernahme | \n",
+ " NaN | \n",
+ " 20.072 | \n",
+ " 16.963 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " 13. Jahresergebnis | \n",
+ " NaN | \n",
+ " 0.000 | \n",
+ " 0.000 | \n",
+ "
\n",
+ " \n",
+ "
"
+ ],
+ "text/plain": [
+ "
"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from IPython.display import display, HTML\n",
+ "\n",
+ "# Assuming that dataframes df1 and df2 are already defined:\n",
+ "display(HTML(bilanz[\"Passiva\"].to_html()))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 173,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n",
- " ('Aktiva', '31.12.2021 EUR'),\n",
- " ('Aktiva', '31.12.2020 EUR')],\n",
+ "Index([0, 1], dtype='int64')\n",
+ "Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
+ "Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
+ "Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n",
+ " 'Vorjahr TEUR'],\n",
+ " dtype='object')\n",
+ "Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
+ "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
+ "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
+ "Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n",
+ "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
+ "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
+ "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
+ "Index([0, 1, 2], dtype='int64')\n",
+ "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
+ "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
+ "Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
+ "MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n",
+ " ('Art der Beziehung', 'Gesellschafterin TEUR'),\n",
+ " ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n",
" )\n",
- "MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n",
- " ('Passiva', '31.12.2021 EUR'),\n",
- " ('Passiva', '31.12.2020 EUR')],\n",
+ "Index([0, 1], dtype='int64')\n",
+ "MultiIndex([( 'Unnamed: 0_level_0', ...),\n",
+ " ('Anschaffungs- oder Herstellungskosten', ...),\n",
+ " ('Anschaffungs- oder Herstellungskosten', ...),\n",
+ " ('Anschaffungs- oder Herstellungskosten', ...),\n",
+ " ('Anschaffungs- oder Herstellungskosten', ...)],\n",
" )\n",
- "Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n",
- "MultiIndex([('Kreditentwicklung', 'Unnamed: 0_level_1'),\n",
- " ( 'Betrag', 'EUR')],\n",
- " )\n"
+ "MultiIndex([('Unnamed: 0_level_0', ...),\n",
+ " ( 'Abschreibungen', ...),\n",
+ " ( 'Abschreibungen', ...),\n",
+ " ( 'Abschreibungen', ...),\n",
+ " ( 'Abschreibungen', ...)],\n",
+ " )\n",
+ "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
+ " ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n",
+ " ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n",
+ " )\n",
+ "Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n",
+ " '2019'],\n",
+ " dtype='object')\n",
+ "Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n",
+ " 'Veränderung TEUR'],\n",
+ " dtype='object')\n",
+ "Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n"
]
}
],
diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py
index b19398d..36dcdbc 100644
--- a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py
+++ b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py
@@ -54,7 +54,8 @@ class Bundesanzeiger:
)
# Remove irrelevant columns
- return df_data.drop(["raw_report"], axis=1)
+ # return df_data.drop(["raw_report"], axis=1)
+ return df_data
@staticmethod
def filter_reports(df_reports: pd.DataFrame) -> pd.DataFrame: