list:\n",
" result = []\n",
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
" for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n",
" df = pd.read_html(StringIO(str(table)), flavor=\"bs4\")[0]\n",
" print(df.columns)\n",
" print(df.dtypes)\n",
" result.append(df)\n",
" return result\n",
"\n",
"\n",
"tables = parse_tables(sample_report)"
]
},
{
"cell_type": "code",
"execution_count": 164,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" Anhang | \n",
" 31.12.2021 TEUR | \n",
" Vorjahr TEUR | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" A. Anlagevermögen | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" I. Immaterielle Vermögensgegenstände | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" Entgeltlich erworbene Software | \n",
" NaN | \n",
" 3 | \n",
" 6 | \n",
"
\n",
" \n",
" 3 | \n",
" II. Sachanlagen | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" 1. Grundstücke und Bauten | \n",
" NaN | \n",
" 75 | \n",
" 89 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 Anhang 31.12.2021 TEUR Vorjahr TEUR\n",
"0 A. Anlagevermögen NaN NaN NaN\n",
"1 I. Immaterielle Vermögensgegenstände NaN NaN NaN\n",
"2 Entgeltlich erworbene Software NaN 3 6\n",
"3 II. Sachanlagen NaN NaN NaN\n",
"4 1. Grundstücke und Bauten NaN 75 89"
]
},
"execution_count": 164,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"current_table = tables[1]\n",
"current_table.head()"
]
},
{
"cell_type": "code",
"execution_count": 165,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"\n",
"def cleanse_string(value: str) -> str:\n",
" print(value)\n",
" if value is not None and isinstance(value, str):\n",
" return re.sub(r\"(.+\\.).\", \"\", value)\n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": 166,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"A. Anlagevermögen\n",
"I. Immaterielle Vermögensgegenstände\n",
"Entgeltlich erworbene Software\n",
"II. Sachanlagen\n",
"1. Grundstücke und Bauten\n",
"2. Technische Anlagen und Maschinen\n",
"3. Andere Anlagen, Betriebs- und Geschäftsausstattung\n",
"4. Geleistete Anzahlung und Anlagen im Bau\n",
"nan\n",
"III. Finanzanlagen\n",
"Sonstige Ausleihungen\n",
"nan\n",
"B. Umlaufvermögen\n",
"I. Vorräte\n",
"Waren\n",
"II. Forderungen und sonstige Vermögensgegenstände\n",
"1. Forderungen aus Lieferungen und Leistungen\n",
"2. Forderungen gegen verbundene Unternehmen\n",
"3. Sonstige Vermögensgegenstände\n",
"nan\n",
"nan\n",
"C. Rechnungsabgrenzungsposten\n",
"D. Aktiver Unterschiedsbetrag aus der Vermögensverrechnung\n",
"nan\n",
"Passiva\n",
"nan\n",
"A. Eigenkapital\n",
"I. Gezeichnetes Kapital\n",
"II. Kapitalrücklage\n",
"III. Gewinnrücklagen\n",
"Andere Gewinnrücklagen\n",
"IV. Gewinnvortrag\n",
"nan\n",
"B. Rückstellungen\n",
"1. Rückstellungen für Pensionen\n",
"2. Steuerrückstellungen\n",
"3. Sonstige Rückstellungen\n",
"nan\n",
"C. Verbindlichkeiten\n",
"1. Erhaltene Anzahlungen\n",
"2. Verbindlichkeiten aus Lieferungen und Leistungen\n",
"3. Verbindlichkeiten gegenüber verbundenen Unternehmen\n",
"4. Sonstige Verbindlichkeiten\n",
"nan\n",
"D. Rechnungsabgrenzungungsposten\n",
"nan\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_18940\\152097142.py:2: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" current_table.iloc[index][0] = cleanse_string(row[0])\n",
"C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_18940\\152097142.py:2: FutureWarning: Series.__setitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To set a value by position, use `ser.iloc[pos] = value`\n",
" current_table.iloc[index][0] = cleanse_string(row[0])\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" Anhang | \n",
" 31.12.2021 TEUR | \n",
" Vorjahr TEUR | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Anlagevermögen | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" Immaterielle Vermögensgegenstände | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" Entgeltlich erworbene Software | \n",
" NaN | \n",
" 3 | \n",
" 6 | \n",
"
\n",
" \n",
" 3 | \n",
" Sachanlagen | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" Grundstücke und Bauten | \n",
" NaN | \n",
" 75 | \n",
" 89 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 Anhang 31.12.2021 TEUR Vorjahr TEUR\n",
"0 Anlagevermögen NaN NaN NaN\n",
"1 Immaterielle Vermögensgegenstände NaN NaN NaN\n",
"2 Entgeltlich erworbene Software NaN 3 6\n",
"3 Sachanlagen NaN NaN NaN\n",
"4 Grundstücke und Bauten NaN 75 89"
]
},
"execution_count": 166,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"for index, row in current_table.iterrows():\n",
" current_table.iloc[index][0] = cleanse_string(row[0])\n",
"current_table.head()"
]
},
{
"cell_type": "code",
"execution_count": 167,
"metadata": {},
"outputs": [],
"source": [
"def parse_string_to_float(value) -> float:\n",
" try:\n",
" if value is None:\n",
" return None\n",
" if isinstance(value, float):\n",
" return value\n",
" return float(value.replace(\".\", \"\").replace(\",\", \".\"))\n",
" except Exception as e:\n",
" return None\n",
"\n",
"\n",
"def apply_factor(value, factor: float):\n",
" transformed_value = parse_string_to_float(value)\n",
" if transformed_value is None or isinstance(transformed_value, str):\n",
" return None\n",
" result = transformed_value * factor\n",
" # print(result)\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 168,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" Anhang | \n",
" 31.12.2021 | \n",
" Vorjahr | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Anlagevermögen | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" Immaterielle Vermögensgegenstände | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" Entgeltlich erworbene Software | \n",
" NaN | \n",
" 3000.0 | \n",
" 6000.0 | \n",
"
\n",
" \n",
" 3 | \n",
" Sachanlagen | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" Grundstücke und Bauten | \n",
" NaN | \n",
" 75000.0 | \n",
" 89000.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 Anhang 31.12.2021 Vorjahr\n",
"0 Anlagevermögen NaN NaN NaN\n",
"1 Immaterielle Vermögensgegenstände NaN NaN NaN\n",
"2 Entgeltlich erworbene Software NaN 3000.0 6000.0\n",
"3 Sachanlagen NaN NaN NaN\n",
"4 Grundstücke und Bauten NaN 75000.0 89000.0"
]
},
"execution_count": 168,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"converter = {\"T€\": 1 / 1000, \"TEUR\": 1000, \"EUR\": 1 / 1000, \"€\": 1}\n",
"\n",
"for column in current_table.columns:\n",
" if isinstance(column, tuple):\n",
" for c in column:\n",
" for x, factor in converter.items():\n",
" if x in c:\n",
" current_table[column] = current_table[column].apply(\n",
" lambda x: apply_factor(x, factor)\n",
" )\n",
" next\n",
" else:\n",
" for x, factor in converter.items():\n",
" parts = column.split(\" \")\n",
" for y in parts:\n",
" if re.match(x, y):\n",
" current_table[column] = current_table[column].apply(\n",
" lambda x: apply_factor(x, factor)\n",
" )\n",
" current_table.rename({column: parts[0]}, inplace=True, axis=1)\n",
" next\n",
" # print(current_table[column])\n",
"current_table.dropna(axis=0, how=\"all\", inplace=True)\n",
"current_table.head()"
]
},
{
"cell_type": "code",
"execution_count": 169,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"\n",
"def get_bilanz(report: str) -> any:\n",
" result = {}\n",
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
" for pos in [\"Aktiva\", \"Passiva\"]:\n",
" tag = soup.find(\"b\", string=re.compile(pos))\n",
" if tag:\n",
" pos_results = pd.read_html(\n",
" StringIO(str(tag.findNext(\"table\", {\"class\": \"std_table\"})))\n",
" )[0]\n",
" result[pos] = pos_results\n",
" else:\n",
" result[pos] = pd.DataFrame([])\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" Anhang | \n",
" 2021 TEUR | \n",
" Vorjahr TEUR | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1. Umsatzerlöse | \n",
" (1) | \n",
" 66.767 | \n",
" 69.819 | \n",
"
\n",
" \n",
" 1 | \n",
" 2. Veränderung des Bestandes an unfertigen Lei... | \n",
" NaN | \n",
" 0.000 | \n",
" -41.000 | \n",
"
\n",
" \n",
" 2 | \n",
" 3. Sonstige betriebliche Erträge | \n",
" (2) | \n",
" 621.000 | \n",
" 489.000 | \n",
"
\n",
" \n",
" 3 | \n",
" 4. Materialaufwand | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" a) Aufwendungen für bezogene Waren | \n",
" NaN | \n",
" -475.000 | \n",
" -1.220 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 Anhang 2021 TEUR \\\n",
"0 1. Umsatzerlöse (1) 66.767 \n",
"1 2. Veränderung des Bestandes an unfertigen Lei... NaN 0.000 \n",
"2 3. Sonstige betriebliche Erträge (2) 621.000 \n",
"3 4. Materialaufwand NaN NaN \n",
"4 a) Aufwendungen für bezogene Waren NaN -475.000 \n",
"\n",
" Vorjahr TEUR \n",
"0 69.819 \n",
"1 -41.000 \n",
"2 489.000 \n",
"3 NaN \n",
"4 -1.220 "
]
},
"execution_count": 170,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bilanz = get_bilanz(sample_report)\n",
"bilanz[\"Passiva\"].head()"
]
},
{
"cell_type": "code",
"execution_count": 171,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" Anhang | \n",
" 31.12.2021 TEUR | \n",
" Vorjahr TEUR | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" A. Anlagevermögen | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" I. Immaterielle Vermögensgegenstände | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" Entgeltlich erworbene Software | \n",
" NaN | \n",
" 3 | \n",
" 6 | \n",
"
\n",
" \n",
" 3 | \n",
" II. Sachanlagen | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" 1. Grundstücke und Bauten | \n",
" NaN | \n",
" 75 | \n",
" 89 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 Anhang 31.12.2021 TEUR Vorjahr TEUR\n",
"0 A. Anlagevermögen NaN NaN NaN\n",
"1 I. Immaterielle Vermögensgegenstände NaN NaN NaN\n",
"2 Entgeltlich erworbene Software NaN 3 6\n",
"3 II. Sachanlagen NaN NaN NaN\n",
"4 1. Grundstücke und Bauten NaN 75 89"
]
},
"execution_count": 171,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bilanz[\"Aktiva\"].head()"
]
},
{
"cell_type": "code",
"execution_count": 172,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" Anhang | \n",
" 2021 TEUR | \n",
" Vorjahr TEUR | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1. Umsatzerlöse | \n",
" (1) | \n",
" 66.767 | \n",
" 69.819 | \n",
"
\n",
" \n",
" 1 | \n",
" 2. Veränderung des Bestandes an unfertigen Leistungen | \n",
" NaN | \n",
" 0.000 | \n",
" -41.000 | \n",
"
\n",
" \n",
" 2 | \n",
" 3. Sonstige betriebliche Erträge | \n",
" (2) | \n",
" 621.000 | \n",
" 489.000 | \n",
"
\n",
" \n",
" 3 | \n",
" 4. Materialaufwand | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" a) Aufwendungen für bezogene Waren | \n",
" NaN | \n",
" -475.000 | \n",
" -1.220 | \n",
"
\n",
" \n",
" 5 | \n",
" b) Aufwendungen für bezogene Leistungen | \n",
" NaN | \n",
" -12.855 | \n",
" -12.457 | \n",
"
\n",
" \n",
" 6 | \n",
" 5. Personalaufwand | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 7 | \n",
" a) Gehälter | \n",
" NaN | \n",
" -52.916 | \n",
" -45.242 | \n",
"
\n",
" \n",
" 8 | \n",
" b) Soziale Abgaben und Aufwendungen für Altersversorgung und für Unterstützung | \n",
" NaN | \n",
" -9.945 | \n",
" -9.999 | \n",
"
\n",
" \n",
" 9 | \n",
" davon für Altersversorgung: TEUR 1.817 (Vorjahr: TEUR 1.676) | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 10 | \n",
" 6. Abschreibungen auf immaterielle Vermögensgegenstände des Anlagevermögens und Sachanlagen | \n",
" NaN | \n",
" -165.000 | \n",
" -201.000 | \n",
"
\n",
" \n",
" 11 | \n",
" 7. Sonstige betriebliche Aufwendungen | \n",
" (3) | \n",
" -4.968 | \n",
" -7.356 | \n",
"
\n",
" \n",
" 12 | \n",
" 8. Zinsen und ähnliche Aufwendungen | \n",
" NaN | \n",
" -6.170 | \n",
" -10.748 | \n",
"
\n",
" \n",
" 13 | \n",
" davon aus der Aufzinsung von Rückstellungen: TEUR 6.116 (Vorjahr: TEUR 10.730) | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 14 | \n",
" 9. Steuern vom Einkommen und vom Ertrag | \n",
" NaN | \n",
" 35.000 | \n",
" 0.000 | \n",
"
\n",
" \n",
" 15 | \n",
" 10. Ergebnis vor sonstigen Steuern und Verlustübernahme | \n",
" NaN | \n",
" -20.072 | \n",
" -16.956 | \n",
"
\n",
" \n",
" 16 | \n",
" 11. Sonstige Steuern | \n",
" NaN | \n",
" 0.000 | \n",
" -7.000 | \n",
"
\n",
" \n",
" 17 | \n",
" 12. Erträge aus Verlustübernahme | \n",
" NaN | \n",
" 20.072 | \n",
" 16.963 | \n",
"
\n",
" \n",
" 18 | \n",
" 13. Jahresergebnis | \n",
" NaN | \n",
" 0.000 | \n",
" 0.000 | \n",
"
\n",
" \n",
"
"
],
"text/plain": [
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from IPython.display import display, HTML\n",
"\n",
"# Assuming that dataframes df1 and df2 are already defined:\n",
"display(HTML(bilanz[\"Passiva\"].to_html()))"
]
},
{
"cell_type": "code",
"execution_count": 173,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index([0, 1], dtype='int64')\n",
"Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n",
" 'Vorjahr TEUR'],\n",
" dtype='object')\n",
"Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index([0, 1, 2], dtype='int64')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
"MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n",
" ('Art der Beziehung', 'Gesellschafterin TEUR'),\n",
" ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n",
" )\n",
"Index([0, 1], dtype='int64')\n",
"MultiIndex([( 'Unnamed: 0_level_0', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...)],\n",
" )\n",
"MultiIndex([('Unnamed: 0_level_0', ...),\n",
" ( 'Abschreibungen', ...),\n",
" ( 'Abschreibungen', ...),\n",
" ( 'Abschreibungen', ...),\n",
" ( 'Abschreibungen', ...)],\n",
" )\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n",
" ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n",
" )\n",
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n",
" '2019'],\n",
" dtype='object')\n",
"Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n",
" 'Veränderung TEUR'],\n",
" dtype='object')\n",
"Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n"
]
}
],
"source": [
"def get_tables(raw_report: str) -> list:\n",
" soup = BeautifulSoup(raw_report, features=\"html.parser\")\n",
" tables = soup.find_all(\"table\", {\"class\": \"std_table\"})\n",
" dfs = []\n",
" for table in tables:\n",
" for df in pd.read_html(StringIO(str(table))):\n",
" dfs.append(df)\n",
" return dfs\n",
"\n",
"\n",
"for df in get_tables(sample_report):\n",
" print(df.columns)\n",
"\n",
"tables = get_tables(sample_report)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}