list:\n",
" auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
" hits = re.findall(auditor_regex, report)\n",
" return [hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip() for hit in hits]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Eckhard Lewe', 'Renate Hermsdorf']"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extract_auditors(sample_report)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def extract_auditor_company(report: str) -> str:\n",
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
" temp = soup.find_all(\"b\")\n",
" for elem in temp:\n",
" br = elem.findChildren(\"br\")\n",
" if len(br) > 0:\n",
" return elem.text.split(\"\\n\")[1].strip()\n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Warth & Klein Grant Thornton AG'"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extract_auditor_company(sample_report)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Aufsichtsrat"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"**TODO**"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Bilanz bzw. GuV"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" Anhang | \n",
" 2020 TEUR | \n",
" Vorjahr TEUR | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1. Umsatzerlöse | \n",
" (1) | \n",
" 69.819 | \n",
" 77.429 | \n",
"
\n",
" \n",
" 1 | \n",
" 2. Veränderung des Bestandes an unfertigen Lei... | \n",
" NaN | \n",
" -41.000 | \n",
" -66.000 | \n",
"
\n",
" \n",
" 2 | \n",
" 3. Sonstige betriebliche Erträge | \n",
" (2) | \n",
" 489.000 | \n",
" 1.816 | \n",
"
\n",
" \n",
" 3 | \n",
" 4. Materialaufwand | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" a) Aufwendungen für bezogene Waren | \n",
" NaN | \n",
" -1.220 | \n",
" -3.003 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 Anhang 2020 TEUR \\\n",
"0 1. Umsatzerlöse (1) 69.819 \n",
"1 2. Veränderung des Bestandes an unfertigen Lei... NaN -41.000 \n",
"2 3. Sonstige betriebliche Erträge (2) 489.000 \n",
"3 4. Materialaufwand NaN NaN \n",
"4 a) Aufwendungen für bezogene Waren NaN -1.220 \n",
"\n",
" Vorjahr TEUR \n",
"0 77.429 \n",
"1 -66.000 \n",
"2 1.816 \n",
"3 NaN \n",
"4 -3.003 "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def get_bilanz(report: str) -> any:\n",
" result = {}\n",
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
" for pos in [\"Aktiva\", \"Passiva\"]:\n",
" tag = soup.find(\"b\", string=re.compile(pos))\n",
" if tag:\n",
" pos_results = pd.read_html(\n",
" StringIO(str(tag.findNext(\"table\", {\"class\": \"std_table\"})))\n",
" )[0]\n",
" result[pos] = pos_results\n",
" return result\n",
"\n",
"\n",
"bilanz = get_bilanz(sample_report)\n",
"bilanz[\"Passiva\"].head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Gesellschafterbeschluss', 'Shareholder Resolution'], dtype='object')\n",
"Index(['Unnamed: 0', 'Anhang', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'Anhang', '2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2020 TEUR',\n",
" 'Vorjahr TEUR'],\n",
" dtype='object')\n",
"Index(['Aufgliederung nach Inland und Ausland', '2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2020'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Int64Index([0, 1, 2], dtype='int64')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '2020 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
"MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n",
" ('Art der Beziehung', 'Gesellschafterin TEUR'),\n",
" ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n",
" )\n",
"Int64Index([0, 1], dtype='int64')\n",
"MultiIndex([( 'Unnamed: 0_level_0', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...)],\n",
" )\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( 'Abschreibungen', 'Stand 01.01.2020 EUR'),\n",
" ( 'Abschreibungen', 'Abschreibungen des Geschäftsjahres EUR'),\n",
" ( 'Abschreibungen', 'Abgänge Umbuchung EUR'),\n",
" ( 'Abschreibungen', 'Stand 31.12.2020 EUR')],\n",
" )\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( 'Buchwerte', 'Stand 31.12.2020 EUR'),\n",
" ( 'Buchwerte', 'Stand 31.12.2019 EUR')],\n",
" )\n",
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2020', '2019',\n",
" '2018'],\n",
" dtype='object')\n",
"Index(['Gewinn- und Verlustrechnung', '2020 TEUR', 'Vorjahr TEUR',\n",
" 'Veränderung TEUR'],\n",
" dtype='object')\n",
"Index(['Bilanz', '31.12.2020 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n",
"Index(['Bericht des Aufsichtsrats', 'Report of the Supervisory Board'], dtype='object')\n"
]
}
],
"source": [
"def get_tables(raw_report: str) -> list:\n",
" soup = BeautifulSoup(raw_report, features=\"html.parser\")\n",
" tables = soup.find_all(\"table\", {\"class\": \"std_table\"})\n",
" dfs = []\n",
" for table in tables:\n",
" for df in pd.read_html(StringIO(str(table))):\n",
" dfs.append(df)\n",
" return dfs\n",
"\n",
"\n",
"for df in get_tables(sample_report):\n",
" print(df.columns)\n",
"\n",
"tables = get_tables(sample_report)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.7"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}