mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-04-22 11:32:53 +02:00
772 lines
28 KiB
Plaintext
772 lines
28 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Daten Extraktion aus dem Bundesanzeiger"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"In order to run this notebooks, download the `deutschland` library source code from: [TrisNol/deutschland](https://github.com/TrisNol/deutschland/tree/feat/bundesanzeiger-raw-report) and place it in the `Jupyter/API-tests/Bundesanzeiger/deutschland` directory. Since the PR adding the required features to the main repo has not been completet as of of yet (see: [PR](https://github.com/bundesAPI/deutschland/pull/88)) we have to include it in another way..."
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Vorbereitung"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"C:\\Users\\trist\\AppData\\Roaming\\Python\\Python310\\site-packages\\requests\\__init__.py:102: RequestsDependencyWarning: urllib3 (1.26.12) or chardet (5.1.0)/charset_normalizer (2.0.12) doesn't match a supported version!\n",
|
||
" warnings.warn(\"urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported \"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"from deutschland.bundesanzeiger import Bundesanzeiger"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"dict_keys(['040860c00ef9020cfb6db2f58a163256', '9eb401c5af2f0289b8207233bf852b81', 'b14d979ea9f42367d413589050bd04e5', 'ecb8c1011456ea0d40f87e850fc216bf', '3a7c6c1f1d1b89bf5ceb165f0ee88053', '03b2e6aac2f2da2c0c5e8f23de9caec4', 'a5f8dc87fa797e7d2f8fb88c49a23c36', '9a3a8a3e84290ee650cbccf32323b3d7', '6c0fcc20a58aaa18a9d13f35a51e3996', 'bf276d441c339e787e22385d2b69b277', '90a79d28f3c11a2122d2827d2bf6acda', '88c785ce3b3c580dcc285661c7790cca', 'd3064baa8246c3ed02e30b5038200edc', '5bf92eed2808b484c005409764b825b7', 'fece6303c991a280850be1900ff78f8f', '26b0624c60cdbf647f3d45f4917ec6ea', '9f98bee55f598908cca60b6a47e5d49d', '99267bb7474e6d1d5d9e091ba5ef3ee8', '102738ef4b91408ed043d84fe785b50b', '94711f3e509518d073e1760d97550347'])\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"ba = Bundesanzeiger()\n",
|
||
"reports = ba.get_reports(\"Atos IT-Dienstleistung und Beratung GmbH\")\n",
|
||
"print(reports.keys())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"report_contents = []\n",
|
||
"for key in reports.keys():\n",
|
||
" report_contents.append(reports[key])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>date</th>\n",
|
||
" <th>name</th>\n",
|
||
" <th>company</th>\n",
|
||
" <th>report</th>\n",
|
||
" <th>raw_report</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>2023-03-17</td>\n",
|
||
" <td>Aufsichtsrat</td>\n",
|
||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl...</td>\n",
|
||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2022-03-25</td>\n",
|
||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2021-03-11</td>\n",
|
||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>2020-03-24</td>\n",
|
||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung...</td>\n",
|
||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>2018-12-11</td>\n",
|
||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" date name \\\n",
|
||
"0 2023-03-17 Aufsichtsrat \n",
|
||
"1 2022-03-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||
"2 2021-03-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||
"3 2020-03-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||
"4 2018-12-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||
"\n",
|
||
" company \\\n",
|
||
"0 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||
"1 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||
"2 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||
"3 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||
"4 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||
"\n",
|
||
" report \\\n",
|
||
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl... \n",
|
||
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||
"2 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||
"3 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung... \n",
|
||
"4 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||
"\n",
|
||
" raw_report \n",
|
||
"0 <div class=\"publication_container\">\\n <div cla... \n",
|
||
"1 <div class=\"publication_container\">\\n <div cla... \n",
|
||
"2 <div class=\"publication_container\">\\n <div cla... \n",
|
||
"3 <div class=\"publication_container\">\\n <div cla... \n",
|
||
"4 <div class=\"publication_container\">\\n <div cla... "
|
||
]
|
||
},
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df_reports = pd.DataFrame(report_contents)\n",
|
||
"df_reports.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>date</th>\n",
|
||
" <th>name</th>\n",
|
||
" <th>company</th>\n",
|
||
" <th>report</th>\n",
|
||
" <th>raw_report</th>\n",
|
||
" <th>type</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>2023-03-17</td>\n",
|
||
" <td>Aufsichtsrat</td>\n",
|
||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl...</td>\n",
|
||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||
" <td>Aufsichtsrat</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2022-03-25</td>\n",
|
||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||
" <td>Jahresabschluss</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2021-03-11</td>\n",
|
||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||
" <td>Jahresabschluss</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>2020-03-24</td>\n",
|
||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung...</td>\n",
|
||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||
" <td>Jahresabschluss</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>2018-12-11</td>\n",
|
||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||
" <td>Jahresabschluss</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" date name \\\n",
|
||
"0 2023-03-17 Aufsichtsrat \n",
|
||
"1 2022-03-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||
"2 2021-03-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||
"3 2020-03-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||
"4 2018-12-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||
"\n",
|
||
" company \\\n",
|
||
"0 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||
"1 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||
"2 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||
"3 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||
"4 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||
"\n",
|
||
" report \\\n",
|
||
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl... \n",
|
||
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||
"2 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||
"3 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung... \n",
|
||
"4 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||
"\n",
|
||
" raw_report type \n",
|
||
"0 <div class=\"publication_container\">\\n <div cla... Aufsichtsrat \n",
|
||
"1 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
||
"2 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
||
"3 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
||
"4 <div class=\"publication_container\">\\n <div cla... Jahresabschluss "
|
||
]
|
||
},
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df_reports[\"type\"] = df_reports.name.apply(lambda name: name.split(\" \")[0])\n",
|
||
"df_reports.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_6460\\963182859.py:2: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" df_jahresabschluss['jahr'] = df_jahresabschluss.name.apply(\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>date</th>\n",
|
||
" <th>company</th>\n",
|
||
" <th>raw_report</th>\n",
|
||
" <th>jahr</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2022-03-25</td>\n",
|
||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||
" <td>2020</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>2021-03-11</td>\n",
|
||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||
" <td>2019</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>2020-03-24</td>\n",
|
||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||
" <td>2018</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>2018-12-11</td>\n",
|
||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||
" <td>2017</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>2018-01-03</td>\n",
|
||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||
" <td>2016</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" date company \\\n",
|
||
"1 2022-03-25 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||
"2 2021-03-11 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||
"3 2020-03-24 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||
"4 2018-12-11 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||
"6 2018-01-03 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||
"\n",
|
||
" raw_report jahr \n",
|
||
"1 <div class=\"publication_container\">\\n <div cla... 2020 \n",
|
||
"2 <div class=\"publication_container\">\\n <div cla... 2019 \n",
|
||
"3 <div class=\"publication_container\">\\n <div cla... 2018 \n",
|
||
"4 <div class=\"publication_container\">\\n <div cla... 2017 \n",
|
||
"6 <div class=\"publication_container\">\\n <div cla... 2016 "
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df_jahresabschluss = df_reports.loc[df_reports.type == \"Jahresabschluss\"]\n",
|
||
"df_jahresabschluss[\"jahr\"] = df_jahresabschluss.name.apply(\n",
|
||
" lambda name: name.split(\" \")[-1].split(\".\")[-1]\n",
|
||
")\n",
|
||
"df_jahresabschluss = df_jahresabschluss.drop([\"name\", \"report\", \"type\"], axis=1)\n",
|
||
"df_jahresabschluss.head()"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Daten Extraktion"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from bs4 import BeautifulSoup\n",
|
||
"from io import StringIO"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"sample_report = df_jahresabschluss.iloc[0].raw_report"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Wirtschaftsprüfer"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import re\n",
|
||
"\n",
|
||
"\n",
|
||
"def extract_auditors(report: str) -> list:\n",
|
||
" auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
|
||
" hits = re.findall(auditor_regex, report)\n",
|
||
" return [hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip() for hit in hits]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"['Eckhard Lewe', 'Renate Hermsdorf']"
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"extract_auditors(sample_report)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def extract_auditor_company(report: str) -> str:\n",
|
||
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
|
||
" temp = soup.find_all(\"b\")\n",
|
||
" for elem in temp:\n",
|
||
" br = elem.findChildren(\"br\")\n",
|
||
" if len(br) > 0:\n",
|
||
" return elem.text.split(\"\\n\")[1].strip()\n",
|
||
" return None"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Warth & Klein Grant Thornton AG'"
|
||
]
|
||
},
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"extract_auditor_company(sample_report)"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Aufsichtsrat"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"**TODO**"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Bilanz bzw. GuV"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Unnamed: 0</th>\n",
|
||
" <th>Anhang</th>\n",
|
||
" <th>2020 TEUR</th>\n",
|
||
" <th>Vorjahr TEUR</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1. Umsatzerlöse</td>\n",
|
||
" <td>(1)</td>\n",
|
||
" <td>69.819</td>\n",
|
||
" <td>77.429</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>2. Veränderung des Bestandes an unfertigen Lei...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>-41.000</td>\n",
|
||
" <td>-66.000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>3. Sonstige betriebliche Erträge</td>\n",
|
||
" <td>(2)</td>\n",
|
||
" <td>489.000</td>\n",
|
||
" <td>1.816</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>4. Materialaufwand</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>a) Aufwendungen für bezogene Waren</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>-1.220</td>\n",
|
||
" <td>-3.003</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Unnamed: 0 Anhang 2020 TEUR \\\n",
|
||
"0 1. Umsatzerlöse (1) 69.819 \n",
|
||
"1 2. Veränderung des Bestandes an unfertigen Lei... NaN -41.000 \n",
|
||
"2 3. Sonstige betriebliche Erträge (2) 489.000 \n",
|
||
"3 4. Materialaufwand NaN NaN \n",
|
||
"4 a) Aufwendungen für bezogene Waren NaN -1.220 \n",
|
||
"\n",
|
||
" Vorjahr TEUR \n",
|
||
"0 77.429 \n",
|
||
"1 -66.000 \n",
|
||
"2 1.816 \n",
|
||
"3 NaN \n",
|
||
"4 -3.003 "
|
||
]
|
||
},
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"def get_bilanz(report: str) -> any:\n",
|
||
" result = {}\n",
|
||
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
|
||
" for pos in [\"Aktiva\", \"Passiva\"]:\n",
|
||
" tag = soup.find(\"b\", string=re.compile(pos))\n",
|
||
" if tag:\n",
|
||
" pos_results = pd.read_html(\n",
|
||
" StringIO(str(tag.findNext(\"table\", {\"class\": \"std_table\"})))\n",
|
||
" )[0]\n",
|
||
" result[pos] = pos_results\n",
|
||
" return result\n",
|
||
"\n",
|
||
"\n",
|
||
"bilanz = get_bilanz(sample_report)\n",
|
||
"bilanz[\"Passiva\"].head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['Gesellschafterbeschluss', 'Shareholder Resolution'], dtype='object')\n",
|
||
"Index(['Unnamed: 0', 'Anhang', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||
"Index(['Unnamed: 0', 'Anhang', '2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2020 TEUR',\n",
|
||
" 'Vorjahr TEUR'],\n",
|
||
" dtype='object')\n",
|
||
"Index(['Aufgliederung nach Inland und Ausland', '2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||
"Index(['Unnamed: 0', '31.12.2020'], dtype='object')\n",
|
||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||
"Int64Index([0, 1, 2], dtype='int64')\n",
|
||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||
"Index(['Unnamed: 0', '2020 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
|
||
"MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n",
|
||
" ('Art der Beziehung', 'Gesellschafterin TEUR'),\n",
|
||
" ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n",
|
||
" )\n",
|
||
"Int64Index([0, 1], dtype='int64')\n",
|
||
"MultiIndex([( 'Unnamed: 0_level_0', ...),\n",
|
||
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
|
||
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
|
||
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
|
||
" ('Anschaffungs- oder Herstellungskosten', ...)],\n",
|
||
" )\n",
|
||
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
|
||
" ( 'Abschreibungen', 'Stand 01.01.2020 EUR'),\n",
|
||
" ( 'Abschreibungen', 'Abschreibungen des Geschäftsjahres EUR'),\n",
|
||
" ( 'Abschreibungen', 'Abgänge Umbuchung EUR'),\n",
|
||
" ( 'Abschreibungen', 'Stand 31.12.2020 EUR')],\n",
|
||
" )\n",
|
||
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
|
||
" ( 'Buchwerte', 'Stand 31.12.2020 EUR'),\n",
|
||
" ( 'Buchwerte', 'Stand 31.12.2019 EUR')],\n",
|
||
" )\n",
|
||
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2020', '2019',\n",
|
||
" '2018'],\n",
|
||
" dtype='object')\n",
|
||
"Index(['Gewinn- und Verlustrechnung', '2020 TEUR', 'Vorjahr TEUR',\n",
|
||
" 'Veränderung TEUR'],\n",
|
||
" dtype='object')\n",
|
||
"Index(['Bilanz', '31.12.2020 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n",
|
||
"Index(['Bericht des Aufsichtsrats', 'Report of the Supervisory Board'], dtype='object')\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"def get_tables(raw_report: str) -> list:\n",
|
||
" soup = BeautifulSoup(raw_report, features=\"html.parser\")\n",
|
||
" tables = soup.find_all(\"table\", {\"class\": \"std_table\"})\n",
|
||
" dfs = []\n",
|
||
" for table in tables:\n",
|
||
" for df in pd.read_html(StringIO(str(table))):\n",
|
||
" dfs.append(df)\n",
|
||
" return dfs\n",
|
||
"\n",
|
||
"\n",
|
||
"for df in get_tables(sample_report):\n",
|
||
" print(df.columns)\n",
|
||
"\n",
|
||
"tables = get_tables(sample_report)"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.10.7"
|
||
},
|
||
"orig_nbformat": 4
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|