mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-05-13 13:18:46 +02:00
Bundesanzeiger preparation, Handeslblatt RSS feed export
This commit is contained in:
parent
37fb1b1da3
commit
421b1e8c87
@ -8,14 +8,6 @@
|
||||
"# Daten Extraktion aus dem Bundesanzeiger"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In order to run this notebooks, download the `deutschland` library source code from: [TrisNol/deutschland](https://github.com/TrisNol/deutschland/tree/feat/bundesanzeiger-raw-report) and place it in the `Jupyter/API-tests/Bundesanzeiger/deutschland` directory. Since the PR adding the required features to the main repo has not been completet as of of yet (see: [PR](https://github.com/bundesAPI/deutschland/pull/88)) we have to include it in another way..."
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
@ -26,18 +18,9 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"C:\\Users\\trist\\AppData\\Roaming\\Python\\Python310\\site-packages\\requests\\__init__.py:102: RequestsDependencyWarning: urllib3 (1.26.12) or chardet (5.1.0)/charset_normalizer (2.0.12) doesn't match a supported version!\n",
|
||||
" warnings.warn(\"urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported \"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"from deutschland.bundesanzeiger import Bundesanzeiger"
|
||||
@ -45,26 +28,28 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"dict_keys(['040860c00ef9020cfb6db2f58a163256', '9eb401c5af2f0289b8207233bf852b81', 'b14d979ea9f42367d413589050bd04e5', 'ecb8c1011456ea0d40f87e850fc216bf', '3a7c6c1f1d1b89bf5ceb165f0ee88053', '03b2e6aac2f2da2c0c5e8f23de9caec4', 'a5f8dc87fa797e7d2f8fb88c49a23c36', '9a3a8a3e84290ee650cbccf32323b3d7', '6c0fcc20a58aaa18a9d13f35a51e3996', 'bf276d441c339e787e22385d2b69b277', '90a79d28f3c11a2122d2827d2bf6acda', '88c785ce3b3c580dcc285661c7790cca', 'd3064baa8246c3ed02e30b5038200edc', '5bf92eed2808b484c005409764b825b7', 'fece6303c991a280850be1900ff78f8f', '26b0624c60cdbf647f3d45f4917ec6ea', '9f98bee55f598908cca60b6a47e5d49d', '99267bb7474e6d1d5d9e091ba5ef3ee8', '102738ef4b91408ed043d84fe785b50b', '94711f3e509518d073e1760d97550347'])\n"
|
||||
"dict_keys(['c1051233030a8e0232523052fd4a2310', '57d129e6fd7505d567fa13919e5e6bdd'])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ba = Bundesanzeiger()\n",
|
||||
"reports = ba.get_reports(\"Atos IT-Dienstleistung und Beratung GmbH\")\n",
|
||||
"reports = ba.get_reports(\n",
|
||||
" \"Volkswagen Economy Service Erdle Bernhard Erdle GmbH\"\n",
|
||||
") # \"Atos IT-Dienstleistung und Beratung GmbH\")\n",
|
||||
"print(reports.keys())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -75,7 +60,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -109,42 +94,18 @@
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>2023-03-17</td>\n",
|
||||
" <td>Aufsichtsrat</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl...</td>\n",
|
||||
" <td>2023-05-25</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2022-03-25</td>\n",
|
||||
" <td>2023-05-24</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>2021-03-11</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>2020-03-24</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>2018-12-11</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
@ -153,35 +114,23 @@
|
||||
],
|
||||
"text/plain": [
|
||||
" date name \\\n",
|
||||
"0 2023-03-17 Aufsichtsrat \n",
|
||||
"1 2022-03-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"2 2021-03-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"3 2020-03-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"4 2018-12-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"0 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"1 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"\n",
|
||||
" company \\\n",
|
||||
"0 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"1 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"2 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"3 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"4 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
" company \\\n",
|
||||
"0 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"1 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"\n",
|
||||
" report \\\n",
|
||||
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl... \n",
|
||||
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||||
"2 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||||
"3 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung... \n",
|
||||
"4 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||||
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||
"\n",
|
||||
" raw_report \n",
|
||||
"0 <div class=\"publication_container\">\\n <div cla... \n",
|
||||
"1 <div class=\"publication_container\">\\n <div cla... \n",
|
||||
"2 <div class=\"publication_container\">\\n <div cla... \n",
|
||||
"3 <div class=\"publication_container\">\\n <div cla... \n",
|
||||
"4 <div class=\"publication_container\">\\n <div cla... "
|
||||
"1 <div class=\"publication_container\">\\n <div cla... "
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -193,7 +142,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 36,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -228,46 +177,19 @@
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>2023-03-17</td>\n",
|
||||
" <td>Aufsichtsrat</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl...</td>\n",
|
||||
" <td>2023-05-25</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>Aufsichtsrat</td>\n",
|
||||
" <td>Jahresabschluss</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2022-03-25</td>\n",
|
||||
" <td>2023-05-24</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>Jahresabschluss</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>2021-03-11</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>Jahresabschluss</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>2020-03-24</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>Jahresabschluss</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>2018-12-11</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>Jahresabschluss</td>\n",
|
||||
" </tr>\n",
|
||||
@ -277,35 +199,23 @@
|
||||
],
|
||||
"text/plain": [
|
||||
" date name \\\n",
|
||||
"0 2023-03-17 Aufsichtsrat \n",
|
||||
"1 2022-03-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"2 2021-03-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"3 2020-03-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"4 2018-12-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"0 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"1 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"\n",
|
||||
" company \\\n",
|
||||
"0 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"1 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"2 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"3 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"4 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
" company \\\n",
|
||||
"0 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"1 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"\n",
|
||||
" report \\\n",
|
||||
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl... \n",
|
||||
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||||
"2 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||||
"3 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung... \n",
|
||||
"4 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||||
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||
"\n",
|
||||
" raw_report type \n",
|
||||
"0 <div class=\"publication_container\">\\n <div cla... Aufsichtsrat \n",
|
||||
"1 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
||||
"2 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
||||
"3 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
||||
"4 <div class=\"publication_container\">\\n <div cla... Jahresabschluss "
|
||||
"0 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
||||
"1 <div class=\"publication_container\">\\n <div cla... Jahresabschluss "
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"execution_count": 36,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -317,21 +227,9 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 37,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_6460\\963182859.py:2: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||
" df_jahresabschluss['jahr'] = df_jahresabschluss.name.apply(\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
@ -361,61 +259,34 @@
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2022-03-25</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>2023-05-25</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2020</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>2021-03-11</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2023-05-24</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2019</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>2020-03-24</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2018</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>2018-12-11</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2017</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6</th>\n",
|
||||
" <td>2018-01-03</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2016</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" date company \\\n",
|
||||
"1 2022-03-25 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"2 2021-03-11 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"3 2020-03-24 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"4 2018-12-11 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"6 2018-01-03 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
" date company \\\n",
|
||||
"0 2023-05-25 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"1 2023-05-24 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"\n",
|
||||
" raw_report jahr \n",
|
||||
"1 <div class=\"publication_container\">\\n <div cla... 2020 \n",
|
||||
"2 <div class=\"publication_container\">\\n <div cla... 2019 \n",
|
||||
"3 <div class=\"publication_container\">\\n <div cla... 2018 \n",
|
||||
"4 <div class=\"publication_container\">\\n <div cla... 2017 \n",
|
||||
"6 <div class=\"publication_container\">\\n <div cla... 2016 "
|
||||
"0 <div class=\"publication_container\">\\n <div cla... 2020 \n",
|
||||
"1 <div class=\"publication_container\">\\n <div cla... 2019 "
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"execution_count": 37,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -439,7 +310,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 38,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -449,11 +320,12 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 39,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sample_report = df_jahresabschluss.iloc[0].raw_report"
|
||||
"sample_report = df_jahresabschluss.iloc[0].raw_report\n",
|
||||
"sample_report_content = df_jahresabschluss.iloc[0].raw_report"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -466,45 +338,20 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 40,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"from dataclasses import dataclass\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dataclass\n",
|
||||
"class Auditor:\n",
|
||||
" name: str\n",
|
||||
" company: str\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def extract_auditors(report: str) -> list:\n",
|
||||
" auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
|
||||
" hits = re.findall(auditor_regex, report)\n",
|
||||
" return [hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip() for hit in hits]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['Eckhard Lewe', 'Renate Hermsdorf']"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"extract_auditors(sample_report)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def extract_auditor_company(report: str) -> str:\n",
|
||||
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
|
||||
" temp = soup.find_all(\"b\")\n",
|
||||
@ -512,27 +359,37 @@
|
||||
" br = elem.findChildren(\"br\")\n",
|
||||
" if len(br) > 0:\n",
|
||||
" return elem.text.split(\"\\n\")[1].strip()\n",
|
||||
" return None"
|
||||
" return None\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def extract_auditors(report: str) -> list:\n",
|
||||
" auditor_company = extract_auditor_company(report)\n",
|
||||
" auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
|
||||
" hits = re.findall(auditor_regex, report)\n",
|
||||
" return [\n",
|
||||
" Auditor(hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip(), auditor_company)\n",
|
||||
" for hit in hits\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Warth & Klein Grant Thornton AG'"
|
||||
"[]"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"extract_auditor_company(sample_report)"
|
||||
"extract_auditors(sample_report)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -561,97 +418,177 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"execution_count": 42,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Unnamed: 0</th>\n",
|
||||
" <th>Anhang</th>\n",
|
||||
" <th>2020 TEUR</th>\n",
|
||||
" <th>Vorjahr TEUR</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>1. Umsatzerlöse</td>\n",
|
||||
" <td>(1)</td>\n",
|
||||
" <td>69.819</td>\n",
|
||||
" <td>77.429</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2. Veränderung des Bestandes an unfertigen Lei...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>-41.000</td>\n",
|
||||
" <td>-66.000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>3. Sonstige betriebliche Erträge</td>\n",
|
||||
" <td>(2)</td>\n",
|
||||
" <td>489.000</td>\n",
|
||||
" <td>1.816</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>4. Materialaufwand</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>a) Aufwendungen für bezogene Waren</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>-1.220</td>\n",
|
||||
" <td>-3.003</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Unnamed: 0 Anhang 2020 TEUR \\\n",
|
||||
"0 1. Umsatzerlöse (1) 69.819 \n",
|
||||
"1 2. Veränderung des Bestandes an unfertigen Lei... NaN -41.000 \n",
|
||||
"2 3. Sonstige betriebliche Erträge (2) 489.000 \n",
|
||||
"3 4. Materialaufwand NaN NaN \n",
|
||||
"4 a) Aufwendungen für bezogene Waren NaN -1.220 \n",
|
||||
"\n",
|
||||
" Vorjahr TEUR \n",
|
||||
"0 77.429 \n",
|
||||
"1 -66.000 \n",
|
||||
"2 1.816 \n",
|
||||
"3 NaN \n",
|
||||
"4 -3.003 "
|
||||
"{'net_income': 23484.67, 'equity': 65083.84, 'current_assets': 357613.61}"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"execution_count": 42,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def extract_kpis(report_content) -> dict:\n",
|
||||
" \"\"\"\n",
|
||||
" Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd\n",
|
||||
" Extracts Key Performance Indicators (KPIs) from the financial reports.\n",
|
||||
" Args:\n",
|
||||
" reports (dict): A dictionary containing the financial reports with their hash as keys and report details as values.\n",
|
||||
" Returns:\n",
|
||||
" dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" kpis = {}\n",
|
||||
"\n",
|
||||
" # Define KPI patterns to search for\n",
|
||||
" kpi_patterns = {\n",
|
||||
" \"revenue\": r\"(?:revenue|umsatz|erlöse)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"net_income\": r\"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"ebit\": r\"(?:ebit|operating income)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"ebitda\": r\"(?:ebitda)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"gross_profit\": r\"(?:gross profit|bruttogewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"operating_profit\": r\"(?:operating profit|betriebsgewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"assets\": r\"(?:total assets|bilanzsumme)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"liabilities\": r\"(?:total liabilities|gesamtverbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"equity\": r\"(?:shareholders'? equity|eigenkapital)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"current_assets\": r\"(?:current assets|umlaufvermögen)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"current_liabilities\": r\"(?:current liabilities|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"long_term_debt\": r\"(?:long[-\\s]?term debt|langfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"short_term_debt\": r\"(?:short[-\\s]?term debt|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"cash_and_cash_equivalents\": r\"(?:cash (?:and cash equivalents)?|barmittel)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"dividends\": r\"(?:dividends?|dividende)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"cash_flow\": r\"(?:cash flow|cashflow|cash flow from operating activities)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" report_kpis = {}\n",
|
||||
" for kpi, pattern in kpi_patterns.items():\n",
|
||||
" match = re.search(pattern, report_content, flags=re.IGNORECASE | re.UNICODE)\n",
|
||||
" if match:\n",
|
||||
" value = match.group(1)\n",
|
||||
"\n",
|
||||
" # Clean and validate the extracted number\n",
|
||||
" try:\n",
|
||||
" if not value: # Check if value is empty\n",
|
||||
" cleaned_value = None\n",
|
||||
" else:\n",
|
||||
" multiplier = 1\n",
|
||||
" if value[-1].lower() == \"m\":\n",
|
||||
" value = value[:-1]\n",
|
||||
" multiplier = 1_000_000\n",
|
||||
" elif value[-1].lower() == \"b\":\n",
|
||||
" value = value[:-1]\n",
|
||||
" multiplier = 1_000_000_000\n",
|
||||
"\n",
|
||||
" # Remove commas after checking for multipliers\n",
|
||||
" value = value.replace(\".\", \"\").replace(\",\", \".\").strip()\n",
|
||||
" cleaned_value = float(value) * multiplier\n",
|
||||
" except ValueError:\n",
|
||||
" cleaned_value = None\n",
|
||||
"\n",
|
||||
" if cleaned_value is not None:\n",
|
||||
" report_kpis[kpi] = cleaned_value\n",
|
||||
" return report_kpis\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"extract_kpis(\n",
|
||||
" BeautifulSoup(sample_report, features=\"html.parser\").get_text().replace(\"\\n\", \" \")\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"with open(\"./temp.txt\", \"w\") as file:\n",
|
||||
" file.write(\n",
|
||||
" BeautifulSoup(sample_report, features=\"html.parser\")\n",
|
||||
" .get_text()\n",
|
||||
" .replace(\"\\n\", \" \")\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 46,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n",
|
||||
" ('Aktiva', '31.12.2020 EUR'),\n",
|
||||
" ('Aktiva', '31.12.2019 EUR')],\n",
|
||||
" )\n",
|
||||
"Aktiva Unnamed: 0_level_1 object\n",
|
||||
" 31.12.2020 EUR object\n",
|
||||
" 31.12.2019 EUR object\n",
|
||||
"dtype: object\n",
|
||||
"MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n",
|
||||
" ('Passiva', '31.12.2020 EUR'),\n",
|
||||
" ('Passiva', '31.12.2019 EUR')],\n",
|
||||
" )\n",
|
||||
"Passiva Unnamed: 0_level_1 object\n",
|
||||
" 31.12.2020 EUR object\n",
|
||||
" 31.12.2019 EUR object\n",
|
||||
"dtype: object\n",
|
||||
"Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n",
|
||||
"Angaben zur Identifikation der Gesellschaft laut Registergericht object\n",
|
||||
"Angaben zur Identifikation der Gesellschaft laut Registergericht.1 object\n",
|
||||
"dtype: object\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{}"
|
||||
]
|
||||
},
|
||||
"execution_count": 46,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def parse_tables(report: str) -> list:\n",
|
||||
" result = {}\n",
|
||||
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
|
||||
" for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n",
|
||||
" df = pd.read_html(StringIO(str(table)))[0]\n",
|
||||
" print(df.columns)\n",
|
||||
" print(df.dtypes)\n",
|
||||
" return result\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"parse_tables(sample_report)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "KeyError",
|
||||
"evalue": "'Passiva'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Bundesanzeiger\\notebook.ipynb Cell 21\u001b[0m in \u001b[0;36m1\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=10'>11</a>\u001b[0m \u001b[39mreturn\u001b[39;00m result\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=13'>14</a>\u001b[0m bilanz \u001b[39m=\u001b[39m get_bilanz(sample_report)\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=14'>15</a>\u001b[0m bilanz[\u001b[39m\"\u001b[39;49m\u001b[39mPassiva\u001b[39;49m\u001b[39m\"\u001b[39;49m]\u001b[39m.\u001b[39mhead()\n",
|
||||
"\u001b[1;31mKeyError\u001b[0m: 'Passiva'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def get_bilanz(report: str) -> any:\n",
|
||||
" result = {}\n",
|
||||
@ -672,30 +609,30 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Index(['Gesellschafterbeschluss', 'Shareholder Resolution'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', 'Anhang', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', 'Anhang', '2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2020 TEUR',\n",
|
||||
"Int64Index([0, 1], dtype='int64')\n",
|
||||
"Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n",
|
||||
" 'Vorjahr TEUR'],\n",
|
||||
" dtype='object')\n",
|
||||
"Index(['Aufgliederung nach Inland und Ausland', '2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2020'], dtype='object')\n",
|
||||
"Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||||
"Int64Index([0, 1, 2], dtype='int64')\n",
|
||||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '2020 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
|
||||
"MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n",
|
||||
" ('Art der Beziehung', 'Gesellschafterin TEUR'),\n",
|
||||
" ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n",
|
||||
@ -707,24 +644,23 @@
|
||||
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
|
||||
" ('Anschaffungs- oder Herstellungskosten', ...)],\n",
|
||||
" )\n",
|
||||
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
|
||||
" ( 'Abschreibungen', 'Stand 01.01.2020 EUR'),\n",
|
||||
" ( 'Abschreibungen', 'Abschreibungen des Geschäftsjahres EUR'),\n",
|
||||
" ( 'Abschreibungen', 'Abgänge Umbuchung EUR'),\n",
|
||||
" ( 'Abschreibungen', 'Stand 31.12.2020 EUR')],\n",
|
||||
"MultiIndex([('Unnamed: 0_level_0', ...),\n",
|
||||
" ( 'Abschreibungen', ...),\n",
|
||||
" ( 'Abschreibungen', ...),\n",
|
||||
" ( 'Abschreibungen', ...),\n",
|
||||
" ( 'Abschreibungen', ...)],\n",
|
||||
" )\n",
|
||||
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
|
||||
" ( 'Buchwerte', 'Stand 31.12.2020 EUR'),\n",
|
||||
" ( 'Buchwerte', 'Stand 31.12.2019 EUR')],\n",
|
||||
" ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n",
|
||||
" ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n",
|
||||
" )\n",
|
||||
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2020', '2019',\n",
|
||||
" '2018'],\n",
|
||||
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n",
|
||||
" '2019'],\n",
|
||||
" dtype='object')\n",
|
||||
"Index(['Gewinn- und Verlustrechnung', '2020 TEUR', 'Vorjahr TEUR',\n",
|
||||
"Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n",
|
||||
" 'Veränderung TEUR'],\n",
|
||||
" dtype='object')\n",
|
||||
"Index(['Bilanz', '31.12.2020 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n",
|
||||
"Index(['Bericht des Aufsichtsrats', 'Report of the Supervisory Board'], dtype='object')\n"
|
||||
"Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -596,6 +596,262 @@
|
||||
"source": [
|
||||
"service.get_by_id(\"abc\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Handelsblatt RSS Feed"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"import xmltodict\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class HandelsblattRSS:\n",
|
||||
" def __init__(self):\n",
|
||||
" self.base_url = \"https://www.handelsblatt.com/contentexport/feed\"\n",
|
||||
"\n",
|
||||
" def get_news_for_category(self, category: str = \"unternehmen\") -> dict:\n",
|
||||
" url = f\"{self.base_url}/{category}\"\n",
|
||||
" result = requests.get(url=url)\n",
|
||||
" if result.status_code == 200:\n",
|
||||
" return xmltodict.parse(result.text)[\"rss\"][\"channel\"][\"item\"]\n",
|
||||
" return None\n",
|
||||
"\n",
|
||||
" def get_news_details_text(self, url: str) -> dict:\n",
|
||||
" content = requests.get(url)\n",
|
||||
" soup = BeautifulSoup(content.text, features=\"html.parser\")\n",
|
||||
"\n",
|
||||
" return \" \".join(\n",
|
||||
" [elem.text.replace(\"\\n\", \" \") for elem in soup.find_all(\"p\")][:]\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"handelsblatt = HandelsblattRSS()\n",
|
||||
"\n",
|
||||
"items = handelsblatt.get_news_for_category()\n",
|
||||
"\n",
|
||||
"from utils.mongodb.mongo import MongoConnector, MongoNewsService\n",
|
||||
"\n",
|
||||
"connector = MongoConnector(\n",
|
||||
" hostname=\"trisnol.tech\",\n",
|
||||
" database=\"transparenzregister\",\n",
|
||||
" username=\"root\",\n",
|
||||
" password=\"pR0R0v2e2\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"service = MongoNewsService(connector)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'2023-06-27T09:20:32+0200'"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from datetime import datetime\n",
|
||||
"\n",
|
||||
"d = items[0][\"pubDate\"]\n",
|
||||
"datetime.strptime(d, \"%a, %d %b %Y %H:%M:%S %z\").strftime(\"%Y-%m-%dT%H:%M:%S%z\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|██████████| 50/50 [01:04<00:00, 1.30s/it]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>id</th>\n",
|
||||
" <th>title</th>\n",
|
||||
" <th>date</th>\n",
|
||||
" <th>source_url</th>\n",
|
||||
" <th>text</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>https://www.handelsblatt.com/29227224.html</td>\n",
|
||||
" <td>Dieselskandal: Ex-Audi-Chef Rupert Stadler zu ...</td>\n",
|
||||
" <td>2023-06-27T09:20:32+0200</td>\n",
|
||||
" <td>https://www.handelsblatt.com/unternehmen/indus...</td>\n",
|
||||
" <td>Der frühere Audi-Chef wurde wegen Betrugs zu e...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>https://www.handelsblatt.com/29226410.html</td>\n",
|
||||
" <td>Luftfahrt: Größer, reichweitenstärker – aber n...</td>\n",
|
||||
" <td>2023-06-27T16:28:53+0200</td>\n",
|
||||
" <td>https://www.handelsblatt.com/unternehmen/hande...</td>\n",
|
||||
" <td>Honda Aircraft arbeitet an einem Privatflugzeu...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>https://www.handelsblatt.com/29226522.html</td>\n",
|
||||
" <td>Asien: Deutsche Unternehmen wetten auf den Ind...</td>\n",
|
||||
" <td>2023-06-27T00:30:00+0200</td>\n",
|
||||
" <td>https://www.handelsblatt.com/politik/internati...</td>\n",
|
||||
" <td>Unternehmen gehen von einer positiven wirtscha...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>https://www.handelsblatt.com/29228524.html</td>\n",
|
||||
" <td>Elektromobilität: US-Elektroautohersteller Lor...</td>\n",
|
||||
" <td>2023-06-27T18:45:29+0200</td>\n",
|
||||
" <td>https://www.handelsblatt.com/unternehmen/indus...</td>\n",
|
||||
" <td>Das Start-up plante die Massenproduktion mit e...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>https://www.handelsblatt.com/29228272.html</td>\n",
|
||||
" <td>US-Konzern: „Gewaltige Komplexität“ – BGH prüf...</td>\n",
|
||||
" <td>2023-06-27T16:23:03+0200</td>\n",
|
||||
" <td>https://www.handelsblatt.com/unternehmen/hande...</td>\n",
|
||||
" <td>Das Kartellamt stufte den US-Konzern vergangen...</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" id \\\n",
|
||||
"0 https://www.handelsblatt.com/29227224.html \n",
|
||||
"1 https://www.handelsblatt.com/29226410.html \n",
|
||||
"2 https://www.handelsblatt.com/29226522.html \n",
|
||||
"3 https://www.handelsblatt.com/29228524.html \n",
|
||||
"4 https://www.handelsblatt.com/29228272.html \n",
|
||||
"\n",
|
||||
" title \\\n",
|
||||
"0 Dieselskandal: Ex-Audi-Chef Rupert Stadler zu ... \n",
|
||||
"1 Luftfahrt: Größer, reichweitenstärker – aber n... \n",
|
||||
"2 Asien: Deutsche Unternehmen wetten auf den Ind... \n",
|
||||
"3 Elektromobilität: US-Elektroautohersteller Lor... \n",
|
||||
"4 US-Konzern: „Gewaltige Komplexität“ – BGH prüf... \n",
|
||||
"\n",
|
||||
" date \\\n",
|
||||
"0 2023-06-27T09:20:32+0200 \n",
|
||||
"1 2023-06-27T16:28:53+0200 \n",
|
||||
"2 2023-06-27T00:30:00+0200 \n",
|
||||
"3 2023-06-27T18:45:29+0200 \n",
|
||||
"4 2023-06-27T16:23:03+0200 \n",
|
||||
"\n",
|
||||
" source_url \\\n",
|
||||
"0 https://www.handelsblatt.com/unternehmen/indus... \n",
|
||||
"1 https://www.handelsblatt.com/unternehmen/hande... \n",
|
||||
"2 https://www.handelsblatt.com/politik/internati... \n",
|
||||
"3 https://www.handelsblatt.com/unternehmen/indus... \n",
|
||||
"4 https://www.handelsblatt.com/unternehmen/hande... \n",
|
||||
"\n",
|
||||
" text \n",
|
||||
"0 Der frühere Audi-Chef wurde wegen Betrugs zu e... \n",
|
||||
"1 Honda Aircraft arbeitet an einem Privatflugzeu... \n",
|
||||
"2 Unternehmen gehen von einer positiven wirtscha... \n",
|
||||
"3 Das Start-up plante die Massenproduktion mit e... \n",
|
||||
"4 Das Kartellamt stufte den US-Konzern vergangen... "
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from datetime import datetime\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"news = []\n",
|
||||
"for news_article in tqdm(items):\n",
|
||||
" info = {\n",
|
||||
" \"id\": news_article[\"guid\"],\n",
|
||||
" \"title\": news_article[\"title\"],\n",
|
||||
" \"date\": datetime.strptime(\n",
|
||||
" news_article[\"pubDate\"], \"%a, %d %b %Y %H:%M:%S %z\"\n",
|
||||
" ).strftime(\"%Y-%m-%dT%H:%M:%S%z\"),\n",
|
||||
" \"source_url\": news_article[\"link\"],\n",
|
||||
" \"text\": handelsblatt.get_news_details_text(news_article[\"link\"]),\n",
|
||||
" }\n",
|
||||
" news.append(info)\n",
|
||||
"\n",
|
||||
"df = pd.DataFrame(news)\n",
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|██████████| 50/50 [00:00<00:00, 81.98it/s]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from models.News import News\n",
|
||||
"\n",
|
||||
"for article in tqdm(news):\n",
|
||||
" news_article = News(**article)\n",
|
||||
" if service.get_by_id(news_article.id) is None:\n",
|
||||
" service.insert(news_article)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
Loading…
x
Reference in New Issue
Block a user