mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-05-12 19:20:00 +02:00
Merge pull request #39 from fhswf/feature/data-extraktion
Feature/data extraktion
This commit is contained in:
commit
ebedf7c630
@ -12,7 +12,7 @@ repos:
|
||||
- id: check-xml
|
||||
- id: check-ast
|
||||
- id: check-added-large-files
|
||||
args: [--enforce-all]
|
||||
args: [--enforce-all --maxkb=50000]
|
||||
- id: name-tests-test
|
||||
- id: detect-private-key
|
||||
- id: check-case-conflict
|
||||
|
3
.vscode/settings.json
vendored
Normal file
3
.vscode/settings.json
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
{
|
||||
"files.eol": "\n"
|
||||
}
|
@ -8,14 +8,6 @@
|
||||
"# Daten Extraktion aus dem Bundesanzeiger"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In order to run this notebooks, download the `deutschland` library source code from: [TrisNol/deutschland](https://github.com/TrisNol/deutschland/tree/feat/bundesanzeiger-raw-report) and place it in the `Jupyter/API-tests/Bundesanzeiger/deutschland` directory. Since the PR adding the required features to the main repo has not been completet as of of yet (see: [PR](https://github.com/bundesAPI/deutschland/pull/88)) we have to include it in another way..."
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
@ -26,18 +18,9 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"C:\\Users\\trist\\AppData\\Roaming\\Python\\Python310\\site-packages\\requests\\__init__.py:102: RequestsDependencyWarning: urllib3 (1.26.12) or chardet (5.1.0)/charset_normalizer (2.0.12) doesn't match a supported version!\n",
|
||||
" warnings.warn(\"urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported \"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"from deutschland.bundesanzeiger import Bundesanzeiger"
|
||||
@ -45,26 +28,28 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"dict_keys(['040860c00ef9020cfb6db2f58a163256', '9eb401c5af2f0289b8207233bf852b81', 'b14d979ea9f42367d413589050bd04e5', 'ecb8c1011456ea0d40f87e850fc216bf', '3a7c6c1f1d1b89bf5ceb165f0ee88053', '03b2e6aac2f2da2c0c5e8f23de9caec4', 'a5f8dc87fa797e7d2f8fb88c49a23c36', '9a3a8a3e84290ee650cbccf32323b3d7', '6c0fcc20a58aaa18a9d13f35a51e3996', 'bf276d441c339e787e22385d2b69b277', '90a79d28f3c11a2122d2827d2bf6acda', '88c785ce3b3c580dcc285661c7790cca', 'd3064baa8246c3ed02e30b5038200edc', '5bf92eed2808b484c005409764b825b7', 'fece6303c991a280850be1900ff78f8f', '26b0624c60cdbf647f3d45f4917ec6ea', '9f98bee55f598908cca60b6a47e5d49d', '99267bb7474e6d1d5d9e091ba5ef3ee8', '102738ef4b91408ed043d84fe785b50b', '94711f3e509518d073e1760d97550347'])\n"
|
||||
"dict_keys(['c1051233030a8e0232523052fd4a2310', '57d129e6fd7505d567fa13919e5e6bdd'])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ba = Bundesanzeiger()\n",
|
||||
"reports = ba.get_reports(\"Atos IT-Dienstleistung und Beratung GmbH\")\n",
|
||||
"reports = ba.get_reports(\n",
|
||||
" \"Volkswagen Economy Service Erdle Bernhard Erdle GmbH\"\n",
|
||||
") # \"Atos IT-Dienstleistung und Beratung GmbH\")\n",
|
||||
"print(reports.keys())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -75,7 +60,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -109,42 +94,18 @@
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>2023-03-17</td>\n",
|
||||
" <td>Aufsichtsrat</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl...</td>\n",
|
||||
" <td>2023-05-25</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2022-03-25</td>\n",
|
||||
" <td>2023-05-24</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>2021-03-11</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>2020-03-24</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>2018-12-11</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
@ -153,35 +114,23 @@
|
||||
],
|
||||
"text/plain": [
|
||||
" date name \\\n",
|
||||
"0 2023-03-17 Aufsichtsrat \n",
|
||||
"1 2022-03-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"2 2021-03-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"3 2020-03-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"4 2018-12-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"0 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"1 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"\n",
|
||||
" company \\\n",
|
||||
"0 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"1 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"2 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"3 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"4 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
" company \\\n",
|
||||
"0 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"1 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"\n",
|
||||
" report \\\n",
|
||||
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl... \n",
|
||||
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||||
"2 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||||
"3 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung... \n",
|
||||
"4 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||||
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||
"\n",
|
||||
" raw_report \n",
|
||||
"0 <div class=\"publication_container\">\\n <div cla... \n",
|
||||
"1 <div class=\"publication_container\">\\n <div cla... \n",
|
||||
"2 <div class=\"publication_container\">\\n <div cla... \n",
|
||||
"3 <div class=\"publication_container\">\\n <div cla... \n",
|
||||
"4 <div class=\"publication_container\">\\n <div cla... "
|
||||
"1 <div class=\"publication_container\">\\n <div cla... "
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -193,7 +142,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 36,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -228,46 +177,19 @@
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>2023-03-17</td>\n",
|
||||
" <td>Aufsichtsrat</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl...</td>\n",
|
||||
" <td>2023-05-25</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>Aufsichtsrat</td>\n",
|
||||
" <td>Jahresabschluss</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2022-03-25</td>\n",
|
||||
" <td>2023-05-24</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>Jahresabschluss</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>2021-03-11</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>Jahresabschluss</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>2020-03-24</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>Jahresabschluss</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>2018-12-11</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>Jahresabschluss</td>\n",
|
||||
" </tr>\n",
|
||||
@ -277,35 +199,23 @@
|
||||
],
|
||||
"text/plain": [
|
||||
" date name \\\n",
|
||||
"0 2023-03-17 Aufsichtsrat \n",
|
||||
"1 2022-03-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"2 2021-03-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"3 2020-03-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"4 2018-12-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"0 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"1 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"\n",
|
||||
" company \\\n",
|
||||
"0 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"1 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"2 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"3 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"4 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
" company \\\n",
|
||||
"0 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"1 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"\n",
|
||||
" report \\\n",
|
||||
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl... \n",
|
||||
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||||
"2 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||||
"3 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung... \n",
|
||||
"4 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||||
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||
"\n",
|
||||
" raw_report type \n",
|
||||
"0 <div class=\"publication_container\">\\n <div cla... Aufsichtsrat \n",
|
||||
"1 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
||||
"2 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
||||
"3 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
||||
"4 <div class=\"publication_container\">\\n <div cla... Jahresabschluss "
|
||||
"0 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
||||
"1 <div class=\"publication_container\">\\n <div cla... Jahresabschluss "
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"execution_count": 36,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -317,21 +227,9 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 37,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_6460\\963182859.py:2: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||
" df_jahresabschluss['jahr'] = df_jahresabschluss.name.apply(\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
@ -361,61 +259,34 @@
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2022-03-25</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>2023-05-25</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2020</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>2021-03-11</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2023-05-24</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2019</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>2020-03-24</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2018</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>2018-12-11</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2017</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6</th>\n",
|
||||
" <td>2018-01-03</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2016</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" date company \\\n",
|
||||
"1 2022-03-25 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"2 2021-03-11 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"3 2020-03-24 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"4 2018-12-11 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"6 2018-01-03 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
" date company \\\n",
|
||||
"0 2023-05-25 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"1 2023-05-24 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"\n",
|
||||
" raw_report jahr \n",
|
||||
"1 <div class=\"publication_container\">\\n <div cla... 2020 \n",
|
||||
"2 <div class=\"publication_container\">\\n <div cla... 2019 \n",
|
||||
"3 <div class=\"publication_container\">\\n <div cla... 2018 \n",
|
||||
"4 <div class=\"publication_container\">\\n <div cla... 2017 \n",
|
||||
"6 <div class=\"publication_container\">\\n <div cla... 2016 "
|
||||
"0 <div class=\"publication_container\">\\n <div cla... 2020 \n",
|
||||
"1 <div class=\"publication_container\">\\n <div cla... 2019 "
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"execution_count": 37,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -439,7 +310,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 38,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -449,11 +320,12 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 39,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sample_report = df_jahresabschluss.iloc[0].raw_report"
|
||||
"sample_report = df_jahresabschluss.iloc[0].raw_report\n",
|
||||
"sample_report_content = df_jahresabschluss.iloc[0].raw_report"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -466,45 +338,20 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 40,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"from dataclasses import dataclass\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dataclass\n",
|
||||
"class Auditor:\n",
|
||||
" name: str\n",
|
||||
" company: str\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def extract_auditors(report: str) -> list:\n",
|
||||
" auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
|
||||
" hits = re.findall(auditor_regex, report)\n",
|
||||
" return [hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip() for hit in hits]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['Eckhard Lewe', 'Renate Hermsdorf']"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"extract_auditors(sample_report)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def extract_auditor_company(report: str) -> str:\n",
|
||||
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
|
||||
" temp = soup.find_all(\"b\")\n",
|
||||
@ -512,27 +359,37 @@
|
||||
" br = elem.findChildren(\"br\")\n",
|
||||
" if len(br) > 0:\n",
|
||||
" return elem.text.split(\"\\n\")[1].strip()\n",
|
||||
" return None"
|
||||
" return None\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def extract_auditors(report: str) -> list:\n",
|
||||
" auditor_company = extract_auditor_company(report)\n",
|
||||
" auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
|
||||
" hits = re.findall(auditor_regex, report)\n",
|
||||
" return [\n",
|
||||
" Auditor(hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip(), auditor_company)\n",
|
||||
" for hit in hits\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Warth & Klein Grant Thornton AG'"
|
||||
"[]"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"extract_auditor_company(sample_report)"
|
||||
"extract_auditors(sample_report)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -561,97 +418,177 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"execution_count": 42,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Unnamed: 0</th>\n",
|
||||
" <th>Anhang</th>\n",
|
||||
" <th>2020 TEUR</th>\n",
|
||||
" <th>Vorjahr TEUR</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>1. Umsatzerlöse</td>\n",
|
||||
" <td>(1)</td>\n",
|
||||
" <td>69.819</td>\n",
|
||||
" <td>77.429</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2. Veränderung des Bestandes an unfertigen Lei...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>-41.000</td>\n",
|
||||
" <td>-66.000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>3. Sonstige betriebliche Erträge</td>\n",
|
||||
" <td>(2)</td>\n",
|
||||
" <td>489.000</td>\n",
|
||||
" <td>1.816</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>4. Materialaufwand</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>a) Aufwendungen für bezogene Waren</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>-1.220</td>\n",
|
||||
" <td>-3.003</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Unnamed: 0 Anhang 2020 TEUR \\\n",
|
||||
"0 1. Umsatzerlöse (1) 69.819 \n",
|
||||
"1 2. Veränderung des Bestandes an unfertigen Lei... NaN -41.000 \n",
|
||||
"2 3. Sonstige betriebliche Erträge (2) 489.000 \n",
|
||||
"3 4. Materialaufwand NaN NaN \n",
|
||||
"4 a) Aufwendungen für bezogene Waren NaN -1.220 \n",
|
||||
"\n",
|
||||
" Vorjahr TEUR \n",
|
||||
"0 77.429 \n",
|
||||
"1 -66.000 \n",
|
||||
"2 1.816 \n",
|
||||
"3 NaN \n",
|
||||
"4 -3.003 "
|
||||
"{'net_income': 23484.67, 'equity': 65083.84, 'current_assets': 357613.61}"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"execution_count": 42,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def extract_kpis(report_content) -> dict:\n",
|
||||
" \"\"\"\n",
|
||||
" Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd\n",
|
||||
" Extracts Key Performance Indicators (KPIs) from the financial reports.\n",
|
||||
" Args:\n",
|
||||
" reports (dict): A dictionary containing the financial reports with their hash as keys and report details as values.\n",
|
||||
" Returns:\n",
|
||||
" dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" kpis = {}\n",
|
||||
"\n",
|
||||
" # Define KPI patterns to search for\n",
|
||||
" kpi_patterns = {\n",
|
||||
" \"revenue\": r\"(?:revenue|umsatz|erlöse)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"net_income\": r\"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"ebit\": r\"(?:ebit|operating income)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"ebitda\": r\"(?:ebitda)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"gross_profit\": r\"(?:gross profit|bruttogewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"operating_profit\": r\"(?:operating profit|betriebsgewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"assets\": r\"(?:total assets|bilanzsumme)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"liabilities\": r\"(?:total liabilities|gesamtverbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"equity\": r\"(?:shareholders'? equity|eigenkapital)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"current_assets\": r\"(?:current assets|umlaufvermögen)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"current_liabilities\": r\"(?:current liabilities|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"long_term_debt\": r\"(?:long[-\\s]?term debt|langfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"short_term_debt\": r\"(?:short[-\\s]?term debt|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"cash_and_cash_equivalents\": r\"(?:cash (?:and cash equivalents)?|barmittel)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"dividends\": r\"(?:dividends?|dividende)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"cash_flow\": r\"(?:cash flow|cashflow|cash flow from operating activities)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" report_kpis = {}\n",
|
||||
" for kpi, pattern in kpi_patterns.items():\n",
|
||||
" match = re.search(pattern, report_content, flags=re.IGNORECASE | re.UNICODE)\n",
|
||||
" if match:\n",
|
||||
" value = match.group(1)\n",
|
||||
"\n",
|
||||
" # Clean and validate the extracted number\n",
|
||||
" try:\n",
|
||||
" if not value: # Check if value is empty\n",
|
||||
" cleaned_value = None\n",
|
||||
" else:\n",
|
||||
" multiplier = 1\n",
|
||||
" if value[-1].lower() == \"m\":\n",
|
||||
" value = value[:-1]\n",
|
||||
" multiplier = 1_000_000\n",
|
||||
" elif value[-1].lower() == \"b\":\n",
|
||||
" value = value[:-1]\n",
|
||||
" multiplier = 1_000_000_000\n",
|
||||
"\n",
|
||||
" # Remove commas after checking for multipliers\n",
|
||||
" value = value.replace(\".\", \"\").replace(\",\", \".\").strip()\n",
|
||||
" cleaned_value = float(value) * multiplier\n",
|
||||
" except ValueError:\n",
|
||||
" cleaned_value = None\n",
|
||||
"\n",
|
||||
" if cleaned_value is not None:\n",
|
||||
" report_kpis[kpi] = cleaned_value\n",
|
||||
" return report_kpis\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"extract_kpis(\n",
|
||||
" BeautifulSoup(sample_report, features=\"html.parser\").get_text().replace(\"\\n\", \" \")\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"with open(\"./temp.txt\", \"w\") as file:\n",
|
||||
" file.write(\n",
|
||||
" BeautifulSoup(sample_report, features=\"html.parser\")\n",
|
||||
" .get_text()\n",
|
||||
" .replace(\"\\n\", \" \")\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 46,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n",
|
||||
" ('Aktiva', '31.12.2020 EUR'),\n",
|
||||
" ('Aktiva', '31.12.2019 EUR')],\n",
|
||||
" )\n",
|
||||
"Aktiva Unnamed: 0_level_1 object\n",
|
||||
" 31.12.2020 EUR object\n",
|
||||
" 31.12.2019 EUR object\n",
|
||||
"dtype: object\n",
|
||||
"MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n",
|
||||
" ('Passiva', '31.12.2020 EUR'),\n",
|
||||
" ('Passiva', '31.12.2019 EUR')],\n",
|
||||
" )\n",
|
||||
"Passiva Unnamed: 0_level_1 object\n",
|
||||
" 31.12.2020 EUR object\n",
|
||||
" 31.12.2019 EUR object\n",
|
||||
"dtype: object\n",
|
||||
"Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n",
|
||||
"Angaben zur Identifikation der Gesellschaft laut Registergericht object\n",
|
||||
"Angaben zur Identifikation der Gesellschaft laut Registergericht.1 object\n",
|
||||
"dtype: object\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{}"
|
||||
]
|
||||
},
|
||||
"execution_count": 46,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def parse_tables(report: str) -> list:\n",
|
||||
" result = {}\n",
|
||||
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
|
||||
" for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n",
|
||||
" df = pd.read_html(StringIO(str(table)))[0]\n",
|
||||
" print(df.columns)\n",
|
||||
" print(df.dtypes)\n",
|
||||
" return result\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"parse_tables(sample_report)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "KeyError",
|
||||
"evalue": "'Passiva'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Bundesanzeiger\\notebook.ipynb Cell 21\u001b[0m in \u001b[0;36m1\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=10'>11</a>\u001b[0m \u001b[39mreturn\u001b[39;00m result\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=13'>14</a>\u001b[0m bilanz \u001b[39m=\u001b[39m get_bilanz(sample_report)\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=14'>15</a>\u001b[0m bilanz[\u001b[39m\"\u001b[39;49m\u001b[39mPassiva\u001b[39;49m\u001b[39m\"\u001b[39;49m]\u001b[39m.\u001b[39mhead()\n",
|
||||
"\u001b[1;31mKeyError\u001b[0m: 'Passiva'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def get_bilanz(report: str) -> any:\n",
|
||||
" result = {}\n",
|
||||
@ -672,30 +609,30 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Index(['Gesellschafterbeschluss', 'Shareholder Resolution'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', 'Anhang', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', 'Anhang', '2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2020 TEUR',\n",
|
||||
"Int64Index([0, 1], dtype='int64')\n",
|
||||
"Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n",
|
||||
" 'Vorjahr TEUR'],\n",
|
||||
" dtype='object')\n",
|
||||
"Index(['Aufgliederung nach Inland und Ausland', '2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2020'], dtype='object')\n",
|
||||
"Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||||
"Int64Index([0, 1, 2], dtype='int64')\n",
|
||||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '2020 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
|
||||
"MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n",
|
||||
" ('Art der Beziehung', 'Gesellschafterin TEUR'),\n",
|
||||
" ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n",
|
||||
@ -707,24 +644,23 @@
|
||||
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
|
||||
" ('Anschaffungs- oder Herstellungskosten', ...)],\n",
|
||||
" )\n",
|
||||
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
|
||||
" ( 'Abschreibungen', 'Stand 01.01.2020 EUR'),\n",
|
||||
" ( 'Abschreibungen', 'Abschreibungen des Geschäftsjahres EUR'),\n",
|
||||
" ( 'Abschreibungen', 'Abgänge Umbuchung EUR'),\n",
|
||||
" ( 'Abschreibungen', 'Stand 31.12.2020 EUR')],\n",
|
||||
"MultiIndex([('Unnamed: 0_level_0', ...),\n",
|
||||
" ( 'Abschreibungen', ...),\n",
|
||||
" ( 'Abschreibungen', ...),\n",
|
||||
" ( 'Abschreibungen', ...),\n",
|
||||
" ( 'Abschreibungen', ...)],\n",
|
||||
" )\n",
|
||||
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
|
||||
" ( 'Buchwerte', 'Stand 31.12.2020 EUR'),\n",
|
||||
" ( 'Buchwerte', 'Stand 31.12.2019 EUR')],\n",
|
||||
" ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n",
|
||||
" ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n",
|
||||
" )\n",
|
||||
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2020', '2019',\n",
|
||||
" '2018'],\n",
|
||||
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n",
|
||||
" '2019'],\n",
|
||||
" dtype='object')\n",
|
||||
"Index(['Gewinn- und Verlustrechnung', '2020 TEUR', 'Vorjahr TEUR',\n",
|
||||
"Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n",
|
||||
" 'Veränderung TEUR'],\n",
|
||||
" dtype='object')\n",
|
||||
"Index(['Bilanz', '31.12.2020 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n",
|
||||
"Index(['Bericht des Aufsichtsrats', 'Report of the Supervisory Board'], dtype='object')\n"
|
||||
"Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
1
Jupyter/API-tests/News/.gitignore
vendored
Normal file
1
Jupyter/API-tests/News/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
data/
|
879
Jupyter/API-tests/News/notebook.ipynb
Normal file
879
Jupyter/API-tests/News/notebook.ipynb
Normal file
File diff suppressed because one or more lines are too long
1
Jupyter/API-tests/News/requirements.txt
Normal file
1
Jupyter/API-tests/News/requirements.txt
Normal file
@ -0,0 +1 @@
|
||||
pymongo
|
1
Jupyter/API-tests/Unternehmensregister/.gitignore
vendored
Normal file
1
Jupyter/API-tests/Unternehmensregister/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
data/*
|
192
Jupyter/API-tests/Unternehmensregister/main.py
Normal file
192
Jupyter/API-tests/Unternehmensregister/main.py
Normal file
@ -0,0 +1,192 @@
|
||||
"""Unternehmensregister Scraping."""
|
||||
import glob
|
||||
import logging
|
||||
import multiprocessing
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions as ec
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from tqdm import tqdm
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
|
||||
def scrape(query: str, download_dir: list[str]):
|
||||
"""Fetch results from Unternehmensregister for given query.
|
||||
|
||||
Args:
|
||||
query (str): Search Query (RegEx supported)
|
||||
download_dir (list[str]): Directory to place output files in
|
||||
"""
|
||||
download_path = os.path.join(str(Path.cwd()), *download_dir)
|
||||
options = webdriver.ChromeOptions()
|
||||
preferences = {
|
||||
"profile.default_content_settings.popups": 0,
|
||||
"safebrowsing.enabled": True,
|
||||
"download": {
|
||||
"directory_upgrade": True,
|
||||
"prompt_for_download": False,
|
||||
"extensions_to_open": "",
|
||||
"default_directory": download_path,
|
||||
},
|
||||
}
|
||||
options.add_argument("--headless=new")
|
||||
options.add_experimental_option("prefs", preferences)
|
||||
|
||||
driver = webdriver.Chrome(options=options)
|
||||
|
||||
driver.get("https://www.unternehmensregister.de/ureg/")
|
||||
# Accept Cookies
|
||||
driver.find_elements(
|
||||
By.XPATH, '//button[text()="Nur technisch notwendige Cookies akzeptieren"]'
|
||||
)[0].click()
|
||||
# Enter search query
|
||||
driver.find_elements(By.ID, "globalSearchForm:extendedResearchCompanyName")[
|
||||
0
|
||||
].send_keys(query)
|
||||
# Trigger search
|
||||
driver.find_elements(By.ID, "globalSearchForm:btnExecuteSearchOld")[0].click()
|
||||
# Wait for results
|
||||
wait = WebDriverWait(driver, 15)
|
||||
wait.until(
|
||||
lambda driver: driver.current_url != "https://www.unternehmensregister.de/ureg/"
|
||||
)
|
||||
|
||||
num_pages = int(
|
||||
driver.find_element(By.XPATH, '//*[@class="page_count"]').text.split(" ")[0]
|
||||
)
|
||||
|
||||
processed_companies = []
|
||||
|
||||
for _ in tqdm(range(num_pages)):
|
||||
# Find all "Registerinformationen"
|
||||
companies_tab = driver.find_elements(
|
||||
By.LINK_TEXT, "Registerinformationen des Registergerichts"
|
||||
)
|
||||
company_names = [
|
||||
elem.text
|
||||
for elem in driver.find_elements(
|
||||
By.XPATH, '//div[@class="company_result"]/span/b'
|
||||
)
|
||||
]
|
||||
for index, company_link in enumerate(companies_tab):
|
||||
company_name = company_names[index]
|
||||
if company_name in processed_companies:
|
||||
continue
|
||||
# Go to intermediary page
|
||||
company_link.click()
|
||||
# Trigger next redirect
|
||||
driver.find_element(By.LINK_TEXT, "Registerinformationen anzeigen").click()
|
||||
# Trigger SI download
|
||||
driver.find_element(By.LINK_TEXT, "SI").click()
|
||||
# Show shopping cart
|
||||
wait.until(
|
||||
ec.visibility_of_element_located(
|
||||
(By.LINK_TEXT, "Dokumentenkorb ansehen")
|
||||
)
|
||||
)
|
||||
driver.find_element(By.LINK_TEXT, "Dokumentenkorb ansehen").click()
|
||||
# Get document
|
||||
elems = driver.find_elements(By.TAG_NAME, "input")
|
||||
elems[-2].click()
|
||||
|
||||
wait.until(
|
||||
ec.visibility_of_element_located((By.ID, "paymentFormOverview:btnNext"))
|
||||
)
|
||||
driver.find_element(By.ID, "paymentFormOverview:btnNext").click()
|
||||
|
||||
wait.until(
|
||||
ec.visibility_of_element_located((By.LINK_TEXT, "Zum Dokumentenkorb"))
|
||||
)
|
||||
driver.find_element(By.LINK_TEXT, "Zum Dokumentenkorb").click()
|
||||
|
||||
num_files = get_num_files(download_path)
|
||||
driver.find_element(By.CLASS_NAME, "download-wrapper").click()
|
||||
|
||||
try:
|
||||
wait.until(wait_for_download_condition(download_path, num_files))
|
||||
file_name = "".join(e for e in company_name if e.isalnum()) + ".xml"
|
||||
rename_latest_file(
|
||||
download_path,
|
||||
file_name,
|
||||
)
|
||||
processed_companies.append(company_name)
|
||||
except Exception:
|
||||
logger.warning("Exception caught in Scraping")
|
||||
finally:
|
||||
for _ in range(6):
|
||||
driver.back()
|
||||
driver.find_element(By.XPATH, '//*[@class="fas fa-angle-right"]').click()
|
||||
driver.close()
|
||||
|
||||
|
||||
def wait_for_download_condition(
|
||||
path: str, num_files: int, pattern: str = "*.xml"
|
||||
) -> bool:
|
||||
"""Selenium wait condition monitoring number of files in a dir.
|
||||
|
||||
Args:
|
||||
path (str): Directory path
|
||||
num_files (int): Current number of file
|
||||
pattern (str, optional): File pattern. Defaults to "*.xml".
|
||||
|
||||
Returns:
|
||||
bool: Current num file exceeded
|
||||
"""
|
||||
return len(glob.glob1(path, pattern)) > num_files
|
||||
|
||||
|
||||
def get_num_files(path: str, pattern: str = "*.xml") -> int:
|
||||
"""Get number of files in directory.
|
||||
|
||||
Args:
|
||||
path (str): Directory to scan
|
||||
pattern (str, optional): File pattern. Defaults to "*.xml".
|
||||
|
||||
Returns:
|
||||
int: Number of files matching pattern
|
||||
"""
|
||||
return len(glob.glob1(path, pattern))
|
||||
|
||||
|
||||
def rename_latest_file(path: str, filename: str, pattern: str = "*.xml"):
|
||||
"""Rename file in dir with latest change date.
|
||||
|
||||
Args:
|
||||
path (str): Dir to check
|
||||
filename (str): Name of file
|
||||
pattern (str, optional): File pattern. Defaults to "*.xml".
|
||||
"""
|
||||
list_of_files = [os.path.join(path, file) for file in glob.glob1(path, pattern)]
|
||||
latest_download = max(list_of_files, key=os.path.getctime)
|
||||
os.rename(latest_download, os.path.join(path, filename))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""Main procedure"""
|
||||
import pandas as pd
|
||||
|
||||
df_relevant_companies = pd.read_excel(
|
||||
"./data/study_id42887_top-100-unternehmen-deutschland.xlsx",
|
||||
sheet_name="Toplist",
|
||||
skiprows=1,
|
||||
)
|
||||
df_relevant_companies = df_relevant_companies[df_relevant_companies["Name"].notna()]
|
||||
|
||||
batch_size = 5
|
||||
pool = multiprocessing.Pool(processes=batch_size)
|
||||
params = [
|
||||
(query, ["data", "Unternehmensregister", "scraping", query.strip()])
|
||||
for query in df_relevant_companies.Name
|
||||
]
|
||||
# Map the process_handler function to the parameter list using the Pool
|
||||
pool.starmap(scrape, params)
|
||||
|
||||
# Close the Pool to prevent any more tasks from being submitted
|
||||
pool.close()
|
||||
|
||||
# Wait for all the processes to complete
|
||||
pool.join()
|
4322
Jupyter/API-tests/Unternehmensregister/notebook.ipynb
Normal file
4322
Jupyter/API-tests/Unternehmensregister/notebook.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
10
Jupyter/API-tests/Unternehmensregister/requirements.txt
Normal file
10
Jupyter/API-tests/Unternehmensregister/requirements.txt
Normal file
@ -0,0 +1,10 @@
|
||||
ocrmypdf
|
||||
pytesseract
|
||||
opencv-python
|
||||
pdf2image
|
||||
bs4
|
||||
selenium
|
||||
xmltodict
|
||||
tqdm
|
||||
openpyxl
|
||||
pandas
|
28
Jupyter/API-tests/docker-compose.yml
Normal file
28
Jupyter/API-tests/docker-compose.yml
Normal file
@ -0,0 +1,28 @@
|
||||
version: '3.8'
|
||||
services:
|
||||
mongodb:
|
||||
image: mongo:6.0.6
|
||||
container_name: mongodb
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
MONGO_INITDB_ROOT_USERNAME: root
|
||||
MONGO_INITDB_ROOT_PASSWORD: pR0R0v2e2
|
||||
MONGO_INITDB_DATABASE: transparenzregister
|
||||
ports:
|
||||
- 27017:27017
|
||||
volumes:
|
||||
- mongodb_data:/data/db
|
||||
|
||||
mongo-express:
|
||||
image: mongo-express:1.0.0-alpha
|
||||
container_name: mongo-express
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- 8081:8081
|
||||
environment:
|
||||
ME_CONFIG_MONGODB_SERVER: mongodb
|
||||
ME_CONFIG_MONGODB_ADMINUSERNAME: root
|
||||
ME_CONFIG_MONGODB_ADMINPASSWORD: pR0R0v2e2
|
||||
|
||||
volumes:
|
||||
mongodb_data:
|
@ -1,35 +1,35 @@
|
||||
---
|
||||
title: "Automatisierte Daten Extraktion aus Internetquellen als Grundlage für die Analyse von Kapitalgesellschaften"
|
||||
author: "Nolde, Tristan Norbert"
|
||||
date: "2023-05-06"
|
||||
---
|
||||
|
||||
# Abstract: Automatisierte Daten Extraktion aus Internetquellen als Grundlage für die Analyse von Kapitalgesellschaften
|
||||
|
||||
## Gliederung
|
||||
1. Einleitung (Zielsetzung/Problemstellung, Vorgehen)
|
||||
2. Web Scraping/Crawling
|
||||
1. Definition und Theorie
|
||||
2. Technologien
|
||||
3. Umsetzung
|
||||
3. RSS Feeds
|
||||
1. Definition und Theorie
|
||||
2. Technologien
|
||||
3. Umsetzung
|
||||
4. APIs
|
||||
1. Definition und Theorie
|
||||
2. Technologien
|
||||
3. Umsetzung
|
||||
5. Rechtliche Rahmenbedingungen
|
||||
6. Vergleich der Lösungsansätze
|
||||
7. Zusammenfassung
|
||||
|
||||
## Inhalt
|
||||
|
||||
In Zeiten von Big Data und AI stellen Daten und ihre Verfügbarkeit zunehmend eines der wichtigsten Wirtschaftsgüter dar. Als solches können sie auch eingesetzt werden, um Kapitalgesellschaften (eine Subklasse von Unternehmen) anhand verschiedener Kennzahlen wie der Mitarbeiterzahl oder dem Jahresgewinn zu analysieren. Obwohl solche Daten zu Genüge in Zeitungsartikeln, Newslettern oder dedizierten Aktienanalysen zu finden sind, so gestaltet sich eine automatisierte Extraktion dieser Daten aufgrund verschiedener Formate sowie weiterer Restriktionen schwierig.
|
||||
|
||||
Daher sollen im Rahmen dieser Seminararbeit verschiedene Wege betrachtet werden, die eben diese Daten erheben und zur Verfügung stellen können. Zu den nennenswerten Quellen gehören: Der Bundesanzeiger, RSS Feeds, Nachrichten APIs. Ziel ist es, aus diesen Quellen wertvolle Informationen bezogen auf den wirtschaftlichen Erfolg einer Kapitalgesellschaft sowie aktueller Nachrichten zu extrahieren und in ein einheitliches Format zu überführen.
|
||||
|
||||
Neben des technischen Einsatzes von Web Scraping/Crawling, um Informationen aus Webseiten zu gewinnen, sowie des Abfragens verfügbarer APIs soll auch der rechltiche Aspekt dieser Vorgehens Berücksichtigung finden, um die Rechtmäßigkeit zu bewerten.
|
||||
|
||||
Abschließend wird der Einsatz der verschiedenen Technologien an den Faktoren: Flexibilität, Simplizität, Verfügbarkeit und Rechtmäßigkeit, ein Fazit gezogen sowie ein Ausblick des weiteren Einsatzes gegeben.
|
||||
---
|
||||
title: "Automatisierte Daten Extraktion aus Internetquellen als Grundlage für die Analyse von Kapitalgesellschaften"
|
||||
author: "Nolde, Tristan Norbert"
|
||||
date: "2023-05-06"
|
||||
---
|
||||
|
||||
# Abstract: Automatisierte Daten Extraktion aus Internetquellen als Grundlage für die Analyse von Kapitalgesellschaften
|
||||
|
||||
## Gliederung
|
||||
1. Einleitung (Zielsetzung/Problemstellung, Vorgehen)
|
||||
2. Web Scraping/Crawling
|
||||
1. Definition und Theorie
|
||||
2. Technologien
|
||||
3. Umsetzung
|
||||
3. RSS Feeds
|
||||
1. Definition und Theorie
|
||||
2. Technologien
|
||||
3. Umsetzung
|
||||
4. APIs
|
||||
1. Definition und Theorie
|
||||
2. Technologien
|
||||
3. Umsetzung
|
||||
5. Rechtliche Rahmenbedingungen
|
||||
6. Vergleich der Lösungsansätze
|
||||
7. Zusammenfassung
|
||||
|
||||
## Inhalt
|
||||
|
||||
In Zeiten von Big Data und AI stellen Daten und ihre Verfügbarkeit zunehmend eines der wichtigsten Wirtschaftsgüter dar. Als solches können sie auch eingesetzt werden, um Kapitalgesellschaften (eine Subklasse von Unternehmen) anhand verschiedener Kennzahlen wie der Mitarbeiterzahl oder dem Jahresgewinn zu analysieren. Obwohl solche Daten zu Genüge in Zeitungsartikeln, Newslettern oder dedizierten Aktienanalysen zu finden sind, so gestaltet sich eine automatisierte Extraktion dieser Daten aufgrund verschiedener Formate sowie weiterer Restriktionen schwierig.
|
||||
|
||||
Daher sollen im Rahmen dieser Seminararbeit verschiedene Wege betrachtet werden, die eben diese Daten erheben und zur Verfügung stellen können. Zu den nennenswerten Quellen gehören: Der Bundesanzeiger, RSS Feeds, Nachrichten APIs. Ziel ist es, aus diesen Quellen wertvolle Informationen bezogen auf den wirtschaftlichen Erfolg einer Kapitalgesellschaft sowie aktueller Nachrichten zu extrahieren und in ein einheitliches Format zu überführen.
|
||||
|
||||
Neben des technischen Einsatzes von Web Scraping/Crawling, um Informationen aus Webseiten zu gewinnen, sowie des Abfragens verfügbarer APIs soll auch der rechltiche Aspekt dieser Vorgehens Berücksichtigung finden, um die Rechtmäßigkeit zu bewerten.
|
||||
|
||||
Abschließend wird der Einsatz der verschiedenen Technologien an den Faktoren: Flexibilität, Simplizität, Verfügbarkeit und Rechtmäßigkeit, ein Fazit gezogen sowie ein Ausblick des weiteren Einsatzes gegeben.
|
Binary file not shown.
7616
poetry.lock
generated
7616
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
220
pyproject.toml
220
pyproject.toml
@ -1,107 +1,113 @@
|
||||
[build-system]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
requires = ["poetry-core"]
|
||||
|
||||
[tookl.mypy]
|
||||
disallow_untyped_defs = true
|
||||
follow_imports = "silent"
|
||||
python_version = "3.11"
|
||||
warn_redudant_casts = true
|
||||
warn_unused_ignores = true
|
||||
|
||||
[tool.black]
|
||||
target-version = ["py311"]
|
||||
|
||||
[tool.coverage.run]
|
||||
branch = true
|
||||
dynamic_context = "test_function"
|
||||
relative_files = true
|
||||
source = ["src"]
|
||||
|
||||
[tool.poetry]
|
||||
authors = ["AKI Projektgruppe 23"]
|
||||
description = "A project analysing the german transparenzregister and other data sources to find shared business interests and shared personal and other links for lots of companies."
|
||||
name = "aki-prj23-transparenzregister"
|
||||
packages = [{include = "aki_prj23_transparenzregister", from = "src"}]
|
||||
readme = "README.md"
|
||||
version = "0.1.0"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
loguru = "^0.7.0"
|
||||
matplotlib = "^3.7.1"
|
||||
plotly = "^5.14.1"
|
||||
python = "^3.11"
|
||||
seaborn = "^0.12.2"
|
||||
tqdm = "^4.65.0"
|
||||
|
||||
[tool.poetry.group.develop.dependencies]
|
||||
black = {extras = ["jupyter"], version = "^23.3.0"}
|
||||
jupyterlab = "^4.0.0"
|
||||
nbconvert = "^7.4.0"
|
||||
pre-commit = "^3.3.2"
|
||||
rise = "^5.7.1"
|
||||
|
||||
[tool.poetry.group.doc.dependencies]
|
||||
jupyter = "^1.0.0"
|
||||
myst-parser = "^1.0.0"
|
||||
nbsphinx = "^0.9.2"
|
||||
sphinx = "^6.0.0"
|
||||
sphinx-copybutton = "^0.5.2"
|
||||
sphinx-rtd-theme = "^1.2.1"
|
||||
sphinx_autodoc_typehints = "*"
|
||||
sphinxcontrib-mermaid = "^0.9.2"
|
||||
sphinxcontrib-napoleon = "^0.7"
|
||||
|
||||
[tool.poetry.group.lint.dependencies]
|
||||
black = "^23.3.0"
|
||||
mypy = "^1.3.0"
|
||||
pandas-stubs = "^2.0.1.230501"
|
||||
ruff = "^0.0.270"
|
||||
types-requests = "^2.31.0.1"
|
||||
|
||||
[tool.poetry.group.test.dependencies]
|
||||
pytest = "^7.3.1"
|
||||
pytest-clarity = "^1.0.1"
|
||||
pytest-cov = "^4.1.0"
|
||||
pytest-mock = "^3.10.0"
|
||||
pytest-repeat = "^0.9.1"
|
||||
|
||||
[tool.ruff]
|
||||
exclude = [
|
||||
".bzr",
|
||||
".direnv",
|
||||
".eggs",
|
||||
".git",
|
||||
".git-rewrite",
|
||||
".hg",
|
||||
".mypy_cache",
|
||||
".nox",
|
||||
".pants.d",
|
||||
".pytype",
|
||||
".ruff_cache",
|
||||
".svn",
|
||||
".tox",
|
||||
".venv",
|
||||
"__pypackages__",
|
||||
"_build",
|
||||
"buck-out",
|
||||
"build",
|
||||
"dist",
|
||||
"node_modules",
|
||||
"venv"
|
||||
]
|
||||
# Never enforce `E501` (line length violations).
|
||||
ignore = ["E501"]
|
||||
line-length = 88
|
||||
# Enable flake8-bugbear (`B`) rules.
|
||||
select = ["E", "F", "B", "I", "S", "RSE", "RET", "SLF", "SIM", "TID", "PD", "PL", "PLE", "PLR", "PLW", "NPY", "UP", "D", "N", "A", "C4", "T20", "PT"]
|
||||
src = ["src"]
|
||||
target-version = "py311"
|
||||
# Avoid trying to fix flake8-bugbear (`B`) violations.
|
||||
unfixable = ["B"]
|
||||
|
||||
[tool.ruff.per-file-ignores]
|
||||
"tests/*.py" = ["S101"]
|
||||
|
||||
[tool.ruff.pydocstyle]
|
||||
convention = "google"
|
||||
[build-system]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
requires = ["poetry-core"]
|
||||
|
||||
[tookl.mypy]
|
||||
disallow_untyped_defs = true
|
||||
follow_imports = "silent"
|
||||
python_version = "3.11"
|
||||
warn_redudant_casts = true
|
||||
warn_unused_ignores = true
|
||||
|
||||
[tool.black]
|
||||
target-version = ["py311"]
|
||||
|
||||
[tool.coverage.run]
|
||||
branch = true
|
||||
dynamic_context = "test_function"
|
||||
relative_files = true
|
||||
source = ["src"]
|
||||
|
||||
[tool.poetry]
|
||||
authors = ["AKI Projektgruppe 23"]
|
||||
description = "A project analysing the german transparenzregister and other data sources to find shared business interests and shared personal and other links for lots of companies."
|
||||
name = "aki-prj23-transparenzregister"
|
||||
packages = [{include = "aki_prj23_transparenzregister", from = "src"}]
|
||||
readme = "README.md"
|
||||
version = "0.1.0"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
loguru = "^0.7.0"
|
||||
matplotlib = "^3.7.1"
|
||||
plotly = "^5.14.1"
|
||||
pymongo = "^4.4.1"
|
||||
python = "^3.11"
|
||||
seaborn = "^0.12.2"
|
||||
selenium = "^4.10.0"
|
||||
tqdm = "^4.65.0"
|
||||
types-tqdm = "^4.65.0"
|
||||
|
||||
[tool.poetry.group.develop.dependencies]
|
||||
black = {extras = ["jupyter"], version = "^23.3.0"}
|
||||
jupyterlab = "^4.0.0"
|
||||
nbconvert = "^7.4.0"
|
||||
pre-commit = "^3.3.2"
|
||||
rise = "^5.7.1"
|
||||
|
||||
[tool.poetry.group.doc.dependencies]
|
||||
jupyter = "^1.0.0"
|
||||
myst-parser = "^1.0.0"
|
||||
nbsphinx = "^0.9.2"
|
||||
sphinx = "^6.0.0"
|
||||
sphinx-copybutton = "^0.5.2"
|
||||
sphinx-rtd-theme = "^1.2.1"
|
||||
sphinx_autodoc_typehints = "*"
|
||||
sphinxcontrib-mermaid = "^0.9.2"
|
||||
sphinxcontrib-napoleon = "^0.7"
|
||||
|
||||
[tool.poetry.group.lint.dependencies]
|
||||
black = "^23.3.0"
|
||||
mypy = "^1.3.0"
|
||||
pandas-stubs = "^2.0.1.230501"
|
||||
ruff = "^0.0.270"
|
||||
types-requests = "^2.31.0.1"
|
||||
|
||||
[tool.poetry.group.test.dependencies]
|
||||
pytest = "^7.3.1"
|
||||
pytest-clarity = "^1.0.1"
|
||||
pytest-cov = "^4.1.0"
|
||||
pytest-mock = "^3.10.0"
|
||||
pytest-repeat = "^0.9.1"
|
||||
|
||||
[tool.ruff]
|
||||
exclude = [
|
||||
".bzr",
|
||||
".direnv",
|
||||
".eggs",
|
||||
".git",
|
||||
".git-rewrite",
|
||||
".hg",
|
||||
".mypy_cache",
|
||||
".nox",
|
||||
".pants.d",
|
||||
".pytype",
|
||||
".ruff_cache",
|
||||
".svn",
|
||||
".tox",
|
||||
".venv",
|
||||
"__pypackages__",
|
||||
"_build",
|
||||
"buck-out",
|
||||
"build",
|
||||
"dist",
|
||||
"node_modules",
|
||||
"venv"
|
||||
]
|
||||
# Never enforce `E501` (line length violations).
|
||||
ignore = ["E501"]
|
||||
line-length = 88
|
||||
# Enable flake8-bugbear (`B`) rules.
|
||||
select = ["E", "F", "B", "I", "S", "RSE", "RET", "SLF", "SIM", "TID", "PD", "PL", "PLE", "PLR", "PLW", "NPY", "UP", "D", "N", "A", "C4", "T20", "PT"]
|
||||
src = ["src"]
|
||||
target-version = "py311"
|
||||
# Avoid trying to fix flake8-bugbear (`B`) violations.
|
||||
unfixable = ["B"]
|
||||
|
||||
[tool.ruff.flake8-builtins]
|
||||
builtins-ignorelist = ["id"]
|
||||
|
||||
[tool.ruff.per-file-ignores]
|
||||
"tests/*.py" = ["S101", "D100", "D101", "D107", "D103"]
|
||||
|
||||
[tool.ruff.pydocstyle]
|
||||
convention = "google"
|
||||
|
1
src/aki_prj23_transparenzregister/models/__init__.py
Normal file
1
src/aki_prj23_transparenzregister/models/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Model classes."""
|
68
src/aki_prj23_transparenzregister/models/company.py
Normal file
68
src/aki_prj23_transparenzregister/models/company.py
Normal file
@ -0,0 +1,68 @@
|
||||
"""Company model."""
|
||||
from abc import ABC
|
||||
from dataclasses import asdict, dataclass
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class RelationshipRoleEnum(Enum):
|
||||
"""_summary_.
|
||||
|
||||
Args:
|
||||
Enum (_type_): _description_
|
||||
"""
|
||||
|
||||
STAKEHOLDER = ""
|
||||
ORGANISATION = "ORGANISATION"
|
||||
|
||||
|
||||
@dataclass
|
||||
class CompanyID:
|
||||
"""_summary_."""
|
||||
|
||||
district_court: str
|
||||
hr_number: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class Location:
|
||||
"""_summary_."""
|
||||
|
||||
city: str
|
||||
street: str | None = None
|
||||
house_number: str | None = None
|
||||
zip_code: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class CompanyRelationship(ABC):
|
||||
"""_summary_.
|
||||
|
||||
Args:
|
||||
ABC (_type_): _description_
|
||||
"""
|
||||
|
||||
role: RelationshipRoleEnum
|
||||
location: Location
|
||||
|
||||
|
||||
@dataclass
|
||||
class Company:
|
||||
"""_summary_.
|
||||
|
||||
Returns:
|
||||
_type_: _description_
|
||||
"""
|
||||
|
||||
id: CompanyID
|
||||
location: Location
|
||||
name: str
|
||||
last_update: str
|
||||
relationships: list[CompanyRelationship]
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""_summary_.
|
||||
|
||||
Returns:
|
||||
dict: _description_
|
||||
"""
|
||||
return asdict(self)
|
25
src/aki_prj23_transparenzregister/models/news.py
Normal file
25
src/aki_prj23_transparenzregister/models/news.py
Normal file
@ -0,0 +1,25 @@
|
||||
"""News mnodel."""
|
||||
from dataclasses import asdict, dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class News:
|
||||
"""_summary_.
|
||||
|
||||
Returns:
|
||||
_type_: _description_
|
||||
"""
|
||||
|
||||
id: str
|
||||
title: str
|
||||
date: str
|
||||
text: str
|
||||
source_url: str
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""_summary_.
|
||||
|
||||
Returns:
|
||||
dict: _description_
|
||||
"""
|
||||
return asdict(self)
|
1
src/aki_prj23_transparenzregister/utils/__init__.py
Normal file
1
src/aki_prj23_transparenzregister/utils/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Util classes and services."""
|
@ -0,0 +1,49 @@
|
||||
"""CompanyMongoService."""
|
||||
from aki_prj23_transparenzregister.models.company import Company, CompanyID
|
||||
from aki_prj23_transparenzregister.utils.mongo import MongoConnector
|
||||
|
||||
|
||||
class CompanyMongoService:
|
||||
"""_summary_."""
|
||||
|
||||
def __init__(self, connector: MongoConnector):
|
||||
"""_summary_.
|
||||
|
||||
Args:
|
||||
connector (MongoConnector): _description_
|
||||
"""
|
||||
self.collection = connector.database["companies"]
|
||||
|
||||
def get_all(self) -> list[Company]:
|
||||
"""_summary_.
|
||||
|
||||
Returns:
|
||||
list[Company]: _description_
|
||||
"""
|
||||
result = self.collection.find()
|
||||
return list(result)
|
||||
|
||||
def get_by_id(self, id: CompanyID) -> Company | None:
|
||||
"""_summary_.
|
||||
|
||||
Args:
|
||||
id (str): _description_
|
||||
|
||||
Returns:
|
||||
Company | None: _description_
|
||||
"""
|
||||
result = list(self.collection.find({"id": id}))
|
||||
if len(result) == 1:
|
||||
return result[0]
|
||||
return None
|
||||
|
||||
def insert(self, company: Company):
|
||||
"""_summary_.
|
||||
|
||||
Args:
|
||||
company (Company): _description_
|
||||
|
||||
Returns:
|
||||
_type_: _description_
|
||||
"""
|
||||
return self.collection.insert_one(company.to_dict())
|
47
src/aki_prj23_transparenzregister/utils/mongo.py
Normal file
47
src/aki_prj23_transparenzregister/utils/mongo.py
Normal file
@ -0,0 +1,47 @@
|
||||
"""Mongo Wrapper."""
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pymongo
|
||||
|
||||
|
||||
@dataclass
|
||||
class MongoConnection:
|
||||
"""_summary_."""
|
||||
|
||||
hostname: str
|
||||
database: str
|
||||
port: int | None
|
||||
username: str | None
|
||||
password: str | None
|
||||
|
||||
def get_conn_string(self) -> str:
|
||||
"""Transforms the information of the object to a MongoDB connection string.
|
||||
|
||||
Returns:
|
||||
str: Connection string
|
||||
"""
|
||||
if self.username is not None and self.password is not None:
|
||||
connection_string = (
|
||||
f"mongodb+srv://{self.username}:{self.password}@{self.hostname}"
|
||||
)
|
||||
else:
|
||||
connection_string = f"mongodb+srv://{self.hostname}"
|
||||
if self.port is not None:
|
||||
connection_string += f":{self.port}"
|
||||
connection_string = connection_string.replace("mongodb+srv", "mongodb")
|
||||
return connection_string
|
||||
|
||||
|
||||
class MongoConnector:
|
||||
"""Wrapper for establishing a connection to a MongoDB instance."""
|
||||
|
||||
def __init__(self, connection: MongoConnection):
|
||||
"""_summary_.
|
||||
|
||||
Args:
|
||||
connection (MongoConnection): Wrapper for connection string
|
||||
"""
|
||||
self.client: pymongo.MongoClient = pymongo.MongoClient(
|
||||
connection.get_conn_string()
|
||||
)
|
||||
self.database = self.client[connection.database]
|
@ -0,0 +1,94 @@
|
||||
"""MongoNewsService."""
|
||||
from aki_prj23_transparenzregister.models.news import News
|
||||
from aki_prj23_transparenzregister.utils.mongo import MongoConnector
|
||||
|
||||
|
||||
class MongoNewsService:
|
||||
"""_summary_.
|
||||
|
||||
Args:
|
||||
NewsServiceInterface (_type_): _description_
|
||||
"""
|
||||
|
||||
def __init__(self, connector: MongoConnector):
|
||||
"""_summary_.
|
||||
|
||||
Args:
|
||||
connector (MongoConnector): _description_
|
||||
"""
|
||||
self.collection = connector.database["news"]
|
||||
|
||||
def get_all(self) -> list[News]:
|
||||
"""_summary_.
|
||||
|
||||
Returns:
|
||||
list[News]: _description_
|
||||
"""
|
||||
result = self.collection.find()
|
||||
return [MongoEntryTransformer.transform_outgoing(elem) for elem in result]
|
||||
|
||||
def get_by_id(self, id: str) -> News | None:
|
||||
"""_summary_.
|
||||
|
||||
Args:
|
||||
id (str): _description_
|
||||
|
||||
Returns:
|
||||
News | None: _description_
|
||||
"""
|
||||
result = list(self.collection.find({"_id": id}))
|
||||
if len(result) == 1:
|
||||
return MongoEntryTransformer.transform_outgoing(result[0])
|
||||
return None
|
||||
|
||||
def insert(self, news: News):
|
||||
"""_summary_.
|
||||
|
||||
Args:
|
||||
news (News): _description_
|
||||
|
||||
Returns:
|
||||
_type_: _description_
|
||||
"""
|
||||
return self.collection.insert_one(MongoEntryTransformer.transform_ingoing(news))
|
||||
|
||||
|
||||
class MongoEntryTransformer:
|
||||
"""_summary_.
|
||||
|
||||
Returns:
|
||||
_type_: _description_
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def transform_ingoing(news: News) -> dict:
|
||||
"""Convert a News object to a dictionary compatible with a MongoDB entry.
|
||||
|
||||
Args:
|
||||
news (News): News object to be transformed
|
||||
|
||||
Returns:
|
||||
dict: Transformed data with added _id field
|
||||
"""
|
||||
transport_object = news.to_dict()
|
||||
transport_object["_id"] = news.id
|
||||
del transport_object["id"]
|
||||
return transport_object
|
||||
|
||||
@staticmethod
|
||||
def transform_outgoing(data: dict) -> News:
|
||||
"""Reverse the transform_ingoing method.
|
||||
|
||||
Args:
|
||||
data (dict): dict from the MongoDB to be transformed
|
||||
|
||||
Returns:
|
||||
News: News entry based on MongoDB document
|
||||
"""
|
||||
return News(
|
||||
id=data["_id"],
|
||||
title=data["title"],
|
||||
date=data["date"],
|
||||
text=data["text"],
|
||||
source_url=data["source_url"],
|
||||
)
|
35
tests/models/company_test.py
Normal file
35
tests/models/company_test.py
Normal file
@ -0,0 +1,35 @@
|
||||
"""Test Models.company."""
|
||||
|
||||
|
||||
from aki_prj23_transparenzregister.models.company import Company, CompanyID, Location
|
||||
|
||||
|
||||
def test_to_dict() -> None:
|
||||
"""Tests if the version tag is entered."""
|
||||
company_id = CompanyID("The Shire", "420")
|
||||
location = Location(
|
||||
city="Insmouth", house_number="19", street="Harbor", zip_code="1890"
|
||||
)
|
||||
company = Company(
|
||||
id=company_id,
|
||||
last_update="Tomorrow",
|
||||
location=location,
|
||||
name="BLANK GmbH",
|
||||
relationships=[],
|
||||
)
|
||||
|
||||
assert company.to_dict() == {
|
||||
"id": {
|
||||
"district_court": company_id.district_court,
|
||||
"hr_number": company_id.hr_number,
|
||||
},
|
||||
"last_update": company.last_update,
|
||||
"location": {
|
||||
"city": location.city,
|
||||
"house_number": location.house_number,
|
||||
"street": location.street,
|
||||
"zip_code": location.zip_code,
|
||||
},
|
||||
"name": "BLANK GmbH",
|
||||
"relationships": [],
|
||||
}
|
23
tests/models/news_test.py
Normal file
23
tests/models/news_test.py
Normal file
@ -0,0 +1,23 @@
|
||||
"""Test Models.nesws."""
|
||||
|
||||
|
||||
from aki_prj23_transparenzregister.models.news import News
|
||||
|
||||
|
||||
def test_to_dict() -> None:
|
||||
"""Tests if the version tag is entered."""
|
||||
news = News(
|
||||
"4711",
|
||||
"Economy collapses",
|
||||
"2042",
|
||||
"Toilet paper prices rising",
|
||||
"https://www.google.com",
|
||||
)
|
||||
|
||||
assert news.to_dict() == {
|
||||
"id": news.id,
|
||||
"title": news.title,
|
||||
"date": news.date,
|
||||
"text": news.text,
|
||||
"source_url": news.source_url,
|
||||
}
|
103
tests/utils/company_mongo_service_test.py
Normal file
103
tests/utils/company_mongo_service_test.py
Normal file
@ -0,0 +1,103 @@
|
||||
"""Test utils.company_mongo_service."""
|
||||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from aki_prj23_transparenzregister.models.company import Company
|
||||
from aki_prj23_transparenzregister.utils.company_mongo_service import (
|
||||
CompanyMongoService,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def mock_mongo_connector(mocker) -> Mock:
|
||||
"""Mock MongoConnector class.
|
||||
|
||||
Args:
|
||||
mocker (any): Library mocker
|
||||
|
||||
Returns:
|
||||
Mock: Mocked MongoConnector
|
||||
"""
|
||||
mock = Mock()
|
||||
mocker.patch(
|
||||
"aki_prj23_transparenzregister.utils.mongo.MongoConnector", return_value=mock
|
||||
)
|
||||
return mock
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def mock_collection() -> Mock:
|
||||
"""Mock mongo collection.
|
||||
|
||||
Returns:
|
||||
Mock: Mock object
|
||||
"""
|
||||
return Mock()
|
||||
|
||||
|
||||
def test_init(mock_mongo_connector, mock_collection):
|
||||
"""Test CompanyMongoService constructor.
|
||||
|
||||
Args:
|
||||
mock_mongo_connector (Mock): Mocked MongoConnector library
|
||||
mock_collection (Mock): Mocked pymongo collection
|
||||
"""
|
||||
mock_mongo_connector.database = {"companies": mock_collection}
|
||||
service = CompanyMongoService(mock_mongo_connector)
|
||||
assert service.collection == mock_collection
|
||||
|
||||
|
||||
def test_get_all(mock_mongo_connector, mock_collection):
|
||||
"""Test CompanyMongoService get_all method.
|
||||
|
||||
Args:
|
||||
mock_mongo_connector (Mock): Mocked MongoConnector library
|
||||
mock_collection (Mock): Mocked pymongo collection
|
||||
"""
|
||||
mock_mongo_connector.database = {"companies": mock_collection}
|
||||
service = CompanyMongoService(mock_mongo_connector)
|
||||
mock_result = [{"id": "42"}]
|
||||
mock_collection.find.return_value = mock_result
|
||||
assert service.get_all() == mock_result
|
||||
|
||||
|
||||
def test_by_id_no_result(mock_mongo_connector, mock_collection):
|
||||
"""Test CompanyMongoService get_by_id with no result.
|
||||
|
||||
Args:
|
||||
mock_mongo_connector (Mock): Mocked MongoConnector library
|
||||
mock_collection (Mock): Mocked pymongo collection
|
||||
"""
|
||||
mock_mongo_connector.database = {"companies": mock_collection}
|
||||
service = CompanyMongoService(mock_mongo_connector)
|
||||
mock_collection.find.return_value = []
|
||||
assert service.get_by_id("Does not exist") is None
|
||||
|
||||
|
||||
def test_by_id_result(mock_mongo_connector, mock_collection):
|
||||
"""Test CompanyMongoService get_by_id with result.
|
||||
|
||||
Args:
|
||||
mock_mongo_connector (Mock): Mocked MongoConnector library
|
||||
mock_collection (Mock): Mocked pymongo collection
|
||||
"""
|
||||
mock_mongo_connector.database = {"companies": mock_collection}
|
||||
service = CompanyMongoService(mock_mongo_connector)
|
||||
mock_entry = {"id": "Does exist", "vaue": 42}
|
||||
mock_collection.find.return_value = [mock_entry]
|
||||
assert service.get_by_id("Does exist") == mock_entry
|
||||
|
||||
|
||||
def test_insert(mock_mongo_connector, mock_collection):
|
||||
"""Test CompanyMongoService insert method.
|
||||
|
||||
Args:
|
||||
mock_mongo_connector (Mock): Mocked MongoConnector library
|
||||
mock_collection (Mock): Mocked pymongo collection
|
||||
"""
|
||||
mock_mongo_connector.database = {"companies": mock_collection}
|
||||
service = CompanyMongoService(mock_mongo_connector)
|
||||
mock_result = 42
|
||||
mock_collection.insert_one.return_value = mock_result
|
||||
assert service.insert(Company(None, None, "", "", [])) == mock_result
|
26
tests/utils/mongo_test.py
Normal file
26
tests/utils/mongo_test.py
Normal file
@ -0,0 +1,26 @@
|
||||
from unittest.mock import patch
|
||||
|
||||
from aki_prj23_transparenzregister.utils.mongo import MongoConnection, MongoConnector
|
||||
|
||||
|
||||
def test_get_conn_string_no_credentials():
|
||||
conn = MongoConnection("localhost", "", 27017, None, None)
|
||||
assert conn.get_conn_string() == "mongodb://localhost:27017"
|
||||
|
||||
|
||||
def test_get_conn_string_no_port_but_credentials():
|
||||
conn = MongoConnection("localhost", "", None, "admin", "password")
|
||||
assert conn.get_conn_string() == "mongodb+srv://admin:password@localhost"
|
||||
|
||||
|
||||
def test_get_conn_simple():
|
||||
conn = MongoConnection("localhost", "", None, None, None)
|
||||
assert conn.get_conn_string() == "mongodb+srv://localhost"
|
||||
|
||||
|
||||
def test_mongo_connector():
|
||||
with patch("pymongo.MongoClient") as mock_mongo_client:
|
||||
expected_result = 42
|
||||
mock_mongo_client.return_value = {"db": expected_result}
|
||||
temp = MongoConnector(MongoConnection("localhost", "db", None, None, None))
|
||||
assert temp.database == expected_result
|
115
tests/utils/news_mongo_service_test.py
Normal file
115
tests/utils/news_mongo_service_test.py
Normal file
@ -0,0 +1,115 @@
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from aki_prj23_transparenzregister.models.news import News
|
||||
from aki_prj23_transparenzregister.utils.news_mongo_service import (
|
||||
MongoEntryTransformer,
|
||||
MongoNewsService,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def mock_mongo_connector(mocker) -> Mock:
|
||||
"""Mock MongoConnector class.
|
||||
|
||||
Args:
|
||||
mocker (any): Library mocker
|
||||
|
||||
Returns:
|
||||
Mock: Mocked MongoConnector
|
||||
"""
|
||||
mock = Mock()
|
||||
mocker.patch(
|
||||
"aki_prj23_transparenzregister.utils.mongo.MongoConnector", return_value=mock
|
||||
)
|
||||
return mock
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def mock_collection() -> Mock:
|
||||
"""Mock mongo collection.
|
||||
|
||||
Returns:
|
||||
Mock: Mock object
|
||||
"""
|
||||
return Mock()
|
||||
|
||||
|
||||
def test_init(mock_mongo_connector, mock_collection):
|
||||
"""Test CompanyMongoService constructor.
|
||||
|
||||
Args:
|
||||
mock_mongo_connector (Mock): Mocked MongoConnector library
|
||||
mock_collection (Mock): Mocked pymongo collection
|
||||
"""
|
||||
mock_mongo_connector.database = {"news": mock_collection}
|
||||
service = MongoNewsService(mock_mongo_connector)
|
||||
assert service.collection == mock_collection
|
||||
|
||||
|
||||
def test_get_all(mock_mongo_connector, mock_collection):
|
||||
mock_mongo_connector.database = {"news": mock_collection}
|
||||
service = MongoNewsService(mock_mongo_connector)
|
||||
|
||||
mock_collection.find.return_value = []
|
||||
assert service.get_all() == []
|
||||
|
||||
|
||||
def test_get_by_id_with_result(mock_mongo_connector, mock_collection):
|
||||
mock_mongo_connector.database = {"news": mock_collection}
|
||||
service = MongoNewsService(mock_mongo_connector)
|
||||
|
||||
with patch(
|
||||
"aki_prj23_transparenzregister.utils.news_mongo_service.MongoEntryTransformer.transform_outgoing"
|
||||
) as mock_out:
|
||||
mock_collection.find.return_value = [{}]
|
||||
mock_out.return_value = {}
|
||||
assert service.get_by_id("foadh") == {}
|
||||
|
||||
|
||||
def test_get_by_id_no_result(mock_mongo_connector, mock_collection):
|
||||
mock_mongo_connector.database = {"news": mock_collection}
|
||||
service = MongoNewsService(mock_mongo_connector)
|
||||
|
||||
mock_collection.find.return_value = []
|
||||
assert service.get_by_id("foadh") is None
|
||||
|
||||
|
||||
def test_insert(mock_mongo_connector, mock_collection):
|
||||
mock_mongo_connector.database = {"news": mock_collection}
|
||||
service = MongoNewsService(mock_mongo_connector)
|
||||
|
||||
with patch(
|
||||
"aki_prj23_transparenzregister.utils.news_mongo_service.MongoEntryTransformer.transform_ingoing"
|
||||
) as mock_in:
|
||||
mock_collection.insert_one.return_value = {}
|
||||
mock_in.return_value = {}
|
||||
assert service.insert({}) == {}
|
||||
|
||||
|
||||
def test_transform_ingoing():
|
||||
news = News("42", None, None, None, None)
|
||||
result = MongoEntryTransformer.transform_ingoing(news)
|
||||
assert result["_id"] == "42"
|
||||
assert "id" not in result
|
||||
|
||||
|
||||
def test_transform_outgoing():
|
||||
data = {
|
||||
"_id": "4711",
|
||||
"title": "Hello",
|
||||
"date": "Today",
|
||||
"text": "World",
|
||||
"source_url": "chat.openai.com",
|
||||
}
|
||||
expected_result = News(
|
||||
**{
|
||||
"id": "4711",
|
||||
"title": "Hello",
|
||||
"date": "Today",
|
||||
"text": "World",
|
||||
"source_url": "chat.openai.com",
|
||||
}
|
||||
)
|
||||
assert MongoEntryTransformer.transform_outgoing(data) == expected_result
|
Loading…
x
Reference in New Issue
Block a user