mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-05-12 23:09:59 +02:00
Merge pull request #39 from fhswf/feature/data-extraktion
Feature/data extraktion
This commit is contained in:
commit
ebedf7c630
@ -12,7 +12,7 @@ repos:
|
|||||||
- id: check-xml
|
- id: check-xml
|
||||||
- id: check-ast
|
- id: check-ast
|
||||||
- id: check-added-large-files
|
- id: check-added-large-files
|
||||||
args: [--enforce-all]
|
args: [--enforce-all --maxkb=50000]
|
||||||
- id: name-tests-test
|
- id: name-tests-test
|
||||||
- id: detect-private-key
|
- id: detect-private-key
|
||||||
- id: check-case-conflict
|
- id: check-case-conflict
|
||||||
|
3
.vscode/settings.json
vendored
Normal file
3
.vscode/settings.json
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
{
|
||||||
|
"files.eol": "\n"
|
||||||
|
}
|
@ -8,14 +8,6 @@
|
|||||||
"# Daten Extraktion aus dem Bundesanzeiger"
|
"# Daten Extraktion aus dem Bundesanzeiger"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"attachments": {},
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"In order to run this notebooks, download the `deutschland` library source code from: [TrisNol/deutschland](https://github.com/TrisNol/deutschland/tree/feat/bundesanzeiger-raw-report) and place it in the `Jupyter/API-tests/Bundesanzeiger/deutschland` directory. Since the PR adding the required features to the main repo has not been completet as of of yet (see: [PR](https://github.com/bundesAPI/deutschland/pull/88)) we have to include it in another way..."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"attachments": {},
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
@ -26,18 +18,9 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": 32,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"C:\\Users\\trist\\AppData\\Roaming\\Python\\Python310\\site-packages\\requests\\__init__.py:102: RequestsDependencyWarning: urllib3 (1.26.12) or chardet (5.1.0)/charset_normalizer (2.0.12) doesn't match a supported version!\n",
|
|
||||||
" warnings.warn(\"urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported \"\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"from deutschland.bundesanzeiger import Bundesanzeiger"
|
"from deutschland.bundesanzeiger import Bundesanzeiger"
|
||||||
@ -45,26 +28,28 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": 33,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"dict_keys(['040860c00ef9020cfb6db2f58a163256', '9eb401c5af2f0289b8207233bf852b81', 'b14d979ea9f42367d413589050bd04e5', 'ecb8c1011456ea0d40f87e850fc216bf', '3a7c6c1f1d1b89bf5ceb165f0ee88053', '03b2e6aac2f2da2c0c5e8f23de9caec4', 'a5f8dc87fa797e7d2f8fb88c49a23c36', '9a3a8a3e84290ee650cbccf32323b3d7', '6c0fcc20a58aaa18a9d13f35a51e3996', 'bf276d441c339e787e22385d2b69b277', '90a79d28f3c11a2122d2827d2bf6acda', '88c785ce3b3c580dcc285661c7790cca', 'd3064baa8246c3ed02e30b5038200edc', '5bf92eed2808b484c005409764b825b7', 'fece6303c991a280850be1900ff78f8f', '26b0624c60cdbf647f3d45f4917ec6ea', '9f98bee55f598908cca60b6a47e5d49d', '99267bb7474e6d1d5d9e091ba5ef3ee8', '102738ef4b91408ed043d84fe785b50b', '94711f3e509518d073e1760d97550347'])\n"
|
"dict_keys(['c1051233030a8e0232523052fd4a2310', '57d129e6fd7505d567fa13919e5e6bdd'])\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"ba = Bundesanzeiger()\n",
|
"ba = Bundesanzeiger()\n",
|
||||||
"reports = ba.get_reports(\"Atos IT-Dienstleistung und Beratung GmbH\")\n",
|
"reports = ba.get_reports(\n",
|
||||||
|
" \"Volkswagen Economy Service Erdle Bernhard Erdle GmbH\"\n",
|
||||||
|
") # \"Atos IT-Dienstleistung und Beratung GmbH\")\n",
|
||||||
"print(reports.keys())"
|
"print(reports.keys())"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": 34,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -75,7 +60,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 8,
|
"execution_count": 35,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -109,42 +94,18 @@
|
|||||||
" <tbody>\n",
|
" <tbody>\n",
|
||||||
" <tr>\n",
|
" <tr>\n",
|
||||||
" <th>0</th>\n",
|
" <th>0</th>\n",
|
||||||
" <td>2023-03-17</td>\n",
|
" <td>2023-05-25</td>\n",
|
||||||
" <td>Aufsichtsrat</td>\n",
|
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl...</td>\n",
|
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||||
" </tr>\n",
|
" </tr>\n",
|
||||||
" <tr>\n",
|
" <tr>\n",
|
||||||
" <th>1</th>\n",
|
" <th>1</th>\n",
|
||||||
" <td>2022-03-25</td>\n",
|
" <td>2023-05-24</td>\n",
|
||||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>2</th>\n",
|
|
||||||
" <td>2021-03-11</td>\n",
|
|
||||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
|
||||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
|
||||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
|
||||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>3</th>\n",
|
|
||||||
" <td>2020-03-24</td>\n",
|
|
||||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
|
||||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
|
||||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung...</td>\n",
|
|
||||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>4</th>\n",
|
|
||||||
" <td>2018-12-11</td>\n",
|
|
||||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
|
||||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
|
||||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
|
||||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||||
" </tr>\n",
|
" </tr>\n",
|
||||||
" </tbody>\n",
|
" </tbody>\n",
|
||||||
@ -153,35 +114,23 @@
|
|||||||
],
|
],
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
" date name \\\n",
|
" date name \\\n",
|
||||||
"0 2023-03-17 Aufsichtsrat \n",
|
"0 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||||
"1 2022-03-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
"1 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||||
"2 2021-03-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
|
||||||
"3 2020-03-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
|
||||||
"4 2018-12-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
|
||||||
"\n",
|
"\n",
|
||||||
" company \\\n",
|
" company \\\n",
|
||||||
"0 Atos IT-Dienstleistung und Beratung GmbH \n",
|
"0 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||||
"1 Atos IT-Dienstleistung und Beratung GmbH \n",
|
"1 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||||
"2 Atos IT-Dienstleistung und Beratung GmbH \n",
|
|
||||||
"3 Atos IT-Dienstleistung und Beratung GmbH \n",
|
|
||||||
"4 Atos IT-Dienstleistung und Beratung GmbH \n",
|
|
||||||
"\n",
|
"\n",
|
||||||
" report \\\n",
|
" report \\\n",
|
||||||
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl... \n",
|
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||||
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||||
"2 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
|
||||||
"3 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung... \n",
|
|
||||||
"4 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
|
||||||
"\n",
|
"\n",
|
||||||
" raw_report \n",
|
" raw_report \n",
|
||||||
"0 <div class=\"publication_container\">\\n <div cla... \n",
|
"0 <div class=\"publication_container\">\\n <div cla... \n",
|
||||||
"1 <div class=\"publication_container\">\\n <div cla... \n",
|
"1 <div class=\"publication_container\">\\n <div cla... "
|
||||||
"2 <div class=\"publication_container\">\\n <div cla... \n",
|
|
||||||
"3 <div class=\"publication_container\">\\n <div cla... \n",
|
|
||||||
"4 <div class=\"publication_container\">\\n <div cla... "
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 8,
|
"execution_count": 35,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -193,7 +142,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 9,
|
"execution_count": 36,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -228,46 +177,19 @@
|
|||||||
" <tbody>\n",
|
" <tbody>\n",
|
||||||
" <tr>\n",
|
" <tr>\n",
|
||||||
" <th>0</th>\n",
|
" <th>0</th>\n",
|
||||||
" <td>2023-03-17</td>\n",
|
" <td>2023-05-25</td>\n",
|
||||||
" <td>Aufsichtsrat</td>\n",
|
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl...</td>\n",
|
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||||
" <td>Aufsichtsrat</td>\n",
|
" <td>Jahresabschluss</td>\n",
|
||||||
" </tr>\n",
|
" </tr>\n",
|
||||||
" <tr>\n",
|
" <tr>\n",
|
||||||
" <th>1</th>\n",
|
" <th>1</th>\n",
|
||||||
" <td>2022-03-25</td>\n",
|
" <td>2023-05-24</td>\n",
|
||||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
|
||||||
" <td>Jahresabschluss</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>2</th>\n",
|
|
||||||
" <td>2021-03-11</td>\n",
|
|
||||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
|
||||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
|
||||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
|
||||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
|
||||||
" <td>Jahresabschluss</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>3</th>\n",
|
|
||||||
" <td>2020-03-24</td>\n",
|
|
||||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
|
||||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
|
||||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung...</td>\n",
|
|
||||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
|
||||||
" <td>Jahresabschluss</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>4</th>\n",
|
|
||||||
" <td>2018-12-11</td>\n",
|
|
||||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
|
||||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
|
||||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
|
||||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||||
" <td>Jahresabschluss</td>\n",
|
" <td>Jahresabschluss</td>\n",
|
||||||
" </tr>\n",
|
" </tr>\n",
|
||||||
@ -277,35 +199,23 @@
|
|||||||
],
|
],
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
" date name \\\n",
|
" date name \\\n",
|
||||||
"0 2023-03-17 Aufsichtsrat \n",
|
"0 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||||
"1 2022-03-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
"1 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||||
"2 2021-03-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
|
||||||
"3 2020-03-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
|
||||||
"4 2018-12-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
|
||||||
"\n",
|
"\n",
|
||||||
" company \\\n",
|
" company \\\n",
|
||||||
"0 Atos IT-Dienstleistung und Beratung GmbH \n",
|
"0 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||||
"1 Atos IT-Dienstleistung und Beratung GmbH \n",
|
"1 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||||
"2 Atos IT-Dienstleistung und Beratung GmbH \n",
|
|
||||||
"3 Atos IT-Dienstleistung und Beratung GmbH \n",
|
|
||||||
"4 Atos IT-Dienstleistung und Beratung GmbH \n",
|
|
||||||
"\n",
|
"\n",
|
||||||
" report \\\n",
|
" report \\\n",
|
||||||
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl... \n",
|
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||||
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||||
"2 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
|
||||||
"3 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung... \n",
|
|
||||||
"4 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
|
||||||
"\n",
|
"\n",
|
||||||
" raw_report type \n",
|
" raw_report type \n",
|
||||||
"0 <div class=\"publication_container\">\\n <div cla... Aufsichtsrat \n",
|
"0 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
||||||
"1 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
"1 <div class=\"publication_container\">\\n <div cla... Jahresabschluss "
|
||||||
"2 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
|
||||||
"3 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
|
||||||
"4 <div class=\"publication_container\">\\n <div cla... Jahresabschluss "
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 9,
|
"execution_count": 36,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -317,21 +227,9 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 10,
|
"execution_count": 37,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_6460\\963182859.py:2: SettingWithCopyWarning: \n",
|
|
||||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
|
||||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
|
||||||
"\n",
|
|
||||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
||||||
" df_jahresabschluss['jahr'] = df_jahresabschluss.name.apply(\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/html": [
|
"text/html": [
|
||||||
@ -361,61 +259,34 @@
|
|||||||
" </thead>\n",
|
" </thead>\n",
|
||||||
" <tbody>\n",
|
" <tbody>\n",
|
||||||
" <tr>\n",
|
" <tr>\n",
|
||||||
" <th>1</th>\n",
|
" <th>0</th>\n",
|
||||||
" <td>2022-03-25</td>\n",
|
" <td>2023-05-25</td>\n",
|
||||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||||
" <td>2020</td>\n",
|
" <td>2020</td>\n",
|
||||||
" </tr>\n",
|
" </tr>\n",
|
||||||
" <tr>\n",
|
" <tr>\n",
|
||||||
" <th>2</th>\n",
|
" <th>1</th>\n",
|
||||||
" <td>2021-03-11</td>\n",
|
" <td>2023-05-24</td>\n",
|
||||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||||
" <td>2019</td>\n",
|
" <td>2019</td>\n",
|
||||||
" </tr>\n",
|
" </tr>\n",
|
||||||
" <tr>\n",
|
|
||||||
" <th>3</th>\n",
|
|
||||||
" <td>2020-03-24</td>\n",
|
|
||||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
|
||||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
|
||||||
" <td>2018</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>4</th>\n",
|
|
||||||
" <td>2018-12-11</td>\n",
|
|
||||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
|
||||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
|
||||||
" <td>2017</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>6</th>\n",
|
|
||||||
" <td>2018-01-03</td>\n",
|
|
||||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
|
||||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
|
||||||
" <td>2016</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </tbody>\n",
|
" </tbody>\n",
|
||||||
"</table>\n",
|
"</table>\n",
|
||||||
"</div>"
|
"</div>"
|
||||||
],
|
],
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
" date company \\\n",
|
" date company \\\n",
|
||||||
"1 2022-03-25 Atos IT-Dienstleistung und Beratung GmbH \n",
|
"0 2023-05-25 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||||
"2 2021-03-11 Atos IT-Dienstleistung und Beratung GmbH \n",
|
"1 2023-05-24 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||||
"3 2020-03-24 Atos IT-Dienstleistung und Beratung GmbH \n",
|
|
||||||
"4 2018-12-11 Atos IT-Dienstleistung und Beratung GmbH \n",
|
|
||||||
"6 2018-01-03 Atos IT-Dienstleistung und Beratung GmbH \n",
|
|
||||||
"\n",
|
"\n",
|
||||||
" raw_report jahr \n",
|
" raw_report jahr \n",
|
||||||
"1 <div class=\"publication_container\">\\n <div cla... 2020 \n",
|
"0 <div class=\"publication_container\">\\n <div cla... 2020 \n",
|
||||||
"2 <div class=\"publication_container\">\\n <div cla... 2019 \n",
|
"1 <div class=\"publication_container\">\\n <div cla... 2019 "
|
||||||
"3 <div class=\"publication_container\">\\n <div cla... 2018 \n",
|
|
||||||
"4 <div class=\"publication_container\">\\n <div cla... 2017 \n",
|
|
||||||
"6 <div class=\"publication_container\">\\n <div cla... 2016 "
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 10,
|
"execution_count": 37,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -439,7 +310,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 11,
|
"execution_count": 38,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -449,11 +320,12 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 12,
|
"execution_count": 39,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"sample_report = df_jahresabschluss.iloc[0].raw_report"
|
"sample_report = df_jahresabschluss.iloc[0].raw_report\n",
|
||||||
|
"sample_report_content = df_jahresabschluss.iloc[0].raw_report"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -466,45 +338,20 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 14,
|
"execution_count": 40,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import re\n",
|
"import re\n",
|
||||||
|
"from dataclasses import dataclass\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"@dataclass\n",
|
||||||
|
"class Auditor:\n",
|
||||||
|
" name: str\n",
|
||||||
|
" company: str\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def extract_auditors(report: str) -> list:\n",
|
|
||||||
" auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
|
|
||||||
" hits = re.findall(auditor_regex, report)\n",
|
|
||||||
" return [hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip() for hit in hits]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 15,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"['Eckhard Lewe', 'Renate Hermsdorf']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 15,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"extract_auditors(sample_report)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 16,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def extract_auditor_company(report: str) -> str:\n",
|
"def extract_auditor_company(report: str) -> str:\n",
|
||||||
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
|
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
|
||||||
" temp = soup.find_all(\"b\")\n",
|
" temp = soup.find_all(\"b\")\n",
|
||||||
@ -512,27 +359,37 @@
|
|||||||
" br = elem.findChildren(\"br\")\n",
|
" br = elem.findChildren(\"br\")\n",
|
||||||
" if len(br) > 0:\n",
|
" if len(br) > 0:\n",
|
||||||
" return elem.text.split(\"\\n\")[1].strip()\n",
|
" return elem.text.split(\"\\n\")[1].strip()\n",
|
||||||
" return None"
|
" return None\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def extract_auditors(report: str) -> list:\n",
|
||||||
|
" auditor_company = extract_auditor_company(report)\n",
|
||||||
|
" auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
|
||||||
|
" hits = re.findall(auditor_regex, report)\n",
|
||||||
|
" return [\n",
|
||||||
|
" Auditor(hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip(), auditor_company)\n",
|
||||||
|
" for hit in hits\n",
|
||||||
|
" ]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 17,
|
"execution_count": 41,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"'Warth & Klein Grant Thornton AG'"
|
"[]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 17,
|
"execution_count": 41,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"extract_auditor_company(sample_report)"
|
"extract_auditors(sample_report)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -561,97 +418,177 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 18,
|
"execution_count": 42,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/html": [
|
|
||||||
"<div>\n",
|
|
||||||
"<style scoped>\n",
|
|
||||||
" .dataframe tbody tr th:only-of-type {\n",
|
|
||||||
" vertical-align: middle;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe tbody tr th {\n",
|
|
||||||
" vertical-align: top;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe thead th {\n",
|
|
||||||
" text-align: right;\n",
|
|
||||||
" }\n",
|
|
||||||
"</style>\n",
|
|
||||||
"<table border=\"1\" class=\"dataframe\">\n",
|
|
||||||
" <thead>\n",
|
|
||||||
" <tr style=\"text-align: right;\">\n",
|
|
||||||
" <th></th>\n",
|
|
||||||
" <th>Unnamed: 0</th>\n",
|
|
||||||
" <th>Anhang</th>\n",
|
|
||||||
" <th>2020 TEUR</th>\n",
|
|
||||||
" <th>Vorjahr TEUR</th>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </thead>\n",
|
|
||||||
" <tbody>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>0</th>\n",
|
|
||||||
" <td>1. Umsatzerlöse</td>\n",
|
|
||||||
" <td>(1)</td>\n",
|
|
||||||
" <td>69.819</td>\n",
|
|
||||||
" <td>77.429</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>1</th>\n",
|
|
||||||
" <td>2. Veränderung des Bestandes an unfertigen Lei...</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>-41.000</td>\n",
|
|
||||||
" <td>-66.000</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>2</th>\n",
|
|
||||||
" <td>3. Sonstige betriebliche Erträge</td>\n",
|
|
||||||
" <td>(2)</td>\n",
|
|
||||||
" <td>489.000</td>\n",
|
|
||||||
" <td>1.816</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>3</th>\n",
|
|
||||||
" <td>4. Materialaufwand</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>4</th>\n",
|
|
||||||
" <td>a) Aufwendungen für bezogene Waren</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>-1.220</td>\n",
|
|
||||||
" <td>-3.003</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </tbody>\n",
|
|
||||||
"</table>\n",
|
|
||||||
"</div>"
|
|
||||||
],
|
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
" Unnamed: 0 Anhang 2020 TEUR \\\n",
|
"{'net_income': 23484.67, 'equity': 65083.84, 'current_assets': 357613.61}"
|
||||||
"0 1. Umsatzerlöse (1) 69.819 \n",
|
|
||||||
"1 2. Veränderung des Bestandes an unfertigen Lei... NaN -41.000 \n",
|
|
||||||
"2 3. Sonstige betriebliche Erträge (2) 489.000 \n",
|
|
||||||
"3 4. Materialaufwand NaN NaN \n",
|
|
||||||
"4 a) Aufwendungen für bezogene Waren NaN -1.220 \n",
|
|
||||||
"\n",
|
|
||||||
" Vorjahr TEUR \n",
|
|
||||||
"0 77.429 \n",
|
|
||||||
"1 -66.000 \n",
|
|
||||||
"2 1.816 \n",
|
|
||||||
"3 NaN \n",
|
|
||||||
"4 -3.003 "
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 18,
|
"execution_count": 42,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
"source": [
|
||||||
|
"def extract_kpis(report_content) -> dict:\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd\n",
|
||||||
|
" Extracts Key Performance Indicators (KPIs) from the financial reports.\n",
|
||||||
|
" Args:\n",
|
||||||
|
" reports (dict): A dictionary containing the financial reports with their hash as keys and report details as values.\n",
|
||||||
|
" Returns:\n",
|
||||||
|
" dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
" kpis = {}\n",
|
||||||
|
"\n",
|
||||||
|
" # Define KPI patterns to search for\n",
|
||||||
|
" kpi_patterns = {\n",
|
||||||
|
" \"revenue\": r\"(?:revenue|umsatz|erlöse)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||||
|
" \"net_income\": r\"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||||
|
" \"ebit\": r\"(?:ebit|operating income)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||||
|
" \"ebitda\": r\"(?:ebitda)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||||
|
" \"gross_profit\": r\"(?:gross profit|bruttogewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||||
|
" \"operating_profit\": r\"(?:operating profit|betriebsgewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||||
|
" \"assets\": r\"(?:total assets|bilanzsumme)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||||
|
" \"liabilities\": r\"(?:total liabilities|gesamtverbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||||
|
" \"equity\": r\"(?:shareholders'? equity|eigenkapital)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||||
|
" \"current_assets\": r\"(?:current assets|umlaufvermögen)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||||
|
" \"current_liabilities\": r\"(?:current liabilities|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||||
|
" \"long_term_debt\": r\"(?:long[-\\s]?term debt|langfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||||
|
" \"short_term_debt\": r\"(?:short[-\\s]?term debt|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||||
|
" \"cash_and_cash_equivalents\": r\"(?:cash (?:and cash equivalents)?|barmittel)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||||
|
" \"dividends\": r\"(?:dividends?|dividende)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||||
|
" \"cash_flow\": r\"(?:cash flow|cashflow|cash flow from operating activities)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" report_kpis = {}\n",
|
||||||
|
" for kpi, pattern in kpi_patterns.items():\n",
|
||||||
|
" match = re.search(pattern, report_content, flags=re.IGNORECASE | re.UNICODE)\n",
|
||||||
|
" if match:\n",
|
||||||
|
" value = match.group(1)\n",
|
||||||
|
"\n",
|
||||||
|
" # Clean and validate the extracted number\n",
|
||||||
|
" try:\n",
|
||||||
|
" if not value: # Check if value is empty\n",
|
||||||
|
" cleaned_value = None\n",
|
||||||
|
" else:\n",
|
||||||
|
" multiplier = 1\n",
|
||||||
|
" if value[-1].lower() == \"m\":\n",
|
||||||
|
" value = value[:-1]\n",
|
||||||
|
" multiplier = 1_000_000\n",
|
||||||
|
" elif value[-1].lower() == \"b\":\n",
|
||||||
|
" value = value[:-1]\n",
|
||||||
|
" multiplier = 1_000_000_000\n",
|
||||||
|
"\n",
|
||||||
|
" # Remove commas after checking for multipliers\n",
|
||||||
|
" value = value.replace(\".\", \"\").replace(\",\", \".\").strip()\n",
|
||||||
|
" cleaned_value = float(value) * multiplier\n",
|
||||||
|
" except ValueError:\n",
|
||||||
|
" cleaned_value = None\n",
|
||||||
|
"\n",
|
||||||
|
" if cleaned_value is not None:\n",
|
||||||
|
" report_kpis[kpi] = cleaned_value\n",
|
||||||
|
" return report_kpis\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"extract_kpis(\n",
|
||||||
|
" BeautifulSoup(sample_report, features=\"html.parser\").get_text().replace(\"\\n\", \" \")\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 43,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"\n",
|
||||||
|
"with open(\"./temp.txt\", \"w\") as file:\n",
|
||||||
|
" file.write(\n",
|
||||||
|
" BeautifulSoup(sample_report, features=\"html.parser\")\n",
|
||||||
|
" .get_text()\n",
|
||||||
|
" .replace(\"\\n\", \" \")\n",
|
||||||
|
" )"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 46,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n",
|
||||||
|
" ('Aktiva', '31.12.2020 EUR'),\n",
|
||||||
|
" ('Aktiva', '31.12.2019 EUR')],\n",
|
||||||
|
" )\n",
|
||||||
|
"Aktiva Unnamed: 0_level_1 object\n",
|
||||||
|
" 31.12.2020 EUR object\n",
|
||||||
|
" 31.12.2019 EUR object\n",
|
||||||
|
"dtype: object\n",
|
||||||
|
"MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n",
|
||||||
|
" ('Passiva', '31.12.2020 EUR'),\n",
|
||||||
|
" ('Passiva', '31.12.2019 EUR')],\n",
|
||||||
|
" )\n",
|
||||||
|
"Passiva Unnamed: 0_level_1 object\n",
|
||||||
|
" 31.12.2020 EUR object\n",
|
||||||
|
" 31.12.2019 EUR object\n",
|
||||||
|
"dtype: object\n",
|
||||||
|
"Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n",
|
||||||
|
"Angaben zur Identifikation der Gesellschaft laut Registergericht object\n",
|
||||||
|
"Angaben zur Identifikation der Gesellschaft laut Registergericht.1 object\n",
|
||||||
|
"dtype: object\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 46,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"def parse_tables(report: str) -> list:\n",
|
||||||
|
" result = {}\n",
|
||||||
|
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
|
||||||
|
" for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n",
|
||||||
|
" df = pd.read_html(StringIO(str(table)))[0]\n",
|
||||||
|
" print(df.columns)\n",
|
||||||
|
" print(df.dtypes)\n",
|
||||||
|
" return result\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"parse_tables(sample_report)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 45,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "KeyError",
|
||||||
|
"evalue": "'Passiva'",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Bundesanzeiger\\notebook.ipynb Cell 21\u001b[0m in \u001b[0;36m1\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=10'>11</a>\u001b[0m \u001b[39mreturn\u001b[39;00m result\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=13'>14</a>\u001b[0m bilanz \u001b[39m=\u001b[39m get_bilanz(sample_report)\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=14'>15</a>\u001b[0m bilanz[\u001b[39m\"\u001b[39;49m\u001b[39mPassiva\u001b[39;49m\u001b[39m\"\u001b[39;49m]\u001b[39m.\u001b[39mhead()\n",
|
||||||
|
"\u001b[1;31mKeyError\u001b[0m: 'Passiva'"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"def get_bilanz(report: str) -> any:\n",
|
"def get_bilanz(report: str) -> any:\n",
|
||||||
" result = {}\n",
|
" result = {}\n",
|
||||||
@ -672,30 +609,30 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 19,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"Index(['Gesellschafterbeschluss', 'Shareholder Resolution'], dtype='object')\n",
|
"Int64Index([0, 1], dtype='int64')\n",
|
||||||
"Index(['Unnamed: 0', 'Anhang', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
"Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||||
"Index(['Unnamed: 0', 'Anhang', '2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
"Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||||
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2020 TEUR',\n",
|
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n",
|
||||||
" 'Vorjahr TEUR'],\n",
|
" 'Vorjahr TEUR'],\n",
|
||||||
" dtype='object')\n",
|
" dtype='object')\n",
|
||||||
"Index(['Aufgliederung nach Inland und Ausland', '2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
"Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||||
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||||
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||||
"Index(['Unnamed: 0', '31.12.2020'], dtype='object')\n",
|
"Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n",
|
||||||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||||||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||||||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||||||
"Int64Index([0, 1, 2], dtype='int64')\n",
|
"Int64Index([0, 1, 2], dtype='int64')\n",
|
||||||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||||||
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||||
"Index(['Unnamed: 0', '2020 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
|
"Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
|
||||||
"MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n",
|
"MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n",
|
||||||
" ('Art der Beziehung', 'Gesellschafterin TEUR'),\n",
|
" ('Art der Beziehung', 'Gesellschafterin TEUR'),\n",
|
||||||
" ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n",
|
" ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n",
|
||||||
@ -707,24 +644,23 @@
|
|||||||
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
|
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
|
||||||
" ('Anschaffungs- oder Herstellungskosten', ...)],\n",
|
" ('Anschaffungs- oder Herstellungskosten', ...)],\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
|
"MultiIndex([('Unnamed: 0_level_0', ...),\n",
|
||||||
" ( 'Abschreibungen', 'Stand 01.01.2020 EUR'),\n",
|
" ( 'Abschreibungen', ...),\n",
|
||||||
" ( 'Abschreibungen', 'Abschreibungen des Geschäftsjahres EUR'),\n",
|
" ( 'Abschreibungen', ...),\n",
|
||||||
" ( 'Abschreibungen', 'Abgänge Umbuchung EUR'),\n",
|
" ( 'Abschreibungen', ...),\n",
|
||||||
" ( 'Abschreibungen', 'Stand 31.12.2020 EUR')],\n",
|
" ( 'Abschreibungen', ...)],\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
|
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
|
||||||
" ( 'Buchwerte', 'Stand 31.12.2020 EUR'),\n",
|
" ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n",
|
||||||
" ( 'Buchwerte', 'Stand 31.12.2019 EUR')],\n",
|
" ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2020', '2019',\n",
|
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n",
|
||||||
" '2018'],\n",
|
" '2019'],\n",
|
||||||
" dtype='object')\n",
|
" dtype='object')\n",
|
||||||
"Index(['Gewinn- und Verlustrechnung', '2020 TEUR', 'Vorjahr TEUR',\n",
|
"Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n",
|
||||||
" 'Veränderung TEUR'],\n",
|
" 'Veränderung TEUR'],\n",
|
||||||
" dtype='object')\n",
|
" dtype='object')\n",
|
||||||
"Index(['Bilanz', '31.12.2020 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n",
|
"Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n"
|
||||||
"Index(['Bericht des Aufsichtsrats', 'Report of the Supervisory Board'], dtype='object')\n"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
1
Jupyter/API-tests/News/.gitignore
vendored
Normal file
1
Jupyter/API-tests/News/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
data/
|
879
Jupyter/API-tests/News/notebook.ipynb
Normal file
879
Jupyter/API-tests/News/notebook.ipynb
Normal file
File diff suppressed because one or more lines are too long
1
Jupyter/API-tests/News/requirements.txt
Normal file
1
Jupyter/API-tests/News/requirements.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
pymongo
|
1
Jupyter/API-tests/Unternehmensregister/.gitignore
vendored
Normal file
1
Jupyter/API-tests/Unternehmensregister/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
data/*
|
192
Jupyter/API-tests/Unternehmensregister/main.py
Normal file
192
Jupyter/API-tests/Unternehmensregister/main.py
Normal file
@ -0,0 +1,192 @@
|
|||||||
|
"""Unternehmensregister Scraping."""
|
||||||
|
import glob
|
||||||
|
import logging
|
||||||
|
import multiprocessing
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support import expected_conditions as ec
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
logger = logging.getLogger()
|
||||||
|
|
||||||
|
|
||||||
|
def scrape(query: str, download_dir: list[str]):
|
||||||
|
"""Fetch results from Unternehmensregister for given query.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query (str): Search Query (RegEx supported)
|
||||||
|
download_dir (list[str]): Directory to place output files in
|
||||||
|
"""
|
||||||
|
download_path = os.path.join(str(Path.cwd()), *download_dir)
|
||||||
|
options = webdriver.ChromeOptions()
|
||||||
|
preferences = {
|
||||||
|
"profile.default_content_settings.popups": 0,
|
||||||
|
"safebrowsing.enabled": True,
|
||||||
|
"download": {
|
||||||
|
"directory_upgrade": True,
|
||||||
|
"prompt_for_download": False,
|
||||||
|
"extensions_to_open": "",
|
||||||
|
"default_directory": download_path,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
options.add_argument("--headless=new")
|
||||||
|
options.add_experimental_option("prefs", preferences)
|
||||||
|
|
||||||
|
driver = webdriver.Chrome(options=options)
|
||||||
|
|
||||||
|
driver.get("https://www.unternehmensregister.de/ureg/")
|
||||||
|
# Accept Cookies
|
||||||
|
driver.find_elements(
|
||||||
|
By.XPATH, '//button[text()="Nur technisch notwendige Cookies akzeptieren"]'
|
||||||
|
)[0].click()
|
||||||
|
# Enter search query
|
||||||
|
driver.find_elements(By.ID, "globalSearchForm:extendedResearchCompanyName")[
|
||||||
|
0
|
||||||
|
].send_keys(query)
|
||||||
|
# Trigger search
|
||||||
|
driver.find_elements(By.ID, "globalSearchForm:btnExecuteSearchOld")[0].click()
|
||||||
|
# Wait for results
|
||||||
|
wait = WebDriverWait(driver, 15)
|
||||||
|
wait.until(
|
||||||
|
lambda driver: driver.current_url != "https://www.unternehmensregister.de/ureg/"
|
||||||
|
)
|
||||||
|
|
||||||
|
num_pages = int(
|
||||||
|
driver.find_element(By.XPATH, '//*[@class="page_count"]').text.split(" ")[0]
|
||||||
|
)
|
||||||
|
|
||||||
|
processed_companies = []
|
||||||
|
|
||||||
|
for _ in tqdm(range(num_pages)):
|
||||||
|
# Find all "Registerinformationen"
|
||||||
|
companies_tab = driver.find_elements(
|
||||||
|
By.LINK_TEXT, "Registerinformationen des Registergerichts"
|
||||||
|
)
|
||||||
|
company_names = [
|
||||||
|
elem.text
|
||||||
|
for elem in driver.find_elements(
|
||||||
|
By.XPATH, '//div[@class="company_result"]/span/b'
|
||||||
|
)
|
||||||
|
]
|
||||||
|
for index, company_link in enumerate(companies_tab):
|
||||||
|
company_name = company_names[index]
|
||||||
|
if company_name in processed_companies:
|
||||||
|
continue
|
||||||
|
# Go to intermediary page
|
||||||
|
company_link.click()
|
||||||
|
# Trigger next redirect
|
||||||
|
driver.find_element(By.LINK_TEXT, "Registerinformationen anzeigen").click()
|
||||||
|
# Trigger SI download
|
||||||
|
driver.find_element(By.LINK_TEXT, "SI").click()
|
||||||
|
# Show shopping cart
|
||||||
|
wait.until(
|
||||||
|
ec.visibility_of_element_located(
|
||||||
|
(By.LINK_TEXT, "Dokumentenkorb ansehen")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
driver.find_element(By.LINK_TEXT, "Dokumentenkorb ansehen").click()
|
||||||
|
# Get document
|
||||||
|
elems = driver.find_elements(By.TAG_NAME, "input")
|
||||||
|
elems[-2].click()
|
||||||
|
|
||||||
|
wait.until(
|
||||||
|
ec.visibility_of_element_located((By.ID, "paymentFormOverview:btnNext"))
|
||||||
|
)
|
||||||
|
driver.find_element(By.ID, "paymentFormOverview:btnNext").click()
|
||||||
|
|
||||||
|
wait.until(
|
||||||
|
ec.visibility_of_element_located((By.LINK_TEXT, "Zum Dokumentenkorb"))
|
||||||
|
)
|
||||||
|
driver.find_element(By.LINK_TEXT, "Zum Dokumentenkorb").click()
|
||||||
|
|
||||||
|
num_files = get_num_files(download_path)
|
||||||
|
driver.find_element(By.CLASS_NAME, "download-wrapper").click()
|
||||||
|
|
||||||
|
try:
|
||||||
|
wait.until(wait_for_download_condition(download_path, num_files))
|
||||||
|
file_name = "".join(e for e in company_name if e.isalnum()) + ".xml"
|
||||||
|
rename_latest_file(
|
||||||
|
download_path,
|
||||||
|
file_name,
|
||||||
|
)
|
||||||
|
processed_companies.append(company_name)
|
||||||
|
except Exception:
|
||||||
|
logger.warning("Exception caught in Scraping")
|
||||||
|
finally:
|
||||||
|
for _ in range(6):
|
||||||
|
driver.back()
|
||||||
|
driver.find_element(By.XPATH, '//*[@class="fas fa-angle-right"]').click()
|
||||||
|
driver.close()
|
||||||
|
|
||||||
|
|
||||||
|
def wait_for_download_condition(
|
||||||
|
path: str, num_files: int, pattern: str = "*.xml"
|
||||||
|
) -> bool:
|
||||||
|
"""Selenium wait condition monitoring number of files in a dir.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str): Directory path
|
||||||
|
num_files (int): Current number of file
|
||||||
|
pattern (str, optional): File pattern. Defaults to "*.xml".
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: Current num file exceeded
|
||||||
|
"""
|
||||||
|
return len(glob.glob1(path, pattern)) > num_files
|
||||||
|
|
||||||
|
|
||||||
|
def get_num_files(path: str, pattern: str = "*.xml") -> int:
|
||||||
|
"""Get number of files in directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str): Directory to scan
|
||||||
|
pattern (str, optional): File pattern. Defaults to "*.xml".
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
int: Number of files matching pattern
|
||||||
|
"""
|
||||||
|
return len(glob.glob1(path, pattern))
|
||||||
|
|
||||||
|
|
||||||
|
def rename_latest_file(path: str, filename: str, pattern: str = "*.xml"):
|
||||||
|
"""Rename file in dir with latest change date.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str): Dir to check
|
||||||
|
filename (str): Name of file
|
||||||
|
pattern (str, optional): File pattern. Defaults to "*.xml".
|
||||||
|
"""
|
||||||
|
list_of_files = [os.path.join(path, file) for file in glob.glob1(path, pattern)]
|
||||||
|
latest_download = max(list_of_files, key=os.path.getctime)
|
||||||
|
os.rename(latest_download, os.path.join(path, filename))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
"""Main procedure"""
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
df_relevant_companies = pd.read_excel(
|
||||||
|
"./data/study_id42887_top-100-unternehmen-deutschland.xlsx",
|
||||||
|
sheet_name="Toplist",
|
||||||
|
skiprows=1,
|
||||||
|
)
|
||||||
|
df_relevant_companies = df_relevant_companies[df_relevant_companies["Name"].notna()]
|
||||||
|
|
||||||
|
batch_size = 5
|
||||||
|
pool = multiprocessing.Pool(processes=batch_size)
|
||||||
|
params = [
|
||||||
|
(query, ["data", "Unternehmensregister", "scraping", query.strip()])
|
||||||
|
for query in df_relevant_companies.Name
|
||||||
|
]
|
||||||
|
# Map the process_handler function to the parameter list using the Pool
|
||||||
|
pool.starmap(scrape, params)
|
||||||
|
|
||||||
|
# Close the Pool to prevent any more tasks from being submitted
|
||||||
|
pool.close()
|
||||||
|
|
||||||
|
# Wait for all the processes to complete
|
||||||
|
pool.join()
|
4322
Jupyter/API-tests/Unternehmensregister/notebook.ipynb
Normal file
4322
Jupyter/API-tests/Unternehmensregister/notebook.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
10
Jupyter/API-tests/Unternehmensregister/requirements.txt
Normal file
10
Jupyter/API-tests/Unternehmensregister/requirements.txt
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
ocrmypdf
|
||||||
|
pytesseract
|
||||||
|
opencv-python
|
||||||
|
pdf2image
|
||||||
|
bs4
|
||||||
|
selenium
|
||||||
|
xmltodict
|
||||||
|
tqdm
|
||||||
|
openpyxl
|
||||||
|
pandas
|
28
Jupyter/API-tests/docker-compose.yml
Normal file
28
Jupyter/API-tests/docker-compose.yml
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
version: '3.8'
|
||||||
|
services:
|
||||||
|
mongodb:
|
||||||
|
image: mongo:6.0.6
|
||||||
|
container_name: mongodb
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
MONGO_INITDB_ROOT_USERNAME: root
|
||||||
|
MONGO_INITDB_ROOT_PASSWORD: pR0R0v2e2
|
||||||
|
MONGO_INITDB_DATABASE: transparenzregister
|
||||||
|
ports:
|
||||||
|
- 27017:27017
|
||||||
|
volumes:
|
||||||
|
- mongodb_data:/data/db
|
||||||
|
|
||||||
|
mongo-express:
|
||||||
|
image: mongo-express:1.0.0-alpha
|
||||||
|
container_name: mongo-express
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- 8081:8081
|
||||||
|
environment:
|
||||||
|
ME_CONFIG_MONGODB_SERVER: mongodb
|
||||||
|
ME_CONFIG_MONGODB_ADMINUSERNAME: root
|
||||||
|
ME_CONFIG_MONGODB_ADMINPASSWORD: pR0R0v2e2
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
mongodb_data:
|
Binary file not shown.
1028
poetry.lock
generated
1028
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -30,9 +30,12 @@ version = "0.1.0"
|
|||||||
loguru = "^0.7.0"
|
loguru = "^0.7.0"
|
||||||
matplotlib = "^3.7.1"
|
matplotlib = "^3.7.1"
|
||||||
plotly = "^5.14.1"
|
plotly = "^5.14.1"
|
||||||
|
pymongo = "^4.4.1"
|
||||||
python = "^3.11"
|
python = "^3.11"
|
||||||
seaborn = "^0.12.2"
|
seaborn = "^0.12.2"
|
||||||
|
selenium = "^4.10.0"
|
||||||
tqdm = "^4.65.0"
|
tqdm = "^4.65.0"
|
||||||
|
types-tqdm = "^4.65.0"
|
||||||
|
|
||||||
[tool.poetry.group.develop.dependencies]
|
[tool.poetry.group.develop.dependencies]
|
||||||
black = {extras = ["jupyter"], version = "^23.3.0"}
|
black = {extras = ["jupyter"], version = "^23.3.0"}
|
||||||
@ -100,8 +103,11 @@ target-version = "py311"
|
|||||||
# Avoid trying to fix flake8-bugbear (`B`) violations.
|
# Avoid trying to fix flake8-bugbear (`B`) violations.
|
||||||
unfixable = ["B"]
|
unfixable = ["B"]
|
||||||
|
|
||||||
|
[tool.ruff.flake8-builtins]
|
||||||
|
builtins-ignorelist = ["id"]
|
||||||
|
|
||||||
[tool.ruff.per-file-ignores]
|
[tool.ruff.per-file-ignores]
|
||||||
"tests/*.py" = ["S101"]
|
"tests/*.py" = ["S101", "D100", "D101", "D107", "D103"]
|
||||||
|
|
||||||
[tool.ruff.pydocstyle]
|
[tool.ruff.pydocstyle]
|
||||||
convention = "google"
|
convention = "google"
|
||||||
|
1
src/aki_prj23_transparenzregister/models/__init__.py
Normal file
1
src/aki_prj23_transparenzregister/models/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
"""Model classes."""
|
68
src/aki_prj23_transparenzregister/models/company.py
Normal file
68
src/aki_prj23_transparenzregister/models/company.py
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
"""Company model."""
|
||||||
|
from abc import ABC
|
||||||
|
from dataclasses import asdict, dataclass
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class RelationshipRoleEnum(Enum):
|
||||||
|
"""_summary_.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
Enum (_type_): _description_
|
||||||
|
"""
|
||||||
|
|
||||||
|
STAKEHOLDER = ""
|
||||||
|
ORGANISATION = "ORGANISATION"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CompanyID:
|
||||||
|
"""_summary_."""
|
||||||
|
|
||||||
|
district_court: str
|
||||||
|
hr_number: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Location:
|
||||||
|
"""_summary_."""
|
||||||
|
|
||||||
|
city: str
|
||||||
|
street: str | None = None
|
||||||
|
house_number: str | None = None
|
||||||
|
zip_code: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CompanyRelationship(ABC):
|
||||||
|
"""_summary_.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ABC (_type_): _description_
|
||||||
|
"""
|
||||||
|
|
||||||
|
role: RelationshipRoleEnum
|
||||||
|
location: Location
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Company:
|
||||||
|
"""_summary_.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
_type_: _description_
|
||||||
|
"""
|
||||||
|
|
||||||
|
id: CompanyID
|
||||||
|
location: Location
|
||||||
|
name: str
|
||||||
|
last_update: str
|
||||||
|
relationships: list[CompanyRelationship]
|
||||||
|
|
||||||
|
def to_dict(self) -> dict:
|
||||||
|
"""_summary_.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: _description_
|
||||||
|
"""
|
||||||
|
return asdict(self)
|
25
src/aki_prj23_transparenzregister/models/news.py
Normal file
25
src/aki_prj23_transparenzregister/models/news.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
"""News mnodel."""
|
||||||
|
from dataclasses import asdict, dataclass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class News:
|
||||||
|
"""_summary_.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
_type_: _description_
|
||||||
|
"""
|
||||||
|
|
||||||
|
id: str
|
||||||
|
title: str
|
||||||
|
date: str
|
||||||
|
text: str
|
||||||
|
source_url: str
|
||||||
|
|
||||||
|
def to_dict(self) -> dict:
|
||||||
|
"""_summary_.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: _description_
|
||||||
|
"""
|
||||||
|
return asdict(self)
|
1
src/aki_prj23_transparenzregister/utils/__init__.py
Normal file
1
src/aki_prj23_transparenzregister/utils/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
"""Util classes and services."""
|
@ -0,0 +1,49 @@
|
|||||||
|
"""CompanyMongoService."""
|
||||||
|
from aki_prj23_transparenzregister.models.company import Company, CompanyID
|
||||||
|
from aki_prj23_transparenzregister.utils.mongo import MongoConnector
|
||||||
|
|
||||||
|
|
||||||
|
class CompanyMongoService:
|
||||||
|
"""_summary_."""
|
||||||
|
|
||||||
|
def __init__(self, connector: MongoConnector):
|
||||||
|
"""_summary_.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
connector (MongoConnector): _description_
|
||||||
|
"""
|
||||||
|
self.collection = connector.database["companies"]
|
||||||
|
|
||||||
|
def get_all(self) -> list[Company]:
|
||||||
|
"""_summary_.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list[Company]: _description_
|
||||||
|
"""
|
||||||
|
result = self.collection.find()
|
||||||
|
return list(result)
|
||||||
|
|
||||||
|
def get_by_id(self, id: CompanyID) -> Company | None:
|
||||||
|
"""_summary_.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
id (str): _description_
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Company | None: _description_
|
||||||
|
"""
|
||||||
|
result = list(self.collection.find({"id": id}))
|
||||||
|
if len(result) == 1:
|
||||||
|
return result[0]
|
||||||
|
return None
|
||||||
|
|
||||||
|
def insert(self, company: Company):
|
||||||
|
"""_summary_.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
company (Company): _description_
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
_type_: _description_
|
||||||
|
"""
|
||||||
|
return self.collection.insert_one(company.to_dict())
|
47
src/aki_prj23_transparenzregister/utils/mongo.py
Normal file
47
src/aki_prj23_transparenzregister/utils/mongo.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
"""Mongo Wrapper."""
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import pymongo
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MongoConnection:
|
||||||
|
"""_summary_."""
|
||||||
|
|
||||||
|
hostname: str
|
||||||
|
database: str
|
||||||
|
port: int | None
|
||||||
|
username: str | None
|
||||||
|
password: str | None
|
||||||
|
|
||||||
|
def get_conn_string(self) -> str:
|
||||||
|
"""Transforms the information of the object to a MongoDB connection string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Connection string
|
||||||
|
"""
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
connection_string = (
|
||||||
|
f"mongodb+srv://{self.username}:{self.password}@{self.hostname}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
connection_string = f"mongodb+srv://{self.hostname}"
|
||||||
|
if self.port is not None:
|
||||||
|
connection_string += f":{self.port}"
|
||||||
|
connection_string = connection_string.replace("mongodb+srv", "mongodb")
|
||||||
|
return connection_string
|
||||||
|
|
||||||
|
|
||||||
|
class MongoConnector:
|
||||||
|
"""Wrapper for establishing a connection to a MongoDB instance."""
|
||||||
|
|
||||||
|
def __init__(self, connection: MongoConnection):
|
||||||
|
"""_summary_.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
connection (MongoConnection): Wrapper for connection string
|
||||||
|
"""
|
||||||
|
self.client: pymongo.MongoClient = pymongo.MongoClient(
|
||||||
|
connection.get_conn_string()
|
||||||
|
)
|
||||||
|
self.database = self.client[connection.database]
|
@ -0,0 +1,94 @@
|
|||||||
|
"""MongoNewsService."""
|
||||||
|
from aki_prj23_transparenzregister.models.news import News
|
||||||
|
from aki_prj23_transparenzregister.utils.mongo import MongoConnector
|
||||||
|
|
||||||
|
|
||||||
|
class MongoNewsService:
|
||||||
|
"""_summary_.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
NewsServiceInterface (_type_): _description_
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, connector: MongoConnector):
|
||||||
|
"""_summary_.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
connector (MongoConnector): _description_
|
||||||
|
"""
|
||||||
|
self.collection = connector.database["news"]
|
||||||
|
|
||||||
|
def get_all(self) -> list[News]:
|
||||||
|
"""_summary_.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list[News]: _description_
|
||||||
|
"""
|
||||||
|
result = self.collection.find()
|
||||||
|
return [MongoEntryTransformer.transform_outgoing(elem) for elem in result]
|
||||||
|
|
||||||
|
def get_by_id(self, id: str) -> News | None:
|
||||||
|
"""_summary_.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
id (str): _description_
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
News | None: _description_
|
||||||
|
"""
|
||||||
|
result = list(self.collection.find({"_id": id}))
|
||||||
|
if len(result) == 1:
|
||||||
|
return MongoEntryTransformer.transform_outgoing(result[0])
|
||||||
|
return None
|
||||||
|
|
||||||
|
def insert(self, news: News):
|
||||||
|
"""_summary_.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
news (News): _description_
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
_type_: _description_
|
||||||
|
"""
|
||||||
|
return self.collection.insert_one(MongoEntryTransformer.transform_ingoing(news))
|
||||||
|
|
||||||
|
|
||||||
|
class MongoEntryTransformer:
|
||||||
|
"""_summary_.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
_type_: _description_
|
||||||
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def transform_ingoing(news: News) -> dict:
|
||||||
|
"""Convert a News object to a dictionary compatible with a MongoDB entry.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
news (News): News object to be transformed
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Transformed data with added _id field
|
||||||
|
"""
|
||||||
|
transport_object = news.to_dict()
|
||||||
|
transport_object["_id"] = news.id
|
||||||
|
del transport_object["id"]
|
||||||
|
return transport_object
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def transform_outgoing(data: dict) -> News:
|
||||||
|
"""Reverse the transform_ingoing method.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (dict): dict from the MongoDB to be transformed
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
News: News entry based on MongoDB document
|
||||||
|
"""
|
||||||
|
return News(
|
||||||
|
id=data["_id"],
|
||||||
|
title=data["title"],
|
||||||
|
date=data["date"],
|
||||||
|
text=data["text"],
|
||||||
|
source_url=data["source_url"],
|
||||||
|
)
|
35
tests/models/company_test.py
Normal file
35
tests/models/company_test.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
"""Test Models.company."""
|
||||||
|
|
||||||
|
|
||||||
|
from aki_prj23_transparenzregister.models.company import Company, CompanyID, Location
|
||||||
|
|
||||||
|
|
||||||
|
def test_to_dict() -> None:
|
||||||
|
"""Tests if the version tag is entered."""
|
||||||
|
company_id = CompanyID("The Shire", "420")
|
||||||
|
location = Location(
|
||||||
|
city="Insmouth", house_number="19", street="Harbor", zip_code="1890"
|
||||||
|
)
|
||||||
|
company = Company(
|
||||||
|
id=company_id,
|
||||||
|
last_update="Tomorrow",
|
||||||
|
location=location,
|
||||||
|
name="BLANK GmbH",
|
||||||
|
relationships=[],
|
||||||
|
)
|
||||||
|
|
||||||
|
assert company.to_dict() == {
|
||||||
|
"id": {
|
||||||
|
"district_court": company_id.district_court,
|
||||||
|
"hr_number": company_id.hr_number,
|
||||||
|
},
|
||||||
|
"last_update": company.last_update,
|
||||||
|
"location": {
|
||||||
|
"city": location.city,
|
||||||
|
"house_number": location.house_number,
|
||||||
|
"street": location.street,
|
||||||
|
"zip_code": location.zip_code,
|
||||||
|
},
|
||||||
|
"name": "BLANK GmbH",
|
||||||
|
"relationships": [],
|
||||||
|
}
|
23
tests/models/news_test.py
Normal file
23
tests/models/news_test.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
"""Test Models.nesws."""
|
||||||
|
|
||||||
|
|
||||||
|
from aki_prj23_transparenzregister.models.news import News
|
||||||
|
|
||||||
|
|
||||||
|
def test_to_dict() -> None:
|
||||||
|
"""Tests if the version tag is entered."""
|
||||||
|
news = News(
|
||||||
|
"4711",
|
||||||
|
"Economy collapses",
|
||||||
|
"2042",
|
||||||
|
"Toilet paper prices rising",
|
||||||
|
"https://www.google.com",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert news.to_dict() == {
|
||||||
|
"id": news.id,
|
||||||
|
"title": news.title,
|
||||||
|
"date": news.date,
|
||||||
|
"text": news.text,
|
||||||
|
"source_url": news.source_url,
|
||||||
|
}
|
103
tests/utils/company_mongo_service_test.py
Normal file
103
tests/utils/company_mongo_service_test.py
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
"""Test utils.company_mongo_service."""
|
||||||
|
from unittest.mock import Mock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from aki_prj23_transparenzregister.models.company import Company
|
||||||
|
from aki_prj23_transparenzregister.utils.company_mongo_service import (
|
||||||
|
CompanyMongoService,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def mock_mongo_connector(mocker) -> Mock:
|
||||||
|
"""Mock MongoConnector class.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
mocker (any): Library mocker
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Mock: Mocked MongoConnector
|
||||||
|
"""
|
||||||
|
mock = Mock()
|
||||||
|
mocker.patch(
|
||||||
|
"aki_prj23_transparenzregister.utils.mongo.MongoConnector", return_value=mock
|
||||||
|
)
|
||||||
|
return mock
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def mock_collection() -> Mock:
|
||||||
|
"""Mock mongo collection.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Mock: Mock object
|
||||||
|
"""
|
||||||
|
return Mock()
|
||||||
|
|
||||||
|
|
||||||
|
def test_init(mock_mongo_connector, mock_collection):
|
||||||
|
"""Test CompanyMongoService constructor.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
mock_mongo_connector (Mock): Mocked MongoConnector library
|
||||||
|
mock_collection (Mock): Mocked pymongo collection
|
||||||
|
"""
|
||||||
|
mock_mongo_connector.database = {"companies": mock_collection}
|
||||||
|
service = CompanyMongoService(mock_mongo_connector)
|
||||||
|
assert service.collection == mock_collection
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_all(mock_mongo_connector, mock_collection):
|
||||||
|
"""Test CompanyMongoService get_all method.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
mock_mongo_connector (Mock): Mocked MongoConnector library
|
||||||
|
mock_collection (Mock): Mocked pymongo collection
|
||||||
|
"""
|
||||||
|
mock_mongo_connector.database = {"companies": mock_collection}
|
||||||
|
service = CompanyMongoService(mock_mongo_connector)
|
||||||
|
mock_result = [{"id": "42"}]
|
||||||
|
mock_collection.find.return_value = mock_result
|
||||||
|
assert service.get_all() == mock_result
|
||||||
|
|
||||||
|
|
||||||
|
def test_by_id_no_result(mock_mongo_connector, mock_collection):
|
||||||
|
"""Test CompanyMongoService get_by_id with no result.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
mock_mongo_connector (Mock): Mocked MongoConnector library
|
||||||
|
mock_collection (Mock): Mocked pymongo collection
|
||||||
|
"""
|
||||||
|
mock_mongo_connector.database = {"companies": mock_collection}
|
||||||
|
service = CompanyMongoService(mock_mongo_connector)
|
||||||
|
mock_collection.find.return_value = []
|
||||||
|
assert service.get_by_id("Does not exist") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_by_id_result(mock_mongo_connector, mock_collection):
|
||||||
|
"""Test CompanyMongoService get_by_id with result.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
mock_mongo_connector (Mock): Mocked MongoConnector library
|
||||||
|
mock_collection (Mock): Mocked pymongo collection
|
||||||
|
"""
|
||||||
|
mock_mongo_connector.database = {"companies": mock_collection}
|
||||||
|
service = CompanyMongoService(mock_mongo_connector)
|
||||||
|
mock_entry = {"id": "Does exist", "vaue": 42}
|
||||||
|
mock_collection.find.return_value = [mock_entry]
|
||||||
|
assert service.get_by_id("Does exist") == mock_entry
|
||||||
|
|
||||||
|
|
||||||
|
def test_insert(mock_mongo_connector, mock_collection):
|
||||||
|
"""Test CompanyMongoService insert method.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
mock_mongo_connector (Mock): Mocked MongoConnector library
|
||||||
|
mock_collection (Mock): Mocked pymongo collection
|
||||||
|
"""
|
||||||
|
mock_mongo_connector.database = {"companies": mock_collection}
|
||||||
|
service = CompanyMongoService(mock_mongo_connector)
|
||||||
|
mock_result = 42
|
||||||
|
mock_collection.insert_one.return_value = mock_result
|
||||||
|
assert service.insert(Company(None, None, "", "", [])) == mock_result
|
26
tests/utils/mongo_test.py
Normal file
26
tests/utils/mongo_test.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from aki_prj23_transparenzregister.utils.mongo import MongoConnection, MongoConnector
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_conn_string_no_credentials():
|
||||||
|
conn = MongoConnection("localhost", "", 27017, None, None)
|
||||||
|
assert conn.get_conn_string() == "mongodb://localhost:27017"
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_conn_string_no_port_but_credentials():
|
||||||
|
conn = MongoConnection("localhost", "", None, "admin", "password")
|
||||||
|
assert conn.get_conn_string() == "mongodb+srv://admin:password@localhost"
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_conn_simple():
|
||||||
|
conn = MongoConnection("localhost", "", None, None, None)
|
||||||
|
assert conn.get_conn_string() == "mongodb+srv://localhost"
|
||||||
|
|
||||||
|
|
||||||
|
def test_mongo_connector():
|
||||||
|
with patch("pymongo.MongoClient") as mock_mongo_client:
|
||||||
|
expected_result = 42
|
||||||
|
mock_mongo_client.return_value = {"db": expected_result}
|
||||||
|
temp = MongoConnector(MongoConnection("localhost", "db", None, None, None))
|
||||||
|
assert temp.database == expected_result
|
115
tests/utils/news_mongo_service_test.py
Normal file
115
tests/utils/news_mongo_service_test.py
Normal file
@ -0,0 +1,115 @@
|
|||||||
|
from unittest.mock import Mock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from aki_prj23_transparenzregister.models.news import News
|
||||||
|
from aki_prj23_transparenzregister.utils.news_mongo_service import (
|
||||||
|
MongoEntryTransformer,
|
||||||
|
MongoNewsService,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def mock_mongo_connector(mocker) -> Mock:
|
||||||
|
"""Mock MongoConnector class.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
mocker (any): Library mocker
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Mock: Mocked MongoConnector
|
||||||
|
"""
|
||||||
|
mock = Mock()
|
||||||
|
mocker.patch(
|
||||||
|
"aki_prj23_transparenzregister.utils.mongo.MongoConnector", return_value=mock
|
||||||
|
)
|
||||||
|
return mock
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def mock_collection() -> Mock:
|
||||||
|
"""Mock mongo collection.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Mock: Mock object
|
||||||
|
"""
|
||||||
|
return Mock()
|
||||||
|
|
||||||
|
|
||||||
|
def test_init(mock_mongo_connector, mock_collection):
|
||||||
|
"""Test CompanyMongoService constructor.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
mock_mongo_connector (Mock): Mocked MongoConnector library
|
||||||
|
mock_collection (Mock): Mocked pymongo collection
|
||||||
|
"""
|
||||||
|
mock_mongo_connector.database = {"news": mock_collection}
|
||||||
|
service = MongoNewsService(mock_mongo_connector)
|
||||||
|
assert service.collection == mock_collection
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_all(mock_mongo_connector, mock_collection):
|
||||||
|
mock_mongo_connector.database = {"news": mock_collection}
|
||||||
|
service = MongoNewsService(mock_mongo_connector)
|
||||||
|
|
||||||
|
mock_collection.find.return_value = []
|
||||||
|
assert service.get_all() == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_by_id_with_result(mock_mongo_connector, mock_collection):
|
||||||
|
mock_mongo_connector.database = {"news": mock_collection}
|
||||||
|
service = MongoNewsService(mock_mongo_connector)
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"aki_prj23_transparenzregister.utils.news_mongo_service.MongoEntryTransformer.transform_outgoing"
|
||||||
|
) as mock_out:
|
||||||
|
mock_collection.find.return_value = [{}]
|
||||||
|
mock_out.return_value = {}
|
||||||
|
assert service.get_by_id("foadh") == {}
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_by_id_no_result(mock_mongo_connector, mock_collection):
|
||||||
|
mock_mongo_connector.database = {"news": mock_collection}
|
||||||
|
service = MongoNewsService(mock_mongo_connector)
|
||||||
|
|
||||||
|
mock_collection.find.return_value = []
|
||||||
|
assert service.get_by_id("foadh") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_insert(mock_mongo_connector, mock_collection):
|
||||||
|
mock_mongo_connector.database = {"news": mock_collection}
|
||||||
|
service = MongoNewsService(mock_mongo_connector)
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"aki_prj23_transparenzregister.utils.news_mongo_service.MongoEntryTransformer.transform_ingoing"
|
||||||
|
) as mock_in:
|
||||||
|
mock_collection.insert_one.return_value = {}
|
||||||
|
mock_in.return_value = {}
|
||||||
|
assert service.insert({}) == {}
|
||||||
|
|
||||||
|
|
||||||
|
def test_transform_ingoing():
|
||||||
|
news = News("42", None, None, None, None)
|
||||||
|
result = MongoEntryTransformer.transform_ingoing(news)
|
||||||
|
assert result["_id"] == "42"
|
||||||
|
assert "id" not in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_transform_outgoing():
|
||||||
|
data = {
|
||||||
|
"_id": "4711",
|
||||||
|
"title": "Hello",
|
||||||
|
"date": "Today",
|
||||||
|
"text": "World",
|
||||||
|
"source_url": "chat.openai.com",
|
||||||
|
}
|
||||||
|
expected_result = News(
|
||||||
|
**{
|
||||||
|
"id": "4711",
|
||||||
|
"title": "Hello",
|
||||||
|
"date": "Today",
|
||||||
|
"text": "World",
|
||||||
|
"source_url": "chat.openai.com",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
assert MongoEntryTransformer.transform_outgoing(data) == expected_result
|
Loading…
x
Reference in New Issue
Block a user