mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-05-12 19:10:01 +02:00
Merge pull request #39 from fhswf/feature/data-extraktion
Feature/data extraktion
This commit is contained in:
commit
ebedf7c630
@ -12,7 +12,7 @@ repos:
|
||||
- id: check-xml
|
||||
- id: check-ast
|
||||
- id: check-added-large-files
|
||||
args: [--enforce-all]
|
||||
args: [--enforce-all --maxkb=50000]
|
||||
- id: name-tests-test
|
||||
- id: detect-private-key
|
||||
- id: check-case-conflict
|
||||
|
3
.vscode/settings.json
vendored
Normal file
3
.vscode/settings.json
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
{
|
||||
"files.eol": "\n"
|
||||
}
|
@ -8,14 +8,6 @@
|
||||
"# Daten Extraktion aus dem Bundesanzeiger"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In order to run this notebooks, download the `deutschland` library source code from: [TrisNol/deutschland](https://github.com/TrisNol/deutschland/tree/feat/bundesanzeiger-raw-report) and place it in the `Jupyter/API-tests/Bundesanzeiger/deutschland` directory. Since the PR adding the required features to the main repo has not been completet as of of yet (see: [PR](https://github.com/bundesAPI/deutschland/pull/88)) we have to include it in another way..."
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
@ -26,18 +18,9 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"C:\\Users\\trist\\AppData\\Roaming\\Python\\Python310\\site-packages\\requests\\__init__.py:102: RequestsDependencyWarning: urllib3 (1.26.12) or chardet (5.1.0)/charset_normalizer (2.0.12) doesn't match a supported version!\n",
|
||||
" warnings.warn(\"urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported \"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"from deutschland.bundesanzeiger import Bundesanzeiger"
|
||||
@ -45,26 +28,28 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"dict_keys(['040860c00ef9020cfb6db2f58a163256', '9eb401c5af2f0289b8207233bf852b81', 'b14d979ea9f42367d413589050bd04e5', 'ecb8c1011456ea0d40f87e850fc216bf', '3a7c6c1f1d1b89bf5ceb165f0ee88053', '03b2e6aac2f2da2c0c5e8f23de9caec4', 'a5f8dc87fa797e7d2f8fb88c49a23c36', '9a3a8a3e84290ee650cbccf32323b3d7', '6c0fcc20a58aaa18a9d13f35a51e3996', 'bf276d441c339e787e22385d2b69b277', '90a79d28f3c11a2122d2827d2bf6acda', '88c785ce3b3c580dcc285661c7790cca', 'd3064baa8246c3ed02e30b5038200edc', '5bf92eed2808b484c005409764b825b7', 'fece6303c991a280850be1900ff78f8f', '26b0624c60cdbf647f3d45f4917ec6ea', '9f98bee55f598908cca60b6a47e5d49d', '99267bb7474e6d1d5d9e091ba5ef3ee8', '102738ef4b91408ed043d84fe785b50b', '94711f3e509518d073e1760d97550347'])\n"
|
||||
"dict_keys(['c1051233030a8e0232523052fd4a2310', '57d129e6fd7505d567fa13919e5e6bdd'])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ba = Bundesanzeiger()\n",
|
||||
"reports = ba.get_reports(\"Atos IT-Dienstleistung und Beratung GmbH\")\n",
|
||||
"reports = ba.get_reports(\n",
|
||||
" \"Volkswagen Economy Service Erdle Bernhard Erdle GmbH\"\n",
|
||||
") # \"Atos IT-Dienstleistung und Beratung GmbH\")\n",
|
||||
"print(reports.keys())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -75,7 +60,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -109,42 +94,18 @@
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>2023-03-17</td>\n",
|
||||
" <td>Aufsichtsrat</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl...</td>\n",
|
||||
" <td>2023-05-25</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2022-03-25</td>\n",
|
||||
" <td>2023-05-24</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>2021-03-11</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>2020-03-24</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>2018-12-11</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
@ -153,35 +114,23 @@
|
||||
],
|
||||
"text/plain": [
|
||||
" date name \\\n",
|
||||
"0 2023-03-17 Aufsichtsrat \n",
|
||||
"1 2022-03-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"2 2021-03-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"3 2020-03-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"4 2018-12-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"0 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"1 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"\n",
|
||||
" company \\\n",
|
||||
"0 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"1 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"2 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"3 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"4 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"0 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"1 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"\n",
|
||||
" report \\\n",
|
||||
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl... \n",
|
||||
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||||
"2 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||||
"3 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung... \n",
|
||||
"4 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||||
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||
"\n",
|
||||
" raw_report \n",
|
||||
"0 <div class=\"publication_container\">\\n <div cla... \n",
|
||||
"1 <div class=\"publication_container\">\\n <div cla... \n",
|
||||
"2 <div class=\"publication_container\">\\n <div cla... \n",
|
||||
"3 <div class=\"publication_container\">\\n <div cla... \n",
|
||||
"4 <div class=\"publication_container\">\\n <div cla... "
|
||||
"1 <div class=\"publication_container\">\\n <div cla... "
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -193,7 +142,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 36,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -228,46 +177,19 @@
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>2023-03-17</td>\n",
|
||||
" <td>Aufsichtsrat</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl...</td>\n",
|
||||
" <td>2023-05-25</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>Aufsichtsrat</td>\n",
|
||||
" <td>Jahresabschluss</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2022-03-25</td>\n",
|
||||
" <td>2023-05-24</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>Jahresabschluss</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>2021-03-11</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>Jahresabschluss</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>2020-03-24</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>Jahresabschluss</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>2018-12-11</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>Jahresabschluss</td>\n",
|
||||
" </tr>\n",
|
||||
@ -277,35 +199,23 @@
|
||||
],
|
||||
"text/plain": [
|
||||
" date name \\\n",
|
||||
"0 2023-03-17 Aufsichtsrat \n",
|
||||
"1 2022-03-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"2 2021-03-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"3 2020-03-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"4 2018-12-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"0 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"1 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"\n",
|
||||
" company \\\n",
|
||||
"0 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"1 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"2 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"3 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"4 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"0 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"1 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"\n",
|
||||
" report \\\n",
|
||||
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl... \n",
|
||||
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||||
"2 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||||
"3 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung... \n",
|
||||
"4 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
|
||||
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||
"\n",
|
||||
" raw_report type \n",
|
||||
"0 <div class=\"publication_container\">\\n <div cla... Aufsichtsrat \n",
|
||||
"1 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
||||
"2 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
||||
"3 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
||||
"4 <div class=\"publication_container\">\\n <div cla... Jahresabschluss "
|
||||
"0 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
||||
"1 <div class=\"publication_container\">\\n <div cla... Jahresabschluss "
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"execution_count": 36,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -317,21 +227,9 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 37,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_6460\\963182859.py:2: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||
" df_jahresabschluss['jahr'] = df_jahresabschluss.name.apply(\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
@ -361,61 +259,34 @@
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2022-03-25</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>2023-05-25</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2020</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>2021-03-11</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2023-05-24</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2019</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>2020-03-24</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2018</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>2018-12-11</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2017</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6</th>\n",
|
||||
" <td>2018-01-03</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2016</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" date company \\\n",
|
||||
"1 2022-03-25 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"2 2021-03-11 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"3 2020-03-24 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"4 2018-12-11 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"6 2018-01-03 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"0 2023-05-25 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"1 2023-05-24 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"\n",
|
||||
" raw_report jahr \n",
|
||||
"1 <div class=\"publication_container\">\\n <div cla... 2020 \n",
|
||||
"2 <div class=\"publication_container\">\\n <div cla... 2019 \n",
|
||||
"3 <div class=\"publication_container\">\\n <div cla... 2018 \n",
|
||||
"4 <div class=\"publication_container\">\\n <div cla... 2017 \n",
|
||||
"6 <div class=\"publication_container\">\\n <div cla... 2016 "
|
||||
"0 <div class=\"publication_container\">\\n <div cla... 2020 \n",
|
||||
"1 <div class=\"publication_container\">\\n <div cla... 2019 "
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"execution_count": 37,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -439,7 +310,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 38,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -449,11 +320,12 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 39,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sample_report = df_jahresabschluss.iloc[0].raw_report"
|
||||
"sample_report = df_jahresabschluss.iloc[0].raw_report\n",
|
||||
"sample_report_content = df_jahresabschluss.iloc[0].raw_report"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -466,45 +338,20 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 40,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"from dataclasses import dataclass\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dataclass\n",
|
||||
"class Auditor:\n",
|
||||
" name: str\n",
|
||||
" company: str\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def extract_auditors(report: str) -> list:\n",
|
||||
" auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
|
||||
" hits = re.findall(auditor_regex, report)\n",
|
||||
" return [hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip() for hit in hits]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['Eckhard Lewe', 'Renate Hermsdorf']"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"extract_auditors(sample_report)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def extract_auditor_company(report: str) -> str:\n",
|
||||
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
|
||||
" temp = soup.find_all(\"b\")\n",
|
||||
@ -512,27 +359,37 @@
|
||||
" br = elem.findChildren(\"br\")\n",
|
||||
" if len(br) > 0:\n",
|
||||
" return elem.text.split(\"\\n\")[1].strip()\n",
|
||||
" return None"
|
||||
" return None\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def extract_auditors(report: str) -> list:\n",
|
||||
" auditor_company = extract_auditor_company(report)\n",
|
||||
" auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
|
||||
" hits = re.findall(auditor_regex, report)\n",
|
||||
" return [\n",
|
||||
" Auditor(hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip(), auditor_company)\n",
|
||||
" for hit in hits\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Warth & Klein Grant Thornton AG'"
|
||||
"[]"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"extract_auditor_company(sample_report)"
|
||||
"extract_auditors(sample_report)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -561,97 +418,177 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"execution_count": 42,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Unnamed: 0</th>\n",
|
||||
" <th>Anhang</th>\n",
|
||||
" <th>2020 TEUR</th>\n",
|
||||
" <th>Vorjahr TEUR</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>1. Umsatzerlöse</td>\n",
|
||||
" <td>(1)</td>\n",
|
||||
" <td>69.819</td>\n",
|
||||
" <td>77.429</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2. Veränderung des Bestandes an unfertigen Lei...</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>-41.000</td>\n",
|
||||
" <td>-66.000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>3. Sonstige betriebliche Erträge</td>\n",
|
||||
" <td>(2)</td>\n",
|
||||
" <td>489.000</td>\n",
|
||||
" <td>1.816</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>4. Materialaufwand</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>a) Aufwendungen für bezogene Waren</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>-1.220</td>\n",
|
||||
" <td>-3.003</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Unnamed: 0 Anhang 2020 TEUR \\\n",
|
||||
"0 1. Umsatzerlöse (1) 69.819 \n",
|
||||
"1 2. Veränderung des Bestandes an unfertigen Lei... NaN -41.000 \n",
|
||||
"2 3. Sonstige betriebliche Erträge (2) 489.000 \n",
|
||||
"3 4. Materialaufwand NaN NaN \n",
|
||||
"4 a) Aufwendungen für bezogene Waren NaN -1.220 \n",
|
||||
"\n",
|
||||
" Vorjahr TEUR \n",
|
||||
"0 77.429 \n",
|
||||
"1 -66.000 \n",
|
||||
"2 1.816 \n",
|
||||
"3 NaN \n",
|
||||
"4 -3.003 "
|
||||
"{'net_income': 23484.67, 'equity': 65083.84, 'current_assets': 357613.61}"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"execution_count": 42,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def extract_kpis(report_content) -> dict:\n",
|
||||
" \"\"\"\n",
|
||||
" Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd\n",
|
||||
" Extracts Key Performance Indicators (KPIs) from the financial reports.\n",
|
||||
" Args:\n",
|
||||
" reports (dict): A dictionary containing the financial reports with their hash as keys and report details as values.\n",
|
||||
" Returns:\n",
|
||||
" dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" kpis = {}\n",
|
||||
"\n",
|
||||
" # Define KPI patterns to search for\n",
|
||||
" kpi_patterns = {\n",
|
||||
" \"revenue\": r\"(?:revenue|umsatz|erlöse)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"net_income\": r\"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"ebit\": r\"(?:ebit|operating income)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"ebitda\": r\"(?:ebitda)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"gross_profit\": r\"(?:gross profit|bruttogewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"operating_profit\": r\"(?:operating profit|betriebsgewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"assets\": r\"(?:total assets|bilanzsumme)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"liabilities\": r\"(?:total liabilities|gesamtverbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"equity\": r\"(?:shareholders'? equity|eigenkapital)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"current_assets\": r\"(?:current assets|umlaufvermögen)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"current_liabilities\": r\"(?:current liabilities|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"long_term_debt\": r\"(?:long[-\\s]?term debt|langfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"short_term_debt\": r\"(?:short[-\\s]?term debt|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"cash_and_cash_equivalents\": r\"(?:cash (?:and cash equivalents)?|barmittel)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"dividends\": r\"(?:dividends?|dividende)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" \"cash_flow\": r\"(?:cash flow|cashflow|cash flow from operating activities)[:\\s]*([\\d,.]+[mmb]?)\",\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" report_kpis = {}\n",
|
||||
" for kpi, pattern in kpi_patterns.items():\n",
|
||||
" match = re.search(pattern, report_content, flags=re.IGNORECASE | re.UNICODE)\n",
|
||||
" if match:\n",
|
||||
" value = match.group(1)\n",
|
||||
"\n",
|
||||
" # Clean and validate the extracted number\n",
|
||||
" try:\n",
|
||||
" if not value: # Check if value is empty\n",
|
||||
" cleaned_value = None\n",
|
||||
" else:\n",
|
||||
" multiplier = 1\n",
|
||||
" if value[-1].lower() == \"m\":\n",
|
||||
" value = value[:-1]\n",
|
||||
" multiplier = 1_000_000\n",
|
||||
" elif value[-1].lower() == \"b\":\n",
|
||||
" value = value[:-1]\n",
|
||||
" multiplier = 1_000_000_000\n",
|
||||
"\n",
|
||||
" # Remove commas after checking for multipliers\n",
|
||||
" value = value.replace(\".\", \"\").replace(\",\", \".\").strip()\n",
|
||||
" cleaned_value = float(value) * multiplier\n",
|
||||
" except ValueError:\n",
|
||||
" cleaned_value = None\n",
|
||||
"\n",
|
||||
" if cleaned_value is not None:\n",
|
||||
" report_kpis[kpi] = cleaned_value\n",
|
||||
" return report_kpis\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"extract_kpis(\n",
|
||||
" BeautifulSoup(sample_report, features=\"html.parser\").get_text().replace(\"\\n\", \" \")\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"with open(\"./temp.txt\", \"w\") as file:\n",
|
||||
" file.write(\n",
|
||||
" BeautifulSoup(sample_report, features=\"html.parser\")\n",
|
||||
" .get_text()\n",
|
||||
" .replace(\"\\n\", \" \")\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 46,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n",
|
||||
" ('Aktiva', '31.12.2020 EUR'),\n",
|
||||
" ('Aktiva', '31.12.2019 EUR')],\n",
|
||||
" )\n",
|
||||
"Aktiva Unnamed: 0_level_1 object\n",
|
||||
" 31.12.2020 EUR object\n",
|
||||
" 31.12.2019 EUR object\n",
|
||||
"dtype: object\n",
|
||||
"MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n",
|
||||
" ('Passiva', '31.12.2020 EUR'),\n",
|
||||
" ('Passiva', '31.12.2019 EUR')],\n",
|
||||
" )\n",
|
||||
"Passiva Unnamed: 0_level_1 object\n",
|
||||
" 31.12.2020 EUR object\n",
|
||||
" 31.12.2019 EUR object\n",
|
||||
"dtype: object\n",
|
||||
"Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n",
|
||||
"Angaben zur Identifikation der Gesellschaft laut Registergericht object\n",
|
||||
"Angaben zur Identifikation der Gesellschaft laut Registergericht.1 object\n",
|
||||
"dtype: object\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{}"
|
||||
]
|
||||
},
|
||||
"execution_count": 46,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def parse_tables(report: str) -> list:\n",
|
||||
" result = {}\n",
|
||||
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
|
||||
" for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n",
|
||||
" df = pd.read_html(StringIO(str(table)))[0]\n",
|
||||
" print(df.columns)\n",
|
||||
" print(df.dtypes)\n",
|
||||
" return result\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"parse_tables(sample_report)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "KeyError",
|
||||
"evalue": "'Passiva'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Bundesanzeiger\\notebook.ipynb Cell 21\u001b[0m in \u001b[0;36m1\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=10'>11</a>\u001b[0m \u001b[39mreturn\u001b[39;00m result\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=13'>14</a>\u001b[0m bilanz \u001b[39m=\u001b[39m get_bilanz(sample_report)\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=14'>15</a>\u001b[0m bilanz[\u001b[39m\"\u001b[39;49m\u001b[39mPassiva\u001b[39;49m\u001b[39m\"\u001b[39;49m]\u001b[39m.\u001b[39mhead()\n",
|
||||
"\u001b[1;31mKeyError\u001b[0m: 'Passiva'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def get_bilanz(report: str) -> any:\n",
|
||||
" result = {}\n",
|
||||
@ -672,30 +609,30 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Index(['Gesellschafterbeschluss', 'Shareholder Resolution'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', 'Anhang', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', 'Anhang', '2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2020 TEUR',\n",
|
||||
"Int64Index([0, 1], dtype='int64')\n",
|
||||
"Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n",
|
||||
" 'Vorjahr TEUR'],\n",
|
||||
" dtype='object')\n",
|
||||
"Index(['Aufgliederung nach Inland und Ausland', '2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2020'], dtype='object')\n",
|
||||
"Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||||
"Int64Index([0, 1, 2], dtype='int64')\n",
|
||||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '2020 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
|
||||
"MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n",
|
||||
" ('Art der Beziehung', 'Gesellschafterin TEUR'),\n",
|
||||
" ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n",
|
||||
@ -707,24 +644,23 @@
|
||||
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
|
||||
" ('Anschaffungs- oder Herstellungskosten', ...)],\n",
|
||||
" )\n",
|
||||
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
|
||||
" ( 'Abschreibungen', 'Stand 01.01.2020 EUR'),\n",
|
||||
" ( 'Abschreibungen', 'Abschreibungen des Geschäftsjahres EUR'),\n",
|
||||
" ( 'Abschreibungen', 'Abgänge Umbuchung EUR'),\n",
|
||||
" ( 'Abschreibungen', 'Stand 31.12.2020 EUR')],\n",
|
||||
"MultiIndex([('Unnamed: 0_level_0', ...),\n",
|
||||
" ( 'Abschreibungen', ...),\n",
|
||||
" ( 'Abschreibungen', ...),\n",
|
||||
" ( 'Abschreibungen', ...),\n",
|
||||
" ( 'Abschreibungen', ...)],\n",
|
||||
" )\n",
|
||||
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
|
||||
" ( 'Buchwerte', 'Stand 31.12.2020 EUR'),\n",
|
||||
" ( 'Buchwerte', 'Stand 31.12.2019 EUR')],\n",
|
||||
" ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n",
|
||||
" ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n",
|
||||
" )\n",
|
||||
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2020', '2019',\n",
|
||||
" '2018'],\n",
|
||||
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n",
|
||||
" '2019'],\n",
|
||||
" dtype='object')\n",
|
||||
"Index(['Gewinn- und Verlustrechnung', '2020 TEUR', 'Vorjahr TEUR',\n",
|
||||
"Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n",
|
||||
" 'Veränderung TEUR'],\n",
|
||||
" dtype='object')\n",
|
||||
"Index(['Bilanz', '31.12.2020 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n",
|
||||
"Index(['Bericht des Aufsichtsrats', 'Report of the Supervisory Board'], dtype='object')\n"
|
||||
"Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
1
Jupyter/API-tests/News/.gitignore
vendored
Normal file
1
Jupyter/API-tests/News/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
data/
|
879
Jupyter/API-tests/News/notebook.ipynb
Normal file
879
Jupyter/API-tests/News/notebook.ipynb
Normal file
File diff suppressed because one or more lines are too long
1
Jupyter/API-tests/News/requirements.txt
Normal file
1
Jupyter/API-tests/News/requirements.txt
Normal file
@ -0,0 +1 @@
|
||||
pymongo
|
1
Jupyter/API-tests/Unternehmensregister/.gitignore
vendored
Normal file
1
Jupyter/API-tests/Unternehmensregister/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
data/*
|
192
Jupyter/API-tests/Unternehmensregister/main.py
Normal file
192
Jupyter/API-tests/Unternehmensregister/main.py
Normal file
@ -0,0 +1,192 @@
|
||||
"""Unternehmensregister Scraping."""
|
||||
import glob
|
||||
import logging
|
||||
import multiprocessing
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions as ec
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from tqdm import tqdm
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
|
||||
def scrape(query: str, download_dir: list[str]):
|
||||
"""Fetch results from Unternehmensregister for given query.
|
||||
|
||||
Args:
|
||||
query (str): Search Query (RegEx supported)
|
||||
download_dir (list[str]): Directory to place output files in
|
||||
"""
|
||||
download_path = os.path.join(str(Path.cwd()), *download_dir)
|
||||
options = webdriver.ChromeOptions()
|
||||
preferences = {
|
||||
"profile.default_content_settings.popups": 0,
|
||||
"safebrowsing.enabled": True,
|
||||
"download": {
|
||||
"directory_upgrade": True,
|
||||
"prompt_for_download": False,
|
||||
"extensions_to_open": "",
|
||||
"default_directory": download_path,
|
||||
},
|
||||
}
|
||||
options.add_argument("--headless=new")
|
||||
options.add_experimental_option("prefs", preferences)
|
||||
|
||||
driver = webdriver.Chrome(options=options)
|
||||
|
||||
driver.get("https://www.unternehmensregister.de/ureg/")
|
||||
# Accept Cookies
|
||||
driver.find_elements(
|
||||
By.XPATH, '//button[text()="Nur technisch notwendige Cookies akzeptieren"]'
|
||||
)[0].click()
|
||||
# Enter search query
|
||||
driver.find_elements(By.ID, "globalSearchForm:extendedResearchCompanyName")[
|
||||
0
|
||||
].send_keys(query)
|
||||
# Trigger search
|
||||
driver.find_elements(By.ID, "globalSearchForm:btnExecuteSearchOld")[0].click()
|
||||
# Wait for results
|
||||
wait = WebDriverWait(driver, 15)
|
||||
wait.until(
|
||||
lambda driver: driver.current_url != "https://www.unternehmensregister.de/ureg/"
|
||||
)
|
||||
|
||||
num_pages = int(
|
||||
driver.find_element(By.XPATH, '//*[@class="page_count"]').text.split(" ")[0]
|
||||
)
|
||||
|
||||
processed_companies = []
|
||||
|
||||
for _ in tqdm(range(num_pages)):
|
||||
# Find all "Registerinformationen"
|
||||
companies_tab = driver.find_elements(
|
||||
By.LINK_TEXT, "Registerinformationen des Registergerichts"
|
||||
)
|
||||
company_names = [
|
||||
elem.text
|
||||
for elem in driver.find_elements(
|
||||
By.XPATH, '//div[@class="company_result"]/span/b'
|
||||
)
|
||||
]
|
||||
for index, company_link in enumerate(companies_tab):
|
||||
company_name = company_names[index]
|
||||
if company_name in processed_companies:
|
||||
continue
|
||||
# Go to intermediary page
|
||||
company_link.click()
|
||||
# Trigger next redirect
|
||||
driver.find_element(By.LINK_TEXT, "Registerinformationen anzeigen").click()
|
||||
# Trigger SI download
|
||||
driver.find_element(By.LINK_TEXT, "SI").click()
|
||||
# Show shopping cart
|
||||
wait.until(
|
||||
ec.visibility_of_element_located(
|
||||
(By.LINK_TEXT, "Dokumentenkorb ansehen")
|
||||
)
|
||||
)
|
||||
driver.find_element(By.LINK_TEXT, "Dokumentenkorb ansehen").click()
|
||||
# Get document
|
||||
elems = driver.find_elements(By.TAG_NAME, "input")
|
||||
elems[-2].click()
|
||||
|
||||
wait.until(
|
||||
ec.visibility_of_element_located((By.ID, "paymentFormOverview:btnNext"))
|
||||
)
|
||||
driver.find_element(By.ID, "paymentFormOverview:btnNext").click()
|
||||
|
||||
wait.until(
|
||||
ec.visibility_of_element_located((By.LINK_TEXT, "Zum Dokumentenkorb"))
|
||||
)
|
||||
driver.find_element(By.LINK_TEXT, "Zum Dokumentenkorb").click()
|
||||
|
||||
num_files = get_num_files(download_path)
|
||||
driver.find_element(By.CLASS_NAME, "download-wrapper").click()
|
||||
|
||||
try:
|
||||
wait.until(wait_for_download_condition(download_path, num_files))
|
||||
file_name = "".join(e for e in company_name if e.isalnum()) + ".xml"
|
||||
rename_latest_file(
|
||||
download_path,
|
||||
file_name,
|
||||
)
|
||||
processed_companies.append(company_name)
|
||||
except Exception:
|
||||
logger.warning("Exception caught in Scraping")
|
||||
finally:
|
||||
for _ in range(6):
|
||||
driver.back()
|
||||
driver.find_element(By.XPATH, '//*[@class="fas fa-angle-right"]').click()
|
||||
driver.close()
|
||||
|
||||
|
||||
def wait_for_download_condition(
|
||||
path: str, num_files: int, pattern: str = "*.xml"
|
||||
) -> bool:
|
||||
"""Selenium wait condition monitoring number of files in a dir.
|
||||
|
||||
Args:
|
||||
path (str): Directory path
|
||||
num_files (int): Current number of file
|
||||
pattern (str, optional): File pattern. Defaults to "*.xml".
|
||||
|
||||
Returns:
|
||||
bool: Current num file exceeded
|
||||
"""
|
||||
return len(glob.glob1(path, pattern)) > num_files
|
||||
|
||||
|
||||
def get_num_files(path: str, pattern: str = "*.xml") -> int:
|
||||
"""Get number of files in directory.
|
||||
|
||||
Args:
|
||||
path (str): Directory to scan
|
||||
pattern (str, optional): File pattern. Defaults to "*.xml".
|
||||
|
||||
Returns:
|
||||
int: Number of files matching pattern
|
||||
"""
|
||||
return len(glob.glob1(path, pattern))
|
||||
|
||||
|
||||
def rename_latest_file(path: str, filename: str, pattern: str = "*.xml"):
|
||||
"""Rename file in dir with latest change date.
|
||||
|
||||
Args:
|
||||
path (str): Dir to check
|
||||
filename (str): Name of file
|
||||
pattern (str, optional): File pattern. Defaults to "*.xml".
|
||||
"""
|
||||
list_of_files = [os.path.join(path, file) for file in glob.glob1(path, pattern)]
|
||||
latest_download = max(list_of_files, key=os.path.getctime)
|
||||
os.rename(latest_download, os.path.join(path, filename))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""Main procedure"""
|
||||
import pandas as pd
|
||||
|
||||
df_relevant_companies = pd.read_excel(
|
||||
"./data/study_id42887_top-100-unternehmen-deutschland.xlsx",
|
||||
sheet_name="Toplist",
|
||||
skiprows=1,
|
||||
)
|
||||
df_relevant_companies = df_relevant_companies[df_relevant_companies["Name"].notna()]
|
||||
|
||||
batch_size = 5
|
||||
pool = multiprocessing.Pool(processes=batch_size)
|
||||
params = [
|
||||
(query, ["data", "Unternehmensregister", "scraping", query.strip()])
|
||||
for query in df_relevant_companies.Name
|
||||
]
|
||||
# Map the process_handler function to the parameter list using the Pool
|
||||
pool.starmap(scrape, params)
|
||||
|
||||
# Close the Pool to prevent any more tasks from being submitted
|
||||
pool.close()
|
||||
|
||||
# Wait for all the processes to complete
|
||||
pool.join()
|
4322
Jupyter/API-tests/Unternehmensregister/notebook.ipynb
Normal file
4322
Jupyter/API-tests/Unternehmensregister/notebook.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
10
Jupyter/API-tests/Unternehmensregister/requirements.txt
Normal file
10
Jupyter/API-tests/Unternehmensregister/requirements.txt
Normal file
@ -0,0 +1,10 @@
|
||||
ocrmypdf
|
||||
pytesseract
|
||||
opencv-python
|
||||
pdf2image
|
||||
bs4
|
||||
selenium
|
||||
xmltodict
|
||||
tqdm
|
||||
openpyxl
|
||||
pandas
|
28
Jupyter/API-tests/docker-compose.yml
Normal file
28
Jupyter/API-tests/docker-compose.yml
Normal file
@ -0,0 +1,28 @@
|
||||
version: '3.8'
|
||||
services:
|
||||
mongodb:
|
||||
image: mongo:6.0.6
|
||||
container_name: mongodb
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
MONGO_INITDB_ROOT_USERNAME: root
|
||||
MONGO_INITDB_ROOT_PASSWORD: pR0R0v2e2
|
||||
MONGO_INITDB_DATABASE: transparenzregister
|
||||
ports:
|
||||
- 27017:27017
|
||||
volumes:
|
||||
- mongodb_data:/data/db
|
||||
|
||||
mongo-express:
|
||||
image: mongo-express:1.0.0-alpha
|
||||
container_name: mongo-express
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- 8081:8081
|
||||
environment:
|
||||
ME_CONFIG_MONGODB_SERVER: mongodb
|
||||
ME_CONFIG_MONGODB_ADMINUSERNAME: root
|
||||
ME_CONFIG_MONGODB_ADMINPASSWORD: pR0R0v2e2
|
||||
|
||||
volumes:
|
||||
mongodb_data:
|
Binary file not shown.
1028
poetry.lock
generated
1028
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -30,9 +30,12 @@ version = "0.1.0"
|
||||
loguru = "^0.7.0"
|
||||
matplotlib = "^3.7.1"
|
||||
plotly = "^5.14.1"
|
||||
pymongo = "^4.4.1"
|
||||
python = "^3.11"
|
||||
seaborn = "^0.12.2"
|
||||
selenium = "^4.10.0"
|
||||
tqdm = "^4.65.0"
|
||||
types-tqdm = "^4.65.0"
|
||||
|
||||
[tool.poetry.group.develop.dependencies]
|
||||
black = {extras = ["jupyter"], version = "^23.3.0"}
|
||||
@ -100,8 +103,11 @@ target-version = "py311"
|
||||
# Avoid trying to fix flake8-bugbear (`B`) violations.
|
||||
unfixable = ["B"]
|
||||
|
||||
[tool.ruff.flake8-builtins]
|
||||
builtins-ignorelist = ["id"]
|
||||
|
||||
[tool.ruff.per-file-ignores]
|
||||
"tests/*.py" = ["S101"]
|
||||
"tests/*.py" = ["S101", "D100", "D101", "D107", "D103"]
|
||||
|
||||
[tool.ruff.pydocstyle]
|
||||
convention = "google"
|
||||
|
1
src/aki_prj23_transparenzregister/models/__init__.py
Normal file
1
src/aki_prj23_transparenzregister/models/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Model classes."""
|
68
src/aki_prj23_transparenzregister/models/company.py
Normal file
68
src/aki_prj23_transparenzregister/models/company.py
Normal file
@ -0,0 +1,68 @@
|
||||
"""Company model."""
|
||||
from abc import ABC
|
||||
from dataclasses import asdict, dataclass
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class RelationshipRoleEnum(Enum):
|
||||
"""_summary_.
|
||||
|
||||
Args:
|
||||
Enum (_type_): _description_
|
||||
"""
|
||||
|
||||
STAKEHOLDER = ""
|
||||
ORGANISATION = "ORGANISATION"
|
||||
|
||||
|
||||
@dataclass
|
||||
class CompanyID:
|
||||
"""_summary_."""
|
||||
|
||||
district_court: str
|
||||
hr_number: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class Location:
|
||||
"""_summary_."""
|
||||
|
||||
city: str
|
||||
street: str | None = None
|
||||
house_number: str | None = None
|
||||
zip_code: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class CompanyRelationship(ABC):
|
||||
"""_summary_.
|
||||
|
||||
Args:
|
||||
ABC (_type_): _description_
|
||||
"""
|
||||
|
||||
role: RelationshipRoleEnum
|
||||
location: Location
|
||||
|
||||
|
||||
@dataclass
|
||||
class Company:
|
||||
"""_summary_.
|
||||
|
||||
Returns:
|
||||
_type_: _description_
|
||||
"""
|
||||
|
||||
id: CompanyID
|
||||
location: Location
|
||||
name: str
|
||||
last_update: str
|
||||
relationships: list[CompanyRelationship]
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""_summary_.
|
||||
|
||||
Returns:
|
||||
dict: _description_
|
||||
"""
|
||||
return asdict(self)
|
25
src/aki_prj23_transparenzregister/models/news.py
Normal file
25
src/aki_prj23_transparenzregister/models/news.py
Normal file
@ -0,0 +1,25 @@
|
||||
"""News mnodel."""
|
||||
from dataclasses import asdict, dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class News:
|
||||
"""_summary_.
|
||||
|
||||
Returns:
|
||||
_type_: _description_
|
||||
"""
|
||||
|
||||
id: str
|
||||
title: str
|
||||
date: str
|
||||
text: str
|
||||
source_url: str
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""_summary_.
|
||||
|
||||
Returns:
|
||||
dict: _description_
|
||||
"""
|
||||
return asdict(self)
|
1
src/aki_prj23_transparenzregister/utils/__init__.py
Normal file
1
src/aki_prj23_transparenzregister/utils/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Util classes and services."""
|
@ -0,0 +1,49 @@
|
||||
"""CompanyMongoService."""
|
||||
from aki_prj23_transparenzregister.models.company import Company, CompanyID
|
||||
from aki_prj23_transparenzregister.utils.mongo import MongoConnector
|
||||
|
||||
|
||||
class CompanyMongoService:
|
||||
"""_summary_."""
|
||||
|
||||
def __init__(self, connector: MongoConnector):
|
||||
"""_summary_.
|
||||
|
||||
Args:
|
||||
connector (MongoConnector): _description_
|
||||
"""
|
||||
self.collection = connector.database["companies"]
|
||||
|
||||
def get_all(self) -> list[Company]:
|
||||
"""_summary_.
|
||||
|
||||
Returns:
|
||||
list[Company]: _description_
|
||||
"""
|
||||
result = self.collection.find()
|
||||
return list(result)
|
||||
|
||||
def get_by_id(self, id: CompanyID) -> Company | None:
|
||||
"""_summary_.
|
||||
|
||||
Args:
|
||||
id (str): _description_
|
||||
|
||||
Returns:
|
||||
Company | None: _description_
|
||||
"""
|
||||
result = list(self.collection.find({"id": id}))
|
||||
if len(result) == 1:
|
||||
return result[0]
|
||||
return None
|
||||
|
||||
def insert(self, company: Company):
|
||||
"""_summary_.
|
||||
|
||||
Args:
|
||||
company (Company): _description_
|
||||
|
||||
Returns:
|
||||
_type_: _description_
|
||||
"""
|
||||
return self.collection.insert_one(company.to_dict())
|
47
src/aki_prj23_transparenzregister/utils/mongo.py
Normal file
47
src/aki_prj23_transparenzregister/utils/mongo.py
Normal file
@ -0,0 +1,47 @@
|
||||
"""Mongo Wrapper."""
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pymongo
|
||||
|
||||
|
||||
@dataclass
|
||||
class MongoConnection:
|
||||
"""_summary_."""
|
||||
|
||||
hostname: str
|
||||
database: str
|
||||
port: int | None
|
||||
username: str | None
|
||||
password: str | None
|
||||
|
||||
def get_conn_string(self) -> str:
|
||||
"""Transforms the information of the object to a MongoDB connection string.
|
||||
|
||||
Returns:
|
||||
str: Connection string
|
||||
"""
|
||||
if self.username is not None and self.password is not None:
|
||||
connection_string = (
|
||||
f"mongodb+srv://{self.username}:{self.password}@{self.hostname}"
|
||||
)
|
||||
else:
|
||||
connection_string = f"mongodb+srv://{self.hostname}"
|
||||
if self.port is not None:
|
||||
connection_string += f":{self.port}"
|
||||
connection_string = connection_string.replace("mongodb+srv", "mongodb")
|
||||
return connection_string
|
||||
|
||||
|
||||
class MongoConnector:
|
||||
"""Wrapper for establishing a connection to a MongoDB instance."""
|
||||
|
||||
def __init__(self, connection: MongoConnection):
|
||||
"""_summary_.
|
||||
|
||||
Args:
|
||||
connection (MongoConnection): Wrapper for connection string
|
||||
"""
|
||||
self.client: pymongo.MongoClient = pymongo.MongoClient(
|
||||
connection.get_conn_string()
|
||||
)
|
||||
self.database = self.client[connection.database]
|
@ -0,0 +1,94 @@
|
||||
"""MongoNewsService."""
|
||||
from aki_prj23_transparenzregister.models.news import News
|
||||
from aki_prj23_transparenzregister.utils.mongo import MongoConnector
|
||||
|
||||
|
||||
class MongoNewsService:
|
||||
"""_summary_.
|
||||
|
||||
Args:
|
||||
NewsServiceInterface (_type_): _description_
|
||||
"""
|
||||
|
||||
def __init__(self, connector: MongoConnector):
|
||||
"""_summary_.
|
||||
|
||||
Args:
|
||||
connector (MongoConnector): _description_
|
||||
"""
|
||||
self.collection = connector.database["news"]
|
||||
|
||||
def get_all(self) -> list[News]:
|
||||
"""_summary_.
|
||||
|
||||
Returns:
|
||||
list[News]: _description_
|
||||
"""
|
||||
result = self.collection.find()
|
||||
return [MongoEntryTransformer.transform_outgoing(elem) for elem in result]
|
||||
|
||||
def get_by_id(self, id: str) -> News | None:
|
||||
"""_summary_.
|
||||
|
||||
Args:
|
||||
id (str): _description_
|
||||
|
||||
Returns:
|
||||
News | None: _description_
|
||||
"""
|
||||
result = list(self.collection.find({"_id": id}))
|
||||
if len(result) == 1:
|
||||
return MongoEntryTransformer.transform_outgoing(result[0])
|
||||
return None
|
||||
|
||||
def insert(self, news: News):
|
||||
"""_summary_.
|
||||
|
||||
Args:
|
||||
news (News): _description_
|
||||
|
||||
Returns:
|
||||
_type_: _description_
|
||||
"""
|
||||
return self.collection.insert_one(MongoEntryTransformer.transform_ingoing(news))
|
||||
|
||||
|
||||
class MongoEntryTransformer:
|
||||
"""_summary_.
|
||||
|
||||
Returns:
|
||||
_type_: _description_
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def transform_ingoing(news: News) -> dict:
|
||||
"""Convert a News object to a dictionary compatible with a MongoDB entry.
|
||||
|
||||
Args:
|
||||
news (News): News object to be transformed
|
||||
|
||||
Returns:
|
||||
dict: Transformed data with added _id field
|
||||
"""
|
||||
transport_object = news.to_dict()
|
||||
transport_object["_id"] = news.id
|
||||
del transport_object["id"]
|
||||
return transport_object
|
||||
|
||||
@staticmethod
|
||||
def transform_outgoing(data: dict) -> News:
|
||||
"""Reverse the transform_ingoing method.
|
||||
|
||||
Args:
|
||||
data (dict): dict from the MongoDB to be transformed
|
||||
|
||||
Returns:
|
||||
News: News entry based on MongoDB document
|
||||
"""
|
||||
return News(
|
||||
id=data["_id"],
|
||||
title=data["title"],
|
||||
date=data["date"],
|
||||
text=data["text"],
|
||||
source_url=data["source_url"],
|
||||
)
|
35
tests/models/company_test.py
Normal file
35
tests/models/company_test.py
Normal file
@ -0,0 +1,35 @@
|
||||
"""Test Models.company."""
|
||||
|
||||
|
||||
from aki_prj23_transparenzregister.models.company import Company, CompanyID, Location
|
||||
|
||||
|
||||
def test_to_dict() -> None:
|
||||
"""Tests if the version tag is entered."""
|
||||
company_id = CompanyID("The Shire", "420")
|
||||
location = Location(
|
||||
city="Insmouth", house_number="19", street="Harbor", zip_code="1890"
|
||||
)
|
||||
company = Company(
|
||||
id=company_id,
|
||||
last_update="Tomorrow",
|
||||
location=location,
|
||||
name="BLANK GmbH",
|
||||
relationships=[],
|
||||
)
|
||||
|
||||
assert company.to_dict() == {
|
||||
"id": {
|
||||
"district_court": company_id.district_court,
|
||||
"hr_number": company_id.hr_number,
|
||||
},
|
||||
"last_update": company.last_update,
|
||||
"location": {
|
||||
"city": location.city,
|
||||
"house_number": location.house_number,
|
||||
"street": location.street,
|
||||
"zip_code": location.zip_code,
|
||||
},
|
||||
"name": "BLANK GmbH",
|
||||
"relationships": [],
|
||||
}
|
23
tests/models/news_test.py
Normal file
23
tests/models/news_test.py
Normal file
@ -0,0 +1,23 @@
|
||||
"""Test Models.nesws."""
|
||||
|
||||
|
||||
from aki_prj23_transparenzregister.models.news import News
|
||||
|
||||
|
||||
def test_to_dict() -> None:
|
||||
"""Tests if the version tag is entered."""
|
||||
news = News(
|
||||
"4711",
|
||||
"Economy collapses",
|
||||
"2042",
|
||||
"Toilet paper prices rising",
|
||||
"https://www.google.com",
|
||||
)
|
||||
|
||||
assert news.to_dict() == {
|
||||
"id": news.id,
|
||||
"title": news.title,
|
||||
"date": news.date,
|
||||
"text": news.text,
|
||||
"source_url": news.source_url,
|
||||
}
|
103
tests/utils/company_mongo_service_test.py
Normal file
103
tests/utils/company_mongo_service_test.py
Normal file
@ -0,0 +1,103 @@
|
||||
"""Test utils.company_mongo_service."""
|
||||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from aki_prj23_transparenzregister.models.company import Company
|
||||
from aki_prj23_transparenzregister.utils.company_mongo_service import (
|
||||
CompanyMongoService,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def mock_mongo_connector(mocker) -> Mock:
|
||||
"""Mock MongoConnector class.
|
||||
|
||||
Args:
|
||||
mocker (any): Library mocker
|
||||
|
||||
Returns:
|
||||
Mock: Mocked MongoConnector
|
||||
"""
|
||||
mock = Mock()
|
||||
mocker.patch(
|
||||
"aki_prj23_transparenzregister.utils.mongo.MongoConnector", return_value=mock
|
||||
)
|
||||
return mock
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def mock_collection() -> Mock:
|
||||
"""Mock mongo collection.
|
||||
|
||||
Returns:
|
||||
Mock: Mock object
|
||||
"""
|
||||
return Mock()
|
||||
|
||||
|
||||
def test_init(mock_mongo_connector, mock_collection):
|
||||
"""Test CompanyMongoService constructor.
|
||||
|
||||
Args:
|
||||
mock_mongo_connector (Mock): Mocked MongoConnector library
|
||||
mock_collection (Mock): Mocked pymongo collection
|
||||
"""
|
||||
mock_mongo_connector.database = {"companies": mock_collection}
|
||||
service = CompanyMongoService(mock_mongo_connector)
|
||||
assert service.collection == mock_collection
|
||||
|
||||
|
||||
def test_get_all(mock_mongo_connector, mock_collection):
|
||||
"""Test CompanyMongoService get_all method.
|
||||
|
||||
Args:
|
||||
mock_mongo_connector (Mock): Mocked MongoConnector library
|
||||
mock_collection (Mock): Mocked pymongo collection
|
||||
"""
|
||||
mock_mongo_connector.database = {"companies": mock_collection}
|
||||
service = CompanyMongoService(mock_mongo_connector)
|
||||
mock_result = [{"id": "42"}]
|
||||
mock_collection.find.return_value = mock_result
|
||||
assert service.get_all() == mock_result
|
||||
|
||||
|
||||
def test_by_id_no_result(mock_mongo_connector, mock_collection):
|
||||
"""Test CompanyMongoService get_by_id with no result.
|
||||
|
||||
Args:
|
||||
mock_mongo_connector (Mock): Mocked MongoConnector library
|
||||
mock_collection (Mock): Mocked pymongo collection
|
||||
"""
|
||||
mock_mongo_connector.database = {"companies": mock_collection}
|
||||
service = CompanyMongoService(mock_mongo_connector)
|
||||
mock_collection.find.return_value = []
|
||||
assert service.get_by_id("Does not exist") is None
|
||||
|
||||
|
||||
def test_by_id_result(mock_mongo_connector, mock_collection):
|
||||
"""Test CompanyMongoService get_by_id with result.
|
||||
|
||||
Args:
|
||||
mock_mongo_connector (Mock): Mocked MongoConnector library
|
||||
mock_collection (Mock): Mocked pymongo collection
|
||||
"""
|
||||
mock_mongo_connector.database = {"companies": mock_collection}
|
||||
service = CompanyMongoService(mock_mongo_connector)
|
||||
mock_entry = {"id": "Does exist", "vaue": 42}
|
||||
mock_collection.find.return_value = [mock_entry]
|
||||
assert service.get_by_id("Does exist") == mock_entry
|
||||
|
||||
|
||||
def test_insert(mock_mongo_connector, mock_collection):
|
||||
"""Test CompanyMongoService insert method.
|
||||
|
||||
Args:
|
||||
mock_mongo_connector (Mock): Mocked MongoConnector library
|
||||
mock_collection (Mock): Mocked pymongo collection
|
||||
"""
|
||||
mock_mongo_connector.database = {"companies": mock_collection}
|
||||
service = CompanyMongoService(mock_mongo_connector)
|
||||
mock_result = 42
|
||||
mock_collection.insert_one.return_value = mock_result
|
||||
assert service.insert(Company(None, None, "", "", [])) == mock_result
|
26
tests/utils/mongo_test.py
Normal file
26
tests/utils/mongo_test.py
Normal file
@ -0,0 +1,26 @@
|
||||
from unittest.mock import patch
|
||||
|
||||
from aki_prj23_transparenzregister.utils.mongo import MongoConnection, MongoConnector
|
||||
|
||||
|
||||
def test_get_conn_string_no_credentials():
|
||||
conn = MongoConnection("localhost", "", 27017, None, None)
|
||||
assert conn.get_conn_string() == "mongodb://localhost:27017"
|
||||
|
||||
|
||||
def test_get_conn_string_no_port_but_credentials():
|
||||
conn = MongoConnection("localhost", "", None, "admin", "password")
|
||||
assert conn.get_conn_string() == "mongodb+srv://admin:password@localhost"
|
||||
|
||||
|
||||
def test_get_conn_simple():
|
||||
conn = MongoConnection("localhost", "", None, None, None)
|
||||
assert conn.get_conn_string() == "mongodb+srv://localhost"
|
||||
|
||||
|
||||
def test_mongo_connector():
|
||||
with patch("pymongo.MongoClient") as mock_mongo_client:
|
||||
expected_result = 42
|
||||
mock_mongo_client.return_value = {"db": expected_result}
|
||||
temp = MongoConnector(MongoConnection("localhost", "db", None, None, None))
|
||||
assert temp.database == expected_result
|
115
tests/utils/news_mongo_service_test.py
Normal file
115
tests/utils/news_mongo_service_test.py
Normal file
@ -0,0 +1,115 @@
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from aki_prj23_transparenzregister.models.news import News
|
||||
from aki_prj23_transparenzregister.utils.news_mongo_service import (
|
||||
MongoEntryTransformer,
|
||||
MongoNewsService,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def mock_mongo_connector(mocker) -> Mock:
|
||||
"""Mock MongoConnector class.
|
||||
|
||||
Args:
|
||||
mocker (any): Library mocker
|
||||
|
||||
Returns:
|
||||
Mock: Mocked MongoConnector
|
||||
"""
|
||||
mock = Mock()
|
||||
mocker.patch(
|
||||
"aki_prj23_transparenzregister.utils.mongo.MongoConnector", return_value=mock
|
||||
)
|
||||
return mock
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def mock_collection() -> Mock:
|
||||
"""Mock mongo collection.
|
||||
|
||||
Returns:
|
||||
Mock: Mock object
|
||||
"""
|
||||
return Mock()
|
||||
|
||||
|
||||
def test_init(mock_mongo_connector, mock_collection):
|
||||
"""Test CompanyMongoService constructor.
|
||||
|
||||
Args:
|
||||
mock_mongo_connector (Mock): Mocked MongoConnector library
|
||||
mock_collection (Mock): Mocked pymongo collection
|
||||
"""
|
||||
mock_mongo_connector.database = {"news": mock_collection}
|
||||
service = MongoNewsService(mock_mongo_connector)
|
||||
assert service.collection == mock_collection
|
||||
|
||||
|
||||
def test_get_all(mock_mongo_connector, mock_collection):
|
||||
mock_mongo_connector.database = {"news": mock_collection}
|
||||
service = MongoNewsService(mock_mongo_connector)
|
||||
|
||||
mock_collection.find.return_value = []
|
||||
assert service.get_all() == []
|
||||
|
||||
|
||||
def test_get_by_id_with_result(mock_mongo_connector, mock_collection):
|
||||
mock_mongo_connector.database = {"news": mock_collection}
|
||||
service = MongoNewsService(mock_mongo_connector)
|
||||
|
||||
with patch(
|
||||
"aki_prj23_transparenzregister.utils.news_mongo_service.MongoEntryTransformer.transform_outgoing"
|
||||
) as mock_out:
|
||||
mock_collection.find.return_value = [{}]
|
||||
mock_out.return_value = {}
|
||||
assert service.get_by_id("foadh") == {}
|
||||
|
||||
|
||||
def test_get_by_id_no_result(mock_mongo_connector, mock_collection):
|
||||
mock_mongo_connector.database = {"news": mock_collection}
|
||||
service = MongoNewsService(mock_mongo_connector)
|
||||
|
||||
mock_collection.find.return_value = []
|
||||
assert service.get_by_id("foadh") is None
|
||||
|
||||
|
||||
def test_insert(mock_mongo_connector, mock_collection):
|
||||
mock_mongo_connector.database = {"news": mock_collection}
|
||||
service = MongoNewsService(mock_mongo_connector)
|
||||
|
||||
with patch(
|
||||
"aki_prj23_transparenzregister.utils.news_mongo_service.MongoEntryTransformer.transform_ingoing"
|
||||
) as mock_in:
|
||||
mock_collection.insert_one.return_value = {}
|
||||
mock_in.return_value = {}
|
||||
assert service.insert({}) == {}
|
||||
|
||||
|
||||
def test_transform_ingoing():
|
||||
news = News("42", None, None, None, None)
|
||||
result = MongoEntryTransformer.transform_ingoing(news)
|
||||
assert result["_id"] == "42"
|
||||
assert "id" not in result
|
||||
|
||||
|
||||
def test_transform_outgoing():
|
||||
data = {
|
||||
"_id": "4711",
|
||||
"title": "Hello",
|
||||
"date": "Today",
|
||||
"text": "World",
|
||||
"source_url": "chat.openai.com",
|
||||
}
|
||||
expected_result = News(
|
||||
**{
|
||||
"id": "4711",
|
||||
"title": "Hello",
|
||||
"date": "Today",
|
||||
"text": "World",
|
||||
"source_url": "chat.openai.com",
|
||||
}
|
||||
)
|
||||
assert MongoEntryTransformer.transform_outgoing(data) == expected_result
|
Loading…
x
Reference in New Issue
Block a user