Merge pull request #39 from fhswf/feature/data-extraktion

Feature/data extraktion
This commit is contained in:
Tristan Nolde 2023-07-20 17:11:50 +02:00 committed by GitHub
commit ebedf7c630
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 10277 additions and 4251 deletions

View File

@ -12,7 +12,7 @@ repos:
- id: check-xml
- id: check-ast
- id: check-added-large-files
args: [--enforce-all]
args: [--enforce-all --maxkb=50000]
- id: name-tests-test
- id: detect-private-key
- id: check-case-conflict

3
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,3 @@
{
"files.eol": "\n"
}

View File

@ -8,14 +8,6 @@
"# Daten Extraktion aus dem Bundesanzeiger"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"In order to run this notebooks, download the `deutschland` library source code from: [TrisNol/deutschland](https://github.com/TrisNol/deutschland/tree/feat/bundesanzeiger-raw-report) and place it in the `Jupyter/API-tests/Bundesanzeiger/deutschland` directory. Since the PR adding the required features to the main repo has not been completet as of of yet (see: [PR](https://github.com/bundesAPI/deutschland/pull/88)) we have to include it in another way..."
]
},
{
"attachments": {},
"cell_type": "markdown",
@ -26,18 +18,9 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\trist\\AppData\\Roaming\\Python\\Python310\\site-packages\\requests\\__init__.py:102: RequestsDependencyWarning: urllib3 (1.26.12) or chardet (5.1.0)/charset_normalizer (2.0.12) doesn't match a supported version!\n",
" warnings.warn(\"urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported \"\n"
]
}
],
"outputs": [],
"source": [
"import pandas as pd\n",
"from deutschland.bundesanzeiger import Bundesanzeiger"
@ -45,26 +28,28 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"dict_keys(['040860c00ef9020cfb6db2f58a163256', '9eb401c5af2f0289b8207233bf852b81', 'b14d979ea9f42367d413589050bd04e5', 'ecb8c1011456ea0d40f87e850fc216bf', '3a7c6c1f1d1b89bf5ceb165f0ee88053', '03b2e6aac2f2da2c0c5e8f23de9caec4', 'a5f8dc87fa797e7d2f8fb88c49a23c36', '9a3a8a3e84290ee650cbccf32323b3d7', '6c0fcc20a58aaa18a9d13f35a51e3996', 'bf276d441c339e787e22385d2b69b277', '90a79d28f3c11a2122d2827d2bf6acda', '88c785ce3b3c580dcc285661c7790cca', 'd3064baa8246c3ed02e30b5038200edc', '5bf92eed2808b484c005409764b825b7', 'fece6303c991a280850be1900ff78f8f', '26b0624c60cdbf647f3d45f4917ec6ea', '9f98bee55f598908cca60b6a47e5d49d', '99267bb7474e6d1d5d9e091ba5ef3ee8', '102738ef4b91408ed043d84fe785b50b', '94711f3e509518d073e1760d97550347'])\n"
"dict_keys(['c1051233030a8e0232523052fd4a2310', '57d129e6fd7505d567fa13919e5e6bdd'])\n"
]
}
],
"source": [
"ba = Bundesanzeiger()\n",
"reports = ba.get_reports(\"Atos IT-Dienstleistung und Beratung GmbH\")\n",
"reports = ba.get_reports(\n",
" \"Volkswagen Economy Service Erdle Bernhard Erdle GmbH\"\n",
") # \"Atos IT-Dienstleistung und Beratung GmbH\")\n",
"print(reports.keys())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
@ -75,7 +60,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 35,
"metadata": {},
"outputs": [
{
@ -109,42 +94,18 @@
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2023-03-17</td>\n",
" <td>Aufsichtsrat</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl...</td>\n",
" <td>2023-05-25</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2022-03-25</td>\n",
" <td>2023-05-24</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2021-03-11</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2020-03-24</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2018-12-11</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" </tr>\n",
" </tbody>\n",
@ -153,35 +114,23 @@
],
"text/plain": [
" date name \\\n",
"0 2023-03-17 Aufsichtsrat \n",
"1 2022-03-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"2 2021-03-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"3 2020-03-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"4 2018-12-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"0 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"1 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"\n",
" company \\\n",
"0 Atos IT-Dienstleistung und Beratung GmbH \n",
"1 Atos IT-Dienstleistung und Beratung GmbH \n",
"2 Atos IT-Dienstleistung und Beratung GmbH \n",
"3 Atos IT-Dienstleistung und Beratung GmbH \n",
"4 Atos IT-Dienstleistung und Beratung GmbH \n",
" company \\\n",
"0 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"1 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"\n",
" report \\\n",
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl... \n",
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
"2 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
"3 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung... \n",
"4 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
"\n",
" raw_report \n",
"0 <div class=\"publication_container\">\\n <div cla... \n",
"1 <div class=\"publication_container\">\\n <div cla... \n",
"2 <div class=\"publication_container\">\\n <div cla... \n",
"3 <div class=\"publication_container\">\\n <div cla... \n",
"4 <div class=\"publication_container\">\\n <div cla... "
"1 <div class=\"publication_container\">\\n <div cla... "
]
},
"execution_count": 8,
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
@ -193,7 +142,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 36,
"metadata": {},
"outputs": [
{
@ -228,46 +177,19 @@
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2023-03-17</td>\n",
" <td>Aufsichtsrat</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl...</td>\n",
" <td>2023-05-25</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>Aufsichtsrat</td>\n",
" <td>Jahresabschluss</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2022-03-25</td>\n",
" <td>2023-05-24</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>Jahresabschluss</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2021-03-11</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>Jahresabschluss</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2020-03-24</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>Jahresabschluss</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2018-12-11</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>Jahresabschluss</td>\n",
" </tr>\n",
@ -277,35 +199,23 @@
],
"text/plain": [
" date name \\\n",
"0 2023-03-17 Aufsichtsrat \n",
"1 2022-03-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"2 2021-03-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"3 2020-03-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"4 2018-12-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"0 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"1 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"\n",
" company \\\n",
"0 Atos IT-Dienstleistung und Beratung GmbH \n",
"1 Atos IT-Dienstleistung und Beratung GmbH \n",
"2 Atos IT-Dienstleistung und Beratung GmbH \n",
"3 Atos IT-Dienstleistung und Beratung GmbH \n",
"4 Atos IT-Dienstleistung und Beratung GmbH \n",
" company \\\n",
"0 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"1 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"\n",
" report \\\n",
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl... \n",
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
"2 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
"3 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung... \n",
"4 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
"\n",
" raw_report type \n",
"0 <div class=\"publication_container\">\\n <div cla... Aufsichtsrat \n",
"1 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
"2 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
"3 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
"4 <div class=\"publication_container\">\\n <div cla... Jahresabschluss "
"0 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
"1 <div class=\"publication_container\">\\n <div cla... Jahresabschluss "
]
},
"execution_count": 9,
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
@ -317,21 +227,9 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_6460\\963182859.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_jahresabschluss['jahr'] = df_jahresabschluss.name.apply(\n"
]
},
{
"data": {
"text/html": [
@ -361,61 +259,34 @@
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2022-03-25</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <th>0</th>\n",
" <td>2023-05-25</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2020</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2021-03-11</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <th>1</th>\n",
" <td>2023-05-24</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2019</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2020-03-24</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2018</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2018-12-11</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2017</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>2018-01-03</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2016</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" date company \\\n",
"1 2022-03-25 Atos IT-Dienstleistung und Beratung GmbH \n",
"2 2021-03-11 Atos IT-Dienstleistung und Beratung GmbH \n",
"3 2020-03-24 Atos IT-Dienstleistung und Beratung GmbH \n",
"4 2018-12-11 Atos IT-Dienstleistung und Beratung GmbH \n",
"6 2018-01-03 Atos IT-Dienstleistung und Beratung GmbH \n",
" date company \\\n",
"0 2023-05-25 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"1 2023-05-24 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"\n",
" raw_report jahr \n",
"1 <div class=\"publication_container\">\\n <div cla... 2020 \n",
"2 <div class=\"publication_container\">\\n <div cla... 2019 \n",
"3 <div class=\"publication_container\">\\n <div cla... 2018 \n",
"4 <div class=\"publication_container\">\\n <div cla... 2017 \n",
"6 <div class=\"publication_container\">\\n <div cla... 2016 "
"0 <div class=\"publication_container\">\\n <div cla... 2020 \n",
"1 <div class=\"publication_container\">\\n <div cla... 2019 "
]
},
"execution_count": 10,
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
@ -439,7 +310,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
@ -449,11 +320,12 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"sample_report = df_jahresabschluss.iloc[0].raw_report"
"sample_report = df_jahresabschluss.iloc[0].raw_report\n",
"sample_report_content = df_jahresabschluss.iloc[0].raw_report"
]
},
{
@ -466,45 +338,20 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"from dataclasses import dataclass\n",
"\n",
"\n",
"@dataclass\n",
"class Auditor:\n",
" name: str\n",
" company: str\n",
"\n",
"\n",
"def extract_auditors(report: str) -> list:\n",
" auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
" hits = re.findall(auditor_regex, report)\n",
" return [hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip() for hit in hits]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Eckhard Lewe', 'Renate Hermsdorf']"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extract_auditors(sample_report)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def extract_auditor_company(report: str) -> str:\n",
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
" temp = soup.find_all(\"b\")\n",
@ -512,27 +359,37 @@
" br = elem.findChildren(\"br\")\n",
" if len(br) > 0:\n",
" return elem.text.split(\"\\n\")[1].strip()\n",
" return None"
" return None\n",
"\n",
"\n",
"def extract_auditors(report: str) -> list:\n",
" auditor_company = extract_auditor_company(report)\n",
" auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
" hits = re.findall(auditor_regex, report)\n",
" return [\n",
" Auditor(hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip(), auditor_company)\n",
" for hit in hits\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Warth & Klein Grant Thornton AG'"
"[]"
]
},
"execution_count": 17,
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extract_auditor_company(sample_report)"
"extract_auditors(sample_report)"
]
},
{
@ -561,97 +418,177 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Anhang</th>\n",
" <th>2020 TEUR</th>\n",
" <th>Vorjahr TEUR</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1. Umsatzerlöse</td>\n",
" <td>(1)</td>\n",
" <td>69.819</td>\n",
" <td>77.429</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2. Veränderung des Bestandes an unfertigen Lei...</td>\n",
" <td>NaN</td>\n",
" <td>-41.000</td>\n",
" <td>-66.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3. Sonstige betriebliche Erträge</td>\n",
" <td>(2)</td>\n",
" <td>489.000</td>\n",
" <td>1.816</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4. Materialaufwand</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>a) Aufwendungen für bezogene Waren</td>\n",
" <td>NaN</td>\n",
" <td>-1.220</td>\n",
" <td>-3.003</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 Anhang 2020 TEUR \\\n",
"0 1. Umsatzerlöse (1) 69.819 \n",
"1 2. Veränderung des Bestandes an unfertigen Lei... NaN -41.000 \n",
"2 3. Sonstige betriebliche Erträge (2) 489.000 \n",
"3 4. Materialaufwand NaN NaN \n",
"4 a) Aufwendungen für bezogene Waren NaN -1.220 \n",
"\n",
" Vorjahr TEUR \n",
"0 77.429 \n",
"1 -66.000 \n",
"2 1.816 \n",
"3 NaN \n",
"4 -3.003 "
"{'net_income': 23484.67, 'equity': 65083.84, 'current_assets': 357613.61}"
]
},
"execution_count": 18,
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def extract_kpis(report_content) -> dict:\n",
" \"\"\"\n",
" Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd\n",
" Extracts Key Performance Indicators (KPIs) from the financial reports.\n",
" Args:\n",
" reports (dict): A dictionary containing the financial reports with their hash as keys and report details as values.\n",
" Returns:\n",
" dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.\n",
" \"\"\"\n",
"\n",
" kpis = {}\n",
"\n",
" # Define KPI patterns to search for\n",
" kpi_patterns = {\n",
" \"revenue\": r\"(?:revenue|umsatz|erlöse)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"net_income\": r\"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"ebit\": r\"(?:ebit|operating income)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"ebitda\": r\"(?:ebitda)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"gross_profit\": r\"(?:gross profit|bruttogewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"operating_profit\": r\"(?:operating profit|betriebsgewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"assets\": r\"(?:total assets|bilanzsumme)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"liabilities\": r\"(?:total liabilities|gesamtverbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"equity\": r\"(?:shareholders'? equity|eigenkapital)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"current_assets\": r\"(?:current assets|umlaufvermögen)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"current_liabilities\": r\"(?:current liabilities|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"long_term_debt\": r\"(?:long[-\\s]?term debt|langfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"short_term_debt\": r\"(?:short[-\\s]?term debt|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"cash_and_cash_equivalents\": r\"(?:cash (?:and cash equivalents)?|barmittel)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"dividends\": r\"(?:dividends?|dividende)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"cash_flow\": r\"(?:cash flow|cashflow|cash flow from operating activities)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" }\n",
"\n",
" report_kpis = {}\n",
" for kpi, pattern in kpi_patterns.items():\n",
" match = re.search(pattern, report_content, flags=re.IGNORECASE | re.UNICODE)\n",
" if match:\n",
" value = match.group(1)\n",
"\n",
" # Clean and validate the extracted number\n",
" try:\n",
" if not value: # Check if value is empty\n",
" cleaned_value = None\n",
" else:\n",
" multiplier = 1\n",
" if value[-1].lower() == \"m\":\n",
" value = value[:-1]\n",
" multiplier = 1_000_000\n",
" elif value[-1].lower() == \"b\":\n",
" value = value[:-1]\n",
" multiplier = 1_000_000_000\n",
"\n",
" # Remove commas after checking for multipliers\n",
" value = value.replace(\".\", \"\").replace(\",\", \".\").strip()\n",
" cleaned_value = float(value) * multiplier\n",
" except ValueError:\n",
" cleaned_value = None\n",
"\n",
" if cleaned_value is not None:\n",
" report_kpis[kpi] = cleaned_value\n",
" return report_kpis\n",
"\n",
"\n",
"extract_kpis(\n",
" BeautifulSoup(sample_report, features=\"html.parser\").get_text().replace(\"\\n\", \" \")\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"with open(\"./temp.txt\", \"w\") as file:\n",
" file.write(\n",
" BeautifulSoup(sample_report, features=\"html.parser\")\n",
" .get_text()\n",
" .replace(\"\\n\", \" \")\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n",
" ('Aktiva', '31.12.2020 EUR'),\n",
" ('Aktiva', '31.12.2019 EUR')],\n",
" )\n",
"Aktiva Unnamed: 0_level_1 object\n",
" 31.12.2020 EUR object\n",
" 31.12.2019 EUR object\n",
"dtype: object\n",
"MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n",
" ('Passiva', '31.12.2020 EUR'),\n",
" ('Passiva', '31.12.2019 EUR')],\n",
" )\n",
"Passiva Unnamed: 0_level_1 object\n",
" 31.12.2020 EUR object\n",
" 31.12.2019 EUR object\n",
"dtype: object\n",
"Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n",
"Angaben zur Identifikation der Gesellschaft laut Registergericht object\n",
"Angaben zur Identifikation der Gesellschaft laut Registergericht.1 object\n",
"dtype: object\n"
]
},
{
"data": {
"text/plain": [
"{}"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def parse_tables(report: str) -> list:\n",
" result = {}\n",
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
" for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n",
" df = pd.read_html(StringIO(str(table)))[0]\n",
" print(df.columns)\n",
" print(df.dtypes)\n",
" return result\n",
"\n",
"\n",
"parse_tables(sample_report)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "'Passiva'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Bundesanzeiger\\notebook.ipynb Cell 21\u001b[0m in \u001b[0;36m1\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=10'>11</a>\u001b[0m \u001b[39mreturn\u001b[39;00m result\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=13'>14</a>\u001b[0m bilanz \u001b[39m=\u001b[39m get_bilanz(sample_report)\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=14'>15</a>\u001b[0m bilanz[\u001b[39m\"\u001b[39;49m\u001b[39mPassiva\u001b[39;49m\u001b[39m\"\u001b[39;49m]\u001b[39m.\u001b[39mhead()\n",
"\u001b[1;31mKeyError\u001b[0m: 'Passiva'"
]
}
],
"source": [
"def get_bilanz(report: str) -> any:\n",
" result = {}\n",
@ -672,30 +609,30 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Gesellschafterbeschluss', 'Shareholder Resolution'], dtype='object')\n",
"Index(['Unnamed: 0', 'Anhang', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'Anhang', '2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2020 TEUR',\n",
"Int64Index([0, 1], dtype='int64')\n",
"Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n",
" 'Vorjahr TEUR'],\n",
" dtype='object')\n",
"Index(['Aufgliederung nach Inland und Ausland', '2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2020'], dtype='object')\n",
"Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Int64Index([0, 1, 2], dtype='int64')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '2020 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
"MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n",
" ('Art der Beziehung', 'Gesellschafterin TEUR'),\n",
" ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n",
@ -707,24 +644,23 @@
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...)],\n",
" )\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( 'Abschreibungen', 'Stand 01.01.2020 EUR'),\n",
" ( 'Abschreibungen', 'Abschreibungen des Geschäftsjahres EUR'),\n",
" ( 'Abschreibungen', 'Abgänge Umbuchung EUR'),\n",
" ( 'Abschreibungen', 'Stand 31.12.2020 EUR')],\n",
"MultiIndex([('Unnamed: 0_level_0', ...),\n",
" ( 'Abschreibungen', ...),\n",
" ( 'Abschreibungen', ...),\n",
" ( 'Abschreibungen', ...),\n",
" ( 'Abschreibungen', ...)],\n",
" )\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( 'Buchwerte', 'Stand 31.12.2020 EUR'),\n",
" ( 'Buchwerte', 'Stand 31.12.2019 EUR')],\n",
" ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n",
" ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n",
" )\n",
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2020', '2019',\n",
" '2018'],\n",
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n",
" '2019'],\n",
" dtype='object')\n",
"Index(['Gewinn- und Verlustrechnung', '2020 TEUR', 'Vorjahr TEUR',\n",
"Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n",
" 'Veränderung TEUR'],\n",
" dtype='object')\n",
"Index(['Bilanz', '31.12.2020 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n",
"Index(['Bericht des Aufsichtsrats', 'Report of the Supervisory Board'], dtype='object')\n"
"Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n"
]
}
],

1
Jupyter/API-tests/News/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
data/

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
pymongo

View File

@ -0,0 +1 @@
data/*

View File

@ -0,0 +1,192 @@
"""Unternehmensregister Scraping."""
import glob
import logging
import multiprocessing
import os
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
from tqdm import tqdm
logger = logging.getLogger()
def scrape(query: str, download_dir: list[str]):
"""Fetch results from Unternehmensregister for given query.
Args:
query (str): Search Query (RegEx supported)
download_dir (list[str]): Directory to place output files in
"""
download_path = os.path.join(str(Path.cwd()), *download_dir)
options = webdriver.ChromeOptions()
preferences = {
"profile.default_content_settings.popups": 0,
"safebrowsing.enabled": True,
"download": {
"directory_upgrade": True,
"prompt_for_download": False,
"extensions_to_open": "",
"default_directory": download_path,
},
}
options.add_argument("--headless=new")
options.add_experimental_option("prefs", preferences)
driver = webdriver.Chrome(options=options)
driver.get("https://www.unternehmensregister.de/ureg/")
# Accept Cookies
driver.find_elements(
By.XPATH, '//button[text()="Nur technisch notwendige Cookies akzeptieren"]'
)[0].click()
# Enter search query
driver.find_elements(By.ID, "globalSearchForm:extendedResearchCompanyName")[
0
].send_keys(query)
# Trigger search
driver.find_elements(By.ID, "globalSearchForm:btnExecuteSearchOld")[0].click()
# Wait for results
wait = WebDriverWait(driver, 15)
wait.until(
lambda driver: driver.current_url != "https://www.unternehmensregister.de/ureg/"
)
num_pages = int(
driver.find_element(By.XPATH, '//*[@class="page_count"]').text.split(" ")[0]
)
processed_companies = []
for _ in tqdm(range(num_pages)):
# Find all "Registerinformationen"
companies_tab = driver.find_elements(
By.LINK_TEXT, "Registerinformationen des Registergerichts"
)
company_names = [
elem.text
for elem in driver.find_elements(
By.XPATH, '//div[@class="company_result"]/span/b'
)
]
for index, company_link in enumerate(companies_tab):
company_name = company_names[index]
if company_name in processed_companies:
continue
# Go to intermediary page
company_link.click()
# Trigger next redirect
driver.find_element(By.LINK_TEXT, "Registerinformationen anzeigen").click()
# Trigger SI download
driver.find_element(By.LINK_TEXT, "SI").click()
# Show shopping cart
wait.until(
ec.visibility_of_element_located(
(By.LINK_TEXT, "Dokumentenkorb ansehen")
)
)
driver.find_element(By.LINK_TEXT, "Dokumentenkorb ansehen").click()
# Get document
elems = driver.find_elements(By.TAG_NAME, "input")
elems[-2].click()
wait.until(
ec.visibility_of_element_located((By.ID, "paymentFormOverview:btnNext"))
)
driver.find_element(By.ID, "paymentFormOverview:btnNext").click()
wait.until(
ec.visibility_of_element_located((By.LINK_TEXT, "Zum Dokumentenkorb"))
)
driver.find_element(By.LINK_TEXT, "Zum Dokumentenkorb").click()
num_files = get_num_files(download_path)
driver.find_element(By.CLASS_NAME, "download-wrapper").click()
try:
wait.until(wait_for_download_condition(download_path, num_files))
file_name = "".join(e for e in company_name if e.isalnum()) + ".xml"
rename_latest_file(
download_path,
file_name,
)
processed_companies.append(company_name)
except Exception:
logger.warning("Exception caught in Scraping")
finally:
for _ in range(6):
driver.back()
driver.find_element(By.XPATH, '//*[@class="fas fa-angle-right"]').click()
driver.close()
def wait_for_download_condition(
path: str, num_files: int, pattern: str = "*.xml"
) -> bool:
"""Selenium wait condition monitoring number of files in a dir.
Args:
path (str): Directory path
num_files (int): Current number of file
pattern (str, optional): File pattern. Defaults to "*.xml".
Returns:
bool: Current num file exceeded
"""
return len(glob.glob1(path, pattern)) > num_files
def get_num_files(path: str, pattern: str = "*.xml") -> int:
"""Get number of files in directory.
Args:
path (str): Directory to scan
pattern (str, optional): File pattern. Defaults to "*.xml".
Returns:
int: Number of files matching pattern
"""
return len(glob.glob1(path, pattern))
def rename_latest_file(path: str, filename: str, pattern: str = "*.xml"):
"""Rename file in dir with latest change date.
Args:
path (str): Dir to check
filename (str): Name of file
pattern (str, optional): File pattern. Defaults to "*.xml".
"""
list_of_files = [os.path.join(path, file) for file in glob.glob1(path, pattern)]
latest_download = max(list_of_files, key=os.path.getctime)
os.rename(latest_download, os.path.join(path, filename))
if __name__ == "__main__":
"""Main procedure"""
import pandas as pd
df_relevant_companies = pd.read_excel(
"./data/study_id42887_top-100-unternehmen-deutschland.xlsx",
sheet_name="Toplist",
skiprows=1,
)
df_relevant_companies = df_relevant_companies[df_relevant_companies["Name"].notna()]
batch_size = 5
pool = multiprocessing.Pool(processes=batch_size)
params = [
(query, ["data", "Unternehmensregister", "scraping", query.strip()])
for query in df_relevant_companies.Name
]
# Map the process_handler function to the parameter list using the Pool
pool.starmap(scrape, params)
# Close the Pool to prevent any more tasks from being submitted
pool.close()
# Wait for all the processes to complete
pool.join()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,10 @@
ocrmypdf
pytesseract
opencv-python
pdf2image
bs4
selenium
xmltodict
tqdm
openpyxl
pandas

View File

@ -0,0 +1,28 @@
version: '3.8'
services:
mongodb:
image: mongo:6.0.6
container_name: mongodb
restart: unless-stopped
environment:
MONGO_INITDB_ROOT_USERNAME: root
MONGO_INITDB_ROOT_PASSWORD: pR0R0v2e2
MONGO_INITDB_DATABASE: transparenzregister
ports:
- 27017:27017
volumes:
- mongodb_data:/data/db
mongo-express:
image: mongo-express:1.0.0-alpha
container_name: mongo-express
restart: unless-stopped
ports:
- 8081:8081
environment:
ME_CONFIG_MONGODB_SERVER: mongodb
ME_CONFIG_MONGODB_ADMINUSERNAME: root
ME_CONFIG_MONGODB_ADMINPASSWORD: pR0R0v2e2
volumes:
mongodb_data:

1028
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -30,9 +30,12 @@ version = "0.1.0"
loguru = "^0.7.0"
matplotlib = "^3.7.1"
plotly = "^5.14.1"
pymongo = "^4.4.1"
python = "^3.11"
seaborn = "^0.12.2"
selenium = "^4.10.0"
tqdm = "^4.65.0"
types-tqdm = "^4.65.0"
[tool.poetry.group.develop.dependencies]
black = {extras = ["jupyter"], version = "^23.3.0"}
@ -100,8 +103,11 @@ target-version = "py311"
# Avoid trying to fix flake8-bugbear (`B`) violations.
unfixable = ["B"]
[tool.ruff.flake8-builtins]
builtins-ignorelist = ["id"]
[tool.ruff.per-file-ignores]
"tests/*.py" = ["S101"]
"tests/*.py" = ["S101", "D100", "D101", "D107", "D103"]
[tool.ruff.pydocstyle]
convention = "google"

View File

@ -0,0 +1 @@
"""Model classes."""

View File

@ -0,0 +1,68 @@
"""Company model."""
from abc import ABC
from dataclasses import asdict, dataclass
from enum import Enum
class RelationshipRoleEnum(Enum):
"""_summary_.
Args:
Enum (_type_): _description_
"""
STAKEHOLDER = ""
ORGANISATION = "ORGANISATION"
@dataclass
class CompanyID:
"""_summary_."""
district_court: str
hr_number: str
@dataclass
class Location:
"""_summary_."""
city: str
street: str | None = None
house_number: str | None = None
zip_code: str | None = None
@dataclass
class CompanyRelationship(ABC):
"""_summary_.
Args:
ABC (_type_): _description_
"""
role: RelationshipRoleEnum
location: Location
@dataclass
class Company:
"""_summary_.
Returns:
_type_: _description_
"""
id: CompanyID
location: Location
name: str
last_update: str
relationships: list[CompanyRelationship]
def to_dict(self) -> dict:
"""_summary_.
Returns:
dict: _description_
"""
return asdict(self)

View File

@ -0,0 +1,25 @@
"""News mnodel."""
from dataclasses import asdict, dataclass
@dataclass
class News:
"""_summary_.
Returns:
_type_: _description_
"""
id: str
title: str
date: str
text: str
source_url: str
def to_dict(self) -> dict:
"""_summary_.
Returns:
dict: _description_
"""
return asdict(self)

View File

@ -0,0 +1 @@
"""Util classes and services."""

View File

@ -0,0 +1,49 @@
"""CompanyMongoService."""
from aki_prj23_transparenzregister.models.company import Company, CompanyID
from aki_prj23_transparenzregister.utils.mongo import MongoConnector
class CompanyMongoService:
"""_summary_."""
def __init__(self, connector: MongoConnector):
"""_summary_.
Args:
connector (MongoConnector): _description_
"""
self.collection = connector.database["companies"]
def get_all(self) -> list[Company]:
"""_summary_.
Returns:
list[Company]: _description_
"""
result = self.collection.find()
return list(result)
def get_by_id(self, id: CompanyID) -> Company | None:
"""_summary_.
Args:
id (str): _description_
Returns:
Company | None: _description_
"""
result = list(self.collection.find({"id": id}))
if len(result) == 1:
return result[0]
return None
def insert(self, company: Company):
"""_summary_.
Args:
company (Company): _description_
Returns:
_type_: _description_
"""
return self.collection.insert_one(company.to_dict())

View File

@ -0,0 +1,47 @@
"""Mongo Wrapper."""
from dataclasses import dataclass
import pymongo
@dataclass
class MongoConnection:
"""_summary_."""
hostname: str
database: str
port: int | None
username: str | None
password: str | None
def get_conn_string(self) -> str:
"""Transforms the information of the object to a MongoDB connection string.
Returns:
str: Connection string
"""
if self.username is not None and self.password is not None:
connection_string = (
f"mongodb+srv://{self.username}:{self.password}@{self.hostname}"
)
else:
connection_string = f"mongodb+srv://{self.hostname}"
if self.port is not None:
connection_string += f":{self.port}"
connection_string = connection_string.replace("mongodb+srv", "mongodb")
return connection_string
class MongoConnector:
"""Wrapper for establishing a connection to a MongoDB instance."""
def __init__(self, connection: MongoConnection):
"""_summary_.
Args:
connection (MongoConnection): Wrapper for connection string
"""
self.client: pymongo.MongoClient = pymongo.MongoClient(
connection.get_conn_string()
)
self.database = self.client[connection.database]

View File

@ -0,0 +1,94 @@
"""MongoNewsService."""
from aki_prj23_transparenzregister.models.news import News
from aki_prj23_transparenzregister.utils.mongo import MongoConnector
class MongoNewsService:
"""_summary_.
Args:
NewsServiceInterface (_type_): _description_
"""
def __init__(self, connector: MongoConnector):
"""_summary_.
Args:
connector (MongoConnector): _description_
"""
self.collection = connector.database["news"]
def get_all(self) -> list[News]:
"""_summary_.
Returns:
list[News]: _description_
"""
result = self.collection.find()
return [MongoEntryTransformer.transform_outgoing(elem) for elem in result]
def get_by_id(self, id: str) -> News | None:
"""_summary_.
Args:
id (str): _description_
Returns:
News | None: _description_
"""
result = list(self.collection.find({"_id": id}))
if len(result) == 1:
return MongoEntryTransformer.transform_outgoing(result[0])
return None
def insert(self, news: News):
"""_summary_.
Args:
news (News): _description_
Returns:
_type_: _description_
"""
return self.collection.insert_one(MongoEntryTransformer.transform_ingoing(news))
class MongoEntryTransformer:
"""_summary_.
Returns:
_type_: _description_
"""
@staticmethod
def transform_ingoing(news: News) -> dict:
"""Convert a News object to a dictionary compatible with a MongoDB entry.
Args:
news (News): News object to be transformed
Returns:
dict: Transformed data with added _id field
"""
transport_object = news.to_dict()
transport_object["_id"] = news.id
del transport_object["id"]
return transport_object
@staticmethod
def transform_outgoing(data: dict) -> News:
"""Reverse the transform_ingoing method.
Args:
data (dict): dict from the MongoDB to be transformed
Returns:
News: News entry based on MongoDB document
"""
return News(
id=data["_id"],
title=data["title"],
date=data["date"],
text=data["text"],
source_url=data["source_url"],
)

View File

@ -0,0 +1,35 @@
"""Test Models.company."""
from aki_prj23_transparenzregister.models.company import Company, CompanyID, Location
def test_to_dict() -> None:
"""Tests if the version tag is entered."""
company_id = CompanyID("The Shire", "420")
location = Location(
city="Insmouth", house_number="19", street="Harbor", zip_code="1890"
)
company = Company(
id=company_id,
last_update="Tomorrow",
location=location,
name="BLANK GmbH",
relationships=[],
)
assert company.to_dict() == {
"id": {
"district_court": company_id.district_court,
"hr_number": company_id.hr_number,
},
"last_update": company.last_update,
"location": {
"city": location.city,
"house_number": location.house_number,
"street": location.street,
"zip_code": location.zip_code,
},
"name": "BLANK GmbH",
"relationships": [],
}

23
tests/models/news_test.py Normal file
View File

@ -0,0 +1,23 @@
"""Test Models.nesws."""
from aki_prj23_transparenzregister.models.news import News
def test_to_dict() -> None:
"""Tests if the version tag is entered."""
news = News(
"4711",
"Economy collapses",
"2042",
"Toilet paper prices rising",
"https://www.google.com",
)
assert news.to_dict() == {
"id": news.id,
"title": news.title,
"date": news.date,
"text": news.text,
"source_url": news.source_url,
}

View File

@ -0,0 +1,103 @@
"""Test utils.company_mongo_service."""
from unittest.mock import Mock
import pytest
from aki_prj23_transparenzregister.models.company import Company
from aki_prj23_transparenzregister.utils.company_mongo_service import (
CompanyMongoService,
)
@pytest.fixture()
def mock_mongo_connector(mocker) -> Mock:
"""Mock MongoConnector class.
Args:
mocker (any): Library mocker
Returns:
Mock: Mocked MongoConnector
"""
mock = Mock()
mocker.patch(
"aki_prj23_transparenzregister.utils.mongo.MongoConnector", return_value=mock
)
return mock
@pytest.fixture()
def mock_collection() -> Mock:
"""Mock mongo collection.
Returns:
Mock: Mock object
"""
return Mock()
def test_init(mock_mongo_connector, mock_collection):
"""Test CompanyMongoService constructor.
Args:
mock_mongo_connector (Mock): Mocked MongoConnector library
mock_collection (Mock): Mocked pymongo collection
"""
mock_mongo_connector.database = {"companies": mock_collection}
service = CompanyMongoService(mock_mongo_connector)
assert service.collection == mock_collection
def test_get_all(mock_mongo_connector, mock_collection):
"""Test CompanyMongoService get_all method.
Args:
mock_mongo_connector (Mock): Mocked MongoConnector library
mock_collection (Mock): Mocked pymongo collection
"""
mock_mongo_connector.database = {"companies": mock_collection}
service = CompanyMongoService(mock_mongo_connector)
mock_result = [{"id": "42"}]
mock_collection.find.return_value = mock_result
assert service.get_all() == mock_result
def test_by_id_no_result(mock_mongo_connector, mock_collection):
"""Test CompanyMongoService get_by_id with no result.
Args:
mock_mongo_connector (Mock): Mocked MongoConnector library
mock_collection (Mock): Mocked pymongo collection
"""
mock_mongo_connector.database = {"companies": mock_collection}
service = CompanyMongoService(mock_mongo_connector)
mock_collection.find.return_value = []
assert service.get_by_id("Does not exist") is None
def test_by_id_result(mock_mongo_connector, mock_collection):
"""Test CompanyMongoService get_by_id with result.
Args:
mock_mongo_connector (Mock): Mocked MongoConnector library
mock_collection (Mock): Mocked pymongo collection
"""
mock_mongo_connector.database = {"companies": mock_collection}
service = CompanyMongoService(mock_mongo_connector)
mock_entry = {"id": "Does exist", "vaue": 42}
mock_collection.find.return_value = [mock_entry]
assert service.get_by_id("Does exist") == mock_entry
def test_insert(mock_mongo_connector, mock_collection):
"""Test CompanyMongoService insert method.
Args:
mock_mongo_connector (Mock): Mocked MongoConnector library
mock_collection (Mock): Mocked pymongo collection
"""
mock_mongo_connector.database = {"companies": mock_collection}
service = CompanyMongoService(mock_mongo_connector)
mock_result = 42
mock_collection.insert_one.return_value = mock_result
assert service.insert(Company(None, None, "", "", [])) == mock_result

26
tests/utils/mongo_test.py Normal file
View File

@ -0,0 +1,26 @@
from unittest.mock import patch
from aki_prj23_transparenzregister.utils.mongo import MongoConnection, MongoConnector
def test_get_conn_string_no_credentials():
conn = MongoConnection("localhost", "", 27017, None, None)
assert conn.get_conn_string() == "mongodb://localhost:27017"
def test_get_conn_string_no_port_but_credentials():
conn = MongoConnection("localhost", "", None, "admin", "password")
assert conn.get_conn_string() == "mongodb+srv://admin:password@localhost"
def test_get_conn_simple():
conn = MongoConnection("localhost", "", None, None, None)
assert conn.get_conn_string() == "mongodb+srv://localhost"
def test_mongo_connector():
with patch("pymongo.MongoClient") as mock_mongo_client:
expected_result = 42
mock_mongo_client.return_value = {"db": expected_result}
temp = MongoConnector(MongoConnection("localhost", "db", None, None, None))
assert temp.database == expected_result

View File

@ -0,0 +1,115 @@
from unittest.mock import Mock, patch
import pytest
from aki_prj23_transparenzregister.models.news import News
from aki_prj23_transparenzregister.utils.news_mongo_service import (
MongoEntryTransformer,
MongoNewsService,
)
@pytest.fixture()
def mock_mongo_connector(mocker) -> Mock:
"""Mock MongoConnector class.
Args:
mocker (any): Library mocker
Returns:
Mock: Mocked MongoConnector
"""
mock = Mock()
mocker.patch(
"aki_prj23_transparenzregister.utils.mongo.MongoConnector", return_value=mock
)
return mock
@pytest.fixture()
def mock_collection() -> Mock:
"""Mock mongo collection.
Returns:
Mock: Mock object
"""
return Mock()
def test_init(mock_mongo_connector, mock_collection):
"""Test CompanyMongoService constructor.
Args:
mock_mongo_connector (Mock): Mocked MongoConnector library
mock_collection (Mock): Mocked pymongo collection
"""
mock_mongo_connector.database = {"news": mock_collection}
service = MongoNewsService(mock_mongo_connector)
assert service.collection == mock_collection
def test_get_all(mock_mongo_connector, mock_collection):
mock_mongo_connector.database = {"news": mock_collection}
service = MongoNewsService(mock_mongo_connector)
mock_collection.find.return_value = []
assert service.get_all() == []
def test_get_by_id_with_result(mock_mongo_connector, mock_collection):
mock_mongo_connector.database = {"news": mock_collection}
service = MongoNewsService(mock_mongo_connector)
with patch(
"aki_prj23_transparenzregister.utils.news_mongo_service.MongoEntryTransformer.transform_outgoing"
) as mock_out:
mock_collection.find.return_value = [{}]
mock_out.return_value = {}
assert service.get_by_id("foadh") == {}
def test_get_by_id_no_result(mock_mongo_connector, mock_collection):
mock_mongo_connector.database = {"news": mock_collection}
service = MongoNewsService(mock_mongo_connector)
mock_collection.find.return_value = []
assert service.get_by_id("foadh") is None
def test_insert(mock_mongo_connector, mock_collection):
mock_mongo_connector.database = {"news": mock_collection}
service = MongoNewsService(mock_mongo_connector)
with patch(
"aki_prj23_transparenzregister.utils.news_mongo_service.MongoEntryTransformer.transform_ingoing"
) as mock_in:
mock_collection.insert_one.return_value = {}
mock_in.return_value = {}
assert service.insert({}) == {}
def test_transform_ingoing():
news = News("42", None, None, None, None)
result = MongoEntryTransformer.transform_ingoing(news)
assert result["_id"] == "42"
assert "id" not in result
def test_transform_outgoing():
data = {
"_id": "4711",
"title": "Hello",
"date": "Today",
"text": "World",
"source_url": "chat.openai.com",
}
expected_result = News(
**{
"id": "4711",
"title": "Hello",
"date": "Today",
"text": "World",
"source_url": "chat.openai.com",
}
)
assert MongoEntryTransformer.transform_outgoing(data) == expected_result