mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-04-24 16:42:34 +02:00
Feat/fetch financials (#79)
This commit is contained in:
commit
2cd8def200
@ -18,216 +18,125 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>date</th>\n",
|
||||
" <th>company</th>\n",
|
||||
" <th>raw_report</th>\n",
|
||||
" <th>jahr</th>\n",
|
||||
" <th>auditors</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>2023-07-07</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2021</td>\n",
|
||||
" <td>[]</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>2023-05-10</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2021</td>\n",
|
||||
" <td>[Auditor(name='Eckhard Lewe', company='Grant T...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>2022-03-25</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2020</td>\n",
|
||||
" <td>[Auditor(name='Eckhard Lewe', company='Warth &...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>5</th>\n",
|
||||
" <td>2021-03-11</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2019</td>\n",
|
||||
" <td>[Auditor(name='Eckhard Lewe', company='Warth &...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6</th>\n",
|
||||
" <td>2020-03-24</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2018</td>\n",
|
||||
" <td>[Auditor(name='Ulrich Diersch', company='Warth...</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" date company \\\n",
|
||||
"0 2023-07-07 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"2 2023-05-10 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"4 2022-03-25 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"5 2021-03-11 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"6 2020-03-24 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"\n",
|
||||
" raw_report jahr \\\n",
|
||||
"0 <div class=\"publication_container\">\\n <div cla... 2021 \n",
|
||||
"2 <div class=\"publication_container\">\\n <div cla... 2021 \n",
|
||||
"4 <div class=\"publication_container\">\\n <div cla... 2020 \n",
|
||||
"5 <div class=\"publication_container\">\\n <div cla... 2019 \n",
|
||||
"6 <div class=\"publication_container\">\\n <div cla... 2018 \n",
|
||||
"\n",
|
||||
" auditors \n",
|
||||
"0 [] \n",
|
||||
"2 [Auditor(name='Eckhard Lewe', company='Grant T... \n",
|
||||
"4 [Auditor(name='Eckhard Lewe', company='Warth &... \n",
|
||||
"5 [Auditor(name='Eckhard Lewe', company='Warth &... \n",
|
||||
"6 [Auditor(name='Ulrich Diersch', company='Warth... "
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"from deutschland.bundesanzeiger import Bundesanzeiger"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"dict_keys(['c1051233030a8e0232523052fd4a2310', '57d129e6fd7505d567fa13919e5e6bdd'])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ba = Bundesanzeiger()\n",
|
||||
"reports = ba.get_reports(\n",
|
||||
" \"Volkswagen Economy Service Erdle Bernhard Erdle GmbH\"\n",
|
||||
") # \"Atos IT-Dienstleistung und Beratung GmbH\")\n",
|
||||
"print(reports.keys())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"report_contents = []\n",
|
||||
"for key in reports.keys():\n",
|
||||
" report_contents.append(reports[key])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>date</th>\n",
|
||||
" <th>name</th>\n",
|
||||
" <th>company</th>\n",
|
||||
" <th>report</th>\n",
|
||||
" <th>raw_report</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>2023-05-25</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2023-05-24</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" date name \\\n",
|
||||
"0 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"1 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"\n",
|
||||
" company \\\n",
|
||||
"0 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"1 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"\n",
|
||||
" report \\\n",
|
||||
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||
"\n",
|
||||
" raw_report \n",
|
||||
"0 <div class=\"publication_container\">\\n <div cla... \n",
|
||||
"1 <div class=\"publication_container\">\\n <div cla... "
|
||||
]
|
||||
},
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df_reports = pd.DataFrame(report_contents)\n",
|
||||
"\n",
|
||||
"from aki_prj23_transparenzregister.utils.data_extraction.bundesanzeiger import (\n",
|
||||
" Bundesanzeiger,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"ba_wrapper = Bundesanzeiger()\n",
|
||||
"df_reports = ba_wrapper.get_information(\"Atos IT-Dienstleistung und Beratung GmbH\")\n",
|
||||
"df_reports.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>date</th>\n",
|
||||
" <th>name</th>\n",
|
||||
" <th>company</th>\n",
|
||||
" <th>report</th>\n",
|
||||
" <th>raw_report</th>\n",
|
||||
" <th>type</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>2023-05-25</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>Jahresabschluss</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2023-05-24</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>Jahresabschluss</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" date name \\\n",
|
||||
"0 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"1 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"\n",
|
||||
" company \\\n",
|
||||
"0 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"1 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"\n",
|
||||
" report \\\n",
|
||||
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||
"\n",
|
||||
" raw_report type \n",
|
||||
"0 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
||||
"1 <div class=\"publication_container\">\\n <div cla... Jahresabschluss "
|
||||
]
|
||||
},
|
||||
"execution_count": 36,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df_reports[\"type\"] = df_reports.name.apply(lambda name: name.split(\" \")[0])\n",
|
||||
"df_reports.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -260,13 +169,20 @@
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>2023-07-11</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2021</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2023-05-25</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2020</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>2023-05-24</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
@ -278,15 +194,17 @@
|
||||
],
|
||||
"text/plain": [
|
||||
" date company \\\n",
|
||||
"0 2023-05-25 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"1 2023-05-24 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"0 2023-07-11 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"1 2023-05-25 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"2 2023-05-24 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"\n",
|
||||
" raw_report jahr \n",
|
||||
"0 <div class=\"publication_container\">\\n <div cla... 2020 \n",
|
||||
"1 <div class=\"publication_container\">\\n <div cla... 2019 "
|
||||
"0 <div class=\"publication_container\">\\n <div cla... 2021 \n",
|
||||
"1 <div class=\"publication_container\">\\n <div cla... 2020 \n",
|
||||
"2 <div class=\"publication_container\">\\n <div cla... 2019 "
|
||||
]
|
||||
},
|
||||
"execution_count": 37,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -310,7 +228,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 38,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -320,7 +238,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 39,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -338,18 +256,12 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 40,
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"from dataclasses import dataclass\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dataclass\n",
|
||||
"class Auditor:\n",
|
||||
" name: str\n",
|
||||
" company: str\n",
|
||||
"from aki_prj23_transparenzregister.models.auditor import Auditor\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def extract_auditor_company(report: str) -> str:\n",
|
||||
@ -374,7 +286,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 41,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -383,7 +295,7 @@
|
||||
"[]"
|
||||
]
|
||||
},
|
||||
"execution_count": 41,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -418,16 +330,16 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 42,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'net_income': 23484.67, 'equity': 65083.84, 'current_assets': 357613.61}"
|
||||
"{'net_income': 100238.5, 'equity': 165322.34, 'current_assets': 435344.07}"
|
||||
]
|
||||
},
|
||||
"execution_count": 42,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -502,7 +414,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -518,7 +430,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 46,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -526,24 +438,30 @@
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n",
|
||||
" ('Aktiva', '31.12.2020 EUR'),\n",
|
||||
" ('Aktiva', '31.12.2019 EUR')],\n",
|
||||
" ('Aktiva', '31.12.2021 EUR'),\n",
|
||||
" ('Aktiva', '31.12.2020 EUR')],\n",
|
||||
" )\n",
|
||||
"Aktiva Unnamed: 0_level_1 object\n",
|
||||
" 31.12.2021 EUR object\n",
|
||||
" 31.12.2020 EUR object\n",
|
||||
" 31.12.2019 EUR object\n",
|
||||
"dtype: object\n",
|
||||
"MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n",
|
||||
" ('Passiva', '31.12.2020 EUR'),\n",
|
||||
" ('Passiva', '31.12.2019 EUR')],\n",
|
||||
" ('Passiva', '31.12.2021 EUR'),\n",
|
||||
" ('Passiva', '31.12.2020 EUR')],\n",
|
||||
" )\n",
|
||||
"Passiva Unnamed: 0_level_1 object\n",
|
||||
" 31.12.2021 EUR object\n",
|
||||
" 31.12.2020 EUR object\n",
|
||||
" 31.12.2019 EUR object\n",
|
||||
"dtype: object\n",
|
||||
"Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n",
|
||||
"Angaben zur Identifikation der Gesellschaft laut Registergericht object\n",
|
||||
"Angaben zur Identifikation der Gesellschaft laut Registergericht.1 object\n",
|
||||
"dtype: object\n",
|
||||
"MultiIndex([('Kreditentwicklung', 'Unnamed: 0_level_1'),\n",
|
||||
" ( 'Betrag', 'EUR')],\n",
|
||||
" )\n",
|
||||
"Kreditentwicklung Unnamed: 0_level_1 object\n",
|
||||
"Betrag EUR object\n",
|
||||
"dtype: object\n"
|
||||
]
|
||||
},
|
||||
@ -553,7 +471,7 @@
|
||||
"{}"
|
||||
]
|
||||
},
|
||||
"execution_count": 46,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -574,19 +492,46 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "KeyError",
|
||||
"evalue": "'Passiva'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Bundesanzeiger\\notebook.ipynb Cell 21\u001b[0m in \u001b[0;36m1\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=10'>11</a>\u001b[0m \u001b[39mreturn\u001b[39;00m result\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=13'>14</a>\u001b[0m bilanz \u001b[39m=\u001b[39m get_bilanz(sample_report)\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=14'>15</a>\u001b[0m bilanz[\u001b[39m\"\u001b[39;49m\u001b[39mPassiva\u001b[39;49m\u001b[39m\"\u001b[39;49m]\u001b[39m.\u001b[39mhead()\n",
|
||||
"\u001b[1;31mKeyError\u001b[0m: 'Passiva'"
|
||||
]
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
"Empty DataFrame\n",
|
||||
"Columns: []\n",
|
||||
"Index: []"
|
||||
]
|
||||
},
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@ -600,6 +545,8 @@
|
||||
" StringIO(str(tag.findNext(\"table\", {\"class\": \"std_table\"})))\n",
|
||||
" )[0]\n",
|
||||
" result[pos] = pos_results\n",
|
||||
" else:\n",
|
||||
" result[pos] = pd.DataFrame([])\n",
|
||||
" return result\n",
|
||||
"\n",
|
||||
"\n",
|
||||
@ -609,58 +556,25 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Int64Index([0, 1], dtype='int64')\n",
|
||||
"Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n",
|
||||
" 'Vorjahr TEUR'],\n",
|
||||
" dtype='object')\n",
|
||||
"Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||||
"Int64Index([0, 1, 2], dtype='int64')\n",
|
||||
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
|
||||
"Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
|
||||
"MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n",
|
||||
" ('Art der Beziehung', 'Gesellschafterin TEUR'),\n",
|
||||
" ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n",
|
||||
"MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n",
|
||||
" ('Aktiva', '31.12.2021 EUR'),\n",
|
||||
" ('Aktiva', '31.12.2020 EUR')],\n",
|
||||
" )\n",
|
||||
"Int64Index([0, 1], dtype='int64')\n",
|
||||
"MultiIndex([( 'Unnamed: 0_level_0', ...),\n",
|
||||
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
|
||||
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
|
||||
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
|
||||
" ('Anschaffungs- oder Herstellungskosten', ...)],\n",
|
||||
"MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n",
|
||||
" ('Passiva', '31.12.2021 EUR'),\n",
|
||||
" ('Passiva', '31.12.2020 EUR')],\n",
|
||||
" )\n",
|
||||
"MultiIndex([('Unnamed: 0_level_0', ...),\n",
|
||||
" ( 'Abschreibungen', ...),\n",
|
||||
" ( 'Abschreibungen', ...),\n",
|
||||
" ( 'Abschreibungen', ...),\n",
|
||||
" ( 'Abschreibungen', ...)],\n",
|
||||
" )\n",
|
||||
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
|
||||
" ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n",
|
||||
" ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n",
|
||||
" )\n",
|
||||
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n",
|
||||
" '2019'],\n",
|
||||
" dtype='object')\n",
|
||||
"Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n",
|
||||
" 'Veränderung TEUR'],\n",
|
||||
" dtype='object')\n",
|
||||
"Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n"
|
||||
"Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n",
|
||||
"MultiIndex([('Kreditentwicklung', 'Unnamed: 0_level_1'),\n",
|
||||
" ( 'Betrag', 'EUR')],\n",
|
||||
" )\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -698,7 +612,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.7"
|
||||
"version": "3.11.3"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
|
10377
poetry.lock
generated
10377
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -38,6 +38,7 @@ version = "0.1.0"
|
||||
SQLAlchemy = {version = "^1.4.46", extras = ["mypy"]}
|
||||
dash = "^2.11.1"
|
||||
dash-bootstrap-components = "^1.4.2"
|
||||
deutschland = {git = "https://github.com/TrisNol/deutschland.git", branch = "hotfix/python-3.11-support"}
|
||||
loguru = "^0.7.0"
|
||||
matplotlib = "^3.7.1"
|
||||
plotly = "^5.14.1"
|
||||
@ -48,6 +49,10 @@ seaborn = "^0.12.2"
|
||||
selenium = "^4.10.0"
|
||||
tqdm = "^4.65.0"
|
||||
|
||||
# TODO Add dependent libraries (i.e., deutshcland, plotly, etc)
|
||||
[tool.poetry.extras]
|
||||
ingest = ["selenium"]
|
||||
|
||||
[tool.poetry.group.develop.dependencies]
|
||||
black = {extras = ["jupyter"], version = "^23.3.0"}
|
||||
jupyterlab = "^4.0.0"
|
||||
|
1
src/aki_prj23_transparenzregister/apps/__init__.py
Normal file
1
src/aki_prj23_transparenzregister/apps/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Main applications."""
|
@ -0,0 +1,63 @@
|
||||
"""Add financial data to companies."""
|
||||
import typing
|
||||
|
||||
from aki_prj23_transparenzregister.config.config_providers import JsonFileConfigProvider
|
||||
from aki_prj23_transparenzregister.utils.data_extraction.bundesanzeiger import (
|
||||
Bundesanzeiger,
|
||||
)
|
||||
from aki_prj23_transparenzregister.utils.mongo.company_mongo_service import (
|
||||
CompanyMongoService,
|
||||
)
|
||||
from aki_prj23_transparenzregister.utils.mongo.connector import MongoConnector
|
||||
|
||||
|
||||
def work(company: typing.Any, company_service: CompanyMongoService) -> None:
|
||||
"""Process company regarding financials.
|
||||
|
||||
Args:
|
||||
company (dict): Company to process
|
||||
company_service (CompanyMongoService): Interface to Company collection on MongoDB
|
||||
"""
|
||||
yearly_results = Bundesanzeiger().get_information(
|
||||
company["name"], company["location"]["city"]
|
||||
)
|
||||
yearly_results_data = {}
|
||||
for _index, row in yearly_results.iterrows():
|
||||
yearly_results_data[row.jahr] = {
|
||||
"auditors": [auditor.to_dict() for auditor in row.auditors],
|
||||
"financials": row.financial_results,
|
||||
}
|
||||
|
||||
company_service.add_yearly_results(company["_id"], yearly_results_data)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import concurrent.futures
|
||||
|
||||
from loguru import logger
|
||||
|
||||
config_provider = JsonFileConfigProvider("./secrets.json")
|
||||
|
||||
mongo_connector = MongoConnector(config_provider.get_mongo_connection_string())
|
||||
company_service = CompanyMongoService(mongo_connector)
|
||||
|
||||
num_threads = 25
|
||||
companies = company_service.get_where_no_financial_results()
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
|
||||
# Submit tasks for each entry in the list
|
||||
future_to_entry = {
|
||||
executor.submit(work, entry, company_service): entry for entry in companies
|
||||
}
|
||||
|
||||
# with tqdm(total=len(companies)) as pbar:
|
||||
# Wait for all tasks to complete
|
||||
for future in concurrent.futures.as_completed(future_to_entry):
|
||||
entry = future_to_entry[future]
|
||||
logger.info(entry["name"])
|
||||
try:
|
||||
# Get the result of the completed task (if needed)
|
||||
result = future.result()
|
||||
# pbar.set_description(entry["name"])
|
||||
# pbar.update(1)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing entry {e}")
|
18
src/aki_prj23_transparenzregister/models/auditor.py
Normal file
18
src/aki_prj23_transparenzregister/models/auditor.py
Normal file
@ -0,0 +1,18 @@
|
||||
"""Auditor model."""
|
||||
from dataclasses import asdict, dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class Auditor:
|
||||
"""Auditor."""
|
||||
|
||||
name: str
|
||||
company: str | None
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""_summary_.
|
||||
|
||||
Returns:
|
||||
dict: _description_
|
||||
"""
|
||||
return asdict(self)
|
@ -45,19 +45,61 @@ class CompanyRelationship(ABC):
|
||||
location: Location
|
||||
|
||||
|
||||
class FinancialKPIEnum(Enum):
|
||||
"""Financial KPI keys."""
|
||||
|
||||
# Umsatz || Erlöse
|
||||
REVENUE = "revenue"
|
||||
# Jahresüberschuss || Nettoeinkommen
|
||||
NET_INCOME = "net_income"
|
||||
# Ebit
|
||||
EBIT = "ebit"
|
||||
# Ebitda
|
||||
EBITDA = "ebitda"
|
||||
# Bruttogewinn
|
||||
GROSS_PROFIT = "gross_profit"
|
||||
# Betriebsgewinn
|
||||
OPERATING_PROFIT = "operating_profit"
|
||||
# Bilanzsumme
|
||||
ASSETS = "assets"
|
||||
# Gesamtverbindlichkeiten
|
||||
LIABILITIES = "liabilities"
|
||||
# Eigenkapital
|
||||
EQUITY = "equity"
|
||||
# Umlaufvermögen
|
||||
CURRENT_ASSETS = "current_assets"
|
||||
# Kurzfristige Verbindlichkeiten
|
||||
CURRENT_LIABILITIES = "current_liabilities"
|
||||
# Langfristige Verbindlichkeiten
|
||||
LONG_TERM_DEBT = "long_term_debt"
|
||||
# Kurzfristige Verbindlichkeiten
|
||||
SHORT_TERM_DEBT = "short_term_debt"
|
||||
# Barmittel
|
||||
CASH_AND_CASH_EQUIVALENTS = "cash_and_cash_equivalents"
|
||||
# Dividende
|
||||
DIVIDENDS = "dividends"
|
||||
# Cash Flow
|
||||
CASH_FLOW = "cash_flow"
|
||||
|
||||
|
||||
@dataclass
|
||||
class YearlyResult:
|
||||
"""Company yearly result."""
|
||||
|
||||
year: int
|
||||
kpis: dict[FinancialKPIEnum, float]
|
||||
|
||||
|
||||
@dataclass
|
||||
class Company:
|
||||
"""_summary_.
|
||||
|
||||
Returns:
|
||||
_type_: _description_
|
||||
"""
|
||||
"""Company dataclass."""
|
||||
|
||||
id: CompanyID
|
||||
location: Location
|
||||
name: str
|
||||
last_update: str
|
||||
relationships: list[CompanyRelationship]
|
||||
# yearly_results: list[FinancialResults]
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""_summary_.
|
||||
|
@ -0,0 +1 @@
|
||||
"""Everything regarding data extraction from various sources."""
|
@ -0,0 +1,183 @@
|
||||
"""Fetch data from Bundesanzeiger."""
|
||||
import re
|
||||
|
||||
import pandas as pd
|
||||
from bs4 import BeautifulSoup
|
||||
from deutschland.bundesanzeiger import Bundesanzeiger as Ba
|
||||
|
||||
from aki_prj23_transparenzregister.models.auditor import Auditor
|
||||
from aki_prj23_transparenzregister.models.company import FinancialKPIEnum
|
||||
|
||||
pd.options.mode.chained_assignment = None # type: ignore
|
||||
|
||||
|
||||
class Bundesanzeiger:
|
||||
"""Bundesanzeiger wrapper to export relevant information."""
|
||||
|
||||
def get_information(self, company_name: str, city: str | None) -> pd.DataFrame:
|
||||
"""Extract relevant information from all found yearly results for the given company.
|
||||
|
||||
Args:
|
||||
company_name (str): Name of the company to search for
|
||||
city (Optional[str]): City where the company is registered
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Result
|
||||
"""
|
||||
ba = Ba()
|
||||
# Get Bundesanzeiger entries for company
|
||||
reports = ba.get_reports(f"{company_name} {city}")
|
||||
# Transform to list of data
|
||||
report_contents = []
|
||||
for key in reports:
|
||||
report_contents.append(reports[key])
|
||||
|
||||
if len(report_contents) == 0:
|
||||
return pd.DataFrame()
|
||||
# Transform to DataFrame and filter out irrelevant entries
|
||||
df_data = pd.DataFrame(report_contents)
|
||||
df_data = self.filter_reports(df_data)
|
||||
|
||||
# Filter out entries of different companies
|
||||
df_data = df_data.loc[df_data.company == company_name]
|
||||
|
||||
# Add Auditor information
|
||||
df_data["auditors"] = df_data.raw_report.apply(self.extract_auditors)
|
||||
|
||||
# Add Financial information
|
||||
df_data["financial_results"] = df_data.raw_report.apply(
|
||||
self.extract_financial_results
|
||||
)
|
||||
|
||||
# Remove irrelevant columns
|
||||
return df_data.drop(["raw_report"], axis=1)
|
||||
|
||||
def filter_reports(self, df_reports: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Returns only reports of type `Jahresabschluss` and extracts the year of the report.
|
||||
|
||||
Args:
|
||||
df_reports (pd.DataFrame): DataFrame containing list of reports
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Filtered and pruned DataFrame
|
||||
"""
|
||||
df_reports["type"] = df_reports.name.apply(lambda name: name.split(" ")[0])
|
||||
df_reports = df_reports.loc[df_reports.type == "Jahresabschluss"]
|
||||
df_reports["jahr"] = df_reports.name.apply(
|
||||
lambda name: name.split(" ")[-1].split(".")[-1]
|
||||
)
|
||||
return df_reports.drop(["name", "report", "type"], axis=1)
|
||||
|
||||
def extract_auditor_company(self, report: str) -> str | None:
|
||||
"""Extract the name of an auditor company from the given yearly results report.
|
||||
|
||||
Args:
|
||||
report (str): Yearly results report as raw string
|
||||
|
||||
Returns:
|
||||
str | None: Name of the auditor company if found, otherwise None
|
||||
"""
|
||||
soup = BeautifulSoup(report, features="html.parser")
|
||||
temp = soup.find_all("b")
|
||||
for elem in temp:
|
||||
br = elem.findChildren("br")
|
||||
if len(br) > 0:
|
||||
return elem.text.split("\n")[1].strip()
|
||||
return None
|
||||
|
||||
def extract_auditors(self, report: str) -> list:
|
||||
"""Find the list of auditors involved in the given yearly results report.
|
||||
|
||||
Args:
|
||||
report (str): Yearly results report as raw string
|
||||
|
||||
Returns:
|
||||
list[Auditor]: List of Auditors found in the given report
|
||||
"""
|
||||
auditor_company = self.extract_auditor_company(report)
|
||||
auditor_regex = r"[a-z A-Z,.'-]+, Wirtschaftsprüfer"
|
||||
hits = re.findall(auditor_regex, report)
|
||||
return [
|
||||
Auditor(hit.replace(", Wirtschaftsprüfer", "").lstrip(), auditor_company)
|
||||
for hit in hits
|
||||
]
|
||||
|
||||
def __extract_kpis__(self, report: str) -> dict:
|
||||
"""Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd.
|
||||
|
||||
Extracts Key Performance Indicators (KPIs) from the financial reports.
|
||||
|
||||
Args:
|
||||
report (str): The yearly report as a parsed string
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.
|
||||
"""
|
||||
kpis = {}
|
||||
|
||||
# Define KPI patterns to search for
|
||||
kpi_patterns = {
|
||||
FinancialKPIEnum.REVENUE.value: r"(?:revenue|umsatz|erlöse)[:\s]*([\d,.]+[mmb]?)",
|
||||
FinancialKPIEnum.NET_INCOME.value: r"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\s]*([\d,.]+[mmb]?)",
|
||||
FinancialKPIEnum.EBIT.value: r"(?:ebit|operating income)[:\s]*([\d,.]+[mmb]?)",
|
||||
FinancialKPIEnum.EBITDA.value: r"(?:ebitda)[:\s]*([\d,.]+[mmb]?)",
|
||||
FinancialKPIEnum.GROSS_PROFIT.value: r"(?:gross profit|bruttogewinn)[:\s]*([\d,.]+[mmb]?)",
|
||||
FinancialKPIEnum.OPERATING_PROFIT.value: r"(?:operating profit|betriebsgewinn)[:\s]*([\d,.]+[mmb]?)",
|
||||
FinancialKPIEnum.ASSETS.value: r"(?:total assets|bilanzsumme)[:\s]*([\d,.]+[mmb]?)",
|
||||
FinancialKPIEnum.LIABILITIES.value: r"(?:total liabilities|gesamtverbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
|
||||
FinancialKPIEnum.EQUITY.value: r"(?:shareholders'? equity|eigenkapital)[:\s]*([\d,.]+[mmb]?)",
|
||||
FinancialKPIEnum.CURRENT_ASSETS.value: r"(?:current assets|umlaufvermögen)[:\s]*([\d,.]+[mmb]?)",
|
||||
FinancialKPIEnum.CURRENT_LIABILITIES.value: r"(?:current liabilities|kurzfristige verbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
|
||||
FinancialKPIEnum.LONG_TERM_DEBT.value: r"(?:long[-\s]?term debt|langfristige verbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
|
||||
FinancialKPIEnum.SHORT_TERM_DEBT.value: r"(?:short[-\s]?term debt|kurzfristige verbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
|
||||
FinancialKPIEnum.CASH_AND_CASH_EQUIVALENTS.value: r"(?:cash (?:and cash equivalents)?|barmittel)[:\s]*([\d,.]+[mmb]?)",
|
||||
FinancialKPIEnum.DIVIDENDS.value: r"(?:dividends?|dividende)[:\s]*([\d,.]+[mmb]?)",
|
||||
FinancialKPIEnum.CASH_FLOW.value: r"(?:cash flow|cashflow|cash flow from operating activities)[:\s]*([\d,.]+[mmb]?)",
|
||||
}
|
||||
|
||||
for kpi, pattern in kpi_patterns.items():
|
||||
match = re.search(pattern, report, flags=re.IGNORECASE | re.UNICODE)
|
||||
if match:
|
||||
value = match.group(1)
|
||||
|
||||
# Clean and validate the extracted number
|
||||
try:
|
||||
if not value: # Check if value is empty
|
||||
cleaned_value = None
|
||||
else:
|
||||
multiplier = 1
|
||||
if value[-1].lower() == "m":
|
||||
value = value[:-1]
|
||||
multiplier = 1_000_000
|
||||
elif value[-1].lower() == "b":
|
||||
value = value[:-1]
|
||||
multiplier = 1_000_000_000
|
||||
|
||||
# Remove commas after checking for multipliers
|
||||
value = value.replace(".", "").replace(",", ".").strip()
|
||||
cleaned_value = float(value) * multiplier
|
||||
except ValueError:
|
||||
cleaned_value = None
|
||||
|
||||
if cleaned_value is not None:
|
||||
kpis[kpi] = cleaned_value
|
||||
return kpis
|
||||
|
||||
def extract_financial_results(self, report: str) -> dict:
|
||||
"""Extract financial data from given report.
|
||||
|
||||
Args:
|
||||
report (str): Report to be analyzed
|
||||
|
||||
Returns:
|
||||
dict: Results
|
||||
"""
|
||||
report_parsed = (
|
||||
BeautifulSoup(report, features="html.parser").get_text().replace("\n", " ")
|
||||
)
|
||||
return self.__extract_kpis__(report_parsed)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ba_wrapper = Bundesanzeiger()
|
||||
ba_wrapper.get_information("Atos IT-Dienstleistung und Beratung GmbH", None)
|
@ -1,7 +1,10 @@
|
||||
"""CompanyMongoService."""
|
||||
from pymongo.results import InsertOneResult
|
||||
from threading import Lock
|
||||
|
||||
from aki_prj23_transparenzregister.models.company import Company, CompanyID
|
||||
from bson.objectid import ObjectId
|
||||
from pymongo.results import InsertOneResult, UpdateResult
|
||||
|
||||
from aki_prj23_transparenzregister.models.company import Company
|
||||
from aki_prj23_transparenzregister.utils.mongo.connector import MongoConnector
|
||||
|
||||
|
||||
@ -15,6 +18,7 @@ class CompanyMongoService:
|
||||
connector (MongoConnector): _description_
|
||||
"""
|
||||
self.collection = connector.database["companies"]
|
||||
self.lock = Lock() # Create a lock for synchronization
|
||||
|
||||
def get_all(self) -> list[Company]:
|
||||
"""_summary_.
|
||||
@ -22,10 +26,11 @@ class CompanyMongoService:
|
||||
Returns:
|
||||
list[Company]: _description_
|
||||
"""
|
||||
result = self.collection.find()
|
||||
return list(result)
|
||||
with self.lock:
|
||||
result = self.collection.find()
|
||||
return list(result)
|
||||
|
||||
def get_by_id(self, id: CompanyID) -> Company | None:
|
||||
def get_by_id(self, id: str) -> Company | None:
|
||||
"""_summary_.
|
||||
|
||||
Args:
|
||||
@ -34,10 +39,46 @@ class CompanyMongoService:
|
||||
Returns:
|
||||
Company | None: _description_
|
||||
"""
|
||||
result = list(self.collection.find({"id": id}))
|
||||
if len(result) == 1:
|
||||
return result[0]
|
||||
return None
|
||||
with self.lock:
|
||||
result = list(self.collection.find({"id": id}))
|
||||
if len(result) == 1:
|
||||
return result[0]
|
||||
return None
|
||||
|
||||
def get_by_object_id(self, _id: str) -> dict | None:
|
||||
"""Find an object by given _id.
|
||||
|
||||
Args:
|
||||
_id (str): ID
|
||||
|
||||
Returns:
|
||||
Company | None: Entry if found, otherwise None
|
||||
"""
|
||||
with self.lock:
|
||||
result = list(self.collection.find({"_id": ObjectId(_id)}))
|
||||
if len(result) == 1:
|
||||
return result[0]
|
||||
return None
|
||||
|
||||
def get_where_no_financial_results(self) -> list[dict]:
|
||||
"""Get all entries that have no yearly_results.
|
||||
|
||||
Returns:
|
||||
list[dict]: List of companies found
|
||||
"""
|
||||
with self.lock:
|
||||
return list(
|
||||
self.collection.find({"$or": [{"yearly_results": {"$exists": False}}]})
|
||||
)
|
||||
|
||||
def get_where_yearly_results(self) -> list[dict]:
|
||||
"""Get a list of all companies with valid yearly_results (interesting entries for data loader).
|
||||
|
||||
Returns:
|
||||
list[dict]: List of companies
|
||||
"""
|
||||
with self.lock:
|
||||
return list(self.collection.find({"yearly_results": {"$gt": {}}}))
|
||||
|
||||
def insert(self, company: Company) -> InsertOneResult:
|
||||
"""_summary_.
|
||||
@ -48,4 +89,20 @@ class CompanyMongoService:
|
||||
Returns:
|
||||
_type_: _description_
|
||||
"""
|
||||
return self.collection.insert_one(company.to_dict())
|
||||
with self.lock:
|
||||
return self.collection.insert_one(company.to_dict())
|
||||
|
||||
def add_yearly_results(self, _id: str, yearly_results: dict) -> UpdateResult:
|
||||
"""Add the `yearly_results` field to a Company entry.
|
||||
|
||||
Args:
|
||||
_id (str): ID of the object
|
||||
yearly_results (dict): Yearly results dictionary
|
||||
|
||||
Returns:
|
||||
UpdateResult: Result
|
||||
"""
|
||||
with self.lock:
|
||||
return self.collection.update_one(
|
||||
{"_id": ObjectId(_id)}, {"$set": {"yearly_results": yearly_results}}
|
||||
)
|
||||
|
36
tests/apps/enrich_company_financials_test.py
Normal file
36
tests/apps/enrich_company_financials_test.py
Normal file
@ -0,0 +1,36 @@
|
||||
"""Tests for the enrich_company_financials module."""
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from aki_prj23_transparenzregister.apps import enrich_company_financials
|
||||
from aki_prj23_transparenzregister.models.auditor import Auditor
|
||||
|
||||
|
||||
def test_import_enrich_company_financials() -> None:
|
||||
"""Testing if the enrich_company_financials can be imported."""
|
||||
assert enrich_company_financials
|
||||
|
||||
|
||||
@patch(
|
||||
"aki_prj23_transparenzregister.apps.enrich_company_financials.Bundesanzeiger.get_information"
|
||||
)
|
||||
@patch(
|
||||
"aki_prj23_transparenzregister.apps.enrich_company_financials.CompanyMongoService"
|
||||
)
|
||||
def test_work(mock_compnay_service: Mock, mock_bundesanzeiger: Mock) -> None:
|
||||
mock_bundesanzeiger.return_value = pd.DataFrame(
|
||||
[
|
||||
{
|
||||
"jahr": "2042",
|
||||
"auditors": [Auditor(name="", company="")],
|
||||
"financial_results": [],
|
||||
}
|
||||
]
|
||||
)
|
||||
# mock_compnay_service.add_yearly_resreturn_value
|
||||
enrich_company_financials.work(
|
||||
{"_id": "", "name": "ABC AG", "location": {"city": "Haltern am See"}},
|
||||
mock_compnay_service,
|
||||
)
|
||||
assert enrich_company_financials
|
1
tests/utils/data_extraction/__init__.py
Normal file
1
tests/utils/data_extraction/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Tests for data_extraction."""
|
111
tests/utils/data_extraction/bundesanzeiger_test.py
Normal file
111
tests/utils/data_extraction/bundesanzeiger_test.py
Normal file
@ -0,0 +1,111 @@
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from aki_prj23_transparenzregister.models.company import FinancialKPIEnum
|
||||
from aki_prj23_transparenzregister.utils.data_extraction.bundesanzeiger import (
|
||||
Bundesanzeiger,
|
||||
)
|
||||
|
||||
|
||||
def test_extract_auditor_company_no_hits() -> None:
|
||||
input_data = """
|
||||
<b>
|
||||
Nothing to see here
|
||||
</b>
|
||||
"""
|
||||
ba = Bundesanzeiger()
|
||||
result = ba.extract_auditor_company(input_data)
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_extract_auditor_company() -> None:
|
||||
company_name = "Korrupte Wirtschaftsprüfer GmbH & Co. KG"
|
||||
input_data = f"""
|
||||
<b>
|
||||
{company_name}
|
||||
<br>
|
||||
Max Mustermann
|
||||
</b>
|
||||
"""
|
||||
ba = Bundesanzeiger()
|
||||
result = ba.extract_auditor_company(input_data)
|
||||
assert result == company_name
|
||||
|
||||
|
||||
def test_extract_kpis() -> None:
|
||||
input_data = """
|
||||
Die Prj23_Transparenzregister GmbH erwirtschaftete einen Jahresüberschuss 10.000,43 €.
|
||||
Des Weiteren sanken die Gesamtverbindlichkeiten 42,00 €
|
||||
"""
|
||||
ba = Bundesanzeiger()
|
||||
result = ba.__extract_kpis__(input_data)
|
||||
|
||||
net_income = 10000.43
|
||||
liabilities = 42.00
|
||||
|
||||
assert result[FinancialKPIEnum.NET_INCOME.value] == net_income
|
||||
assert result[FinancialKPIEnum.LIABILITIES.value] == liabilities
|
||||
|
||||
|
||||
def test_extracct_financial_results() -> None:
|
||||
input_data = """
|
||||
<br>
|
||||
Die Prj23_Transparenzregister GmbH erwirtschaftete einen Jahresüberschuss 10.000,43 €.
|
||||
</br>
|
||||
<h2>Dies ist ein Platzhalter, der ignoriert werden soll</h2>
|
||||
<b>Des Weiteren sanken die Gesamtverbindlichkeiten 42,00 €</b>
|
||||
"""
|
||||
ba = Bundesanzeiger()
|
||||
result = ba.extract_financial_results(input_data)
|
||||
|
||||
net_income = 10000.43
|
||||
liabilities = 42.00
|
||||
|
||||
assert result[FinancialKPIEnum.NET_INCOME.value] == net_income
|
||||
assert result[FinancialKPIEnum.LIABILITIES.value] == liabilities
|
||||
|
||||
|
||||
def test_filter_reports() -> None:
|
||||
test_data = [
|
||||
{"name": "Bedienungsanleitung", "report": "", "raw_report": ""},
|
||||
{"name": "Jahresabschluss 1998", "report": "", "raw_report": ""},
|
||||
]
|
||||
test_df = pd.DataFrame(test_data)
|
||||
ba = Bundesanzeiger()
|
||||
result = ba.filter_reports(test_df)
|
||||
assert len(result) == 1
|
||||
assert result.iloc[0].jahr == "1998"
|
||||
|
||||
|
||||
@patch(
|
||||
"aki_prj23_transparenzregister.utils.data_extraction.bundesanzeiger.Ba.get_reports"
|
||||
)
|
||||
def test_get_information(mock_bundesanzeiger: Mock) -> None:
|
||||
mock_bundesanzeiger.return_value = {
|
||||
"1": {
|
||||
"name": "Bedienungsanleitung",
|
||||
"report": "",
|
||||
"company": "",
|
||||
"raw_report": "",
|
||||
},
|
||||
"2": {
|
||||
"name": "Jahresabschluss 1998",
|
||||
"report": "",
|
||||
"company": "PRJ 23 Transparenzregister GmbH",
|
||||
"raw_report": "",
|
||||
},
|
||||
}
|
||||
ba = Bundesanzeiger()
|
||||
result = ba.get_information("PRJ 23 Transparenzregister GmbH", "Iserlohn")
|
||||
assert len(result) == 1
|
||||
|
||||
|
||||
@patch(
|
||||
"aki_prj23_transparenzregister.utils.data_extraction.bundesanzeiger.Ba.get_reports"
|
||||
)
|
||||
def test_get_information_no_results(mock_bundesanzeiger: Mock) -> None:
|
||||
mock_bundesanzeiger.return_value = {}
|
||||
ba = Bundesanzeiger()
|
||||
result = ba.get_information("PRJ 23 Transparenzregister GmbH", "Iserlohn")
|
||||
assert len(result) == 0
|
@ -3,7 +3,7 @@ from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from aki_prj23_transparenzregister.models.company import Company
|
||||
from aki_prj23_transparenzregister.models.company import Company, CompanyID, Location
|
||||
from aki_prj23_transparenzregister.utils.mongo.company_mongo_service import (
|
||||
CompanyMongoService,
|
||||
)
|
||||
@ -73,7 +73,7 @@ def test_by_id_no_result(mock_mongo_connector: Mock, mock_collection: Mock) -> N
|
||||
mock_mongo_connector.database = {"companies": mock_collection}
|
||||
service = CompanyMongoService(mock_mongo_connector)
|
||||
mock_collection.find.return_value = []
|
||||
assert service.get_by_id("Does not exist") is None # type: ignore
|
||||
assert service.get_by_id("Does not exist") is None
|
||||
|
||||
|
||||
def test_by_id_result(mock_mongo_connector: Mock, mock_collection: Mock) -> None:
|
||||
@ -87,7 +87,7 @@ def test_by_id_result(mock_mongo_connector: Mock, mock_collection: Mock) -> None
|
||||
service = CompanyMongoService(mock_mongo_connector)
|
||||
mock_entry = {"id": "Does exist", "vaue": 42}
|
||||
mock_collection.find.return_value = [mock_entry]
|
||||
assert service.get_by_id("Does exist") == mock_entry # type: ignore
|
||||
assert service.get_by_id("Does exist") == mock_entry
|
||||
|
||||
|
||||
def test_insert(mock_mongo_connector: Mock, mock_collection: Mock) -> None:
|
||||
@ -101,4 +101,55 @@ def test_insert(mock_mongo_connector: Mock, mock_collection: Mock) -> None:
|
||||
service = CompanyMongoService(mock_mongo_connector)
|
||||
mock_result = 42
|
||||
mock_collection.insert_one.return_value = mock_result
|
||||
assert service.insert(Company(None, None, "", "", [])) == mock_result # type: ignore
|
||||
assert (
|
||||
service.insert(
|
||||
Company(CompanyID("", ""), Location("Hier und Dort"), "", "", [])
|
||||
)
|
||||
== mock_result
|
||||
)
|
||||
|
||||
|
||||
def test_get_by_object_id_no_result(
|
||||
mock_mongo_connector: Mock, mock_collection: Mock
|
||||
) -> None:
|
||||
mock_mongo_connector.database = {"companies": mock_collection}
|
||||
service = CompanyMongoService(mock_mongo_connector)
|
||||
mock_result: list = []
|
||||
mock_collection.find.return_value = mock_result
|
||||
assert service.get_by_object_id("649f16a1e198338c3b44299e") is None
|
||||
|
||||
|
||||
def test_get_by_object_id(mock_mongo_connector: Mock, mock_collection: Mock) -> None:
|
||||
mock_mongo_connector.database = {"companies": mock_collection}
|
||||
service = CompanyMongoService(mock_mongo_connector)
|
||||
mock_result: list = [{"_id": "abc", "brille?": "Fielmann", "Hotel?": "Trivago"}]
|
||||
mock_collection.find.return_value = mock_result
|
||||
assert service.get_by_object_id("612316a1e198338c3b44299e") == mock_result[0]
|
||||
|
||||
|
||||
def test_get_where_financial_no_results(
|
||||
mock_mongo_connector: Mock, mock_collection: Mock
|
||||
) -> None:
|
||||
mock_mongo_connector.database = {"companies": mock_collection}
|
||||
service = CompanyMongoService(mock_mongo_connector)
|
||||
mock_result: list = [{"_id": "abc", "brille?": "Fielmann", "Hotel?": "Trivago"}]
|
||||
mock_collection.find.return_value = mock_result
|
||||
assert service.get_where_no_financial_results() == mock_result
|
||||
|
||||
|
||||
def test_get_where_financial_results(
|
||||
mock_mongo_connector: Mock, mock_collection: Mock
|
||||
) -> None:
|
||||
mock_mongo_connector.database = {"companies": mock_collection}
|
||||
service = CompanyMongoService(mock_mongo_connector)
|
||||
mock_result: list = [{"_id": "abc", "brille?": "Fielmann", "Hotel?": "Trivago"}]
|
||||
mock_collection.find.return_value = mock_result
|
||||
assert service.get_where_yearly_results() == mock_result
|
||||
|
||||
|
||||
def test_add_yearly_reslults(mock_mongo_connector: Mock, mock_collection: Mock) -> None:
|
||||
mock_mongo_connector.database = {"companies": mock_collection}
|
||||
service = CompanyMongoService(mock_mongo_connector)
|
||||
mock_result: list = [{"_id": "abc", "brille?": "Fielmann", "Hotel?": "Trivago"}]
|
||||
mock_collection.update_one.return_value = mock_result
|
||||
assert service.add_yearly_results("612316a1e198338c3b44299e", {}) == mock_result
|
||||
|
Loading…
x
Reference in New Issue
Block a user