Merge pull request #39 from fhswf/feature/data-extraktion

Feature/data extraktion
This commit is contained in:
Tristan Nolde 2023-07-20 17:11:50 +02:00 committed by GitHub
commit ebedf7c630
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 10277 additions and 4251 deletions

View File

@ -12,7 +12,7 @@ repos:
- id: check-xml
- id: check-ast
- id: check-added-large-files
args: [--enforce-all]
args: [--enforce-all --maxkb=50000]
- id: name-tests-test
- id: detect-private-key
- id: check-case-conflict

3
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,3 @@
{
"files.eol": "\n"
}

View File

@ -8,14 +8,6 @@
"# Daten Extraktion aus dem Bundesanzeiger"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"In order to run this notebooks, download the `deutschland` library source code from: [TrisNol/deutschland](https://github.com/TrisNol/deutschland/tree/feat/bundesanzeiger-raw-report) and place it in the `Jupyter/API-tests/Bundesanzeiger/deutschland` directory. Since the PR adding the required features to the main repo has not been completet as of of yet (see: [PR](https://github.com/bundesAPI/deutschland/pull/88)) we have to include it in another way..."
]
},
{
"attachments": {},
"cell_type": "markdown",
@ -26,18 +18,9 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\trist\\AppData\\Roaming\\Python\\Python310\\site-packages\\requests\\__init__.py:102: RequestsDependencyWarning: urllib3 (1.26.12) or chardet (5.1.0)/charset_normalizer (2.0.12) doesn't match a supported version!\n",
" warnings.warn(\"urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported \"\n"
]
}
],
"outputs": [],
"source": [
"import pandas as pd\n",
"from deutschland.bundesanzeiger import Bundesanzeiger"
@ -45,26 +28,28 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"dict_keys(['040860c00ef9020cfb6db2f58a163256', '9eb401c5af2f0289b8207233bf852b81', 'b14d979ea9f42367d413589050bd04e5', 'ecb8c1011456ea0d40f87e850fc216bf', '3a7c6c1f1d1b89bf5ceb165f0ee88053', '03b2e6aac2f2da2c0c5e8f23de9caec4', 'a5f8dc87fa797e7d2f8fb88c49a23c36', '9a3a8a3e84290ee650cbccf32323b3d7', '6c0fcc20a58aaa18a9d13f35a51e3996', 'bf276d441c339e787e22385d2b69b277', '90a79d28f3c11a2122d2827d2bf6acda', '88c785ce3b3c580dcc285661c7790cca', 'd3064baa8246c3ed02e30b5038200edc', '5bf92eed2808b484c005409764b825b7', 'fece6303c991a280850be1900ff78f8f', '26b0624c60cdbf647f3d45f4917ec6ea', '9f98bee55f598908cca60b6a47e5d49d', '99267bb7474e6d1d5d9e091ba5ef3ee8', '102738ef4b91408ed043d84fe785b50b', '94711f3e509518d073e1760d97550347'])\n"
"dict_keys(['c1051233030a8e0232523052fd4a2310', '57d129e6fd7505d567fa13919e5e6bdd'])\n"
]
}
],
"source": [
"ba = Bundesanzeiger()\n",
"reports = ba.get_reports(\"Atos IT-Dienstleistung und Beratung GmbH\")\n",
"reports = ba.get_reports(\n",
" \"Volkswagen Economy Service Erdle Bernhard Erdle GmbH\"\n",
") # \"Atos IT-Dienstleistung und Beratung GmbH\")\n",
"print(reports.keys())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
@ -75,7 +60,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 35,
"metadata": {},
"outputs": [
{
@ -109,42 +94,18 @@
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2023-03-17</td>\n",
" <td>Aufsichtsrat</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl...</td>\n",
" <td>2023-05-25</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2022-03-25</td>\n",
" <td>2023-05-24</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2021-03-11</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2020-03-24</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2018-12-11</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" </tr>\n",
" </tbody>\n",
@ -153,35 +114,23 @@
],
"text/plain": [
" date name \\\n",
"0 2023-03-17 Aufsichtsrat \n",
"1 2022-03-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"2 2021-03-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"3 2020-03-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"4 2018-12-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"0 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"1 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"\n",
" company \\\n",
"0 Atos IT-Dienstleistung und Beratung GmbH \n",
"1 Atos IT-Dienstleistung und Beratung GmbH \n",
"2 Atos IT-Dienstleistung und Beratung GmbH \n",
"3 Atos IT-Dienstleistung und Beratung GmbH \n",
"4 Atos IT-Dienstleistung und Beratung GmbH \n",
" company \\\n",
"0 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"1 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"\n",
" report \\\n",
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl... \n",
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
"2 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
"3 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung... \n",
"4 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
"\n",
" raw_report \n",
"0 <div class=\"publication_container\">\\n <div cla... \n",
"1 <div class=\"publication_container\">\\n <div cla... \n",
"2 <div class=\"publication_container\">\\n <div cla... \n",
"3 <div class=\"publication_container\">\\n <div cla... \n",
"4 <div class=\"publication_container\">\\n <div cla... "
"1 <div class=\"publication_container\">\\n <div cla... "
]
},
"execution_count": 8,
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
@ -193,7 +142,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 36,
"metadata": {},
"outputs": [
{
@ -228,46 +177,19 @@
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2023-03-17</td>\n",
" <td>Aufsichtsrat</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl...</td>\n",
" <td>2023-05-25</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>Aufsichtsrat</td>\n",
" <td>Jahresabschluss</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2022-03-25</td>\n",
" <td>2023-05-24</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>Jahresabschluss</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2021-03-11</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>Jahresabschluss</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2020-03-24</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>Jahresabschluss</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2018-12-11</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>Jahresabschluss</td>\n",
" </tr>\n",
@ -277,35 +199,23 @@
],
"text/plain": [
" date name \\\n",
"0 2023-03-17 Aufsichtsrat \n",
"1 2022-03-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"2 2021-03-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"3 2020-03-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"4 2018-12-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"0 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"1 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"\n",
" company \\\n",
"0 Atos IT-Dienstleistung und Beratung GmbH \n",
"1 Atos IT-Dienstleistung und Beratung GmbH \n",
"2 Atos IT-Dienstleistung und Beratung GmbH \n",
"3 Atos IT-Dienstleistung und Beratung GmbH \n",
"4 Atos IT-Dienstleistung und Beratung GmbH \n",
" company \\\n",
"0 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"1 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"\n",
" report \\\n",
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl... \n",
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
"2 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
"3 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung... \n",
"4 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n",
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
"\n",
" raw_report type \n",
"0 <div class=\"publication_container\">\\n <div cla... Aufsichtsrat \n",
"1 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
"2 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
"3 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
"4 <div class=\"publication_container\">\\n <div cla... Jahresabschluss "
"0 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
"1 <div class=\"publication_container\">\\n <div cla... Jahresabschluss "
]
},
"execution_count": 9,
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
@ -317,21 +227,9 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_6460\\963182859.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_jahresabschluss['jahr'] = df_jahresabschluss.name.apply(\n"
]
},
{
"data": {
"text/html": [
@ -361,61 +259,34 @@
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2022-03-25</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <th>0</th>\n",
" <td>2023-05-25</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2020</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2021-03-11</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <th>1</th>\n",
" <td>2023-05-24</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2019</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2020-03-24</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2018</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2018-12-11</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2017</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>2018-01-03</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2016</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" date company \\\n",
"1 2022-03-25 Atos IT-Dienstleistung und Beratung GmbH \n",
"2 2021-03-11 Atos IT-Dienstleistung und Beratung GmbH \n",
"3 2020-03-24 Atos IT-Dienstleistung und Beratung GmbH \n",
"4 2018-12-11 Atos IT-Dienstleistung und Beratung GmbH \n",
"6 2018-01-03 Atos IT-Dienstleistung und Beratung GmbH \n",
" date company \\\n",
"0 2023-05-25 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"1 2023-05-24 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"\n",
" raw_report jahr \n",
"1 <div class=\"publication_container\">\\n <div cla... 2020 \n",
"2 <div class=\"publication_container\">\\n <div cla... 2019 \n",
"3 <div class=\"publication_container\">\\n <div cla... 2018 \n",
"4 <div class=\"publication_container\">\\n <div cla... 2017 \n",
"6 <div class=\"publication_container\">\\n <div cla... 2016 "
"0 <div class=\"publication_container\">\\n <div cla... 2020 \n",
"1 <div class=\"publication_container\">\\n <div cla... 2019 "
]
},
"execution_count": 10,
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
@ -439,7 +310,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
@ -449,11 +320,12 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"sample_report = df_jahresabschluss.iloc[0].raw_report"
"sample_report = df_jahresabschluss.iloc[0].raw_report\n",
"sample_report_content = df_jahresabschluss.iloc[0].raw_report"
]
},
{
@ -466,45 +338,20 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"from dataclasses import dataclass\n",
"\n",
"\n",
"@dataclass\n",
"class Auditor:\n",
" name: str\n",
" company: str\n",
"\n",
"\n",
"def extract_auditors(report: str) -> list:\n",
" auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
" hits = re.findall(auditor_regex, report)\n",
" return [hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip() for hit in hits]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Eckhard Lewe', 'Renate Hermsdorf']"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extract_auditors(sample_report)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def extract_auditor_company(report: str) -> str:\n",
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
" temp = soup.find_all(\"b\")\n",
@ -512,27 +359,37 @@
" br = elem.findChildren(\"br\")\n",
" if len(br) > 0:\n",
" return elem.text.split(\"\\n\")[1].strip()\n",
" return None"
" return None\n",
"\n",
"\n",
"def extract_auditors(report: str) -> list:\n",
" auditor_company = extract_auditor_company(report)\n",
" auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
" hits = re.findall(auditor_regex, report)\n",
" return [\n",
" Auditor(hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip(), auditor_company)\n",
" for hit in hits\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Warth & Klein Grant Thornton AG'"
"[]"
]
},
"execution_count": 17,
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extract_auditor_company(sample_report)"
"extract_auditors(sample_report)"
]
},
{
@ -561,97 +418,177 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Anhang</th>\n",
" <th>2020 TEUR</th>\n",
" <th>Vorjahr TEUR</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1. Umsatzerlöse</td>\n",
" <td>(1)</td>\n",
" <td>69.819</td>\n",
" <td>77.429</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2. Veränderung des Bestandes an unfertigen Lei...</td>\n",
" <td>NaN</td>\n",
" <td>-41.000</td>\n",
" <td>-66.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3. Sonstige betriebliche Erträge</td>\n",
" <td>(2)</td>\n",
" <td>489.000</td>\n",
" <td>1.816</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4. Materialaufwand</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>a) Aufwendungen für bezogene Waren</td>\n",
" <td>NaN</td>\n",
" <td>-1.220</td>\n",
" <td>-3.003</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 Anhang 2020 TEUR \\\n",
"0 1. Umsatzerlöse (1) 69.819 \n",
"1 2. Veränderung des Bestandes an unfertigen Lei... NaN -41.000 \n",
"2 3. Sonstige betriebliche Erträge (2) 489.000 \n",
"3 4. Materialaufwand NaN NaN \n",
"4 a) Aufwendungen für bezogene Waren NaN -1.220 \n",
"\n",
" Vorjahr TEUR \n",
"0 77.429 \n",
"1 -66.000 \n",
"2 1.816 \n",
"3 NaN \n",
"4 -3.003 "
"{'net_income': 23484.67, 'equity': 65083.84, 'current_assets': 357613.61}"
]
},
"execution_count": 18,
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def extract_kpis(report_content) -> dict:\n",
" \"\"\"\n",
" Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd\n",
" Extracts Key Performance Indicators (KPIs) from the financial reports.\n",
" Args:\n",
" reports (dict): A dictionary containing the financial reports with their hash as keys and report details as values.\n",
" Returns:\n",
" dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.\n",
" \"\"\"\n",
"\n",
" kpis = {}\n",
"\n",
" # Define KPI patterns to search for\n",
" kpi_patterns = {\n",
" \"revenue\": r\"(?:revenue|umsatz|erlöse)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"net_income\": r\"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"ebit\": r\"(?:ebit|operating income)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"ebitda\": r\"(?:ebitda)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"gross_profit\": r\"(?:gross profit|bruttogewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"operating_profit\": r\"(?:operating profit|betriebsgewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"assets\": r\"(?:total assets|bilanzsumme)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"liabilities\": r\"(?:total liabilities|gesamtverbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"equity\": r\"(?:shareholders'? equity|eigenkapital)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"current_assets\": r\"(?:current assets|umlaufvermögen)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"current_liabilities\": r\"(?:current liabilities|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"long_term_debt\": r\"(?:long[-\\s]?term debt|langfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"short_term_debt\": r\"(?:short[-\\s]?term debt|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"cash_and_cash_equivalents\": r\"(?:cash (?:and cash equivalents)?|barmittel)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"dividends\": r\"(?:dividends?|dividende)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"cash_flow\": r\"(?:cash flow|cashflow|cash flow from operating activities)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" }\n",
"\n",
" report_kpis = {}\n",
" for kpi, pattern in kpi_patterns.items():\n",
" match = re.search(pattern, report_content, flags=re.IGNORECASE | re.UNICODE)\n",
" if match:\n",
" value = match.group(1)\n",
"\n",
" # Clean and validate the extracted number\n",
" try:\n",
" if not value: # Check if value is empty\n",
" cleaned_value = None\n",
" else:\n",
" multiplier = 1\n",
" if value[-1].lower() == \"m\":\n",
" value = value[:-1]\n",
" multiplier = 1_000_000\n",
" elif value[-1].lower() == \"b\":\n",
" value = value[:-1]\n",
" multiplier = 1_000_000_000\n",
"\n",
" # Remove commas after checking for multipliers\n",
" value = value.replace(\".\", \"\").replace(\",\", \".\").strip()\n",
" cleaned_value = float(value) * multiplier\n",
" except ValueError:\n",
" cleaned_value = None\n",
"\n",
" if cleaned_value is not None:\n",
" report_kpis[kpi] = cleaned_value\n",
" return report_kpis\n",
"\n",
"\n",
"extract_kpis(\n",
" BeautifulSoup(sample_report, features=\"html.parser\").get_text().replace(\"\\n\", \" \")\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"with open(\"./temp.txt\", \"w\") as file:\n",
" file.write(\n",
" BeautifulSoup(sample_report, features=\"html.parser\")\n",
" .get_text()\n",
" .replace(\"\\n\", \" \")\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n",
" ('Aktiva', '31.12.2020 EUR'),\n",
" ('Aktiva', '31.12.2019 EUR')],\n",
" )\n",
"Aktiva Unnamed: 0_level_1 object\n",
" 31.12.2020 EUR object\n",
" 31.12.2019 EUR object\n",
"dtype: object\n",
"MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n",
" ('Passiva', '31.12.2020 EUR'),\n",
" ('Passiva', '31.12.2019 EUR')],\n",
" )\n",
"Passiva Unnamed: 0_level_1 object\n",
" 31.12.2020 EUR object\n",
" 31.12.2019 EUR object\n",
"dtype: object\n",
"Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n",
"Angaben zur Identifikation der Gesellschaft laut Registergericht object\n",
"Angaben zur Identifikation der Gesellschaft laut Registergericht.1 object\n",
"dtype: object\n"
]
},
{
"data": {
"text/plain": [
"{}"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def parse_tables(report: str) -> list:\n",
" result = {}\n",
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
" for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n",
" df = pd.read_html(StringIO(str(table)))[0]\n",
" print(df.columns)\n",
" print(df.dtypes)\n",
" return result\n",
"\n",
"\n",
"parse_tables(sample_report)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "'Passiva'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Bundesanzeiger\\notebook.ipynb Cell 21\u001b[0m in \u001b[0;36m1\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=10'>11</a>\u001b[0m \u001b[39mreturn\u001b[39;00m result\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=13'>14</a>\u001b[0m bilanz \u001b[39m=\u001b[39m get_bilanz(sample_report)\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=14'>15</a>\u001b[0m bilanz[\u001b[39m\"\u001b[39;49m\u001b[39mPassiva\u001b[39;49m\u001b[39m\"\u001b[39;49m]\u001b[39m.\u001b[39mhead()\n",
"\u001b[1;31mKeyError\u001b[0m: 'Passiva'"
]
}
],
"source": [
"def get_bilanz(report: str) -> any:\n",
" result = {}\n",
@ -672,30 +609,30 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Gesellschafterbeschluss', 'Shareholder Resolution'], dtype='object')\n",
"Index(['Unnamed: 0', 'Anhang', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'Anhang', '2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2020 TEUR',\n",
"Int64Index([0, 1], dtype='int64')\n",
"Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n",
" 'Vorjahr TEUR'],\n",
" dtype='object')\n",
"Index(['Aufgliederung nach Inland und Ausland', '2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2020'], dtype='object')\n",
"Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Int64Index([0, 1, 2], dtype='int64')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '2020 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
"MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n",
" ('Art der Beziehung', 'Gesellschafterin TEUR'),\n",
" ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n",
@ -707,24 +644,23 @@
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...)],\n",
" )\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( 'Abschreibungen', 'Stand 01.01.2020 EUR'),\n",
" ( 'Abschreibungen', 'Abschreibungen des Geschäftsjahres EUR'),\n",
" ( 'Abschreibungen', 'Abgänge Umbuchung EUR'),\n",
" ( 'Abschreibungen', 'Stand 31.12.2020 EUR')],\n",
"MultiIndex([('Unnamed: 0_level_0', ...),\n",
" ( 'Abschreibungen', ...),\n",
" ( 'Abschreibungen', ...),\n",
" ( 'Abschreibungen', ...),\n",
" ( 'Abschreibungen', ...)],\n",
" )\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( 'Buchwerte', 'Stand 31.12.2020 EUR'),\n",
" ( 'Buchwerte', 'Stand 31.12.2019 EUR')],\n",
" ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n",
" ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n",
" )\n",
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2020', '2019',\n",
" '2018'],\n",
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n",
" '2019'],\n",
" dtype='object')\n",
"Index(['Gewinn- und Verlustrechnung', '2020 TEUR', 'Vorjahr TEUR',\n",
"Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n",
" 'Veränderung TEUR'],\n",
" dtype='object')\n",
"Index(['Bilanz', '31.12.2020 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n",
"Index(['Bericht des Aufsichtsrats', 'Report of the Supervisory Board'], dtype='object')\n"
"Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n"
]
}
],

1
Jupyter/API-tests/News/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
data/

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
pymongo

View File

@ -0,0 +1 @@
data/*

View File

@ -0,0 +1,192 @@
"""Unternehmensregister Scraping."""
import glob
import logging
import multiprocessing
import os
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
from tqdm import tqdm
logger = logging.getLogger()
def scrape(query: str, download_dir: list[str]):
"""Fetch results from Unternehmensregister for given query.
Args:
query (str): Search Query (RegEx supported)
download_dir (list[str]): Directory to place output files in
"""
download_path = os.path.join(str(Path.cwd()), *download_dir)
options = webdriver.ChromeOptions()
preferences = {
"profile.default_content_settings.popups": 0,
"safebrowsing.enabled": True,
"download": {
"directory_upgrade": True,
"prompt_for_download": False,
"extensions_to_open": "",
"default_directory": download_path,
},
}
options.add_argument("--headless=new")
options.add_experimental_option("prefs", preferences)
driver = webdriver.Chrome(options=options)
driver.get("https://www.unternehmensregister.de/ureg/")
# Accept Cookies
driver.find_elements(
By.XPATH, '//button[text()="Nur technisch notwendige Cookies akzeptieren"]'
)[0].click()
# Enter search query
driver.find_elements(By.ID, "globalSearchForm:extendedResearchCompanyName")[
0
].send_keys(query)
# Trigger search
driver.find_elements(By.ID, "globalSearchForm:btnExecuteSearchOld")[0].click()
# Wait for results
wait = WebDriverWait(driver, 15)
wait.until(
lambda driver: driver.current_url != "https://www.unternehmensregister.de/ureg/"
)
num_pages = int(
driver.find_element(By.XPATH, '//*[@class="page_count"]').text.split(" ")[0]
)
processed_companies = []
for _ in tqdm(range(num_pages)):
# Find all "Registerinformationen"
companies_tab = driver.find_elements(
By.LINK_TEXT, "Registerinformationen des Registergerichts"
)
company_names = [
elem.text
for elem in driver.find_elements(
By.XPATH, '//div[@class="company_result"]/span/b'
)
]
for index, company_link in enumerate(companies_tab):
company_name = company_names[index]
if company_name in processed_companies:
continue
# Go to intermediary page
company_link.click()
# Trigger next redirect
driver.find_element(By.LINK_TEXT, "Registerinformationen anzeigen").click()
# Trigger SI download
driver.find_element(By.LINK_TEXT, "SI").click()
# Show shopping cart
wait.until(
ec.visibility_of_element_located(
(By.LINK_TEXT, "Dokumentenkorb ansehen")
)
)
driver.find_element(By.LINK_TEXT, "Dokumentenkorb ansehen").click()
# Get document
elems = driver.find_elements(By.TAG_NAME, "input")
elems[-2].click()
wait.until(
ec.visibility_of_element_located((By.ID, "paymentFormOverview:btnNext"))
)
driver.find_element(By.ID, "paymentFormOverview:btnNext").click()
wait.until(
ec.visibility_of_element_located((By.LINK_TEXT, "Zum Dokumentenkorb"))
)
driver.find_element(By.LINK_TEXT, "Zum Dokumentenkorb").click()
num_files = get_num_files(download_path)
driver.find_element(By.CLASS_NAME, "download-wrapper").click()
try:
wait.until(wait_for_download_condition(download_path, num_files))
file_name = "".join(e for e in company_name if e.isalnum()) + ".xml"
rename_latest_file(
download_path,
file_name,
)
processed_companies.append(company_name)
except Exception:
logger.warning("Exception caught in Scraping")
finally:
for _ in range(6):
driver.back()
driver.find_element(By.XPATH, '//*[@class="fas fa-angle-right"]').click()
driver.close()
def wait_for_download_condition(
path: str, num_files: int, pattern: str = "*.xml"
) -> bool:
"""Selenium wait condition monitoring number of files in a dir.
Args:
path (str): Directory path
num_files (int): Current number of file
pattern (str, optional): File pattern. Defaults to "*.xml".
Returns:
bool: Current num file exceeded
"""
return len(glob.glob1(path, pattern)) > num_files
def get_num_files(path: str, pattern: str = "*.xml") -> int:
"""Get number of files in directory.
Args:
path (str): Directory to scan
pattern (str, optional): File pattern. Defaults to "*.xml".
Returns:
int: Number of files matching pattern
"""
return len(glob.glob1(path, pattern))
def rename_latest_file(path: str, filename: str, pattern: str = "*.xml"):
"""Rename file in dir with latest change date.
Args:
path (str): Dir to check
filename (str): Name of file
pattern (str, optional): File pattern. Defaults to "*.xml".
"""
list_of_files = [os.path.join(path, file) for file in glob.glob1(path, pattern)]
latest_download = max(list_of_files, key=os.path.getctime)
os.rename(latest_download, os.path.join(path, filename))
if __name__ == "__main__":
"""Main procedure"""
import pandas as pd
df_relevant_companies = pd.read_excel(
"./data/study_id42887_top-100-unternehmen-deutschland.xlsx",
sheet_name="Toplist",
skiprows=1,
)
df_relevant_companies = df_relevant_companies[df_relevant_companies["Name"].notna()]
batch_size = 5
pool = multiprocessing.Pool(processes=batch_size)
params = [
(query, ["data", "Unternehmensregister", "scraping", query.strip()])
for query in df_relevant_companies.Name
]
# Map the process_handler function to the parameter list using the Pool
pool.starmap(scrape, params)
# Close the Pool to prevent any more tasks from being submitted
pool.close()
# Wait for all the processes to complete
pool.join()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,10 @@
ocrmypdf
pytesseract
opencv-python
pdf2image
bs4
selenium
xmltodict
tqdm
openpyxl
pandas

View File

@ -0,0 +1,28 @@
version: '3.8'
services:
mongodb:
image: mongo:6.0.6
container_name: mongodb
restart: unless-stopped
environment:
MONGO_INITDB_ROOT_USERNAME: root
MONGO_INITDB_ROOT_PASSWORD: pR0R0v2e2
MONGO_INITDB_DATABASE: transparenzregister
ports:
- 27017:27017
volumes:
- mongodb_data:/data/db
mongo-express:
image: mongo-express:1.0.0-alpha
container_name: mongo-express
restart: unless-stopped
ports:
- 8081:8081
environment:
ME_CONFIG_MONGODB_SERVER: mongodb
ME_CONFIG_MONGODB_ADMINUSERNAME: root
ME_CONFIG_MONGODB_ADMINPASSWORD: pR0R0v2e2
volumes:
mongodb_data:

View File

@ -1,35 +1,35 @@
---
title: "Automatisierte Daten Extraktion aus Internetquellen als Grundlage für die Analyse von Kapitalgesellschaften"
author: "Nolde, Tristan Norbert"
date: "2023-05-06"
---
# Abstract: Automatisierte Daten Extraktion aus Internetquellen als Grundlage für die Analyse von Kapitalgesellschaften
## Gliederung
1. Einleitung (Zielsetzung/Problemstellung, Vorgehen)
2. Web Scraping/Crawling
1. Definition und Theorie
2. Technologien
3. Umsetzung
3. RSS Feeds
1. Definition und Theorie
2. Technologien
3. Umsetzung
4. APIs
1. Definition und Theorie
2. Technologien
3. Umsetzung
5. Rechtliche Rahmenbedingungen
6. Vergleich der Lösungsansätze
7. Zusammenfassung
## Inhalt
In Zeiten von Big Data und AI stellen Daten und ihre Verfügbarkeit zunehmend eines der wichtigsten Wirtschaftsgüter dar. Als solches können sie auch eingesetzt werden, um Kapitalgesellschaften (eine Subklasse von Unternehmen) anhand verschiedener Kennzahlen wie der Mitarbeiterzahl oder dem Jahresgewinn zu analysieren. Obwohl solche Daten zu Genüge in Zeitungsartikeln, Newslettern oder dedizierten Aktienanalysen zu finden sind, so gestaltet sich eine automatisierte Extraktion dieser Daten aufgrund verschiedener Formate sowie weiterer Restriktionen schwierig.
Daher sollen im Rahmen dieser Seminararbeit verschiedene Wege betrachtet werden, die eben diese Daten erheben und zur Verfügung stellen können. Zu den nennenswerten Quellen gehören: Der Bundesanzeiger, RSS Feeds, Nachrichten APIs. Ziel ist es, aus diesen Quellen wertvolle Informationen bezogen auf den wirtschaftlichen Erfolg einer Kapitalgesellschaft sowie aktueller Nachrichten zu extrahieren und in ein einheitliches Format zu überführen.
Neben des technischen Einsatzes von Web Scraping/Crawling, um Informationen aus Webseiten zu gewinnen, sowie des Abfragens verfügbarer APIs soll auch der rechltiche Aspekt dieser Vorgehens Berücksichtigung finden, um die Rechtmäßigkeit zu bewerten.
Abschließend wird der Einsatz der verschiedenen Technologien an den Faktoren: Flexibilität, Simplizität, Verfügbarkeit und Rechtmäßigkeit, ein Fazit gezogen sowie ein Ausblick des weiteren Einsatzes gegeben.
---
title: "Automatisierte Daten Extraktion aus Internetquellen als Grundlage für die Analyse von Kapitalgesellschaften"
author: "Nolde, Tristan Norbert"
date: "2023-05-06"
---
# Abstract: Automatisierte Daten Extraktion aus Internetquellen als Grundlage für die Analyse von Kapitalgesellschaften
## Gliederung
1. Einleitung (Zielsetzung/Problemstellung, Vorgehen)
2. Web Scraping/Crawling
1. Definition und Theorie
2. Technologien
3. Umsetzung
3. RSS Feeds
1. Definition und Theorie
2. Technologien
3. Umsetzung
4. APIs
1. Definition und Theorie
2. Technologien
3. Umsetzung
5. Rechtliche Rahmenbedingungen
6. Vergleich der Lösungsansätze
7. Zusammenfassung
## Inhalt
In Zeiten von Big Data und AI stellen Daten und ihre Verfügbarkeit zunehmend eines der wichtigsten Wirtschaftsgüter dar. Als solches können sie auch eingesetzt werden, um Kapitalgesellschaften (eine Subklasse von Unternehmen) anhand verschiedener Kennzahlen wie der Mitarbeiterzahl oder dem Jahresgewinn zu analysieren. Obwohl solche Daten zu Genüge in Zeitungsartikeln, Newslettern oder dedizierten Aktienanalysen zu finden sind, so gestaltet sich eine automatisierte Extraktion dieser Daten aufgrund verschiedener Formate sowie weiterer Restriktionen schwierig.
Daher sollen im Rahmen dieser Seminararbeit verschiedene Wege betrachtet werden, die eben diese Daten erheben und zur Verfügung stellen können. Zu den nennenswerten Quellen gehören: Der Bundesanzeiger, RSS Feeds, Nachrichten APIs. Ziel ist es, aus diesen Quellen wertvolle Informationen bezogen auf den wirtschaftlichen Erfolg einer Kapitalgesellschaft sowie aktueller Nachrichten zu extrahieren und in ein einheitliches Format zu überführen.
Neben des technischen Einsatzes von Web Scraping/Crawling, um Informationen aus Webseiten zu gewinnen, sowie des Abfragens verfügbarer APIs soll auch der rechltiche Aspekt dieser Vorgehens Berücksichtigung finden, um die Rechtmäßigkeit zu bewerten.
Abschließend wird der Einsatz der verschiedenen Technologien an den Faktoren: Flexibilität, Simplizität, Verfügbarkeit und Rechtmäßigkeit, ein Fazit gezogen sowie ein Ausblick des weiteren Einsatzes gegeben.

7616
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,107 +1,113 @@
[build-system]
build-backend = "poetry.core.masonry.api"
requires = ["poetry-core"]
[tookl.mypy]
disallow_untyped_defs = true
follow_imports = "silent"
python_version = "3.11"
warn_redudant_casts = true
warn_unused_ignores = true
[tool.black]
target-version = ["py311"]
[tool.coverage.run]
branch = true
dynamic_context = "test_function"
relative_files = true
source = ["src"]
[tool.poetry]
authors = ["AKI Projektgruppe 23"]
description = "A project analysing the german transparenzregister and other data sources to find shared business interests and shared personal and other links for lots of companies."
name = "aki-prj23-transparenzregister"
packages = [{include = "aki_prj23_transparenzregister", from = "src"}]
readme = "README.md"
version = "0.1.0"
[tool.poetry.dependencies]
loguru = "^0.7.0"
matplotlib = "^3.7.1"
plotly = "^5.14.1"
python = "^3.11"
seaborn = "^0.12.2"
tqdm = "^4.65.0"
[tool.poetry.group.develop.dependencies]
black = {extras = ["jupyter"], version = "^23.3.0"}
jupyterlab = "^4.0.0"
nbconvert = "^7.4.0"
pre-commit = "^3.3.2"
rise = "^5.7.1"
[tool.poetry.group.doc.dependencies]
jupyter = "^1.0.0"
myst-parser = "^1.0.0"
nbsphinx = "^0.9.2"
sphinx = "^6.0.0"
sphinx-copybutton = "^0.5.2"
sphinx-rtd-theme = "^1.2.1"
sphinx_autodoc_typehints = "*"
sphinxcontrib-mermaid = "^0.9.2"
sphinxcontrib-napoleon = "^0.7"
[tool.poetry.group.lint.dependencies]
black = "^23.3.0"
mypy = "^1.3.0"
pandas-stubs = "^2.0.1.230501"
ruff = "^0.0.270"
types-requests = "^2.31.0.1"
[tool.poetry.group.test.dependencies]
pytest = "^7.3.1"
pytest-clarity = "^1.0.1"
pytest-cov = "^4.1.0"
pytest-mock = "^3.10.0"
pytest-repeat = "^0.9.1"
[tool.ruff]
exclude = [
".bzr",
".direnv",
".eggs",
".git",
".git-rewrite",
".hg",
".mypy_cache",
".nox",
".pants.d",
".pytype",
".ruff_cache",
".svn",
".tox",
".venv",
"__pypackages__",
"_build",
"buck-out",
"build",
"dist",
"node_modules",
"venv"
]
# Never enforce `E501` (line length violations).
ignore = ["E501"]
line-length = 88
# Enable flake8-bugbear (`B`) rules.
select = ["E", "F", "B", "I", "S", "RSE", "RET", "SLF", "SIM", "TID", "PD", "PL", "PLE", "PLR", "PLW", "NPY", "UP", "D", "N", "A", "C4", "T20", "PT"]
src = ["src"]
target-version = "py311"
# Avoid trying to fix flake8-bugbear (`B`) violations.
unfixable = ["B"]
[tool.ruff.per-file-ignores]
"tests/*.py" = ["S101"]
[tool.ruff.pydocstyle]
convention = "google"
[build-system]
build-backend = "poetry.core.masonry.api"
requires = ["poetry-core"]
[tookl.mypy]
disallow_untyped_defs = true
follow_imports = "silent"
python_version = "3.11"
warn_redudant_casts = true
warn_unused_ignores = true
[tool.black]
target-version = ["py311"]
[tool.coverage.run]
branch = true
dynamic_context = "test_function"
relative_files = true
source = ["src"]
[tool.poetry]
authors = ["AKI Projektgruppe 23"]
description = "A project analysing the german transparenzregister and other data sources to find shared business interests and shared personal and other links for lots of companies."
name = "aki-prj23-transparenzregister"
packages = [{include = "aki_prj23_transparenzregister", from = "src"}]
readme = "README.md"
version = "0.1.0"
[tool.poetry.dependencies]
loguru = "^0.7.0"
matplotlib = "^3.7.1"
plotly = "^5.14.1"
pymongo = "^4.4.1"
python = "^3.11"
seaborn = "^0.12.2"
selenium = "^4.10.0"
tqdm = "^4.65.0"
types-tqdm = "^4.65.0"
[tool.poetry.group.develop.dependencies]
black = {extras = ["jupyter"], version = "^23.3.0"}
jupyterlab = "^4.0.0"
nbconvert = "^7.4.0"
pre-commit = "^3.3.2"
rise = "^5.7.1"
[tool.poetry.group.doc.dependencies]
jupyter = "^1.0.0"
myst-parser = "^1.0.0"
nbsphinx = "^0.9.2"
sphinx = "^6.0.0"
sphinx-copybutton = "^0.5.2"
sphinx-rtd-theme = "^1.2.1"
sphinx_autodoc_typehints = "*"
sphinxcontrib-mermaid = "^0.9.2"
sphinxcontrib-napoleon = "^0.7"
[tool.poetry.group.lint.dependencies]
black = "^23.3.0"
mypy = "^1.3.0"
pandas-stubs = "^2.0.1.230501"
ruff = "^0.0.270"
types-requests = "^2.31.0.1"
[tool.poetry.group.test.dependencies]
pytest = "^7.3.1"
pytest-clarity = "^1.0.1"
pytest-cov = "^4.1.0"
pytest-mock = "^3.10.0"
pytest-repeat = "^0.9.1"
[tool.ruff]
exclude = [
".bzr",
".direnv",
".eggs",
".git",
".git-rewrite",
".hg",
".mypy_cache",
".nox",
".pants.d",
".pytype",
".ruff_cache",
".svn",
".tox",
".venv",
"__pypackages__",
"_build",
"buck-out",
"build",
"dist",
"node_modules",
"venv"
]
# Never enforce `E501` (line length violations).
ignore = ["E501"]
line-length = 88
# Enable flake8-bugbear (`B`) rules.
select = ["E", "F", "B", "I", "S", "RSE", "RET", "SLF", "SIM", "TID", "PD", "PL", "PLE", "PLR", "PLW", "NPY", "UP", "D", "N", "A", "C4", "T20", "PT"]
src = ["src"]
target-version = "py311"
# Avoid trying to fix flake8-bugbear (`B`) violations.
unfixable = ["B"]
[tool.ruff.flake8-builtins]
builtins-ignorelist = ["id"]
[tool.ruff.per-file-ignores]
"tests/*.py" = ["S101", "D100", "D101", "D107", "D103"]
[tool.ruff.pydocstyle]
convention = "google"

View File

@ -0,0 +1 @@
"""Model classes."""

View File

@ -0,0 +1,68 @@
"""Company model."""
from abc import ABC
from dataclasses import asdict, dataclass
from enum import Enum
class RelationshipRoleEnum(Enum):
"""_summary_.
Args:
Enum (_type_): _description_
"""
STAKEHOLDER = ""
ORGANISATION = "ORGANISATION"
@dataclass
class CompanyID:
"""_summary_."""
district_court: str
hr_number: str
@dataclass
class Location:
"""_summary_."""
city: str
street: str | None = None
house_number: str | None = None
zip_code: str | None = None
@dataclass
class CompanyRelationship(ABC):
"""_summary_.
Args:
ABC (_type_): _description_
"""
role: RelationshipRoleEnum
location: Location
@dataclass
class Company:
"""_summary_.
Returns:
_type_: _description_
"""
id: CompanyID
location: Location
name: str
last_update: str
relationships: list[CompanyRelationship]
def to_dict(self) -> dict:
"""_summary_.
Returns:
dict: _description_
"""
return asdict(self)

View File

@ -0,0 +1,25 @@
"""News mnodel."""
from dataclasses import asdict, dataclass
@dataclass
class News:
"""_summary_.
Returns:
_type_: _description_
"""
id: str
title: str
date: str
text: str
source_url: str
def to_dict(self) -> dict:
"""_summary_.
Returns:
dict: _description_
"""
return asdict(self)

View File

@ -0,0 +1 @@
"""Util classes and services."""

View File

@ -0,0 +1,49 @@
"""CompanyMongoService."""
from aki_prj23_transparenzregister.models.company import Company, CompanyID
from aki_prj23_transparenzregister.utils.mongo import MongoConnector
class CompanyMongoService:
"""_summary_."""
def __init__(self, connector: MongoConnector):
"""_summary_.
Args:
connector (MongoConnector): _description_
"""
self.collection = connector.database["companies"]
def get_all(self) -> list[Company]:
"""_summary_.
Returns:
list[Company]: _description_
"""
result = self.collection.find()
return list(result)
def get_by_id(self, id: CompanyID) -> Company | None:
"""_summary_.
Args:
id (str): _description_
Returns:
Company | None: _description_
"""
result = list(self.collection.find({"id": id}))
if len(result) == 1:
return result[0]
return None
def insert(self, company: Company):
"""_summary_.
Args:
company (Company): _description_
Returns:
_type_: _description_
"""
return self.collection.insert_one(company.to_dict())

View File

@ -0,0 +1,47 @@
"""Mongo Wrapper."""
from dataclasses import dataclass
import pymongo
@dataclass
class MongoConnection:
"""_summary_."""
hostname: str
database: str
port: int | None
username: str | None
password: str | None
def get_conn_string(self) -> str:
"""Transforms the information of the object to a MongoDB connection string.
Returns:
str: Connection string
"""
if self.username is not None and self.password is not None:
connection_string = (
f"mongodb+srv://{self.username}:{self.password}@{self.hostname}"
)
else:
connection_string = f"mongodb+srv://{self.hostname}"
if self.port is not None:
connection_string += f":{self.port}"
connection_string = connection_string.replace("mongodb+srv", "mongodb")
return connection_string
class MongoConnector:
"""Wrapper for establishing a connection to a MongoDB instance."""
def __init__(self, connection: MongoConnection):
"""_summary_.
Args:
connection (MongoConnection): Wrapper for connection string
"""
self.client: pymongo.MongoClient = pymongo.MongoClient(
connection.get_conn_string()
)
self.database = self.client[connection.database]

View File

@ -0,0 +1,94 @@
"""MongoNewsService."""
from aki_prj23_transparenzregister.models.news import News
from aki_prj23_transparenzregister.utils.mongo import MongoConnector
class MongoNewsService:
"""_summary_.
Args:
NewsServiceInterface (_type_): _description_
"""
def __init__(self, connector: MongoConnector):
"""_summary_.
Args:
connector (MongoConnector): _description_
"""
self.collection = connector.database["news"]
def get_all(self) -> list[News]:
"""_summary_.
Returns:
list[News]: _description_
"""
result = self.collection.find()
return [MongoEntryTransformer.transform_outgoing(elem) for elem in result]
def get_by_id(self, id: str) -> News | None:
"""_summary_.
Args:
id (str): _description_
Returns:
News | None: _description_
"""
result = list(self.collection.find({"_id": id}))
if len(result) == 1:
return MongoEntryTransformer.transform_outgoing(result[0])
return None
def insert(self, news: News):
"""_summary_.
Args:
news (News): _description_
Returns:
_type_: _description_
"""
return self.collection.insert_one(MongoEntryTransformer.transform_ingoing(news))
class MongoEntryTransformer:
"""_summary_.
Returns:
_type_: _description_
"""
@staticmethod
def transform_ingoing(news: News) -> dict:
"""Convert a News object to a dictionary compatible with a MongoDB entry.
Args:
news (News): News object to be transformed
Returns:
dict: Transformed data with added _id field
"""
transport_object = news.to_dict()
transport_object["_id"] = news.id
del transport_object["id"]
return transport_object
@staticmethod
def transform_outgoing(data: dict) -> News:
"""Reverse the transform_ingoing method.
Args:
data (dict): dict from the MongoDB to be transformed
Returns:
News: News entry based on MongoDB document
"""
return News(
id=data["_id"],
title=data["title"],
date=data["date"],
text=data["text"],
source_url=data["source_url"],
)

View File

@ -0,0 +1,35 @@
"""Test Models.company."""
from aki_prj23_transparenzregister.models.company import Company, CompanyID, Location
def test_to_dict() -> None:
"""Tests if the version tag is entered."""
company_id = CompanyID("The Shire", "420")
location = Location(
city="Insmouth", house_number="19", street="Harbor", zip_code="1890"
)
company = Company(
id=company_id,
last_update="Tomorrow",
location=location,
name="BLANK GmbH",
relationships=[],
)
assert company.to_dict() == {
"id": {
"district_court": company_id.district_court,
"hr_number": company_id.hr_number,
},
"last_update": company.last_update,
"location": {
"city": location.city,
"house_number": location.house_number,
"street": location.street,
"zip_code": location.zip_code,
},
"name": "BLANK GmbH",
"relationships": [],
}

23
tests/models/news_test.py Normal file
View File

@ -0,0 +1,23 @@
"""Test Models.nesws."""
from aki_prj23_transparenzregister.models.news import News
def test_to_dict() -> None:
"""Tests if the version tag is entered."""
news = News(
"4711",
"Economy collapses",
"2042",
"Toilet paper prices rising",
"https://www.google.com",
)
assert news.to_dict() == {
"id": news.id,
"title": news.title,
"date": news.date,
"text": news.text,
"source_url": news.source_url,
}

View File

@ -0,0 +1,103 @@
"""Test utils.company_mongo_service."""
from unittest.mock import Mock
import pytest
from aki_prj23_transparenzregister.models.company import Company
from aki_prj23_transparenzregister.utils.company_mongo_service import (
CompanyMongoService,
)
@pytest.fixture()
def mock_mongo_connector(mocker) -> Mock:
"""Mock MongoConnector class.
Args:
mocker (any): Library mocker
Returns:
Mock: Mocked MongoConnector
"""
mock = Mock()
mocker.patch(
"aki_prj23_transparenzregister.utils.mongo.MongoConnector", return_value=mock
)
return mock
@pytest.fixture()
def mock_collection() -> Mock:
"""Mock mongo collection.
Returns:
Mock: Mock object
"""
return Mock()
def test_init(mock_mongo_connector, mock_collection):
"""Test CompanyMongoService constructor.
Args:
mock_mongo_connector (Mock): Mocked MongoConnector library
mock_collection (Mock): Mocked pymongo collection
"""
mock_mongo_connector.database = {"companies": mock_collection}
service = CompanyMongoService(mock_mongo_connector)
assert service.collection == mock_collection
def test_get_all(mock_mongo_connector, mock_collection):
"""Test CompanyMongoService get_all method.
Args:
mock_mongo_connector (Mock): Mocked MongoConnector library
mock_collection (Mock): Mocked pymongo collection
"""
mock_mongo_connector.database = {"companies": mock_collection}
service = CompanyMongoService(mock_mongo_connector)
mock_result = [{"id": "42"}]
mock_collection.find.return_value = mock_result
assert service.get_all() == mock_result
def test_by_id_no_result(mock_mongo_connector, mock_collection):
"""Test CompanyMongoService get_by_id with no result.
Args:
mock_mongo_connector (Mock): Mocked MongoConnector library
mock_collection (Mock): Mocked pymongo collection
"""
mock_mongo_connector.database = {"companies": mock_collection}
service = CompanyMongoService(mock_mongo_connector)
mock_collection.find.return_value = []
assert service.get_by_id("Does not exist") is None
def test_by_id_result(mock_mongo_connector, mock_collection):
"""Test CompanyMongoService get_by_id with result.
Args:
mock_mongo_connector (Mock): Mocked MongoConnector library
mock_collection (Mock): Mocked pymongo collection
"""
mock_mongo_connector.database = {"companies": mock_collection}
service = CompanyMongoService(mock_mongo_connector)
mock_entry = {"id": "Does exist", "vaue": 42}
mock_collection.find.return_value = [mock_entry]
assert service.get_by_id("Does exist") == mock_entry
def test_insert(mock_mongo_connector, mock_collection):
"""Test CompanyMongoService insert method.
Args:
mock_mongo_connector (Mock): Mocked MongoConnector library
mock_collection (Mock): Mocked pymongo collection
"""
mock_mongo_connector.database = {"companies": mock_collection}
service = CompanyMongoService(mock_mongo_connector)
mock_result = 42
mock_collection.insert_one.return_value = mock_result
assert service.insert(Company(None, None, "", "", [])) == mock_result

26
tests/utils/mongo_test.py Normal file
View File

@ -0,0 +1,26 @@
from unittest.mock import patch
from aki_prj23_transparenzregister.utils.mongo import MongoConnection, MongoConnector
def test_get_conn_string_no_credentials():
conn = MongoConnection("localhost", "", 27017, None, None)
assert conn.get_conn_string() == "mongodb://localhost:27017"
def test_get_conn_string_no_port_but_credentials():
conn = MongoConnection("localhost", "", None, "admin", "password")
assert conn.get_conn_string() == "mongodb+srv://admin:password@localhost"
def test_get_conn_simple():
conn = MongoConnection("localhost", "", None, None, None)
assert conn.get_conn_string() == "mongodb+srv://localhost"
def test_mongo_connector():
with patch("pymongo.MongoClient") as mock_mongo_client:
expected_result = 42
mock_mongo_client.return_value = {"db": expected_result}
temp = MongoConnector(MongoConnection("localhost", "db", None, None, None))
assert temp.database == expected_result

View File

@ -0,0 +1,115 @@
from unittest.mock import Mock, patch
import pytest
from aki_prj23_transparenzregister.models.news import News
from aki_prj23_transparenzregister.utils.news_mongo_service import (
MongoEntryTransformer,
MongoNewsService,
)
@pytest.fixture()
def mock_mongo_connector(mocker) -> Mock:
"""Mock MongoConnector class.
Args:
mocker (any): Library mocker
Returns:
Mock: Mocked MongoConnector
"""
mock = Mock()
mocker.patch(
"aki_prj23_transparenzregister.utils.mongo.MongoConnector", return_value=mock
)
return mock
@pytest.fixture()
def mock_collection() -> Mock:
"""Mock mongo collection.
Returns:
Mock: Mock object
"""
return Mock()
def test_init(mock_mongo_connector, mock_collection):
"""Test CompanyMongoService constructor.
Args:
mock_mongo_connector (Mock): Mocked MongoConnector library
mock_collection (Mock): Mocked pymongo collection
"""
mock_mongo_connector.database = {"news": mock_collection}
service = MongoNewsService(mock_mongo_connector)
assert service.collection == mock_collection
def test_get_all(mock_mongo_connector, mock_collection):
mock_mongo_connector.database = {"news": mock_collection}
service = MongoNewsService(mock_mongo_connector)
mock_collection.find.return_value = []
assert service.get_all() == []
def test_get_by_id_with_result(mock_mongo_connector, mock_collection):
mock_mongo_connector.database = {"news": mock_collection}
service = MongoNewsService(mock_mongo_connector)
with patch(
"aki_prj23_transparenzregister.utils.news_mongo_service.MongoEntryTransformer.transform_outgoing"
) as mock_out:
mock_collection.find.return_value = [{}]
mock_out.return_value = {}
assert service.get_by_id("foadh") == {}
def test_get_by_id_no_result(mock_mongo_connector, mock_collection):
mock_mongo_connector.database = {"news": mock_collection}
service = MongoNewsService(mock_mongo_connector)
mock_collection.find.return_value = []
assert service.get_by_id("foadh") is None
def test_insert(mock_mongo_connector, mock_collection):
mock_mongo_connector.database = {"news": mock_collection}
service = MongoNewsService(mock_mongo_connector)
with patch(
"aki_prj23_transparenzregister.utils.news_mongo_service.MongoEntryTransformer.transform_ingoing"
) as mock_in:
mock_collection.insert_one.return_value = {}
mock_in.return_value = {}
assert service.insert({}) == {}
def test_transform_ingoing():
news = News("42", None, None, None, None)
result = MongoEntryTransformer.transform_ingoing(news)
assert result["_id"] == "42"
assert "id" not in result
def test_transform_outgoing():
data = {
"_id": "4711",
"title": "Hello",
"date": "Today",
"text": "World",
"source_url": "chat.openai.com",
}
expected_result = News(
**{
"id": "4711",
"title": "Hello",
"date": "Today",
"text": "World",
"source_url": "chat.openai.com",
}
)
assert MongoEntryTransformer.transform_outgoing(data) == expected_result