mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-05-13 19:48:47 +02:00
refactor: Pull Auditor extraction into Bundesanzeiger utils
This commit is contained in:
parent
f64e0dd96e
commit
1e15656028
@ -18,235 +18,119 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>date</th>\n",
|
||||
" <th>company</th>\n",
|
||||
" <th>raw_report</th>\n",
|
||||
" <th>jahr</th>\n",
|
||||
" <th>auditors</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>2023-07-07</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2021</td>\n",
|
||||
" <td>[]</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>2023-05-10</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2021</td>\n",
|
||||
" <td>[Auditor(name='Eckhard Lewe', company='Grant T...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>2022-03-25</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2020</td>\n",
|
||||
" <td>[Auditor(name='Eckhard Lewe', company='Warth &...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>5</th>\n",
|
||||
" <td>2021-03-11</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2019</td>\n",
|
||||
" <td>[Auditor(name='Eckhard Lewe', company='Warth &...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6</th>\n",
|
||||
" <td>2020-03-24</td>\n",
|
||||
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>2018</td>\n",
|
||||
" <td>[Auditor(name='Ulrich Diersch', company='Warth...</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" date company \\\n",
|
||||
"0 2023-07-07 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"2 2023-05-10 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"4 2022-03-25 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"5 2021-03-11 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"6 2020-03-24 Atos IT-Dienstleistung und Beratung GmbH \n",
|
||||
"\n",
|
||||
" raw_report jahr \\\n",
|
||||
"0 <div class=\"publication_container\">\\n <div cla... 2021 \n",
|
||||
"2 <div class=\"publication_container\">\\n <div cla... 2021 \n",
|
||||
"4 <div class=\"publication_container\">\\n <div cla... 2020 \n",
|
||||
"5 <div class=\"publication_container\">\\n <div cla... 2019 \n",
|
||||
"6 <div class=\"publication_container\">\\n <div cla... 2018 \n",
|
||||
"\n",
|
||||
" auditors \n",
|
||||
"0 [] \n",
|
||||
"2 [Auditor(name='Eckhard Lewe', company='Grant T... \n",
|
||||
"4 [Auditor(name='Eckhard Lewe', company='Warth &... \n",
|
||||
"5 [Auditor(name='Eckhard Lewe', company='Warth &... \n",
|
||||
"6 [Auditor(name='Ulrich Diersch', company='Warth... "
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"from deutschland.bundesanzeiger import Bundesanzeiger"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"dict_keys(['7e53c9211957c6a4c17264ab86946c3b', 'c1051233030a8e0232523052fd4a2310', '57d129e6fd7505d567fa13919e5e6bdd'])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ba = Bundesanzeiger()\n",
|
||||
"reports = ba.get_reports(\n",
|
||||
" \"Volkswagen Economy Service Erdle Bernhard Erdle GmbH\"\n",
|
||||
") # \"Atos IT-Dienstleistung und Beratung GmbH\")\n",
|
||||
"print(reports.keys())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"report_contents = []\n",
|
||||
"for key in reports.keys():\n",
|
||||
" report_contents.append(reports[key])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>date</th>\n",
|
||||
" <th>name</th>\n",
|
||||
" <th>company</th>\n",
|
||||
" <th>report</th>\n",
|
||||
" <th>raw_report</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>2023-07-11</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2023-05-25</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>2023-05-24</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" date name \\\n",
|
||||
"0 2023-07-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"1 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"2 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"\n",
|
||||
" company \\\n",
|
||||
"0 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"1 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"2 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"\n",
|
||||
" report \\\n",
|
||||
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||
"2 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||
"\n",
|
||||
" raw_report \n",
|
||||
"0 <div class=\"publication_container\">\\n <div cla... \n",
|
||||
"1 <div class=\"publication_container\">\\n <div cla... \n",
|
||||
"2 <div class=\"publication_container\">\\n <div cla... "
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df_reports = pd.DataFrame(report_contents)\n",
|
||||
"df_reports.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>date</th>\n",
|
||||
" <th>name</th>\n",
|
||||
" <th>company</th>\n",
|
||||
" <th>report</th>\n",
|
||||
" <th>raw_report</th>\n",
|
||||
" <th>type</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>2023-07-11</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>Jahresabschluss</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2023-05-25</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>Jahresabschluss</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>2023-05-24</td>\n",
|
||||
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
|
||||
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
|
||||
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
|
||||
" <td><div class=\"publication_container\">\\n <div cla...</td>\n",
|
||||
" <td>Jahresabschluss</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" date name \\\n",
|
||||
"0 2023-07-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"1 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"2 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
|
||||
"\n",
|
||||
" company \\\n",
|
||||
"0 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"1 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"2 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
|
||||
"\n",
|
||||
" report \\\n",
|
||||
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||
"2 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
|
||||
"\n",
|
||||
" raw_report type \n",
|
||||
"0 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
||||
"1 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
|
||||
"2 <div class=\"publication_container\">\\n <div cla... Jahresabschluss "
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df_reports[\"type\"] = df_reports.name.apply(lambda name: name.split(\" \")[0])\n",
|
||||
"\n",
|
||||
"from aki_prj23_transparenzregister.utils.data_extraction.bundesanzeiger import (\n",
|
||||
" Bundesanzeiger,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"ba_wrapper = Bundesanzeiger()\n",
|
||||
"df_reports = ba_wrapper.get_information(\"Atos IT-Dienstleistung und Beratung GmbH\")\n",
|
||||
"df_reports.head()"
|
||||
]
|
||||
},
|
||||
|
@ -7,4 +7,4 @@ class Auditor:
|
||||
"""Auditor."""
|
||||
|
||||
name: str
|
||||
company: str
|
||||
company: str | None
|
||||
|
@ -0,0 +1 @@
|
||||
"""Everything regarding data extraction from various sources."""
|
@ -0,0 +1,75 @@
|
||||
"""Fetch data from Bundesanzeiger."""
|
||||
import re
|
||||
|
||||
import pandas as pd
|
||||
from bs4 import BeautifulSoup
|
||||
from deutschland.bundesanzeiger import Bundesanzeiger as Ba
|
||||
|
||||
from aki_prj23_transparenzregister.models.auditor import Auditor
|
||||
|
||||
|
||||
class Bundesanzeiger:
|
||||
"""Bundesanzeiger wrapper to export relevant information."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Init."""
|
||||
self.__ba = Ba()
|
||||
|
||||
def get_information(self, company_name: str) -> pd.DataFrame:
|
||||
"""Extract relevant information from all found yearly results for the given company.
|
||||
|
||||
Args:
|
||||
company_name (str): Name of the company to search for
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Result
|
||||
"""
|
||||
reports = self.__ba.get_reports(company_name)
|
||||
report_contents = []
|
||||
for key in reports:
|
||||
report_contents.append(reports[key])
|
||||
|
||||
df_data = pd.DataFrame(report_contents)
|
||||
df_data["type"] = df_data.name.apply(lambda name: name.split(" ")[0])
|
||||
df_data = df_data.loc[df_data.type == "Jahresabschluss"]
|
||||
df_data["jahr"] = df_data.name.apply(
|
||||
lambda name: name.split(" ")[-1].split(".")[-1]
|
||||
)
|
||||
df_data = df_data.drop(["name", "report", "type"], axis=1)
|
||||
|
||||
df_data["auditors"] = df_data.raw_report.apply(self.extract_auditors)
|
||||
return df_data
|
||||
|
||||
def extract_auditor_company(self, report: str) -> str | None:
|
||||
"""Extract the name of an auditor company from the given yearly results report.
|
||||
|
||||
Args:
|
||||
report (str): Yearly results report as raw string
|
||||
|
||||
Returns:
|
||||
str | None: Name of the auditor company if found, otherwise None
|
||||
"""
|
||||
soup = BeautifulSoup(report, features="html.parser")
|
||||
temp = soup.find_all("b")
|
||||
for elem in temp:
|
||||
br = elem.findChildren("br")
|
||||
if len(br) > 0:
|
||||
return elem.text.split("\n")[1].strip()
|
||||
return None
|
||||
|
||||
def extract_auditors(self, report: str) -> list:
|
||||
"""Find the list of auditors involved in the given yearly results report.
|
||||
|
||||
Args:
|
||||
report (str): Yearly results report as raw string
|
||||
|
||||
Returns:
|
||||
list[Auditor]: List of Auditors found in the given report
|
||||
"""
|
||||
auditor_company = self.extract_auditor_company(report)
|
||||
auditor_regex = r"[a-z A-Z,.'-]+, Wirtschaftsprüfer"
|
||||
hits = re.findall(auditor_regex, report)
|
||||
return [
|
||||
Auditor(hit.replace(", Wirtschaftsprüfer", "").lstrip(), auditor_company)
|
||||
for hit in hits
|
||||
]
|
1
tests/utils/data_extraction/__init__.py
Normal file
1
tests/utils/data_extraction/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Tests for data_extraction."""
|
26
tests/utils/data_extraction/bundesanzeiger_test.py
Normal file
26
tests/utils/data_extraction/bundesanzeiger_test.py
Normal file
@ -0,0 +1,26 @@
|
||||
from aki_prj23_transparenzregister.utils.data_extraction.bundesanzeiger import (
|
||||
Bundesanzeiger,
|
||||
)
|
||||
|
||||
|
||||
def test_extract_auditor_company_no_hits() -> None:
|
||||
input_data = """
|
||||
Nothing to see here \O_O/
|
||||
"""
|
||||
ba = Bundesanzeiger()
|
||||
result = ba.extract_auditor_company(input_data)
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_extract_auditor_company() -> None:
|
||||
company_name = "Korrupte Wirtschaftsprüfer GmbH & Co. KG"
|
||||
input_data = f"""
|
||||
<b>
|
||||
{company_name}
|
||||
<br>
|
||||
Max Mustermann
|
||||
</b>
|
||||
"""
|
||||
ba = Bundesanzeiger()
|
||||
result = ba.extract_auditor_company(input_data)
|
||||
assert result == company_name
|
Loading…
x
Reference in New Issue
Block a user