refactor: Pull Auditor extraction into Bundesanzeiger utils

This commit is contained in:
TrisNol 2023-08-18 16:21:52 +02:00
parent f64e0dd96e
commit 1e15656028
6 changed files with 214 additions and 227 deletions

View File

@ -18,235 +18,119 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 2,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>company</th>\n",
" <th>raw_report</th>\n",
" <th>jahr</th>\n",
" <th>auditors</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2023-07-07</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2021</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2023-05-10</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2021</td>\n",
" <td>[Auditor(name='Eckhard Lewe', company='Grant T...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2022-03-25</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2020</td>\n",
" <td>[Auditor(name='Eckhard Lewe', company='Warth &amp;...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2021-03-11</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2019</td>\n",
" <td>[Auditor(name='Eckhard Lewe', company='Warth &amp;...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>2020-03-24</td>\n",
" <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>2018</td>\n",
" <td>[Auditor(name='Ulrich Diersch', company='Warth...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" date company \\\n",
"0 2023-07-07 Atos IT-Dienstleistung und Beratung GmbH \n",
"2 2023-05-10 Atos IT-Dienstleistung und Beratung GmbH \n",
"4 2022-03-25 Atos IT-Dienstleistung und Beratung GmbH \n",
"5 2021-03-11 Atos IT-Dienstleistung und Beratung GmbH \n",
"6 2020-03-24 Atos IT-Dienstleistung und Beratung GmbH \n",
"\n",
" raw_report jahr \\\n",
"0 <div class=\"publication_container\">\\n <div cla... 2021 \n",
"2 <div class=\"publication_container\">\\n <div cla... 2021 \n",
"4 <div class=\"publication_container\">\\n <div cla... 2020 \n",
"5 <div class=\"publication_container\">\\n <div cla... 2019 \n",
"6 <div class=\"publication_container\">\\n <div cla... 2018 \n",
"\n",
" auditors \n",
"0 [] \n",
"2 [Auditor(name='Eckhard Lewe', company='Grant T... \n",
"4 [Auditor(name='Eckhard Lewe', company='Warth &... \n",
"5 [Auditor(name='Eckhard Lewe', company='Warth &... \n",
"6 [Auditor(name='Ulrich Diersch', company='Warth... "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"from deutschland.bundesanzeiger import Bundesanzeiger"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"dict_keys(['7e53c9211957c6a4c17264ab86946c3b', 'c1051233030a8e0232523052fd4a2310', '57d129e6fd7505d567fa13919e5e6bdd'])\n"
]
}
],
"source": [
"ba = Bundesanzeiger()\n",
"reports = ba.get_reports(\n",
" \"Volkswagen Economy Service Erdle Bernhard Erdle GmbH\"\n",
") # \"Atos IT-Dienstleistung und Beratung GmbH\")\n",
"print(reports.keys())"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"report_contents = []\n",
"for key in reports.keys():\n",
" report_contents.append(reports[key])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>name</th>\n",
" <th>company</th>\n",
" <th>report</th>\n",
" <th>raw_report</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2023-07-11</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2023-05-25</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2023-05-24</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" date name \\\n",
"0 2023-07-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"1 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"2 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"\n",
" company \\\n",
"0 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"1 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"2 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"\n",
" report \\\n",
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
"2 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
"\n",
" raw_report \n",
"0 <div class=\"publication_container\">\\n <div cla... \n",
"1 <div class=\"publication_container\">\\n <div cla... \n",
"2 <div class=\"publication_container\">\\n <div cla... "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_reports = pd.DataFrame(report_contents)\n",
"df_reports.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>name</th>\n",
" <th>company</th>\n",
" <th>report</th>\n",
" <th>raw_report</th>\n",
" <th>type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2023-07-11</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>Jahresabschluss</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2023-05-25</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>Jahresabschluss</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2023-05-24</td>\n",
" <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
" <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
" <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
" <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
" <td>Jahresabschluss</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" date name \\\n",
"0 2023-07-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"1 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"2 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n",
"\n",
" company \\\n",
"0 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"1 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"2 Volkswagen Economy Service Erdle Bernhard Erdl... \n",
"\n",
" report \\\n",
"0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
"1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
"2 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n",
"\n",
" raw_report type \n",
"0 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
"1 <div class=\"publication_container\">\\n <div cla... Jahresabschluss \n",
"2 <div class=\"publication_container\">\\n <div cla... Jahresabschluss "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_reports[\"type\"] = df_reports.name.apply(lambda name: name.split(\" \")[0])\n",
"\n",
"from aki_prj23_transparenzregister.utils.data_extraction.bundesanzeiger import (\n",
" Bundesanzeiger,\n",
")\n",
"\n",
"ba_wrapper = Bundesanzeiger()\n",
"df_reports = ba_wrapper.get_information(\"Atos IT-Dienstleistung und Beratung GmbH\")\n",
"df_reports.head()"
]
},

View File

@ -7,4 +7,4 @@ class Auditor:
"""Auditor."""
name: str
company: str
company: str | None

View File

@ -0,0 +1 @@
"""Everything regarding data extraction from various sources."""

View File

@ -0,0 +1,75 @@
"""Fetch data from Bundesanzeiger."""
import re
import pandas as pd
from bs4 import BeautifulSoup
from deutschland.bundesanzeiger import Bundesanzeiger as Ba
from aki_prj23_transparenzregister.models.auditor import Auditor
class Bundesanzeiger:
"""Bundesanzeiger wrapper to export relevant information."""
def __init__(self) -> None:
"""Init."""
self.__ba = Ba()
def get_information(self, company_name: str) -> pd.DataFrame:
"""Extract relevant information from all found yearly results for the given company.
Args:
company_name (str): Name of the company to search for
Returns:
pd.DataFrame: Result
"""
reports = self.__ba.get_reports(company_name)
report_contents = []
for key in reports:
report_contents.append(reports[key])
df_data = pd.DataFrame(report_contents)
df_data["type"] = df_data.name.apply(lambda name: name.split(" ")[0])
df_data = df_data.loc[df_data.type == "Jahresabschluss"]
df_data["jahr"] = df_data.name.apply(
lambda name: name.split(" ")[-1].split(".")[-1]
)
df_data = df_data.drop(["name", "report", "type"], axis=1)
df_data["auditors"] = df_data.raw_report.apply(self.extract_auditors)
return df_data
def extract_auditor_company(self, report: str) -> str | None:
"""Extract the name of an auditor company from the given yearly results report.
Args:
report (str): Yearly results report as raw string
Returns:
str | None: Name of the auditor company if found, otherwise None
"""
soup = BeautifulSoup(report, features="html.parser")
temp = soup.find_all("b")
for elem in temp:
br = elem.findChildren("br")
if len(br) > 0:
return elem.text.split("\n")[1].strip()
return None
def extract_auditors(self, report: str) -> list:
"""Find the list of auditors involved in the given yearly results report.
Args:
report (str): Yearly results report as raw string
Returns:
list[Auditor]: List of Auditors found in the given report
"""
auditor_company = self.extract_auditor_company(report)
auditor_regex = r"[a-z A-Z,.'-]+, Wirtschaftsprüfer"
hits = re.findall(auditor_regex, report)
return [
Auditor(hit.replace(", Wirtschaftsprüfer", "").lstrip(), auditor_company)
for hit in hits
]

View File

@ -0,0 +1 @@
"""Tests for data_extraction."""

View File

@ -0,0 +1,26 @@
from aki_prj23_transparenzregister.utils.data_extraction.bundesanzeiger import (
Bundesanzeiger,
)
def test_extract_auditor_company_no_hits() -> None:
input_data = """
Nothing to see here \O_O/
"""
ba = Bundesanzeiger()
result = ba.extract_auditor_company(input_data)
assert result is None
def test_extract_auditor_company() -> None:
company_name = "Korrupte Wirtschaftsprüfer GmbH & Co. KG"
input_data = f"""
<b>
{company_name}
<br>
Max Mustermann
</b>
"""
ba = Bundesanzeiger()
result = ba.extract_auditor_company(input_data)
assert result == company_name