diff --git a/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb b/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb index e32082e..f67b7e7 100644 --- a/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb +++ b/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb @@ -18,235 +18,119 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datecompanyraw_reportjahrauditors
02023-07-07Atos IT-Dienstleistung und Beratung GmbH<div class=\"publication_container\">\\n <div cla...2021[]
22023-05-10Atos IT-Dienstleistung und Beratung GmbH<div class=\"publication_container\">\\n <div cla...2021[Auditor(name='Eckhard Lewe', company='Grant T...
42022-03-25Atos IT-Dienstleistung und Beratung GmbH<div class=\"publication_container\">\\n <div cla...2020[Auditor(name='Eckhard Lewe', company='Warth &...
52021-03-11Atos IT-Dienstleistung und Beratung GmbH<div class=\"publication_container\">\\n <div cla...2019[Auditor(name='Eckhard Lewe', company='Warth &...
62020-03-24Atos IT-Dienstleistung und Beratung GmbH<div class=\"publication_container\">\\n <div cla...2018[Auditor(name='Ulrich Diersch', company='Warth...
\n", + "
" + ], + "text/plain": [ + " date company \\\n", + "0 2023-07-07 Atos IT-Dienstleistung und Beratung GmbH \n", + "2 2023-05-10 Atos IT-Dienstleistung und Beratung GmbH \n", + "4 2022-03-25 Atos IT-Dienstleistung und Beratung GmbH \n", + "5 2021-03-11 Atos IT-Dienstleistung und Beratung GmbH \n", + "6 2020-03-24 Atos IT-Dienstleistung und Beratung GmbH \n", + "\n", + " raw_report jahr \\\n", + "0
\\n
\\n
\\n
\\n
\\n
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
datenamecompanyreportraw_report
02023-07-11Jahresabschluss zum Geschäftsjahr vom 01.01.20...Volkswagen Economy Service Erdle Bernhard Erdl...\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...<div class=\"publication_container\">\\n <div cla...
12023-05-25Jahresabschluss zum Geschäftsjahr vom 01.01.20...Volkswagen Economy Service Erdle Bernhard Erdl...\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...<div class=\"publication_container\">\\n <div cla...
22023-05-24Jahresabschluss zum Geschäftsjahr vom 01.01.20...Volkswagen Economy Service Erdle Bernhard Erdl...\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...<div class=\"publication_container\">\\n <div cla...
\n", - "
" - ], - "text/plain": [ - " date name \\\n", - "0 2023-07-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n", - "1 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n", - "2 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n", - "\n", - " company \\\n", - "0 Volkswagen Economy Service Erdle Bernhard Erdl... \n", - "1 Volkswagen Economy Service Erdle Bernhard Erdl... \n", - "2 Volkswagen Economy Service Erdle Bernhard Erdl... \n", - "\n", - " report \\\n", - "0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n", - "1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n", - "2 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n", - "\n", - " raw_report \n", - "0
\\n
\\n
\\n
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
datenamecompanyreportraw_reporttype
02023-07-11Jahresabschluss zum Geschäftsjahr vom 01.01.20...Volkswagen Economy Service Erdle Bernhard Erdl...\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...<div class=\"publication_container\">\\n <div cla...Jahresabschluss
12023-05-25Jahresabschluss zum Geschäftsjahr vom 01.01.20...Volkswagen Economy Service Erdle Bernhard Erdl...\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...<div class=\"publication_container\">\\n <div cla...Jahresabschluss
22023-05-24Jahresabschluss zum Geschäftsjahr vom 01.01.20...Volkswagen Economy Service Erdle Bernhard Erdl...\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...<div class=\"publication_container\">\\n <div cla...Jahresabschluss
\n", - "
" - ], - "text/plain": [ - " date name \\\n", - "0 2023-07-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n", - "1 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n", - "2 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n", - "\n", - " company \\\n", - "0 Volkswagen Economy Service Erdle Bernhard Erdl... \n", - "1 Volkswagen Economy Service Erdle Bernhard Erdl... \n", - "2 Volkswagen Economy Service Erdle Bernhard Erdl... \n", - "\n", - " report \\\n", - "0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n", - "1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n", - "2 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n", - "\n", - " raw_report type \n", - "0
\\n
\\n
\\n
None: + """Init.""" + self.__ba = Ba() + + def get_information(self, company_name: str) -> pd.DataFrame: + """Extract relevant information from all found yearly results for the given company. + + Args: + company_name (str): Name of the company to search for + + Returns: + pd.DataFrame: Result + """ + reports = self.__ba.get_reports(company_name) + report_contents = [] + for key in reports: + report_contents.append(reports[key]) + + df_data = pd.DataFrame(report_contents) + df_data["type"] = df_data.name.apply(lambda name: name.split(" ")[0]) + df_data = df_data.loc[df_data.type == "Jahresabschluss"] + df_data["jahr"] = df_data.name.apply( + lambda name: name.split(" ")[-1].split(".")[-1] + ) + df_data = df_data.drop(["name", "report", "type"], axis=1) + + df_data["auditors"] = df_data.raw_report.apply(self.extract_auditors) + return df_data + + def extract_auditor_company(self, report: str) -> str | None: + """Extract the name of an auditor company from the given yearly results report. + + Args: + report (str): Yearly results report as raw string + + Returns: + str | None: Name of the auditor company if found, otherwise None + """ + soup = BeautifulSoup(report, features="html.parser") + temp = soup.find_all("b") + for elem in temp: + br = elem.findChildren("br") + if len(br) > 0: + return elem.text.split("\n")[1].strip() + return None + + def extract_auditors(self, report: str) -> list: + """Find the list of auditors involved in the given yearly results report. + + Args: + report (str): Yearly results report as raw string + + Returns: + list[Auditor]: List of Auditors found in the given report + """ + auditor_company = self.extract_auditor_company(report) + auditor_regex = r"[a-z A-Z,.'-]+, Wirtschaftsprüfer" + hits = re.findall(auditor_regex, report) + return [ + Auditor(hit.replace(", Wirtschaftsprüfer", "").lstrip(), auditor_company) + for hit in hits + ] diff --git a/tests/utils/data_extraction/__init__.py b/tests/utils/data_extraction/__init__.py new file mode 100644 index 0000000..0388525 --- /dev/null +++ b/tests/utils/data_extraction/__init__.py @@ -0,0 +1 @@ +"""Tests for data_extraction.""" diff --git a/tests/utils/data_extraction/bundesanzeiger_test.py b/tests/utils/data_extraction/bundesanzeiger_test.py new file mode 100644 index 0000000..6c870ff --- /dev/null +++ b/tests/utils/data_extraction/bundesanzeiger_test.py @@ -0,0 +1,26 @@ +from aki_prj23_transparenzregister.utils.data_extraction.bundesanzeiger import ( + Bundesanzeiger, +) + + +def test_extract_auditor_company_no_hits() -> None: + input_data = """ + Nothing to see here \O_O/ + """ + ba = Bundesanzeiger() + result = ba.extract_auditor_company(input_data) + assert result is None + + +def test_extract_auditor_company() -> None: + company_name = "Korrupte Wirtschaftsprüfer GmbH & Co. KG" + input_data = f""" + + {company_name} +
+ Max Mustermann +
+ """ + ba = Bundesanzeiger() + result = ba.extract_auditor_company(input_data) + assert result == company_name