\\n
None:
+ """Init."""
+ self.__ba = Ba()
+
+ def get_information(self, company_name: str) -> pd.DataFrame:
+ """Extract relevant information from all found yearly results for the given company.
+
+ Args:
+ company_name (str): Name of the company to search for
+
+ Returns:
+ pd.DataFrame: Result
+ """
+ reports = self.__ba.get_reports(company_name)
+ report_contents = []
+ for key in reports:
+ report_contents.append(reports[key])
+
+ df_data = pd.DataFrame(report_contents)
+ df_data["type"] = df_data.name.apply(lambda name: name.split(" ")[0])
+ df_data = df_data.loc[df_data.type == "Jahresabschluss"]
+ df_data["jahr"] = df_data.name.apply(
+ lambda name: name.split(" ")[-1].split(".")[-1]
+ )
+ df_data = df_data.drop(["name", "report", "type"], axis=1)
+
+ df_data["auditors"] = df_data.raw_report.apply(self.extract_auditors)
+ return df_data
+
+ def extract_auditor_company(self, report: str) -> str | None:
+ """Extract the name of an auditor company from the given yearly results report.
+
+ Args:
+ report (str): Yearly results report as raw string
+
+ Returns:
+ str | None: Name of the auditor company if found, otherwise None
+ """
+ soup = BeautifulSoup(report, features="html.parser")
+ temp = soup.find_all("b")
+ for elem in temp:
+ br = elem.findChildren("br")
+ if len(br) > 0:
+ return elem.text.split("\n")[1].strip()
+ return None
+
+ def extract_auditors(self, report: str) -> list:
+ """Find the list of auditors involved in the given yearly results report.
+
+ Args:
+ report (str): Yearly results report as raw string
+
+ Returns:
+ list[Auditor]: List of Auditors found in the given report
+ """
+ auditor_company = self.extract_auditor_company(report)
+ auditor_regex = r"[a-z A-Z,.'-]+, Wirtschaftsprüfer"
+ hits = re.findall(auditor_regex, report)
+ return [
+ Auditor(hit.replace(", Wirtschaftsprüfer", "").lstrip(), auditor_company)
+ for hit in hits
+ ]
diff --git a/tests/utils/data_extraction/__init__.py b/tests/utils/data_extraction/__init__.py
new file mode 100644
index 0000000..0388525
--- /dev/null
+++ b/tests/utils/data_extraction/__init__.py
@@ -0,0 +1 @@
+"""Tests for data_extraction."""
diff --git a/tests/utils/data_extraction/bundesanzeiger_test.py b/tests/utils/data_extraction/bundesanzeiger_test.py
new file mode 100644
index 0000000..6c870ff
--- /dev/null
+++ b/tests/utils/data_extraction/bundesanzeiger_test.py
@@ -0,0 +1,26 @@
+from aki_prj23_transparenzregister.utils.data_extraction.bundesanzeiger import (
+ Bundesanzeiger,
+)
+
+
+def test_extract_auditor_company_no_hits() -> None:
+ input_data = """
+ Nothing to see here \O_O/
+ """
+ ba = Bundesanzeiger()
+ result = ba.extract_auditor_company(input_data)
+ assert result is None
+
+
+def test_extract_auditor_company() -> None:
+ company_name = "Korrupte Wirtschaftsprüfer GmbH & Co. KG"
+ input_data = f"""
+
+ {company_name}
+
+ Max Mustermann
+
+ """
+ ba = Bundesanzeiger()
+ result = ba.extract_auditor_company(input_data)
+ assert result == company_name