diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py index f59c889..b19398d 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py @@ -42,7 +42,11 @@ class Bundesanzeiger: df_data = df_data.loc[df_data.company == company_name] # Add Auditor information - df_data["auditors"] = df_data.raw_report.apply(self.extract_auditors) + audits = [] + for report in df_data["raw_report"]: + audit = self.extract_auditors(report, company_name) + audits.append(audit) + df_data["auditors"] = audits # Add Financial information df_data["financial_results"] = df_data.raw_report.apply( @@ -75,11 +79,12 @@ class Bundesanzeiger: return df_reports.drop(["name", "report", "type"], axis=1) @staticmethod - def extract_auditor_company(report: str) -> str | None: + def extract_auditor_company(report: str, company_name: str) -> str | None: """Extract the name of an auditor company from the given yearly results report. Args: report (str): Yearly results report as raw string + company_name (str): Name of the company the report originates from, used for filtering Returns: str | None: Name of the auditor company if found, otherwise None @@ -89,19 +94,22 @@ class Bundesanzeiger: for elem in temp: br = elem.findChildren("br") if len(br) > 0: - return elem.text.split("\n")[1].strip() + temp = elem.text.split("\n")[1].strip() + if temp != company_name: + return temp return None - def extract_auditors(self, report: str) -> list: + def extract_auditors(self, report: str, company_name: str) -> list: """Find the list of auditors involved in the given yearly results report. Args: report (str): Yearly results report as raw string + company_name (str): Name of the company the report originates from, used for filtering Returns: list[Auditor]: List of Auditors found in the given report """ - auditor_company = self.extract_auditor_company(report) + auditor_company = self.extract_auditor_company(report, company_name) auditor_regex = r"([a-z A-ZÄäÜüÖö,.'-]+), Wirtschaftsprüfer(in)?" hits = re.findall(auditor_regex, report) return [Auditor(hit[0].strip(), auditor_company) for hit in hits]