diff --git a/src/aki_prj23_transparenzregister/apps/fix_company_financials.py b/src/aki_prj23_transparenzregister/apps/fix_company_financials.py index d39fabc..8d902b9 100644 --- a/src/aki_prj23_transparenzregister/apps/fix_company_financials.py +++ b/src/aki_prj23_transparenzregister/apps/fix_company_financials.py @@ -15,7 +15,7 @@ if __name__ == "__main__": company_service = CompanyMongoService(mongo_connector) entries = company_service.get_where_malformed_yearly_results() - + logger.info(f"Num. entries: {len(entries)}") for company in entries: work(company, company_service) - logger.info(f"Processed {company['name']}") + logger.info(f"Processed: {company['name']}") diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py index 0a4a4e9..b19398d 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py @@ -42,7 +42,11 @@ class Bundesanzeiger: df_data = df_data.loc[df_data.company == company_name] # Add Auditor information - df_data["auditors"] = df_data.raw_report.apply(self.extract_auditors) + audits = [] + for report in df_data["raw_report"]: + audit = self.extract_auditors(report, company_name) + audits.append(audit) + df_data["auditors"] = audits # Add Financial information df_data["financial_results"] = df_data.raw_report.apply( @@ -75,11 +79,12 @@ class Bundesanzeiger: return df_reports.drop(["name", "report", "type"], axis=1) @staticmethod - def extract_auditor_company(report: str) -> str | None: + def extract_auditor_company(report: str, company_name: str) -> str | None: """Extract the name of an auditor company from the given yearly results report. Args: report (str): Yearly results report as raw string + company_name (str): Name of the company the report originates from, used for filtering Returns: str | None: Name of the auditor company if found, otherwise None @@ -89,25 +94,25 @@ class Bundesanzeiger: for elem in temp: br = elem.findChildren("br") if len(br) > 0: - return elem.text.split("\n")[1].strip() + temp = elem.text.split("\n")[1].strip() + if temp != company_name: + return temp return None - def extract_auditors(self, report: str) -> list: + def extract_auditors(self, report: str, company_name: str) -> list: """Find the list of auditors involved in the given yearly results report. Args: report (str): Yearly results report as raw string + company_name (str): Name of the company the report originates from, used for filtering Returns: list[Auditor]: List of Auditors found in the given report """ - auditor_company = self.extract_auditor_company(report) - auditor_regex = r"[a-z A-Z,.'-]+, Wirtschaftsprüfer" + auditor_company = self.extract_auditor_company(report, company_name) + auditor_regex = r"([a-z A-ZÄäÜüÖö,.'-]+), Wirtschaftsprüfer(in)?" hits = re.findall(auditor_regex, report) - return [ - Auditor(hit.replace(", Wirtschaftsprüfer", "").lstrip(), auditor_company) - for hit in hits - ] + return [Auditor(hit[0].strip(), auditor_company) for hit in hits] def __extract_kpis__(self, report: str) -> dict: """Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd. diff --git a/src/aki_prj23_transparenzregister/utils/mongo/company_mongo_service.py b/src/aki_prj23_transparenzregister/utils/mongo/company_mongo_service.py index d84ffdb..d175be2 100644 --- a/src/aki_prj23_transparenzregister/utils/mongo/company_mongo_service.py +++ b/src/aki_prj23_transparenzregister/utils/mongo/company_mongo_service.py @@ -88,19 +88,53 @@ class CompanyMongoService: with self.lock: return list(self.collection.find({"yearly_results": {"$gt": {}}})) + def is_where_yearly_result_key_is_not_number(self, data: dict) -> bool: + """Does the entry contain yearly_results which are malformed as non-numbers? + + Args: + data (dict): Entry from MongoDB + + Returns: + bool: Is yealry_results key malformed? + """ + return any(not re.match("^[0-9]{4}$", key) for key in data["yearly_results"]) + + def is_self_referencing_auditors(self, data: dict) -> bool: + """Does the entry contain yearly_resutls which are self-referencing? + + Args: + data (dict): Entry from MongoDB + + Returns: + bool: Is self referencing? + """ + for key in data["yearly_results"]: + for auditor in data["yearly_results"][key]["auditors"]: + if ( + auditor["company"] is not None + and auditor["company"].strip() == data["name"].strip() + ): + return True + return False + def get_where_malformed_yearly_results(self) -> list[dict]: - """Finds all entries with malformed yearly_results (e.g., key is not a year). + """Finds all entries with malformed yearly_results (e.g., key is not a year, self-referencing). Returns: list[dict]: List of companies """ preliminary_results = self.get_where_yearly_results() + + filters = [ + self.is_self_referencing_auditors, + self.is_where_yearly_result_key_is_not_number, + ] malformed_entries = [] # TODO There should be a cleaner solution using pure MongoDB queries/aggregations for entry in preliminary_results: - for key in entry["yearly_results"]: - if not re.match(r"^[0-9]{4}$", key): - malformed_entries.append(entry) + valid = [filter_func(entry) for filter_func in filters] + if any(valid): + malformed_entries.append(entry) return malformed_entries def insert(self, company: Company) -> InsertOneResult: diff --git a/tests/utils/data_extraction/bundesanzeiger_test.py b/tests/utils/data_extraction/bundesanzeiger_test.py index d764a52..73bbbd9 100644 --- a/tests/utils/data_extraction/bundesanzeiger_test.py +++ b/tests/utils/data_extraction/bundesanzeiger_test.py @@ -12,14 +12,36 @@ from aki_prj23_transparenzregister.utils.enum_types import FinancialKPIEnum def test_extract_auditor_company_no_hits() -> None: input_data = """ + Mega GmbH Nothing to see here """ ba = Bundesanzeiger() - result = ba.extract_auditor_company(input_data) + result = ba.extract_auditor_company(input_data, "Mega GmbH") assert result is None +def test_extract_auditor_company_self_referencing() -> None: + company = "Mega GmbH" + auditor_company = "Super AG" + + input_data = f""" + + {company} +
+ Nothing to see here +
+ + {auditor_company} +
+ Nothing to see here +
+ """ + ba = Bundesanzeiger() + result = ba.extract_auditor_company(input_data, company) + assert result == auditor_company + + def test_extract_auditor_company() -> None: company_name = "Korrupte Wirtschaftsprüfer GmbH & Co. KG" input_data = f""" @@ -30,7 +52,7 @@ def test_extract_auditor_company() -> None: """ ba = Bundesanzeiger() - result = ba.extract_auditor_company(input_data) + result = ba.extract_auditor_company(input_data, "Super AG") assert result == company_name diff --git a/tests/utils/mongo/company_mongo_service_test.py b/tests/utils/mongo/company_mongo_service_test.py index 52b58af..434198f 100644 --- a/tests/utils/mongo/company_mongo_service_test.py +++ b/tests/utils/mongo/company_mongo_service_test.py @@ -172,19 +172,28 @@ def test_get_where_malformed_yearly_results( "_id": "abc", "name": "Fielmann", "Hotel?": "Trivago", - "yearly_results": {"Vor Aeonen": 42, "2022": 4711}, + "yearly_results": { + "Vor Aeonen": {"auditors": [], 42: 1}, + "2022": {"auditors": [], 42: 1}, + }, }, { "_id": "abc", "name": "Fielmann", "Hotel?": "Trivago", - "yearly_results": {"1998": 42, "2022": 4711}, + "yearly_results": { + "1998": {"auditors": [], 42: 1}, + "2022": {"auditors": [], 42: 1}, + }, }, { "_id": "abc", "name": "Fielmann", "Hotel?": "Trivago", - "yearly_results": {"19": 42, "2022": 4711}, + "yearly_results": { + "19": {"auditors": [], 42: 1}, + "2022": {"auditors": [], 42: 1}, + }, }, ] mock_collection.find.return_value = mock_result