From 35d54301e8f72bff1add71ee27bd0447c81b48c2 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Tue, 17 Oct 2023 18:42:03 +0200 Subject: [PATCH] fix(data-extraction): Fix data on staging DB using custom function --- .../apps/fix_company_financials.py | 4 +- .../utils/mongo/company_mongo_service.py | 42 +++++++++++++++++-- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/src/aki_prj23_transparenzregister/apps/fix_company_financials.py b/src/aki_prj23_transparenzregister/apps/fix_company_financials.py index d39fabc..8d902b9 100644 --- a/src/aki_prj23_transparenzregister/apps/fix_company_financials.py +++ b/src/aki_prj23_transparenzregister/apps/fix_company_financials.py @@ -15,7 +15,7 @@ if __name__ == "__main__": company_service = CompanyMongoService(mongo_connector) entries = company_service.get_where_malformed_yearly_results() - + logger.info(f"Num. entries: {len(entries)}") for company in entries: work(company, company_service) - logger.info(f"Processed {company['name']}") + logger.info(f"Processed: {company['name']}") diff --git a/src/aki_prj23_transparenzregister/utils/mongo/company_mongo_service.py b/src/aki_prj23_transparenzregister/utils/mongo/company_mongo_service.py index d84ffdb..d175be2 100644 --- a/src/aki_prj23_transparenzregister/utils/mongo/company_mongo_service.py +++ b/src/aki_prj23_transparenzregister/utils/mongo/company_mongo_service.py @@ -88,19 +88,53 @@ class CompanyMongoService: with self.lock: return list(self.collection.find({"yearly_results": {"$gt": {}}})) + def is_where_yearly_result_key_is_not_number(self, data: dict) -> bool: + """Does the entry contain yearly_results which are malformed as non-numbers? + + Args: + data (dict): Entry from MongoDB + + Returns: + bool: Is yealry_results key malformed? + """ + return any(not re.match("^[0-9]{4}$", key) for key in data["yearly_results"]) + + def is_self_referencing_auditors(self, data: dict) -> bool: + """Does the entry contain yearly_resutls which are self-referencing? + + Args: + data (dict): Entry from MongoDB + + Returns: + bool: Is self referencing? + """ + for key in data["yearly_results"]: + for auditor in data["yearly_results"][key]["auditors"]: + if ( + auditor["company"] is not None + and auditor["company"].strip() == data["name"].strip() + ): + return True + return False + def get_where_malformed_yearly_results(self) -> list[dict]: - """Finds all entries with malformed yearly_results (e.g., key is not a year). + """Finds all entries with malformed yearly_results (e.g., key is not a year, self-referencing). Returns: list[dict]: List of companies """ preliminary_results = self.get_where_yearly_results() + + filters = [ + self.is_self_referencing_auditors, + self.is_where_yearly_result_key_is_not_number, + ] malformed_entries = [] # TODO There should be a cleaner solution using pure MongoDB queries/aggregations for entry in preliminary_results: - for key in entry["yearly_results"]: - if not re.match(r"^[0-9]{4}$", key): - malformed_entries.append(entry) + valid = [filter_func(entry) for filter_func in filters] + if any(valid): + malformed_entries.append(entry) return malformed_entries def insert(self, company: Company) -> InsertOneResult: