fix(data-extraction): Fix data on staging DB using custom function

This commit is contained in:
TrisNol 2023-10-17 18:42:03 +02:00
parent 600039207d
commit 35d54301e8
2 changed files with 40 additions and 6 deletions

View File

@ -15,7 +15,7 @@ if __name__ == "__main__":
company_service = CompanyMongoService(mongo_connector)
entries = company_service.get_where_malformed_yearly_results()
logger.info(f"Num. entries: {len(entries)}")
for company in entries:
work(company, company_service)
logger.info(f"Processed {company['name']}")
logger.info(f"Processed: {company['name']}")

View File

@ -88,19 +88,53 @@ class CompanyMongoService:
with self.lock:
return list(self.collection.find({"yearly_results": {"$gt": {}}}))
def is_where_yearly_result_key_is_not_number(self, data: dict) -> bool:
"""Does the entry contain yearly_results which are malformed as non-numbers?
Args:
data (dict): Entry from MongoDB
Returns:
bool: Is yealry_results key malformed?
"""
return any(not re.match("^[0-9]{4}$", key) for key in data["yearly_results"])
def is_self_referencing_auditors(self, data: dict) -> bool:
"""Does the entry contain yearly_resutls which are self-referencing?
Args:
data (dict): Entry from MongoDB
Returns:
bool: Is self referencing?
"""
for key in data["yearly_results"]:
for auditor in data["yearly_results"][key]["auditors"]:
if (
auditor["company"] is not None
and auditor["company"].strip() == data["name"].strip()
):
return True
return False
def get_where_malformed_yearly_results(self) -> list[dict]:
"""Finds all entries with malformed yearly_results (e.g., key is not a year).
"""Finds all entries with malformed yearly_results (e.g., key is not a year, self-referencing).
Returns:
list[dict]: List of companies
"""
preliminary_results = self.get_where_yearly_results()
filters = [
self.is_self_referencing_auditors,
self.is_where_yearly_result_key_is_not_number,
]
malformed_entries = []
# TODO There should be a cleaner solution using pure MongoDB queries/aggregations
for entry in preliminary_results:
for key in entry["yearly_results"]:
if not re.match(r"^[0-9]{4}$", key):
malformed_entries.append(entry)
valid = [filter_func(entry) for filter_func in filters]
if any(valid):
malformed_entries.append(entry)
return malformed_entries
def insert(self, company: Company) -> InsertOneResult: