mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-04-25 20:32:34 +02:00
fix(data-extraction): Fix data on staging DB using custom function
This commit is contained in:
parent
600039207d
commit
35d54301e8
@ -15,7 +15,7 @@ if __name__ == "__main__":
|
|||||||
company_service = CompanyMongoService(mongo_connector)
|
company_service = CompanyMongoService(mongo_connector)
|
||||||
|
|
||||||
entries = company_service.get_where_malformed_yearly_results()
|
entries = company_service.get_where_malformed_yearly_results()
|
||||||
|
logger.info(f"Num. entries: {len(entries)}")
|
||||||
for company in entries:
|
for company in entries:
|
||||||
work(company, company_service)
|
work(company, company_service)
|
||||||
logger.info(f"Processed {company['name']}")
|
logger.info(f"Processed: {company['name']}")
|
||||||
|
@ -88,18 +88,52 @@ class CompanyMongoService:
|
|||||||
with self.lock:
|
with self.lock:
|
||||||
return list(self.collection.find({"yearly_results": {"$gt": {}}}))
|
return list(self.collection.find({"yearly_results": {"$gt": {}}}))
|
||||||
|
|
||||||
|
def is_where_yearly_result_key_is_not_number(self, data: dict) -> bool:
|
||||||
|
"""Does the entry contain yearly_results which are malformed as non-numbers?
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (dict): Entry from MongoDB
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: Is yealry_results key malformed?
|
||||||
|
"""
|
||||||
|
return any(not re.match("^[0-9]{4}$", key) for key in data["yearly_results"])
|
||||||
|
|
||||||
|
def is_self_referencing_auditors(self, data: dict) -> bool:
|
||||||
|
"""Does the entry contain yearly_resutls which are self-referencing?
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (dict): Entry from MongoDB
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: Is self referencing?
|
||||||
|
"""
|
||||||
|
for key in data["yearly_results"]:
|
||||||
|
for auditor in data["yearly_results"][key]["auditors"]:
|
||||||
|
if (
|
||||||
|
auditor["company"] is not None
|
||||||
|
and auditor["company"].strip() == data["name"].strip()
|
||||||
|
):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def get_where_malformed_yearly_results(self) -> list[dict]:
|
def get_where_malformed_yearly_results(self) -> list[dict]:
|
||||||
"""Finds all entries with malformed yearly_results (e.g., key is not a year).
|
"""Finds all entries with malformed yearly_results (e.g., key is not a year, self-referencing).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list[dict]: List of companies
|
list[dict]: List of companies
|
||||||
"""
|
"""
|
||||||
preliminary_results = self.get_where_yearly_results()
|
preliminary_results = self.get_where_yearly_results()
|
||||||
|
|
||||||
|
filters = [
|
||||||
|
self.is_self_referencing_auditors,
|
||||||
|
self.is_where_yearly_result_key_is_not_number,
|
||||||
|
]
|
||||||
malformed_entries = []
|
malformed_entries = []
|
||||||
# TODO There should be a cleaner solution using pure MongoDB queries/aggregations
|
# TODO There should be a cleaner solution using pure MongoDB queries/aggregations
|
||||||
for entry in preliminary_results:
|
for entry in preliminary_results:
|
||||||
for key in entry["yearly_results"]:
|
valid = [filter_func(entry) for filter_func in filters]
|
||||||
if not re.match(r"^[0-9]{4}$", key):
|
if any(valid):
|
||||||
malformed_entries.append(entry)
|
malformed_entries.append(entry)
|
||||||
return malformed_entries
|
return malformed_entries
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user