diff --git a/src/aki_prj23_transparenzregister/apps/fix_company_financials.py b/src/aki_prj23_transparenzregister/apps/fix_company_financials.py
index d39fabc..8d902b9 100644
--- a/src/aki_prj23_transparenzregister/apps/fix_company_financials.py
+++ b/src/aki_prj23_transparenzregister/apps/fix_company_financials.py
@@ -15,7 +15,7 @@ if __name__ == "__main__":
company_service = CompanyMongoService(mongo_connector)
entries = company_service.get_where_malformed_yearly_results()
-
+ logger.info(f"Num. entries: {len(entries)}")
for company in entries:
work(company, company_service)
- logger.info(f"Processed {company['name']}")
+ logger.info(f"Processed: {company['name']}")
diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py
index 0a4a4e9..b19398d 100644
--- a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py
+++ b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py
@@ -42,7 +42,11 @@ class Bundesanzeiger:
df_data = df_data.loc[df_data.company == company_name]
# Add Auditor information
- df_data["auditors"] = df_data.raw_report.apply(self.extract_auditors)
+ audits = []
+ for report in df_data["raw_report"]:
+ audit = self.extract_auditors(report, company_name)
+ audits.append(audit)
+ df_data["auditors"] = audits
# Add Financial information
df_data["financial_results"] = df_data.raw_report.apply(
@@ -75,11 +79,12 @@ class Bundesanzeiger:
return df_reports.drop(["name", "report", "type"], axis=1)
@staticmethod
- def extract_auditor_company(report: str) -> str | None:
+ def extract_auditor_company(report: str, company_name: str) -> str | None:
"""Extract the name of an auditor company from the given yearly results report.
Args:
report (str): Yearly results report as raw string
+ company_name (str): Name of the company the report originates from, used for filtering
Returns:
str | None: Name of the auditor company if found, otherwise None
@@ -89,25 +94,25 @@ class Bundesanzeiger:
for elem in temp:
br = elem.findChildren("br")
if len(br) > 0:
- return elem.text.split("\n")[1].strip()
+ temp = elem.text.split("\n")[1].strip()
+ if temp != company_name:
+ return temp
return None
- def extract_auditors(self, report: str) -> list:
+ def extract_auditors(self, report: str, company_name: str) -> list:
"""Find the list of auditors involved in the given yearly results report.
Args:
report (str): Yearly results report as raw string
+ company_name (str): Name of the company the report originates from, used for filtering
Returns:
list[Auditor]: List of Auditors found in the given report
"""
- auditor_company = self.extract_auditor_company(report)
- auditor_regex = r"[a-z A-Z,.'-]+, Wirtschaftsprüfer"
+ auditor_company = self.extract_auditor_company(report, company_name)
+ auditor_regex = r"([a-z A-ZÄäÜüÖö,.'-]+), Wirtschaftsprüfer(in)?"
hits = re.findall(auditor_regex, report)
- return [
- Auditor(hit.replace(", Wirtschaftsprüfer", "").lstrip(), auditor_company)
- for hit in hits
- ]
+ return [Auditor(hit[0].strip(), auditor_company) for hit in hits]
def __extract_kpis__(self, report: str) -> dict:
"""Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd.
diff --git a/src/aki_prj23_transparenzregister/utils/mongo/company_mongo_service.py b/src/aki_prj23_transparenzregister/utils/mongo/company_mongo_service.py
index d84ffdb..d175be2 100644
--- a/src/aki_prj23_transparenzregister/utils/mongo/company_mongo_service.py
+++ b/src/aki_prj23_transparenzregister/utils/mongo/company_mongo_service.py
@@ -88,19 +88,53 @@ class CompanyMongoService:
with self.lock:
return list(self.collection.find({"yearly_results": {"$gt": {}}}))
+ def is_where_yearly_result_key_is_not_number(self, data: dict) -> bool:
+ """Does the entry contain yearly_results which are malformed as non-numbers?
+
+ Args:
+ data (dict): Entry from MongoDB
+
+ Returns:
+ bool: Is yealry_results key malformed?
+ """
+ return any(not re.match("^[0-9]{4}$", key) for key in data["yearly_results"])
+
+ def is_self_referencing_auditors(self, data: dict) -> bool:
+ """Does the entry contain yearly_resutls which are self-referencing?
+
+ Args:
+ data (dict): Entry from MongoDB
+
+ Returns:
+ bool: Is self referencing?
+ """
+ for key in data["yearly_results"]:
+ for auditor in data["yearly_results"][key]["auditors"]:
+ if (
+ auditor["company"] is not None
+ and auditor["company"].strip() == data["name"].strip()
+ ):
+ return True
+ return False
+
def get_where_malformed_yearly_results(self) -> list[dict]:
- """Finds all entries with malformed yearly_results (e.g., key is not a year).
+ """Finds all entries with malformed yearly_results (e.g., key is not a year, self-referencing).
Returns:
list[dict]: List of companies
"""
preliminary_results = self.get_where_yearly_results()
+
+ filters = [
+ self.is_self_referencing_auditors,
+ self.is_where_yearly_result_key_is_not_number,
+ ]
malformed_entries = []
# TODO There should be a cleaner solution using pure MongoDB queries/aggregations
for entry in preliminary_results:
- for key in entry["yearly_results"]:
- if not re.match(r"^[0-9]{4}$", key):
- malformed_entries.append(entry)
+ valid = [filter_func(entry) for filter_func in filters]
+ if any(valid):
+ malformed_entries.append(entry)
return malformed_entries
def insert(self, company: Company) -> InsertOneResult:
diff --git a/tests/utils/data_extraction/bundesanzeiger_test.py b/tests/utils/data_extraction/bundesanzeiger_test.py
index d764a52..73bbbd9 100644
--- a/tests/utils/data_extraction/bundesanzeiger_test.py
+++ b/tests/utils/data_extraction/bundesanzeiger_test.py
@@ -12,14 +12,36 @@ from aki_prj23_transparenzregister.utils.enum_types import FinancialKPIEnum
def test_extract_auditor_company_no_hits() -> None:
input_data = """
+ Mega GmbH
Nothing to see here
"""
ba = Bundesanzeiger()
- result = ba.extract_auditor_company(input_data)
+ result = ba.extract_auditor_company(input_data, "Mega GmbH")
assert result is None
+def test_extract_auditor_company_self_referencing() -> None:
+ company = "Mega GmbH"
+ auditor_company = "Super AG"
+
+ input_data = f"""
+
+ {company}
+
+ Nothing to see here
+
+
+ {auditor_company}
+
+ Nothing to see here
+
+ """
+ ba = Bundesanzeiger()
+ result = ba.extract_auditor_company(input_data, company)
+ assert result == auditor_company
+
+
def test_extract_auditor_company() -> None:
company_name = "Korrupte Wirtschaftsprüfer GmbH & Co. KG"
input_data = f"""
@@ -30,7 +52,7 @@ def test_extract_auditor_company() -> None:
"""
ba = Bundesanzeiger()
- result = ba.extract_auditor_company(input_data)
+ result = ba.extract_auditor_company(input_data, "Super AG")
assert result == company_name
diff --git a/tests/utils/mongo/company_mongo_service_test.py b/tests/utils/mongo/company_mongo_service_test.py
index 52b58af..434198f 100644
--- a/tests/utils/mongo/company_mongo_service_test.py
+++ b/tests/utils/mongo/company_mongo_service_test.py
@@ -172,19 +172,28 @@ def test_get_where_malformed_yearly_results(
"_id": "abc",
"name": "Fielmann",
"Hotel?": "Trivago",
- "yearly_results": {"Vor Aeonen": 42, "2022": 4711},
+ "yearly_results": {
+ "Vor Aeonen": {"auditors": [], 42: 1},
+ "2022": {"auditors": [], 42: 1},
+ },
},
{
"_id": "abc",
"name": "Fielmann",
"Hotel?": "Trivago",
- "yearly_results": {"1998": 42, "2022": 4711},
+ "yearly_results": {
+ "1998": {"auditors": [], 42: 1},
+ "2022": {"auditors": [], 42: 1},
+ },
},
{
"_id": "abc",
"name": "Fielmann",
"Hotel?": "Trivago",
- "yearly_results": {"19": 42, "2022": 4711},
+ "yearly_results": {
+ "19": {"auditors": [], 42: 1},
+ "2022": {"auditors": [], 42: 1},
+ },
},
]
mock_collection.find.return_value = mock_result