mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-06-22 09:43:55 +02:00
fix(data-extraction): self-referencing auditors (#224)
fix(data-extraction): self-referencing auditors
This commit is contained in:
@ -15,7 +15,7 @@ if __name__ == "__main__":
|
|||||||
company_service = CompanyMongoService(mongo_connector)
|
company_service = CompanyMongoService(mongo_connector)
|
||||||
|
|
||||||
entries = company_service.get_where_malformed_yearly_results()
|
entries = company_service.get_where_malformed_yearly_results()
|
||||||
|
logger.info(f"Num. entries: {len(entries)}")
|
||||||
for company in entries:
|
for company in entries:
|
||||||
work(company, company_service)
|
work(company, company_service)
|
||||||
logger.info(f"Processed {company['name']}")
|
logger.info(f"Processed: {company['name']}")
|
||||||
|
@ -42,7 +42,11 @@ class Bundesanzeiger:
|
|||||||
df_data = df_data.loc[df_data.company == company_name]
|
df_data = df_data.loc[df_data.company == company_name]
|
||||||
|
|
||||||
# Add Auditor information
|
# Add Auditor information
|
||||||
df_data["auditors"] = df_data.raw_report.apply(self.extract_auditors)
|
audits = []
|
||||||
|
for report in df_data["raw_report"]:
|
||||||
|
audit = self.extract_auditors(report, company_name)
|
||||||
|
audits.append(audit)
|
||||||
|
df_data["auditors"] = audits
|
||||||
|
|
||||||
# Add Financial information
|
# Add Financial information
|
||||||
df_data["financial_results"] = df_data.raw_report.apply(
|
df_data["financial_results"] = df_data.raw_report.apply(
|
||||||
@ -75,11 +79,12 @@ class Bundesanzeiger:
|
|||||||
return df_reports.drop(["name", "report", "type"], axis=1)
|
return df_reports.drop(["name", "report", "type"], axis=1)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def extract_auditor_company(report: str) -> str | None:
|
def extract_auditor_company(report: str, company_name: str) -> str | None:
|
||||||
"""Extract the name of an auditor company from the given yearly results report.
|
"""Extract the name of an auditor company from the given yearly results report.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
report (str): Yearly results report as raw string
|
report (str): Yearly results report as raw string
|
||||||
|
company_name (str): Name of the company the report originates from, used for filtering
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str | None: Name of the auditor company if found, otherwise None
|
str | None: Name of the auditor company if found, otherwise None
|
||||||
@ -89,25 +94,25 @@ class Bundesanzeiger:
|
|||||||
for elem in temp:
|
for elem in temp:
|
||||||
br = elem.findChildren("br")
|
br = elem.findChildren("br")
|
||||||
if len(br) > 0:
|
if len(br) > 0:
|
||||||
return elem.text.split("\n")[1].strip()
|
temp = elem.text.split("\n")[1].strip()
|
||||||
|
if temp != company_name:
|
||||||
|
return temp
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def extract_auditors(self, report: str) -> list:
|
def extract_auditors(self, report: str, company_name: str) -> list:
|
||||||
"""Find the list of auditors involved in the given yearly results report.
|
"""Find the list of auditors involved in the given yearly results report.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
report (str): Yearly results report as raw string
|
report (str): Yearly results report as raw string
|
||||||
|
company_name (str): Name of the company the report originates from, used for filtering
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list[Auditor]: List of Auditors found in the given report
|
list[Auditor]: List of Auditors found in the given report
|
||||||
"""
|
"""
|
||||||
auditor_company = self.extract_auditor_company(report)
|
auditor_company = self.extract_auditor_company(report, company_name)
|
||||||
auditor_regex = r"[a-z A-Z,.'-]+, Wirtschaftsprüfer"
|
auditor_regex = r"([a-z A-ZÄäÜüÖö,.'-]+), Wirtschaftsprüfer(in)?"
|
||||||
hits = re.findall(auditor_regex, report)
|
hits = re.findall(auditor_regex, report)
|
||||||
return [
|
return [Auditor(hit[0].strip(), auditor_company) for hit in hits]
|
||||||
Auditor(hit.replace(", Wirtschaftsprüfer", "").lstrip(), auditor_company)
|
|
||||||
for hit in hits
|
|
||||||
]
|
|
||||||
|
|
||||||
def __extract_kpis__(self, report: str) -> dict:
|
def __extract_kpis__(self, report: str) -> dict:
|
||||||
"""Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd.
|
"""Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd.
|
||||||
|
@ -88,18 +88,52 @@ class CompanyMongoService:
|
|||||||
with self.lock:
|
with self.lock:
|
||||||
return list(self.collection.find({"yearly_results": {"$gt": {}}}))
|
return list(self.collection.find({"yearly_results": {"$gt": {}}}))
|
||||||
|
|
||||||
|
def is_where_yearly_result_key_is_not_number(self, data: dict) -> bool:
|
||||||
|
"""Does the entry contain yearly_results which are malformed as non-numbers?
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (dict): Entry from MongoDB
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: Is yealry_results key malformed?
|
||||||
|
"""
|
||||||
|
return any(not re.match("^[0-9]{4}$", key) for key in data["yearly_results"])
|
||||||
|
|
||||||
|
def is_self_referencing_auditors(self, data: dict) -> bool:
|
||||||
|
"""Does the entry contain yearly_resutls which are self-referencing?
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (dict): Entry from MongoDB
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: Is self referencing?
|
||||||
|
"""
|
||||||
|
for key in data["yearly_results"]:
|
||||||
|
for auditor in data["yearly_results"][key]["auditors"]:
|
||||||
|
if (
|
||||||
|
auditor["company"] is not None
|
||||||
|
and auditor["company"].strip() == data["name"].strip()
|
||||||
|
):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def get_where_malformed_yearly_results(self) -> list[dict]:
|
def get_where_malformed_yearly_results(self) -> list[dict]:
|
||||||
"""Finds all entries with malformed yearly_results (e.g., key is not a year).
|
"""Finds all entries with malformed yearly_results (e.g., key is not a year, self-referencing).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list[dict]: List of companies
|
list[dict]: List of companies
|
||||||
"""
|
"""
|
||||||
preliminary_results = self.get_where_yearly_results()
|
preliminary_results = self.get_where_yearly_results()
|
||||||
|
|
||||||
|
filters = [
|
||||||
|
self.is_self_referencing_auditors,
|
||||||
|
self.is_where_yearly_result_key_is_not_number,
|
||||||
|
]
|
||||||
malformed_entries = []
|
malformed_entries = []
|
||||||
# TODO There should be a cleaner solution using pure MongoDB queries/aggregations
|
# TODO There should be a cleaner solution using pure MongoDB queries/aggregations
|
||||||
for entry in preliminary_results:
|
for entry in preliminary_results:
|
||||||
for key in entry["yearly_results"]:
|
valid = [filter_func(entry) for filter_func in filters]
|
||||||
if not re.match(r"^[0-9]{4}$", key):
|
if any(valid):
|
||||||
malformed_entries.append(entry)
|
malformed_entries.append(entry)
|
||||||
return malformed_entries
|
return malformed_entries
|
||||||
|
|
||||||
|
@ -12,14 +12,36 @@ from aki_prj23_transparenzregister.utils.enum_types import FinancialKPIEnum
|
|||||||
def test_extract_auditor_company_no_hits() -> None:
|
def test_extract_auditor_company_no_hits() -> None:
|
||||||
input_data = """
|
input_data = """
|
||||||
<b>
|
<b>
|
||||||
|
Mega GmbH
|
||||||
Nothing to see here
|
Nothing to see here
|
||||||
</b>
|
</b>
|
||||||
"""
|
"""
|
||||||
ba = Bundesanzeiger()
|
ba = Bundesanzeiger()
|
||||||
result = ba.extract_auditor_company(input_data)
|
result = ba.extract_auditor_company(input_data, "Mega GmbH")
|
||||||
assert result is None
|
assert result is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_auditor_company_self_referencing() -> None:
|
||||||
|
company = "Mega GmbH"
|
||||||
|
auditor_company = "Super AG"
|
||||||
|
|
||||||
|
input_data = f"""
|
||||||
|
<b>
|
||||||
|
{company}
|
||||||
|
<br>
|
||||||
|
Nothing to see here
|
||||||
|
</b>
|
||||||
|
<b>
|
||||||
|
{auditor_company}
|
||||||
|
<br>
|
||||||
|
Nothing to see here
|
||||||
|
</b>
|
||||||
|
"""
|
||||||
|
ba = Bundesanzeiger()
|
||||||
|
result = ba.extract_auditor_company(input_data, company)
|
||||||
|
assert result == auditor_company
|
||||||
|
|
||||||
|
|
||||||
def test_extract_auditor_company() -> None:
|
def test_extract_auditor_company() -> None:
|
||||||
company_name = "Korrupte Wirtschaftsprüfer GmbH & Co. KG"
|
company_name = "Korrupte Wirtschaftsprüfer GmbH & Co. KG"
|
||||||
input_data = f"""
|
input_data = f"""
|
||||||
@ -30,7 +52,7 @@ def test_extract_auditor_company() -> None:
|
|||||||
</b>
|
</b>
|
||||||
"""
|
"""
|
||||||
ba = Bundesanzeiger()
|
ba = Bundesanzeiger()
|
||||||
result = ba.extract_auditor_company(input_data)
|
result = ba.extract_auditor_company(input_data, "Super AG")
|
||||||
assert result == company_name
|
assert result == company_name
|
||||||
|
|
||||||
|
|
||||||
|
@ -172,19 +172,28 @@ def test_get_where_malformed_yearly_results(
|
|||||||
"_id": "abc",
|
"_id": "abc",
|
||||||
"name": "Fielmann",
|
"name": "Fielmann",
|
||||||
"Hotel?": "Trivago",
|
"Hotel?": "Trivago",
|
||||||
"yearly_results": {"Vor Aeonen": 42, "2022": 4711},
|
"yearly_results": {
|
||||||
|
"Vor Aeonen": {"auditors": [], 42: 1},
|
||||||
|
"2022": {"auditors": [], 42: 1},
|
||||||
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"_id": "abc",
|
"_id": "abc",
|
||||||
"name": "Fielmann",
|
"name": "Fielmann",
|
||||||
"Hotel?": "Trivago",
|
"Hotel?": "Trivago",
|
||||||
"yearly_results": {"1998": 42, "2022": 4711},
|
"yearly_results": {
|
||||||
|
"1998": {"auditors": [], 42: 1},
|
||||||
|
"2022": {"auditors": [], 42: 1},
|
||||||
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"_id": "abc",
|
"_id": "abc",
|
||||||
"name": "Fielmann",
|
"name": "Fielmann",
|
||||||
"Hotel?": "Trivago",
|
"Hotel?": "Trivago",
|
||||||
"yearly_results": {"19": 42, "2022": 4711},
|
"yearly_results": {
|
||||||
|
"19": {"auditors": [], 42: 1},
|
||||||
|
"2022": {"auditors": [], 42: 1},
|
||||||
|
},
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
mock_collection.find.return_value = mock_result
|
mock_collection.find.return_value = mock_result
|
||||||
|
Reference in New Issue
Block a user