From 4058824f154d43ee5692bdd8ebd4bdb17335bc60 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Tue, 17 Oct 2023 17:56:26 +0200 Subject: [PATCH] fix(data-extraction): Resolve regex issue in detecting auditors --- .../utils/data_extraction/bundesanzeiger.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py index 0a4a4e9..f59c889 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py @@ -102,12 +102,9 @@ class Bundesanzeiger: list[Auditor]: List of Auditors found in the given report """ auditor_company = self.extract_auditor_company(report) - auditor_regex = r"[a-z A-Z,.'-]+, Wirtschaftsprüfer" + auditor_regex = r"([a-z A-ZÄäÜüÖö,.'-]+), Wirtschaftsprüfer(in)?" hits = re.findall(auditor_regex, report) - return [ - Auditor(hit.replace(", Wirtschaftsprüfer", "").lstrip(), auditor_company) - for hit in hits - ] + return [Auditor(hit[0].strip(), auditor_company) for hit in hits] def __extract_kpis__(self, report: str) -> dict: """Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd.