mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-04-25 00:12:35 +02:00
fix(data-extraction): Resolve self referencing auditor companies
This commit is contained in:
parent
4058824f15
commit
b7d877ef81
@ -42,7 +42,11 @@ class Bundesanzeiger:
|
||||
df_data = df_data.loc[df_data.company == company_name]
|
||||
|
||||
# Add Auditor information
|
||||
df_data["auditors"] = df_data.raw_report.apply(self.extract_auditors)
|
||||
audits = []
|
||||
for report in df_data["raw_report"]:
|
||||
audit = self.extract_auditors(report, company_name)
|
||||
audits.append(audit)
|
||||
df_data["auditors"] = audits
|
||||
|
||||
# Add Financial information
|
||||
df_data["financial_results"] = df_data.raw_report.apply(
|
||||
@ -75,11 +79,12 @@ class Bundesanzeiger:
|
||||
return df_reports.drop(["name", "report", "type"], axis=1)
|
||||
|
||||
@staticmethod
|
||||
def extract_auditor_company(report: str) -> str | None:
|
||||
def extract_auditor_company(report: str, company_name: str) -> str | None:
|
||||
"""Extract the name of an auditor company from the given yearly results report.
|
||||
|
||||
Args:
|
||||
report (str): Yearly results report as raw string
|
||||
company_name (str): Name of the company the report originates from, used for filtering
|
||||
|
||||
Returns:
|
||||
str | None: Name of the auditor company if found, otherwise None
|
||||
@ -89,19 +94,22 @@ class Bundesanzeiger:
|
||||
for elem in temp:
|
||||
br = elem.findChildren("br")
|
||||
if len(br) > 0:
|
||||
return elem.text.split("\n")[1].strip()
|
||||
temp = elem.text.split("\n")[1].strip()
|
||||
if temp != company_name:
|
||||
return temp
|
||||
return None
|
||||
|
||||
def extract_auditors(self, report: str) -> list:
|
||||
def extract_auditors(self, report: str, company_name: str) -> list:
|
||||
"""Find the list of auditors involved in the given yearly results report.
|
||||
|
||||
Args:
|
||||
report (str): Yearly results report as raw string
|
||||
company_name (str): Name of the company the report originates from, used for filtering
|
||||
|
||||
Returns:
|
||||
list[Auditor]: List of Auditors found in the given report
|
||||
"""
|
||||
auditor_company = self.extract_auditor_company(report)
|
||||
auditor_company = self.extract_auditor_company(report, company_name)
|
||||
auditor_regex = r"([a-z A-ZÄäÜüÖö,.'-]+), Wirtschaftsprüfer(in)?"
|
||||
hits = re.findall(auditor_regex, report)
|
||||
return [Auditor(hit[0].strip(), auditor_company) for hit in hits]
|
||||
|
Loading…
x
Reference in New Issue
Block a user