fix(data-extraction): Resolve self referencing auditor companies

This commit is contained in:
TrisNol 2023-10-17 18:08:20 +02:00
parent 4058824f15
commit b7d877ef81

View File

@ -42,7 +42,11 @@ class Bundesanzeiger:
df_data = df_data.loc[df_data.company == company_name]
# Add Auditor information
df_data["auditors"] = df_data.raw_report.apply(self.extract_auditors)
audits = []
for report in df_data["raw_report"]:
audit = self.extract_auditors(report, company_name)
audits.append(audit)
df_data["auditors"] = audits
# Add Financial information
df_data["financial_results"] = df_data.raw_report.apply(
@ -75,11 +79,12 @@ class Bundesanzeiger:
return df_reports.drop(["name", "report", "type"], axis=1)
@staticmethod
def extract_auditor_company(report: str) -> str | None:
def extract_auditor_company(report: str, company_name: str) -> str | None:
"""Extract the name of an auditor company from the given yearly results report.
Args:
report (str): Yearly results report as raw string
company_name (str): Name of the company the report originates from, used for filtering
Returns:
str | None: Name of the auditor company if found, otherwise None
@ -89,19 +94,22 @@ class Bundesanzeiger:
for elem in temp:
br = elem.findChildren("br")
if len(br) > 0:
return elem.text.split("\n")[1].strip()
temp = elem.text.split("\n")[1].strip()
if temp != company_name:
return temp
return None
def extract_auditors(self, report: str) -> list:
def extract_auditors(self, report: str, company_name: str) -> list:
"""Find the list of auditors involved in the given yearly results report.
Args:
report (str): Yearly results report as raw string
company_name (str): Name of the company the report originates from, used for filtering
Returns:
list[Auditor]: List of Auditors found in the given report
"""
auditor_company = self.extract_auditor_company(report)
auditor_company = self.extract_auditor_company(report, company_name)
auditor_regex = r"([a-z A-ZÄäÜüÖö,.'-]+), Wirtschaftsprüfer(in)?"
hits = re.findall(auditor_regex, report)
return [Auditor(hit[0].strip(), auditor_company) for hit in hits]