mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-04-25 12:32:34 +02:00
fix(data-extraction): Resolve self referencing auditor companies
This commit is contained in:
parent
4058824f15
commit
b7d877ef81
@ -42,7 +42,11 @@ class Bundesanzeiger:
|
|||||||
df_data = df_data.loc[df_data.company == company_name]
|
df_data = df_data.loc[df_data.company == company_name]
|
||||||
|
|
||||||
# Add Auditor information
|
# Add Auditor information
|
||||||
df_data["auditors"] = df_data.raw_report.apply(self.extract_auditors)
|
audits = []
|
||||||
|
for report in df_data["raw_report"]:
|
||||||
|
audit = self.extract_auditors(report, company_name)
|
||||||
|
audits.append(audit)
|
||||||
|
df_data["auditors"] = audits
|
||||||
|
|
||||||
# Add Financial information
|
# Add Financial information
|
||||||
df_data["financial_results"] = df_data.raw_report.apply(
|
df_data["financial_results"] = df_data.raw_report.apply(
|
||||||
@ -75,11 +79,12 @@ class Bundesanzeiger:
|
|||||||
return df_reports.drop(["name", "report", "type"], axis=1)
|
return df_reports.drop(["name", "report", "type"], axis=1)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def extract_auditor_company(report: str) -> str | None:
|
def extract_auditor_company(report: str, company_name: str) -> str | None:
|
||||||
"""Extract the name of an auditor company from the given yearly results report.
|
"""Extract the name of an auditor company from the given yearly results report.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
report (str): Yearly results report as raw string
|
report (str): Yearly results report as raw string
|
||||||
|
company_name (str): Name of the company the report originates from, used for filtering
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str | None: Name of the auditor company if found, otherwise None
|
str | None: Name of the auditor company if found, otherwise None
|
||||||
@ -89,19 +94,22 @@ class Bundesanzeiger:
|
|||||||
for elem in temp:
|
for elem in temp:
|
||||||
br = elem.findChildren("br")
|
br = elem.findChildren("br")
|
||||||
if len(br) > 0:
|
if len(br) > 0:
|
||||||
return elem.text.split("\n")[1].strip()
|
temp = elem.text.split("\n")[1].strip()
|
||||||
|
if temp != company_name:
|
||||||
|
return temp
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def extract_auditors(self, report: str) -> list:
|
def extract_auditors(self, report: str, company_name: str) -> list:
|
||||||
"""Find the list of auditors involved in the given yearly results report.
|
"""Find the list of auditors involved in the given yearly results report.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
report (str): Yearly results report as raw string
|
report (str): Yearly results report as raw string
|
||||||
|
company_name (str): Name of the company the report originates from, used for filtering
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list[Auditor]: List of Auditors found in the given report
|
list[Auditor]: List of Auditors found in the given report
|
||||||
"""
|
"""
|
||||||
auditor_company = self.extract_auditor_company(report)
|
auditor_company = self.extract_auditor_company(report, company_name)
|
||||||
auditor_regex = r"([a-z A-ZÄäÜüÖö,.'-]+), Wirtschaftsprüfer(in)?"
|
auditor_regex = r"([a-z A-ZÄäÜüÖö,.'-]+), Wirtschaftsprüfer(in)?"
|
||||||
hits = re.findall(auditor_regex, report)
|
hits = re.findall(auditor_regex, report)
|
||||||
return [Auditor(hit[0].strip(), auditor_company) for hit in hits]
|
return [Auditor(hit[0].strip(), auditor_company) for hit in hits]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user