mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-04-24 13:42:34 +02:00
feat(data-extraction): Minimal Financial data fetch
This commit is contained in:
parent
1e15656028
commit
e8e354932c
@ -24,21 +24,43 @@ class Bundesanzeiger:
|
||||
Returns:
|
||||
pd.DataFrame: Result
|
||||
"""
|
||||
# Get Bundesanzeiger entries for company
|
||||
reports = self.__ba.get_reports(company_name)
|
||||
# Transform to list of data
|
||||
report_contents = []
|
||||
for key in reports:
|
||||
report_contents.append(reports[key])
|
||||
|
||||
# Transform to DataFrame and filter out irrelevant entries
|
||||
df_data = pd.DataFrame(report_contents)
|
||||
df_data["type"] = df_data.name.apply(lambda name: name.split(" ")[0])
|
||||
df_data = df_data.loc[df_data.type == "Jahresabschluss"]
|
||||
df_data["jahr"] = df_data.name.apply(
|
||||
df_data = self.filter_reports(df_data)
|
||||
|
||||
# Add Auditor information
|
||||
df_data["auditors"] = df_data.raw_report.apply(self.extract_auditors)
|
||||
|
||||
# Add Financial information
|
||||
df_data["financial_results"] = df_data.raw_report.apply(
|
||||
self.extract_financial_results
|
||||
)
|
||||
|
||||
# Remove irrelevant columns
|
||||
return df_data.drop(["raw_report"], axis=1)
|
||||
|
||||
def filter_reports(self, df_reports: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Returns only reports of type `Jahresabschluss` and extracts the year of the report.
|
||||
|
||||
Args:
|
||||
df_reports (pd.DataFrame): DataFrame containing list of reports
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Filtered and pruned DataFrame
|
||||
"""
|
||||
df_reports["type"] = df_reports.name.apply(lambda name: name.split(" ")[0])
|
||||
df_reports = df_reports.loc[df_reports.type == "Jahresabschluss"]
|
||||
df_reports["jahr"] = df_reports.name.apply(
|
||||
lambda name: name.split(" ")[-1].split(".")[-1]
|
||||
)
|
||||
df_data = df_data.drop(["name", "report", "type"], axis=1)
|
||||
|
||||
df_data["auditors"] = df_data.raw_report.apply(self.extract_auditors)
|
||||
return df_data
|
||||
return df_reports.drop(["name", "report", "type"], axis=1)
|
||||
|
||||
def extract_auditor_company(self, report: str) -> str | None:
|
||||
"""Extract the name of an auditor company from the given yearly results report.
|
||||
@ -73,3 +95,83 @@ class Bundesanzeiger:
|
||||
Auditor(hit.replace(", Wirtschaftsprüfer", "").lstrip(), auditor_company)
|
||||
for hit in hits
|
||||
]
|
||||
|
||||
def __extract_kpis__(self, report: str) -> dict:
|
||||
"""Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd.
|
||||
|
||||
Extracts Key Performance Indicators (KPIs) from the financial reports.
|
||||
|
||||
Args:
|
||||
report (str): The yearly report as a parsed string
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.
|
||||
"""
|
||||
kpis = {}
|
||||
|
||||
# Define KPI patterns to search for
|
||||
kpi_patterns = {
|
||||
"revenue": r"(?:revenue|umsatz|erlöse)[:\s]*([\d,.]+[mmb]?)",
|
||||
"net_income": r"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\s]*([\d,.]+[mmb]?)",
|
||||
"ebit": r"(?:ebit|operating income)[:\s]*([\d,.]+[mmb]?)",
|
||||
"ebitda": r"(?:ebitda)[:\s]*([\d,.]+[mmb]?)",
|
||||
"gross_profit": r"(?:gross profit|bruttogewinn)[:\s]*([\d,.]+[mmb]?)",
|
||||
"operating_profit": r"(?:operating profit|betriebsgewinn)[:\s]*([\d,.]+[mmb]?)",
|
||||
"assets": r"(?:total assets|bilanzsumme)[:\s]*([\d,.]+[mmb]?)",
|
||||
"liabilities": r"(?:total liabilities|gesamtverbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
|
||||
"equity": r"(?:shareholders'? equity|eigenkapital)[:\s]*([\d,.]+[mmb]?)",
|
||||
"current_assets": r"(?:current assets|umlaufvermögen)[:\s]*([\d,.]+[mmb]?)",
|
||||
"current_liabilities": r"(?:current liabilities|kurzfristige verbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
|
||||
"long_term_debt": r"(?:long[-\s]?term debt|langfristige verbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
|
||||
"short_term_debt": r"(?:short[-\s]?term debt|kurzfristige verbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
|
||||
"cash_and_cash_equivalents": r"(?:cash (?:and cash equivalents)?|barmittel)[:\s]*([\d,.]+[mmb]?)",
|
||||
"dividends": r"(?:dividends?|dividende)[:\s]*([\d,.]+[mmb]?)",
|
||||
"cash_flow": r"(?:cash flow|cashflow|cash flow from operating activities)[:\s]*([\d,.]+[mmb]?)",
|
||||
}
|
||||
|
||||
for kpi, pattern in kpi_patterns.items():
|
||||
match = re.search(pattern, report, flags=re.IGNORECASE | re.UNICODE)
|
||||
if match:
|
||||
value = match.group(1)
|
||||
|
||||
# Clean and validate the extracted number
|
||||
try:
|
||||
if not value: # Check if value is empty
|
||||
cleaned_value = None
|
||||
else:
|
||||
multiplier = 1
|
||||
if value[-1].lower() == "m":
|
||||
value = value[:-1]
|
||||
multiplier = 1_000_000
|
||||
elif value[-1].lower() == "b":
|
||||
value = value[:-1]
|
||||
multiplier = 1_000_000_000
|
||||
|
||||
# Remove commas after checking for multipliers
|
||||
value = value.replace(".", "").replace(",", ".").strip()
|
||||
cleaned_value = float(value) * multiplier
|
||||
except ValueError:
|
||||
cleaned_value = None
|
||||
|
||||
if cleaned_value is not None:
|
||||
kpis[kpi] = cleaned_value
|
||||
return kpis
|
||||
|
||||
def extract_financial_results(self, report: str) -> dict:
|
||||
"""Extract financial data from given report.
|
||||
|
||||
Args:
|
||||
report (str): Report to be analyzed
|
||||
|
||||
Returns:
|
||||
dict: Results
|
||||
"""
|
||||
report_parsed = (
|
||||
BeautifulSoup(report, features="html.parser").get_text().replace("\n", " ")
|
||||
)
|
||||
return self.__extract_kpis__(report_parsed)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ba_wrapper = Bundesanzeiger()
|
||||
ba_wrapper.get_information("Atos IT-Dienstleistung und Beratung GmbH")
|
||||
|
Loading…
x
Reference in New Issue
Block a user