feat(data-extraction): Minimal Financial data fetch

This commit is contained in:
TrisNol 2023-08-18 16:44:49 +02:00
parent 1e15656028
commit e8e354932c

View File

@ -24,21 +24,43 @@ class Bundesanzeiger:
Returns:
pd.DataFrame: Result
"""
# Get Bundesanzeiger entries for company
reports = self.__ba.get_reports(company_name)
# Transform to list of data
report_contents = []
for key in reports:
report_contents.append(reports[key])
# Transform to DataFrame and filter out irrelevant entries
df_data = pd.DataFrame(report_contents)
df_data["type"] = df_data.name.apply(lambda name: name.split(" ")[0])
df_data = df_data.loc[df_data.type == "Jahresabschluss"]
df_data["jahr"] = df_data.name.apply(
df_data = self.filter_reports(df_data)
# Add Auditor information
df_data["auditors"] = df_data.raw_report.apply(self.extract_auditors)
# Add Financial information
df_data["financial_results"] = df_data.raw_report.apply(
self.extract_financial_results
)
# Remove irrelevant columns
return df_data.drop(["raw_report"], axis=1)
def filter_reports(self, df_reports: pd.DataFrame) -> pd.DataFrame:
"""Returns only reports of type `Jahresabschluss` and extracts the year of the report.
Args:
df_reports (pd.DataFrame): DataFrame containing list of reports
Returns:
pd.DataFrame: Filtered and pruned DataFrame
"""
df_reports["type"] = df_reports.name.apply(lambda name: name.split(" ")[0])
df_reports = df_reports.loc[df_reports.type == "Jahresabschluss"]
df_reports["jahr"] = df_reports.name.apply(
lambda name: name.split(" ")[-1].split(".")[-1]
)
df_data = df_data.drop(["name", "report", "type"], axis=1)
df_data["auditors"] = df_data.raw_report.apply(self.extract_auditors)
return df_data
return df_reports.drop(["name", "report", "type"], axis=1)
def extract_auditor_company(self, report: str) -> str | None:
"""Extract the name of an auditor company from the given yearly results report.
@ -73,3 +95,83 @@ class Bundesanzeiger:
Auditor(hit.replace(", Wirtschaftsprüfer", "").lstrip(), auditor_company)
for hit in hits
]
def __extract_kpis__(self, report: str) -> dict:
"""Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd.
Extracts Key Performance Indicators (KPIs) from the financial reports.
Args:
report (str): The yearly report as a parsed string
Returns:
dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.
"""
kpis = {}
# Define KPI patterns to search for
kpi_patterns = {
"revenue": r"(?:revenue|umsatz|erlöse)[:\s]*([\d,.]+[mmb]?)",
"net_income": r"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\s]*([\d,.]+[mmb]?)",
"ebit": r"(?:ebit|operating income)[:\s]*([\d,.]+[mmb]?)",
"ebitda": r"(?:ebitda)[:\s]*([\d,.]+[mmb]?)",
"gross_profit": r"(?:gross profit|bruttogewinn)[:\s]*([\d,.]+[mmb]?)",
"operating_profit": r"(?:operating profit|betriebsgewinn)[:\s]*([\d,.]+[mmb]?)",
"assets": r"(?:total assets|bilanzsumme)[:\s]*([\d,.]+[mmb]?)",
"liabilities": r"(?:total liabilities|gesamtverbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
"equity": r"(?:shareholders'? equity|eigenkapital)[:\s]*([\d,.]+[mmb]?)",
"current_assets": r"(?:current assets|umlaufvermögen)[:\s]*([\d,.]+[mmb]?)",
"current_liabilities": r"(?:current liabilities|kurzfristige verbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
"long_term_debt": r"(?:long[-\s]?term debt|langfristige verbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
"short_term_debt": r"(?:short[-\s]?term debt|kurzfristige verbindlichkeiten)[:\s]*([\d,.]+[mmb]?)",
"cash_and_cash_equivalents": r"(?:cash (?:and cash equivalents)?|barmittel)[:\s]*([\d,.]+[mmb]?)",
"dividends": r"(?:dividends?|dividende)[:\s]*([\d,.]+[mmb]?)",
"cash_flow": r"(?:cash flow|cashflow|cash flow from operating activities)[:\s]*([\d,.]+[mmb]?)",
}
for kpi, pattern in kpi_patterns.items():
match = re.search(pattern, report, flags=re.IGNORECASE | re.UNICODE)
if match:
value = match.group(1)
# Clean and validate the extracted number
try:
if not value: # Check if value is empty
cleaned_value = None
else:
multiplier = 1
if value[-1].lower() == "m":
value = value[:-1]
multiplier = 1_000_000
elif value[-1].lower() == "b":
value = value[:-1]
multiplier = 1_000_000_000
# Remove commas after checking for multipliers
value = value.replace(".", "").replace(",", ".").strip()
cleaned_value = float(value) * multiplier
except ValueError:
cleaned_value = None
if cleaned_value is not None:
kpis[kpi] = cleaned_value
return kpis
def extract_financial_results(self, report: str) -> dict:
"""Extract financial data from given report.
Args:
report (str): Report to be analyzed
Returns:
dict: Results
"""
report_parsed = (
BeautifulSoup(report, features="html.parser").get_text().replace("\n", " ")
)
return self.__extract_kpis__(report_parsed)
if __name__ == "__main__":
ba_wrapper = Bundesanzeiger()
ba_wrapper.get_information("Atos IT-Dienstleistung und Beratung GmbH")