str:\n",
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
" temp = soup.find_all(\"b\")\n",
" for elem in temp:\n",
" br = elem.findChildren(\"br\")\n",
" if len(br) > 0:\n",
" return elem.text.split(\"\\n\")[1].strip()\n",
" return None\n",
"\n",
"\n",
"def extract_auditors(report: str) -> list:\n",
" auditor_company = extract_auditor_company(report)\n",
" auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
" hits = re.findall(auditor_regex, report)\n",
" return [\n",
" Auditor(hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip(), auditor_company)\n",
" for hit in hits\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extract_auditors(sample_report)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Aufsichtsrat"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"**TODO**"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Bilanz bzw. GuV"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'net_income': 23484.67, 'equity': 65083.84, 'current_assets': 357613.61}"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def extract_kpis(report_content) -> dict:\n",
" \"\"\"\n",
" Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd\n",
" Extracts Key Performance Indicators (KPIs) from the financial reports.\n",
" Args:\n",
" reports (dict): A dictionary containing the financial reports with their hash as keys and report details as values.\n",
" Returns:\n",
" dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.\n",
" \"\"\"\n",
"\n",
" kpis = {}\n",
"\n",
" # Define KPI patterns to search for\n",
" kpi_patterns = {\n",
" \"revenue\": r\"(?:revenue|umsatz|erlöse)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"net_income\": r\"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"ebit\": r\"(?:ebit|operating income)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"ebitda\": r\"(?:ebitda)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"gross_profit\": r\"(?:gross profit|bruttogewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"operating_profit\": r\"(?:operating profit|betriebsgewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"assets\": r\"(?:total assets|bilanzsumme)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"liabilities\": r\"(?:total liabilities|gesamtverbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"equity\": r\"(?:shareholders'? equity|eigenkapital)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"current_assets\": r\"(?:current assets|umlaufvermögen)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"current_liabilities\": r\"(?:current liabilities|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"long_term_debt\": r\"(?:long[-\\s]?term debt|langfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"short_term_debt\": r\"(?:short[-\\s]?term debt|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"cash_and_cash_equivalents\": r\"(?:cash (?:and cash equivalents)?|barmittel)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"dividends\": r\"(?:dividends?|dividende)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" \"cash_flow\": r\"(?:cash flow|cashflow|cash flow from operating activities)[:\\s]*([\\d,.]+[mmb]?)\",\n",
" }\n",
"\n",
" report_kpis = {}\n",
" for kpi, pattern in kpi_patterns.items():\n",
" match = re.search(pattern, report_content, flags=re.IGNORECASE | re.UNICODE)\n",
" if match:\n",
" value = match.group(1)\n",
"\n",
" # Clean and validate the extracted number\n",
" try:\n",
" if not value: # Check if value is empty\n",
" cleaned_value = None\n",
" else:\n",
" multiplier = 1\n",
" if value[-1].lower() == \"m\":\n",
" value = value[:-1]\n",
" multiplier = 1_000_000\n",
" elif value[-1].lower() == \"b\":\n",
" value = value[:-1]\n",
" multiplier = 1_000_000_000\n",
"\n",
" # Remove commas after checking for multipliers\n",
" value = value.replace(\".\", \"\").replace(\",\", \".\").strip()\n",
" cleaned_value = float(value) * multiplier\n",
" except ValueError:\n",
" cleaned_value = None\n",
"\n",
" if cleaned_value is not None:\n",
" report_kpis[kpi] = cleaned_value\n",
" return report_kpis\n",
"\n",
"\n",
"extract_kpis(\n",
" BeautifulSoup(sample_report, features=\"html.parser\").get_text().replace(\"\\n\", \" \")\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"with open(\"./temp.txt\", \"w\") as file:\n",
" file.write(\n",
" BeautifulSoup(sample_report, features=\"html.parser\")\n",
" .get_text()\n",
" .replace(\"\\n\", \" \")\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n",
" ('Aktiva', '31.12.2020 EUR'),\n",
" ('Aktiva', '31.12.2019 EUR')],\n",
" )\n",
"Aktiva Unnamed: 0_level_1 object\n",
" 31.12.2020 EUR object\n",
" 31.12.2019 EUR object\n",
"dtype: object\n",
"MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n",
" ('Passiva', '31.12.2020 EUR'),\n",
" ('Passiva', '31.12.2019 EUR')],\n",
" )\n",
"Passiva Unnamed: 0_level_1 object\n",
" 31.12.2020 EUR object\n",
" 31.12.2019 EUR object\n",
"dtype: object\n",
"Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n",
"Angaben zur Identifikation der Gesellschaft laut Registergericht object\n",
"Angaben zur Identifikation der Gesellschaft laut Registergericht.1 object\n",
"dtype: object\n"
]
},
{
"data": {
"text/plain": [
"{}"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def parse_tables(report: str) -> list:\n",
" result = {}\n",
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
" for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n",
" df = pd.read_html(StringIO(str(table)))[0]\n",
" print(df.columns)\n",
" print(df.dtypes)\n",
" return result\n",
"\n",
"\n",
"parse_tables(sample_report)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "'Passiva'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Bundesanzeiger\\notebook.ipynb Cell 21\u001b[0m in \u001b[0;36m1\n\u001b[0;32m
11\u001b[0m \u001b[39mreturn\u001b[39;00m result\n\u001b[0;32m
14\u001b[0m bilanz \u001b[39m=\u001b[39m get_bilanz(sample_report)\n\u001b[1;32m--->
15\u001b[0m bilanz[\u001b[39m\"\u001b[39;49m\u001b[39mPassiva\u001b[39;49m\u001b[39m\"\u001b[39;49m]\u001b[39m.\u001b[39mhead()\n",
"\u001b[1;31mKeyError\u001b[0m: 'Passiva'"
]
}
],
"source": [
"def get_bilanz(report: str) -> any:\n",
" result = {}\n",
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
" for pos in [\"Aktiva\", \"Passiva\"]:\n",
" tag = soup.find(\"b\", string=re.compile(pos))\n",
" if tag:\n",
" pos_results = pd.read_html(\n",
" StringIO(str(tag.findNext(\"table\", {\"class\": \"std_table\"})))\n",
" )[0]\n",
" result[pos] = pos_results\n",
" return result\n",
"\n",
"\n",
"bilanz = get_bilanz(sample_report)\n",
"bilanz[\"Passiva\"].head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Int64Index([0, 1], dtype='int64')\n",
"Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n",
" 'Vorjahr TEUR'],\n",
" dtype='object')\n",
"Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Int64Index([0, 1, 2], dtype='int64')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
"MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n",
" ('Art der Beziehung', 'Gesellschafterin TEUR'),\n",
" ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n",
" )\n",
"Int64Index([0, 1], dtype='int64')\n",
"MultiIndex([( 'Unnamed: 0_level_0', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...)],\n",
" )\n",
"MultiIndex([('Unnamed: 0_level_0', ...),\n",
" ( 'Abschreibungen', ...),\n",
" ( 'Abschreibungen', ...),\n",
" ( 'Abschreibungen', ...),\n",
" ( 'Abschreibungen', ...)],\n",
" )\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
" ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n",
" ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n",
" )\n",
"Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n",
" '2019'],\n",
" dtype='object')\n",
"Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n",
" 'Veränderung TEUR'],\n",
" dtype='object')\n",
"Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n"
]
}
],
"source": [
"def get_tables(raw_report: str) -> list:\n",
" soup = BeautifulSoup(raw_report, features=\"html.parser\")\n",
" tables = soup.find_all(\"table\", {\"class\": \"std_table\"})\n",
" dfs = []\n",
" for table in tables:\n",
" for df in pd.read_html(StringIO(str(table))):\n",
" dfs.append(df)\n",
" return dfs\n",
"\n",
"\n",
"for df in get_tables(sample_report):\n",
" print(df.columns)\n",
"\n",
"tables = get_tables(sample_report)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.7"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}