{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Daten Extraktion aus dem Bundesanzeiger"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Vorbereitung"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>company</th>\n",
       "      <th>raw_report</th>\n",
       "      <th>jahr</th>\n",
       "      <th>auditors</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2023-07-07</td>\n",
       "      <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
       "      <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
       "      <td>2021</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2023-05-10</td>\n",
       "      <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
       "      <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
       "      <td>2021</td>\n",
       "      <td>[Auditor(name='Eckhard Lewe', company='Grant T...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2022-03-25</td>\n",
       "      <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
       "      <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
       "      <td>2020</td>\n",
       "      <td>[Auditor(name='Eckhard Lewe', company='Warth &amp;...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2021-03-11</td>\n",
       "      <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
       "      <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
       "      <td>2019</td>\n",
       "      <td>[Auditor(name='Eckhard Lewe', company='Warth &amp;...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2020-03-24</td>\n",
       "      <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
       "      <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
       "      <td>2018</td>\n",
       "      <td>[Auditor(name='Ulrich Diersch', company='Warth...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        date                                   company  \\\n",
       "0 2023-07-07  Atos IT-Dienstleistung und Beratung GmbH   \n",
       "2 2023-05-10  Atos IT-Dienstleistung und Beratung GmbH   \n",
       "4 2022-03-25  Atos IT-Dienstleistung und Beratung GmbH   \n",
       "5 2021-03-11  Atos IT-Dienstleistung und Beratung GmbH   \n",
       "6 2020-03-24  Atos IT-Dienstleistung und Beratung GmbH   \n",
       "\n",
       "                                          raw_report  jahr  \\\n",
       "0  <div class=\"publication_container\">\\n <div cla...  2021   \n",
       "2  <div class=\"publication_container\">\\n <div cla...  2021   \n",
       "4  <div class=\"publication_container\">\\n <div cla...  2020   \n",
       "5  <div class=\"publication_container\">\\n <div cla...  2019   \n",
       "6  <div class=\"publication_container\">\\n <div cla...  2018   \n",
       "\n",
       "                                            auditors  \n",
       "0                                                 []  \n",
       "2  [Auditor(name='Eckhard Lewe', company='Grant T...  \n",
       "4  [Auditor(name='Eckhard Lewe', company='Warth &...  \n",
       "5  [Auditor(name='Eckhard Lewe', company='Warth &...  \n",
       "6  [Auditor(name='Ulrich Diersch', company='Warth...  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "from aki_prj23_transparenzregister.utils.data_extraction.bundesanzeiger import (\n",
    "    Bundesanzeiger,\n",
    ")\n",
    "\n",
    "ba_wrapper = Bundesanzeiger()\n",
    "df_reports = ba_wrapper.get_information(\"Atos IT-Dienstleistung und Beratung GmbH\")\n",
    "df_reports.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>company</th>\n",
       "      <th>raw_report</th>\n",
       "      <th>jahr</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2023-07-11</td>\n",
       "      <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
       "      <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
       "      <td>2021</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2023-05-25</td>\n",
       "      <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
       "      <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
       "      <td>2020</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2023-05-24</td>\n",
       "      <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
       "      <td>&lt;div class=\"publication_container\"&gt;\\n &lt;div cla...</td>\n",
       "      <td>2019</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        date                                            company  \\\n",
       "0 2023-07-11  Volkswagen Economy Service Erdle Bernhard Erdl...   \n",
       "1 2023-05-25  Volkswagen Economy Service Erdle Bernhard Erdl...   \n",
       "2 2023-05-24  Volkswagen Economy Service Erdle Bernhard Erdl...   \n",
       "\n",
       "                                          raw_report  jahr  \n",
       "0  <div class=\"publication_container\">\\n <div cla...  2021  \n",
       "1  <div class=\"publication_container\">\\n <div cla...  2020  \n",
       "2  <div class=\"publication_container\">\\n <div cla...  2019  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_jahresabschluss = df_reports.loc[df_reports.type == \"Jahresabschluss\"]\n",
    "df_jahresabschluss[\"jahr\"] = df_jahresabschluss.name.apply(\n",
    "    lambda name: name.split(\" \")[-1].split(\".\")[-1]\n",
    ")\n",
    "df_jahresabschluss = df_jahresabschluss.drop([\"name\", \"report\", \"type\"], axis=1)\n",
    "df_jahresabschluss.head()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Daten Extraktion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "from io import StringIO"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_report = df_jahresabschluss.iloc[0].raw_report\n",
    "sample_report_content = df_jahresabschluss.iloc[0].raw_report"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Wirtschaftsprüfer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from aki_prj23_transparenzregister.models.auditor import Auditor\n",
    "\n",
    "\n",
    "def extract_auditor_company(report: str) -> str:\n",
    "    soup = BeautifulSoup(report, features=\"html.parser\")\n",
    "    temp = soup.find_all(\"b\")\n",
    "    for elem in temp:\n",
    "        br = elem.findChildren(\"br\")\n",
    "        if len(br) > 0:\n",
    "            return elem.text.split(\"\\n\")[1].strip()\n",
    "    return None\n",
    "\n",
    "\n",
    "def extract_auditors(report: str) -> list:\n",
    "    auditor_company = extract_auditor_company(report)\n",
    "    auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
    "    hits = re.findall(auditor_regex, report)\n",
    "    return [\n",
    "        Auditor(hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip(), auditor_company)\n",
    "        for hit in hits\n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "extract_auditors(sample_report)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Aufsichtsrat"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**TODO**"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Bilanz bzw. GuV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'net_income': 100238.5, 'equity': 165322.34, 'current_assets': 435344.07}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def extract_kpis(report_content) -> dict:\n",
    "    \"\"\"\n",
    "    Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd\n",
    "    Extracts Key Performance Indicators (KPIs) from the financial reports.\n",
    "    Args:\n",
    "        reports (dict): A dictionary containing the financial reports with their hash as keys and report details as values.\n",
    "    Returns:\n",
    "        dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.\n",
    "    \"\"\"\n",
    "\n",
    "    kpis = {}\n",
    "\n",
    "    # Define KPI patterns to search for\n",
    "    kpi_patterns = {\n",
    "        \"revenue\": r\"(?:revenue|umsatz|erlöse)[:\\s]*([\\d,.]+[mmb]?)\",\n",
    "        \"net_income\": r\"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\\s]*([\\d,.]+[mmb]?)\",\n",
    "        \"ebit\": r\"(?:ebit|operating income)[:\\s]*([\\d,.]+[mmb]?)\",\n",
    "        \"ebitda\": r\"(?:ebitda)[:\\s]*([\\d,.]+[mmb]?)\",\n",
    "        \"gross_profit\": r\"(?:gross profit|bruttogewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
    "        \"operating_profit\": r\"(?:operating profit|betriebsgewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
    "        \"assets\": r\"(?:total assets|bilanzsumme)[:\\s]*([\\d,.]+[mmb]?)\",\n",
    "        \"liabilities\": r\"(?:total liabilities|gesamtverbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
    "        \"equity\": r\"(?:shareholders'? equity|eigenkapital)[:\\s]*([\\d,.]+[mmb]?)\",\n",
    "        \"current_assets\": r\"(?:current assets|umlaufvermögen)[:\\s]*([\\d,.]+[mmb]?)\",\n",
    "        \"current_liabilities\": r\"(?:current liabilities|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
    "        \"long_term_debt\": r\"(?:long[-\\s]?term debt|langfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
    "        \"short_term_debt\": r\"(?:short[-\\s]?term debt|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
    "        \"cash_and_cash_equivalents\": r\"(?:cash (?:and cash equivalents)?|barmittel)[:\\s]*([\\d,.]+[mmb]?)\",\n",
    "        \"dividends\": r\"(?:dividends?|dividende)[:\\s]*([\\d,.]+[mmb]?)\",\n",
    "        \"cash_flow\": r\"(?:cash flow|cashflow|cash flow from operating activities)[:\\s]*([\\d,.]+[mmb]?)\",\n",
    "    }\n",
    "\n",
    "    report_kpis = {}\n",
    "    for kpi, pattern in kpi_patterns.items():\n",
    "        match = re.search(pattern, report_content, flags=re.IGNORECASE | re.UNICODE)\n",
    "        if match:\n",
    "            value = match.group(1)\n",
    "\n",
    "            # Clean and validate the extracted number\n",
    "            try:\n",
    "                if not value:  # Check if value is empty\n",
    "                    cleaned_value = None\n",
    "                else:\n",
    "                    multiplier = 1\n",
    "                    if value[-1].lower() == \"m\":\n",
    "                        value = value[:-1]\n",
    "                        multiplier = 1_000_000\n",
    "                    elif value[-1].lower() == \"b\":\n",
    "                        value = value[:-1]\n",
    "                        multiplier = 1_000_000_000\n",
    "\n",
    "                    # Remove commas after checking for multipliers\n",
    "                    value = value.replace(\".\", \"\").replace(\",\", \".\").strip()\n",
    "                    cleaned_value = float(value) * multiplier\n",
    "            except ValueError:\n",
    "                cleaned_value = None\n",
    "\n",
    "            if cleaned_value is not None:\n",
    "                report_kpis[kpi] = cleaned_value\n",
    "    return report_kpis\n",
    "\n",
    "\n",
    "extract_kpis(\n",
    "    BeautifulSoup(sample_report, features=\"html.parser\").get_text().replace(\"\\n\", \" \")\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "with open(\"./temp.txt\", \"w\") as file:\n",
    "    file.write(\n",
    "        BeautifulSoup(sample_report, features=\"html.parser\")\n",
    "        .get_text()\n",
    "        .replace(\"\\n\", \" \")\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n",
      "            ('Aktiva',    '31.12.2021  EUR'),\n",
      "            ('Aktiva',    '31.12.2020  EUR')],\n",
      "           )\n",
      "Aktiva  Unnamed: 0_level_1    object\n",
      "        31.12.2021  EUR       object\n",
      "        31.12.2020  EUR       object\n",
      "dtype: object\n",
      "MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n",
      "            ('Passiva',    '31.12.2021  EUR'),\n",
      "            ('Passiva',    '31.12.2020  EUR')],\n",
      "           )\n",
      "Passiva  Unnamed: 0_level_1    object\n",
      "         31.12.2021  EUR       object\n",
      "         31.12.2020  EUR       object\n",
      "dtype: object\n",
      "Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n",
      "Angaben zur Identifikation der Gesellschaft laut Registergericht      object\n",
      "Angaben zur Identifikation der Gesellschaft laut Registergericht.1    object\n",
      "dtype: object\n",
      "MultiIndex([('Kreditentwicklung', 'Unnamed: 0_level_1'),\n",
      "            (           'Betrag',                'EUR')],\n",
      "           )\n",
      "Kreditentwicklung  Unnamed: 0_level_1    object\n",
      "Betrag             EUR                   object\n",
      "dtype: object\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{}"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def parse_tables(report: str) -> list:\n",
    "    result = {}\n",
    "    soup = BeautifulSoup(report, features=\"html.parser\")\n",
    "    for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n",
    "        df = pd.read_html(StringIO(str(table)))[0]\n",
    "        print(df.columns)\n",
    "        print(df.dtypes)\n",
    "    return result\n",
    "\n",
    "\n",
    "parse_tables(sample_report)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: []\n",
       "Index: []"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def get_bilanz(report: str) -> any:\n",
    "    result = {}\n",
    "    soup = BeautifulSoup(report, features=\"html.parser\")\n",
    "    for pos in [\"Aktiva\", \"Passiva\"]:\n",
    "        tag = soup.find(\"b\", string=re.compile(pos))\n",
    "        if tag:\n",
    "            pos_results = pd.read_html(\n",
    "                StringIO(str(tag.findNext(\"table\", {\"class\": \"std_table\"})))\n",
    "            )[0]\n",
    "            result[pos] = pos_results\n",
    "        else:\n",
    "            result[pos] = pd.DataFrame([])\n",
    "    return result\n",
    "\n",
    "\n",
    "bilanz = get_bilanz(sample_report)\n",
    "bilanz[\"Passiva\"].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n",
      "            ('Aktiva',    '31.12.2021  EUR'),\n",
      "            ('Aktiva',    '31.12.2020  EUR')],\n",
      "           )\n",
      "MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n",
      "            ('Passiva',    '31.12.2021  EUR'),\n",
      "            ('Passiva',    '31.12.2020  EUR')],\n",
      "           )\n",
      "Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n",
      "MultiIndex([('Kreditentwicklung', 'Unnamed: 0_level_1'),\n",
      "            (           'Betrag',                'EUR')],\n",
      "           )\n"
     ]
    }
   ],
   "source": [
    "def get_tables(raw_report: str) -> list:\n",
    "    soup = BeautifulSoup(raw_report, features=\"html.parser\")\n",
    "    tables = soup.find_all(\"table\", {\"class\": \"std_table\"})\n",
    "    dfs = []\n",
    "    for table in tables:\n",
    "        for df in pd.read_html(StringIO(str(table))):\n",
    "            dfs.append(df)\n",
    "    return dfs\n",
    "\n",
    "\n",
    "for df in get_tables(sample_report):\n",
    "    print(df.columns)\n",
    "\n",
    "tables = get_tables(sample_report)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}