From 421b1e8c87df3afcec5d281e71af62c7aea2ed0d Mon Sep 17 00:00:00 2001 From: TrisNol Date: Tue, 27 Jun 2023 19:17:54 +0200 Subject: [PATCH] Bundesanzeiger preparation, Handeslblatt RSS feed export --- .../API-tests/Bundesanzeiger/notebook.ipynb | 596 ++++++++---------- Jupyter/API-tests/News/notebook.ipynb | 256 ++++++++ 2 files changed, 522 insertions(+), 330 deletions(-) diff --git a/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb b/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb index c5b8d39..952cc6d 100644 --- a/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb +++ b/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb @@ -8,14 +8,6 @@ "# Daten Extraktion aus dem Bundesanzeiger" ] }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In order to run this notebooks, download the `deutschland` library source code from: [TrisNol/deutschland](https://github.com/TrisNol/deutschland/tree/feat/bundesanzeiger-raw-report) and place it in the `Jupyter/API-tests/Bundesanzeiger/deutschland` directory. Since the PR adding the required features to the main repo has not been completet as of of yet (see: [PR](https://github.com/bundesAPI/deutschland/pull/88)) we have to include it in another way..." - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -26,18 +18,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 32, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\trist\\AppData\\Roaming\\Python\\Python310\\site-packages\\requests\\__init__.py:102: RequestsDependencyWarning: urllib3 (1.26.12) or chardet (5.1.0)/charset_normalizer (2.0.12) doesn't match a supported version!\n", - " warnings.warn(\"urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported \"\n" - ] - } - ], + "outputs": [], "source": [ "import pandas as pd\n", "from deutschland.bundesanzeiger import Bundesanzeiger" @@ -45,26 +28,28 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "dict_keys(['040860c00ef9020cfb6db2f58a163256', '9eb401c5af2f0289b8207233bf852b81', 'b14d979ea9f42367d413589050bd04e5', 'ecb8c1011456ea0d40f87e850fc216bf', '3a7c6c1f1d1b89bf5ceb165f0ee88053', '03b2e6aac2f2da2c0c5e8f23de9caec4', 'a5f8dc87fa797e7d2f8fb88c49a23c36', '9a3a8a3e84290ee650cbccf32323b3d7', '6c0fcc20a58aaa18a9d13f35a51e3996', 'bf276d441c339e787e22385d2b69b277', '90a79d28f3c11a2122d2827d2bf6acda', '88c785ce3b3c580dcc285661c7790cca', 'd3064baa8246c3ed02e30b5038200edc', '5bf92eed2808b484c005409764b825b7', 'fece6303c991a280850be1900ff78f8f', '26b0624c60cdbf647f3d45f4917ec6ea', '9f98bee55f598908cca60b6a47e5d49d', '99267bb7474e6d1d5d9e091ba5ef3ee8', '102738ef4b91408ed043d84fe785b50b', '94711f3e509518d073e1760d97550347'])\n" + "dict_keys(['c1051233030a8e0232523052fd4a2310', '57d129e6fd7505d567fa13919e5e6bdd'])\n" ] } ], "source": [ "ba = Bundesanzeiger()\n", - "reports = ba.get_reports(\"Atos IT-Dienstleistung und Beratung GmbH\")\n", + "reports = ba.get_reports(\n", + " \"Volkswagen Economy Service Erdle Bernhard Erdle GmbH\"\n", + ") # \"Atos IT-Dienstleistung und Beratung GmbH\")\n", "print(reports.keys())" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -75,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -109,42 +94,18 @@ " \n", " \n", " 0\n", - " 2023-03-17\n", - " Aufsichtsrat\n", - " Atos IT-Dienstleistung und Beratung GmbH\n", - " \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl...\n", + " 2023-05-25\n", + " Jahresabschluss zum Geschäftsjahr vom 01.01.20...\n", + " Volkswagen Economy Service Erdle Bernhard Erdl...\n", + " \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...\n", " <div class=\"publication_container\">\\n <div cla...\n", " \n", " \n", " 1\n", - " 2022-03-25\n", + " 2023-05-24\n", " Jahresabschluss zum Geschäftsjahr vom 01.01.20...\n", - " Atos IT-Dienstleistung und Beratung GmbH\n", - " \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...\n", - " <div class=\"publication_container\">\\n <div cla...\n", - " \n", - " \n", - " 2\n", - " 2021-03-11\n", - " Jahresabschluss zum Geschäftsjahr vom 01.01.20...\n", - " Atos IT-Dienstleistung und Beratung GmbH\n", - " \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...\n", - " <div class=\"publication_container\">\\n <div cla...\n", - " \n", - " \n", - " 3\n", - " 2020-03-24\n", - " Jahresabschluss zum Geschäftsjahr vom 01.01.20...\n", - " Atos IT-Dienstleistung und Beratung GmbH\n", - " \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung...\n", - " <div class=\"publication_container\">\\n <div cla...\n", - " \n", - " \n", - " 4\n", - " 2018-12-11\n", - " Jahresabschluss zum Geschäftsjahr vom 01.01.20...\n", - " Atos IT-Dienstleistung und Beratung GmbH\n", - " \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...\n", + " Volkswagen Economy Service Erdle Bernhard Erdl...\n", + " \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...\n", " <div class=\"publication_container\">\\n <div cla...\n", " \n", " \n", @@ -153,35 +114,23 @@ ], "text/plain": [ " date name \\\n", - "0 2023-03-17 Aufsichtsrat \n", - "1 2022-03-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n", - "2 2021-03-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n", - "3 2020-03-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n", - "4 2018-12-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n", + "0 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n", + "1 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n", "\n", - " company \\\n", - "0 Atos IT-Dienstleistung und Beratung GmbH \n", - "1 Atos IT-Dienstleistung und Beratung GmbH \n", - "2 Atos IT-Dienstleistung und Beratung GmbH \n", - "3 Atos IT-Dienstleistung und Beratung GmbH \n", - "4 Atos IT-Dienstleistung und Beratung GmbH \n", + " company \\\n", + "0 Volkswagen Economy Service Erdle Bernhard Erdl... \n", + "1 Volkswagen Economy Service Erdle Bernhard Erdl... \n", "\n", " report \\\n", - "0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl... \n", - "1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n", - "2 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n", - "3 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung... \n", - "4 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n", + "0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n", + "1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n", "\n", " raw_report \n", "0
\\n
\\n
\\n
\\n
\\n
\\n
\n", " \n", " 0\n", - " 2023-03-17\n", - " Aufsichtsrat\n", - " Atos IT-Dienstleistung und Beratung GmbH\n", - " \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl...\n", + " 2023-05-25\n", + " Jahresabschluss zum Geschäftsjahr vom 01.01.20...\n", + " Volkswagen Economy Service Erdle Bernhard Erdl...\n", + " \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...\n", " <div class=\"publication_container\">\\n <div cla...\n", - " Aufsichtsrat\n", + " Jahresabschluss\n", " \n", " \n", " 1\n", - " 2022-03-25\n", + " 2023-05-24\n", " Jahresabschluss zum Geschäftsjahr vom 01.01.20...\n", - " Atos IT-Dienstleistung und Beratung GmbH\n", - " \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...\n", - " <div class=\"publication_container\">\\n <div cla...\n", - " Jahresabschluss\n", - " \n", - " \n", - " 2\n", - " 2021-03-11\n", - " Jahresabschluss zum Geschäftsjahr vom 01.01.20...\n", - " Atos IT-Dienstleistung und Beratung GmbH\n", - " \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...\n", - " <div class=\"publication_container\">\\n <div cla...\n", - " Jahresabschluss\n", - " \n", - " \n", - " 3\n", - " 2020-03-24\n", - " Jahresabschluss zum Geschäftsjahr vom 01.01.20...\n", - " Atos IT-Dienstleistung und Beratung GmbH\n", - " \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung...\n", - " <div class=\"publication_container\">\\n <div cla...\n", - " Jahresabschluss\n", - " \n", - " \n", - " 4\n", - " 2018-12-11\n", - " Jahresabschluss zum Geschäftsjahr vom 01.01.20...\n", - " Atos IT-Dienstleistung und Beratung GmbH\n", - " \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...\n", + " Volkswagen Economy Service Erdle Bernhard Erdl...\n", + " \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...\n", " <div class=\"publication_container\">\\n <div cla...\n", " Jahresabschluss\n", " \n", @@ -277,35 +199,23 @@ ], "text/plain": [ " date name \\\n", - "0 2023-03-17 Aufsichtsrat \n", - "1 2022-03-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n", - "2 2021-03-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n", - "3 2020-03-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n", - "4 2018-12-11 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n", + "0 2023-05-25 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n", + "1 2023-05-24 Jahresabschluss zum Geschäftsjahr vom 01.01.20... \n", "\n", - " company \\\n", - "0 Atos IT-Dienstleistung und Beratung GmbH \n", - "1 Atos IT-Dienstleistung und Beratung GmbH \n", - "2 Atos IT-Dienstleistung und Beratung GmbH \n", - "3 Atos IT-Dienstleistung und Beratung GmbH \n", - "4 Atos IT-Dienstleistung und Beratung GmbH \n", + " company \\\n", + "0 Volkswagen Economy Service Erdle Bernhard Erdl... \n", + "1 Volkswagen Economy Service Erdle Bernhard Erdl... \n", "\n", " report \\\n", - "0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl... \n", - "1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n", - "2 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n", - "3 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung... \n", - "4 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun... \n", + "0 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n", + "1 \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se... \n", "\n", " raw_report type \n", - "0
\\n
\\n
\\n
\\n
\\n
\\n
\\n
\n", " \n", " \n", - " 1\n", - " 2022-03-25\n", - " Atos IT-Dienstleistung und Beratung GmbH\n", + " 0\n", + " 2023-05-25\n", + " Volkswagen Economy Service Erdle Bernhard Erdl...\n", " <div class=\"publication_container\">\\n <div cla...\n", " 2020\n", " \n", " \n", - " 2\n", - " 2021-03-11\n", - " Atos IT-Dienstleistung und Beratung GmbH\n", + " 1\n", + " 2023-05-24\n", + " Volkswagen Economy Service Erdle Bernhard Erdl...\n", " <div class=\"publication_container\">\\n <div cla...\n", " 2019\n", " \n", - " \n", - " 3\n", - " 2020-03-24\n", - " Atos IT-Dienstleistung und Beratung GmbH\n", - " <div class=\"publication_container\">\\n <div cla...\n", - " 2018\n", - " \n", - " \n", - " 4\n", - " 2018-12-11\n", - " Atos IT-Dienstleistung und Beratung GmbH\n", - " <div class=\"publication_container\">\\n <div cla...\n", - " 2017\n", - " \n", - " \n", - " 6\n", - " 2018-01-03\n", - " Atos IT-Dienstleistung und Beratung GmbH\n", - " <div class=\"publication_container\">\\n <div cla...\n", - " 2016\n", - " \n", " \n", "\n", "
" ], "text/plain": [ - " date company \\\n", - "1 2022-03-25 Atos IT-Dienstleistung und Beratung GmbH \n", - "2 2021-03-11 Atos IT-Dienstleistung und Beratung GmbH \n", - "3 2020-03-24 Atos IT-Dienstleistung und Beratung GmbH \n", - "4 2018-12-11 Atos IT-Dienstleistung und Beratung GmbH \n", - "6 2018-01-03 Atos IT-Dienstleistung und Beratung GmbH \n", + " date company \\\n", + "0 2023-05-25 Volkswagen Economy Service Erdle Bernhard Erdl... \n", + "1 2023-05-24 Volkswagen Economy Service Erdle Bernhard Erdl... \n", "\n", " raw_report jahr \n", - "1
\\n
\\n
\\n
\\n
\\n
\\n
\\n
list:\n", - " auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n", - " hits = re.findall(auditor_regex, report)\n", - " return [hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip() for hit in hits]" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Eckhard Lewe', 'Renate Hermsdorf']" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "extract_auditors(sample_report)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ "def extract_auditor_company(report: str) -> str:\n", " soup = BeautifulSoup(report, features=\"html.parser\")\n", " temp = soup.find_all(\"b\")\n", @@ -512,27 +359,37 @@ " br = elem.findChildren(\"br\")\n", " if len(br) > 0:\n", " return elem.text.split(\"\\n\")[1].strip()\n", - " return None" + " return None\n", + "\n", + "\n", + "def extract_auditors(report: str) -> list:\n", + " auditor_company = extract_auditor_company(report)\n", + " auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n", + " hits = re.findall(auditor_regex, report)\n", + " return [\n", + " Auditor(hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip(), auditor_company)\n", + " for hit in hits\n", + " ]" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'Warth & Klein Grant Thornton AG'" + "[]" ] }, - "execution_count": 17, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "extract_auditor_company(sample_report)" + "extract_auditors(sample_report)" ] }, { @@ -561,97 +418,177 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 42, "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0Anhang2020 TEURVorjahr TEUR
01. Umsatzerlöse(1)69.81977.429
12. Veränderung des Bestandes an unfertigen Lei...NaN-41.000-66.000
23. Sonstige betriebliche Erträge(2)489.0001.816
34. MaterialaufwandNaNNaNNaN
4a) Aufwendungen für bezogene WarenNaN-1.220-3.003
\n", - "
" - ], "text/plain": [ - " Unnamed: 0 Anhang 2020 TEUR \\\n", - "0 1. Umsatzerlöse (1) 69.819 \n", - "1 2. Veränderung des Bestandes an unfertigen Lei... NaN -41.000 \n", - "2 3. Sonstige betriebliche Erträge (2) 489.000 \n", - "3 4. Materialaufwand NaN NaN \n", - "4 a) Aufwendungen für bezogene Waren NaN -1.220 \n", - "\n", - " Vorjahr TEUR \n", - "0 77.429 \n", - "1 -66.000 \n", - "2 1.816 \n", - "3 NaN \n", - "4 -3.003 " + "{'net_income': 23484.67, 'equity': 65083.84, 'current_assets': 357613.61}" ] }, - "execution_count": 18, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], + "source": [ + "def extract_kpis(report_content) -> dict:\n", + " \"\"\"\n", + " Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd\n", + " Extracts Key Performance Indicators (KPIs) from the financial reports.\n", + " Args:\n", + " reports (dict): A dictionary containing the financial reports with their hash as keys and report details as values.\n", + " Returns:\n", + " dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.\n", + " \"\"\"\n", + "\n", + " kpis = {}\n", + "\n", + " # Define KPI patterns to search for\n", + " kpi_patterns = {\n", + " \"revenue\": r\"(?:revenue|umsatz|erlöse)[:\\s]*([\\d,.]+[mmb]?)\",\n", + " \"net_income\": r\"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\\s]*([\\d,.]+[mmb]?)\",\n", + " \"ebit\": r\"(?:ebit|operating income)[:\\s]*([\\d,.]+[mmb]?)\",\n", + " \"ebitda\": r\"(?:ebitda)[:\\s]*([\\d,.]+[mmb]?)\",\n", + " \"gross_profit\": r\"(?:gross profit|bruttogewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n", + " \"operating_profit\": r\"(?:operating profit|betriebsgewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n", + " \"assets\": r\"(?:total assets|bilanzsumme)[:\\s]*([\\d,.]+[mmb]?)\",\n", + " \"liabilities\": r\"(?:total liabilities|gesamtverbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n", + " \"equity\": r\"(?:shareholders'? equity|eigenkapital)[:\\s]*([\\d,.]+[mmb]?)\",\n", + " \"current_assets\": r\"(?:current assets|umlaufvermögen)[:\\s]*([\\d,.]+[mmb]?)\",\n", + " \"current_liabilities\": r\"(?:current liabilities|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n", + " \"long_term_debt\": r\"(?:long[-\\s]?term debt|langfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n", + " \"short_term_debt\": r\"(?:short[-\\s]?term debt|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n", + " \"cash_and_cash_equivalents\": r\"(?:cash (?:and cash equivalents)?|barmittel)[:\\s]*([\\d,.]+[mmb]?)\",\n", + " \"dividends\": r\"(?:dividends?|dividende)[:\\s]*([\\d,.]+[mmb]?)\",\n", + " \"cash_flow\": r\"(?:cash flow|cashflow|cash flow from operating activities)[:\\s]*([\\d,.]+[mmb]?)\",\n", + " }\n", + "\n", + " report_kpis = {}\n", + " for kpi, pattern in kpi_patterns.items():\n", + " match = re.search(pattern, report_content, flags=re.IGNORECASE | re.UNICODE)\n", + " if match:\n", + " value = match.group(1)\n", + "\n", + " # Clean and validate the extracted number\n", + " try:\n", + " if not value: # Check if value is empty\n", + " cleaned_value = None\n", + " else:\n", + " multiplier = 1\n", + " if value[-1].lower() == \"m\":\n", + " value = value[:-1]\n", + " multiplier = 1_000_000\n", + " elif value[-1].lower() == \"b\":\n", + " value = value[:-1]\n", + " multiplier = 1_000_000_000\n", + "\n", + " # Remove commas after checking for multipliers\n", + " value = value.replace(\".\", \"\").replace(\",\", \".\").strip()\n", + " cleaned_value = float(value) * multiplier\n", + " except ValueError:\n", + " cleaned_value = None\n", + "\n", + " if cleaned_value is not None:\n", + " report_kpis[kpi] = cleaned_value\n", + " return report_kpis\n", + "\n", + "\n", + "extract_kpis(\n", + " BeautifulSoup(sample_report, features=\"html.parser\").get_text().replace(\"\\n\", \" \")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "with open(\"./temp.txt\", \"w\") as file:\n", + " file.write(\n", + " BeautifulSoup(sample_report, features=\"html.parser\")\n", + " .get_text()\n", + " .replace(\"\\n\", \" \")\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n", + " ('Aktiva', '31.12.2020 EUR'),\n", + " ('Aktiva', '31.12.2019 EUR')],\n", + " )\n", + "Aktiva Unnamed: 0_level_1 object\n", + " 31.12.2020 EUR object\n", + " 31.12.2019 EUR object\n", + "dtype: object\n", + "MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n", + " ('Passiva', '31.12.2020 EUR'),\n", + " ('Passiva', '31.12.2019 EUR')],\n", + " )\n", + "Passiva Unnamed: 0_level_1 object\n", + " 31.12.2020 EUR object\n", + " 31.12.2019 EUR object\n", + "dtype: object\n", + "Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n", + "Angaben zur Identifikation der Gesellschaft laut Registergericht object\n", + "Angaben zur Identifikation der Gesellschaft laut Registergericht.1 object\n", + "dtype: object\n" + ] + }, + { + "data": { + "text/plain": [ + "{}" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def parse_tables(report: str) -> list:\n", + " result = {}\n", + " soup = BeautifulSoup(report, features=\"html.parser\")\n", + " for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n", + " df = pd.read_html(StringIO(str(table)))[0]\n", + " print(df.columns)\n", + " print(df.dtypes)\n", + " return result\n", + "\n", + "\n", + "parse_tables(sample_report)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'Passiva'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Bundesanzeiger\\notebook.ipynb Cell 21\u001b[0m in \u001b[0;36m1\n\u001b[0;32m 11\u001b[0m \u001b[39mreturn\u001b[39;00m result\n\u001b[0;32m 14\u001b[0m bilanz \u001b[39m=\u001b[39m get_bilanz(sample_report)\n\u001b[1;32m---> 15\u001b[0m bilanz[\u001b[39m\"\u001b[39;49m\u001b[39mPassiva\u001b[39;49m\u001b[39m\"\u001b[39;49m]\u001b[39m.\u001b[39mhead()\n", + "\u001b[1;31mKeyError\u001b[0m: 'Passiva'" + ] + } + ], "source": [ "def get_bilanz(report: str) -> any:\n", " result = {}\n", @@ -672,30 +609,30 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Index(['Gesellschafterbeschluss', 'Shareholder Resolution'], dtype='object')\n", - "Index(['Unnamed: 0', 'Anhang', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n", - "Index(['Unnamed: 0', 'Anhang', '2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n", - "Index(['Aufgliederung nach Tätigkeitsbereichen', '2020 TEUR',\n", + "Int64Index([0, 1], dtype='int64')\n", + "Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", + "Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", + "Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n", " 'Vorjahr TEUR'],\n", " dtype='object')\n", - "Index(['Aufgliederung nach Inland und Ausland', '2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n", - "Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n", - "Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n", - "Index(['Unnamed: 0', '31.12.2020'], dtype='object')\n", + "Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", + "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", + "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", + "Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n", "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n", "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n", "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n", "Int64Index([0, 1, 2], dtype='int64')\n", "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n", - "Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n", - "Index(['Unnamed: 0', '2020 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n", + "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n", + "Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n", "MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n", " ('Art der Beziehung', 'Gesellschafterin TEUR'),\n", " ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n", @@ -707,24 +644,23 @@ " ('Anschaffungs- oder Herstellungskosten', ...),\n", " ('Anschaffungs- oder Herstellungskosten', ...)],\n", " )\n", - "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", - " ( 'Abschreibungen', 'Stand 01.01.2020 EUR'),\n", - " ( 'Abschreibungen', 'Abschreibungen des Geschäftsjahres EUR'),\n", - " ( 'Abschreibungen', 'Abgänge Umbuchung EUR'),\n", - " ( 'Abschreibungen', 'Stand 31.12.2020 EUR')],\n", + "MultiIndex([('Unnamed: 0_level_0', ...),\n", + " ( 'Abschreibungen', ...),\n", + " ( 'Abschreibungen', ...),\n", + " ( 'Abschreibungen', ...),\n", + " ( 'Abschreibungen', ...)],\n", " )\n", "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n", - " ( 'Buchwerte', 'Stand 31.12.2020 EUR'),\n", - " ( 'Buchwerte', 'Stand 31.12.2019 EUR')],\n", + " ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n", + " ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n", " )\n", - "Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2020', '2019',\n", - " '2018'],\n", + "Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n", + " '2019'],\n", " dtype='object')\n", - "Index(['Gewinn- und Verlustrechnung', '2020 TEUR', 'Vorjahr TEUR',\n", + "Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n", " 'Veränderung TEUR'],\n", " dtype='object')\n", - "Index(['Bilanz', '31.12.2020 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n", - "Index(['Bericht des Aufsichtsrats', 'Report of the Supervisory Board'], dtype='object')\n" + "Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n" ] } ], diff --git a/Jupyter/API-tests/News/notebook.ipynb b/Jupyter/API-tests/News/notebook.ipynb index 89d9e9b..981ba18 100644 --- a/Jupyter/API-tests/News/notebook.ipynb +++ b/Jupyter/API-tests/News/notebook.ipynb @@ -596,6 +596,262 @@ "source": [ "service.get_by_id(\"abc\")" ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Handelsblatt RSS Feed" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "import xmltodict\n", + "from bs4 import BeautifulSoup\n", + "\n", + "\n", + "class HandelsblattRSS:\n", + " def __init__(self):\n", + " self.base_url = \"https://www.handelsblatt.com/contentexport/feed\"\n", + "\n", + " def get_news_for_category(self, category: str = \"unternehmen\") -> dict:\n", + " url = f\"{self.base_url}/{category}\"\n", + " result = requests.get(url=url)\n", + " if result.status_code == 200:\n", + " return xmltodict.parse(result.text)[\"rss\"][\"channel\"][\"item\"]\n", + " return None\n", + "\n", + " def get_news_details_text(self, url: str) -> dict:\n", + " content = requests.get(url)\n", + " soup = BeautifulSoup(content.text, features=\"html.parser\")\n", + "\n", + " return \" \".join(\n", + " [elem.text.replace(\"\\n\", \" \") for elem in soup.find_all(\"p\")][:]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "handelsblatt = HandelsblattRSS()\n", + "\n", + "items = handelsblatt.get_news_for_category()\n", + "\n", + "from utils.mongodb.mongo import MongoConnector, MongoNewsService\n", + "\n", + "connector = MongoConnector(\n", + " hostname=\"trisnol.tech\",\n", + " database=\"transparenzregister\",\n", + " username=\"root\",\n", + " password=\"pR0R0v2e2\",\n", + ")\n", + "\n", + "service = MongoNewsService(connector)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'2023-06-27T09:20:32+0200'" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from datetime import datetime\n", + "\n", + "d = items[0][\"pubDate\"]\n", + "datetime.strptime(d, \"%a, %d %b %Y %H:%M:%S %z\").strftime(\"%Y-%m-%dT%H:%M:%S%z\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 50/50 [01:04<00:00, 1.30s/it]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtitledatesource_urltext
0https://www.handelsblatt.com/29227224.htmlDieselskandal: Ex-Audi-Chef Rupert Stadler zu ...2023-06-27T09:20:32+0200https://www.handelsblatt.com/unternehmen/indus...Der frühere Audi-Chef wurde wegen Betrugs zu e...
1https://www.handelsblatt.com/29226410.htmlLuftfahrt: Größer, reichweitenstärker – aber n...2023-06-27T16:28:53+0200https://www.handelsblatt.com/unternehmen/hande...Honda Aircraft arbeitet an einem Privatflugzeu...
2https://www.handelsblatt.com/29226522.htmlAsien: Deutsche Unternehmen wetten auf den Ind...2023-06-27T00:30:00+0200https://www.handelsblatt.com/politik/internati...Unternehmen gehen von einer positiven wirtscha...
3https://www.handelsblatt.com/29228524.htmlElektromobilität: US-Elektroautohersteller Lor...2023-06-27T18:45:29+0200https://www.handelsblatt.com/unternehmen/indus...Das Start-up plante die Massenproduktion mit e...
4https://www.handelsblatt.com/29228272.htmlUS-Konzern: „Gewaltige Komplexität“ – BGH prüf...2023-06-27T16:23:03+0200https://www.handelsblatt.com/unternehmen/hande...Das Kartellamt stufte den US-Konzern vergangen...
\n", + "
" + ], + "text/plain": [ + " id \\\n", + "0 https://www.handelsblatt.com/29227224.html \n", + "1 https://www.handelsblatt.com/29226410.html \n", + "2 https://www.handelsblatt.com/29226522.html \n", + "3 https://www.handelsblatt.com/29228524.html \n", + "4 https://www.handelsblatt.com/29228272.html \n", + "\n", + " title \\\n", + "0 Dieselskandal: Ex-Audi-Chef Rupert Stadler zu ... \n", + "1 Luftfahrt: Größer, reichweitenstärker – aber n... \n", + "2 Asien: Deutsche Unternehmen wetten auf den Ind... \n", + "3 Elektromobilität: US-Elektroautohersteller Lor... \n", + "4 US-Konzern: „Gewaltige Komplexität“ – BGH prüf... \n", + "\n", + " date \\\n", + "0 2023-06-27T09:20:32+0200 \n", + "1 2023-06-27T16:28:53+0200 \n", + "2 2023-06-27T00:30:00+0200 \n", + "3 2023-06-27T18:45:29+0200 \n", + "4 2023-06-27T16:23:03+0200 \n", + "\n", + " source_url \\\n", + "0 https://www.handelsblatt.com/unternehmen/indus... \n", + "1 https://www.handelsblatt.com/unternehmen/hande... \n", + "2 https://www.handelsblatt.com/politik/internati... \n", + "3 https://www.handelsblatt.com/unternehmen/indus... \n", + "4 https://www.handelsblatt.com/unternehmen/hande... \n", + "\n", + " text \n", + "0 Der frühere Audi-Chef wurde wegen Betrugs zu e... \n", + "1 Honda Aircraft arbeitet an einem Privatflugzeu... \n", + "2 Unternehmen gehen von einer positiven wirtscha... \n", + "3 Das Start-up plante die Massenproduktion mit e... \n", + "4 Das Kartellamt stufte den US-Konzern vergangen... " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from datetime import datetime\n", + "from tqdm import tqdm\n", + "import pandas as pd\n", + "\n", + "\n", + "news = []\n", + "for news_article in tqdm(items):\n", + " info = {\n", + " \"id\": news_article[\"guid\"],\n", + " \"title\": news_article[\"title\"],\n", + " \"date\": datetime.strptime(\n", + " news_article[\"pubDate\"], \"%a, %d %b %Y %H:%M:%S %z\"\n", + " ).strftime(\"%Y-%m-%dT%H:%M:%S%z\"),\n", + " \"source_url\": news_article[\"link\"],\n", + " \"text\": handelsblatt.get_news_details_text(news_article[\"link\"]),\n", + " }\n", + " news.append(info)\n", + "\n", + "df = pd.DataFrame(news)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 50/50 [00:00<00:00, 81.98it/s]\n" + ] + } + ], + "source": [ + "from models.News import News\n", + "\n", + "for article in tqdm(news):\n", + " news_article = News(**article)\n", + " if service.get_by_id(news_article.id) is None:\n", + " service.insert(news_article)" + ] } ], "metadata": {