list:\n",
- " auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
- " hits = re.findall(auditor_regex, report)\n",
- " return [hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip() for hit in hits]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['Eckhard Lewe', 'Renate Hermsdorf']"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "extract_auditors(sample_report)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [],
- "source": [
"def extract_auditor_company(report: str) -> str:\n",
" soup = BeautifulSoup(report, features=\"html.parser\")\n",
" temp = soup.find_all(\"b\")\n",
@@ -512,27 +359,37 @@
" br = elem.findChildren(\"br\")\n",
" if len(br) > 0:\n",
" return elem.text.split(\"\\n\")[1].strip()\n",
- " return None"
+ " return None\n",
+ "\n",
+ "\n",
+ "def extract_auditors(report: str) -> list:\n",
+ " auditor_company = extract_auditor_company(report)\n",
+ " auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
+ " hits = re.findall(auditor_regex, report)\n",
+ " return [\n",
+ " Auditor(hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip(), auditor_company)\n",
+ " for hit in hits\n",
+ " ]"
]
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'Warth & Klein Grant Thornton AG'"
+ "[]"
]
},
- "execution_count": 17,
+ "execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "extract_auditor_company(sample_report)"
+ "extract_auditors(sample_report)"
]
},
{
@@ -561,97 +418,177 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Unnamed: 0 | \n",
- " Anhang | \n",
- " 2020 TEUR | \n",
- " Vorjahr TEUR | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 1. Umsatzerlöse | \n",
- " (1) | \n",
- " 69.819 | \n",
- " 77.429 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 2. Veränderung des Bestandes an unfertigen Lei... | \n",
- " NaN | \n",
- " -41.000 | \n",
- " -66.000 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 3. Sonstige betriebliche Erträge | \n",
- " (2) | \n",
- " 489.000 | \n",
- " 1.816 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 4. Materialaufwand | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " a) Aufwendungen für bezogene Waren | \n",
- " NaN | \n",
- " -1.220 | \n",
- " -3.003 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
"text/plain": [
- " Unnamed: 0 Anhang 2020 TEUR \\\n",
- "0 1. Umsatzerlöse (1) 69.819 \n",
- "1 2. Veränderung des Bestandes an unfertigen Lei... NaN -41.000 \n",
- "2 3. Sonstige betriebliche Erträge (2) 489.000 \n",
- "3 4. Materialaufwand NaN NaN \n",
- "4 a) Aufwendungen für bezogene Waren NaN -1.220 \n",
- "\n",
- " Vorjahr TEUR \n",
- "0 77.429 \n",
- "1 -66.000 \n",
- "2 1.816 \n",
- "3 NaN \n",
- "4 -3.003 "
+ "{'net_income': 23484.67, 'equity': 65083.84, 'current_assets': 357613.61}"
]
},
- "execution_count": 18,
+ "execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
+ "source": [
+ "def extract_kpis(report_content) -> dict:\n",
+ " \"\"\"\n",
+ " Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd\n",
+ " Extracts Key Performance Indicators (KPIs) from the financial reports.\n",
+ " Args:\n",
+ " reports (dict): A dictionary containing the financial reports with their hash as keys and report details as values.\n",
+ " Returns:\n",
+ " dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.\n",
+ " \"\"\"\n",
+ "\n",
+ " kpis = {}\n",
+ "\n",
+ " # Define KPI patterns to search for\n",
+ " kpi_patterns = {\n",
+ " \"revenue\": r\"(?:revenue|umsatz|erlöse)[:\\s]*([\\d,.]+[mmb]?)\",\n",
+ " \"net_income\": r\"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\\s]*([\\d,.]+[mmb]?)\",\n",
+ " \"ebit\": r\"(?:ebit|operating income)[:\\s]*([\\d,.]+[mmb]?)\",\n",
+ " \"ebitda\": r\"(?:ebitda)[:\\s]*([\\d,.]+[mmb]?)\",\n",
+ " \"gross_profit\": r\"(?:gross profit|bruttogewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
+ " \"operating_profit\": r\"(?:operating profit|betriebsgewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
+ " \"assets\": r\"(?:total assets|bilanzsumme)[:\\s]*([\\d,.]+[mmb]?)\",\n",
+ " \"liabilities\": r\"(?:total liabilities|gesamtverbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
+ " \"equity\": r\"(?:shareholders'? equity|eigenkapital)[:\\s]*([\\d,.]+[mmb]?)\",\n",
+ " \"current_assets\": r\"(?:current assets|umlaufvermögen)[:\\s]*([\\d,.]+[mmb]?)\",\n",
+ " \"current_liabilities\": r\"(?:current liabilities|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
+ " \"long_term_debt\": r\"(?:long[-\\s]?term debt|langfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
+ " \"short_term_debt\": r\"(?:short[-\\s]?term debt|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
+ " \"cash_and_cash_equivalents\": r\"(?:cash (?:and cash equivalents)?|barmittel)[:\\s]*([\\d,.]+[mmb]?)\",\n",
+ " \"dividends\": r\"(?:dividends?|dividende)[:\\s]*([\\d,.]+[mmb]?)\",\n",
+ " \"cash_flow\": r\"(?:cash flow|cashflow|cash flow from operating activities)[:\\s]*([\\d,.]+[mmb]?)\",\n",
+ " }\n",
+ "\n",
+ " report_kpis = {}\n",
+ " for kpi, pattern in kpi_patterns.items():\n",
+ " match = re.search(pattern, report_content, flags=re.IGNORECASE | re.UNICODE)\n",
+ " if match:\n",
+ " value = match.group(1)\n",
+ "\n",
+ " # Clean and validate the extracted number\n",
+ " try:\n",
+ " if not value: # Check if value is empty\n",
+ " cleaned_value = None\n",
+ " else:\n",
+ " multiplier = 1\n",
+ " if value[-1].lower() == \"m\":\n",
+ " value = value[:-1]\n",
+ " multiplier = 1_000_000\n",
+ " elif value[-1].lower() == \"b\":\n",
+ " value = value[:-1]\n",
+ " multiplier = 1_000_000_000\n",
+ "\n",
+ " # Remove commas after checking for multipliers\n",
+ " value = value.replace(\".\", \"\").replace(\",\", \".\").strip()\n",
+ " cleaned_value = float(value) * multiplier\n",
+ " except ValueError:\n",
+ " cleaned_value = None\n",
+ "\n",
+ " if cleaned_value is not None:\n",
+ " report_kpis[kpi] = cleaned_value\n",
+ " return report_kpis\n",
+ "\n",
+ "\n",
+ "extract_kpis(\n",
+ " BeautifulSoup(sample_report, features=\"html.parser\").get_text().replace(\"\\n\", \" \")\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "\n",
+ "with open(\"./temp.txt\", \"w\") as file:\n",
+ " file.write(\n",
+ " BeautifulSoup(sample_report, features=\"html.parser\")\n",
+ " .get_text()\n",
+ " .replace(\"\\n\", \" \")\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n",
+ " ('Aktiva', '31.12.2020 EUR'),\n",
+ " ('Aktiva', '31.12.2019 EUR')],\n",
+ " )\n",
+ "Aktiva Unnamed: 0_level_1 object\n",
+ " 31.12.2020 EUR object\n",
+ " 31.12.2019 EUR object\n",
+ "dtype: object\n",
+ "MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n",
+ " ('Passiva', '31.12.2020 EUR'),\n",
+ " ('Passiva', '31.12.2019 EUR')],\n",
+ " )\n",
+ "Passiva Unnamed: 0_level_1 object\n",
+ " 31.12.2020 EUR object\n",
+ " 31.12.2019 EUR object\n",
+ "dtype: object\n",
+ "Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n",
+ "Angaben zur Identifikation der Gesellschaft laut Registergericht object\n",
+ "Angaben zur Identifikation der Gesellschaft laut Registergericht.1 object\n",
+ "dtype: object\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "{}"
+ ]
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "def parse_tables(report: str) -> list:\n",
+ " result = {}\n",
+ " soup = BeautifulSoup(report, features=\"html.parser\")\n",
+ " for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n",
+ " df = pd.read_html(StringIO(str(table)))[0]\n",
+ " print(df.columns)\n",
+ " print(df.dtypes)\n",
+ " return result\n",
+ "\n",
+ "\n",
+ "parse_tables(sample_report)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "KeyError",
+ "evalue": "'Passiva'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
+ "\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Bundesanzeiger\\notebook.ipynb Cell 21\u001b[0m in \u001b[0;36m1\n\u001b[0;32m
11\u001b[0m \u001b[39mreturn\u001b[39;00m result\n\u001b[0;32m
14\u001b[0m bilanz \u001b[39m=\u001b[39m get_bilanz(sample_report)\n\u001b[1;32m--->
15\u001b[0m bilanz[\u001b[39m\"\u001b[39;49m\u001b[39mPassiva\u001b[39;49m\u001b[39m\"\u001b[39;49m]\u001b[39m.\u001b[39mhead()\n",
+ "\u001b[1;31mKeyError\u001b[0m: 'Passiva'"
+ ]
+ }
+ ],
"source": [
"def get_bilanz(report: str) -> any:\n",
" result = {}\n",
@@ -672,30 +609,30 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Index(['Gesellschafterbeschluss', 'Shareholder Resolution'], dtype='object')\n",
- "Index(['Unnamed: 0', 'Anhang', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
- "Index(['Unnamed: 0', 'Anhang', '2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
- "Index(['Aufgliederung nach Tätigkeitsbereichen', '2020 TEUR',\n",
+ "Int64Index([0, 1], dtype='int64')\n",
+ "Index(['Unnamed: 0', 'Anhang', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
+ "Index(['Unnamed: 0', 'Anhang', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
+ "Index(['Aufgliederung nach Tätigkeitsbereichen', '2021 TEUR',\n",
" 'Vorjahr TEUR'],\n",
" dtype='object')\n",
- "Index(['Aufgliederung nach Inland und Ausland', '2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
- "Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
- "Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
- "Index(['Unnamed: 0', '31.12.2020'], dtype='object')\n",
+ "Index(['Aufgliederung nach Inland und Ausland', '2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
+ "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
+ "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
+ "Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
"Int64Index([0, 1, 2], dtype='int64')\n",
"Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
- "Index(['Unnamed: 0', '31.12.2020 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
- "Index(['Unnamed: 0', '2020 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
+ "Index(['Unnamed: 0', '31.12.2021 TEUR', 'Vorjahr TEUR'], dtype='object')\n",
+ "Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
"MultiIndex([('Art des Geschäfts', 'Unnamed: 0_level_1'),\n",
" ('Art der Beziehung', 'Gesellschafterin TEUR'),\n",
" ('Art der Beziehung', 'Verbundene Unternehmen TEUR')],\n",
@@ -707,24 +644,23 @@
" ('Anschaffungs- oder Herstellungskosten', ...),\n",
" ('Anschaffungs- oder Herstellungskosten', ...)],\n",
" )\n",
- "MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
- " ( 'Abschreibungen', 'Stand 01.01.2020 EUR'),\n",
- " ( 'Abschreibungen', 'Abschreibungen des Geschäftsjahres EUR'),\n",
- " ( 'Abschreibungen', 'Abgänge Umbuchung EUR'),\n",
- " ( 'Abschreibungen', 'Stand 31.12.2020 EUR')],\n",
+ "MultiIndex([('Unnamed: 0_level_0', ...),\n",
+ " ( 'Abschreibungen', ...),\n",
+ " ( 'Abschreibungen', ...),\n",
+ " ( 'Abschreibungen', ...),\n",
+ " ( 'Abschreibungen', ...)],\n",
" )\n",
"MultiIndex([('Unnamed: 0_level_0', 'Unnamed: 0_level_1'),\n",
- " ( 'Buchwerte', 'Stand 31.12.2020 EUR'),\n",
- " ( 'Buchwerte', 'Stand 31.12.2019 EUR')],\n",
+ " ( 'Buchwerte', 'Stand 31.12.2021 EUR'),\n",
+ " ( 'Buchwerte', 'Stand 31.12.2020 EUR')],\n",
" )\n",
- "Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2020', '2019',\n",
- " '2018'],\n",
+ "Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n",
+ " '2019'],\n",
" dtype='object')\n",
- "Index(['Gewinn- und Verlustrechnung', '2020 TEUR', 'Vorjahr TEUR',\n",
+ "Index(['Gewinn- und Verlustrechnung', '2021 TEUR', 'Vorjahr TEUR',\n",
" 'Veränderung TEUR'],\n",
" dtype='object')\n",
- "Index(['Bilanz', '31.12.2020 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n",
- "Index(['Bericht des Aufsichtsrats', 'Report of the Supervisory Board'], dtype='object')\n"
+ "Index(['Bilanz', '31.12.2021 TEUR', 'Vorjahr TEUR', 'Veränderung TEUR'], dtype='object')\n"
]
}
],
diff --git a/Jupyter/API-tests/News/notebook.ipynb b/Jupyter/API-tests/News/notebook.ipynb
index 89d9e9b..981ba18 100644
--- a/Jupyter/API-tests/News/notebook.ipynb
+++ b/Jupyter/API-tests/News/notebook.ipynb
@@ -596,6 +596,262 @@
"source": [
"service.get_by_id(\"abc\")"
]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Handelsblatt RSS Feed"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import requests\n",
+ "import xmltodict\n",
+ "from bs4 import BeautifulSoup\n",
+ "\n",
+ "\n",
+ "class HandelsblattRSS:\n",
+ " def __init__(self):\n",
+ " self.base_url = \"https://www.handelsblatt.com/contentexport/feed\"\n",
+ "\n",
+ " def get_news_for_category(self, category: str = \"unternehmen\") -> dict:\n",
+ " url = f\"{self.base_url}/{category}\"\n",
+ " result = requests.get(url=url)\n",
+ " if result.status_code == 200:\n",
+ " return xmltodict.parse(result.text)[\"rss\"][\"channel\"][\"item\"]\n",
+ " return None\n",
+ "\n",
+ " def get_news_details_text(self, url: str) -> dict:\n",
+ " content = requests.get(url)\n",
+ " soup = BeautifulSoup(content.text, features=\"html.parser\")\n",
+ "\n",
+ " return \" \".join(\n",
+ " [elem.text.replace(\"\\n\", \" \") for elem in soup.find_all(\"p\")][:]\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "handelsblatt = HandelsblattRSS()\n",
+ "\n",
+ "items = handelsblatt.get_news_for_category()\n",
+ "\n",
+ "from utils.mongodb.mongo import MongoConnector, MongoNewsService\n",
+ "\n",
+ "connector = MongoConnector(\n",
+ " hostname=\"trisnol.tech\",\n",
+ " database=\"transparenzregister\",\n",
+ " username=\"root\",\n",
+ " password=\"pR0R0v2e2\",\n",
+ ")\n",
+ "\n",
+ "service = MongoNewsService(connector)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'2023-06-27T09:20:32+0200'"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from datetime import datetime\n",
+ "\n",
+ "d = items[0][\"pubDate\"]\n",
+ "datetime.strptime(d, \"%a, %d %b %Y %H:%M:%S %z\").strftime(\"%Y-%m-%dT%H:%M:%S%z\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 50/50 [01:04<00:00, 1.30s/it]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " title | \n",
+ " date | \n",
+ " source_url | \n",
+ " text | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " https://www.handelsblatt.com/29227224.html | \n",
+ " Dieselskandal: Ex-Audi-Chef Rupert Stadler zu ... | \n",
+ " 2023-06-27T09:20:32+0200 | \n",
+ " https://www.handelsblatt.com/unternehmen/indus... | \n",
+ " Der frühere Audi-Chef wurde wegen Betrugs zu e... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " https://www.handelsblatt.com/29226410.html | \n",
+ " Luftfahrt: Größer, reichweitenstärker – aber n... | \n",
+ " 2023-06-27T16:28:53+0200 | \n",
+ " https://www.handelsblatt.com/unternehmen/hande... | \n",
+ " Honda Aircraft arbeitet an einem Privatflugzeu... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " https://www.handelsblatt.com/29226522.html | \n",
+ " Asien: Deutsche Unternehmen wetten auf den Ind... | \n",
+ " 2023-06-27T00:30:00+0200 | \n",
+ " https://www.handelsblatt.com/politik/internati... | \n",
+ " Unternehmen gehen von einer positiven wirtscha... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " https://www.handelsblatt.com/29228524.html | \n",
+ " Elektromobilität: US-Elektroautohersteller Lor... | \n",
+ " 2023-06-27T18:45:29+0200 | \n",
+ " https://www.handelsblatt.com/unternehmen/indus... | \n",
+ " Das Start-up plante die Massenproduktion mit e... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " https://www.handelsblatt.com/29228272.html | \n",
+ " US-Konzern: „Gewaltige Komplexität“ – BGH prüf... | \n",
+ " 2023-06-27T16:23:03+0200 | \n",
+ " https://www.handelsblatt.com/unternehmen/hande... | \n",
+ " Das Kartellamt stufte den US-Konzern vergangen... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id \\\n",
+ "0 https://www.handelsblatt.com/29227224.html \n",
+ "1 https://www.handelsblatt.com/29226410.html \n",
+ "2 https://www.handelsblatt.com/29226522.html \n",
+ "3 https://www.handelsblatt.com/29228524.html \n",
+ "4 https://www.handelsblatt.com/29228272.html \n",
+ "\n",
+ " title \\\n",
+ "0 Dieselskandal: Ex-Audi-Chef Rupert Stadler zu ... \n",
+ "1 Luftfahrt: Größer, reichweitenstärker – aber n... \n",
+ "2 Asien: Deutsche Unternehmen wetten auf den Ind... \n",
+ "3 Elektromobilität: US-Elektroautohersteller Lor... \n",
+ "4 US-Konzern: „Gewaltige Komplexität“ – BGH prüf... \n",
+ "\n",
+ " date \\\n",
+ "0 2023-06-27T09:20:32+0200 \n",
+ "1 2023-06-27T16:28:53+0200 \n",
+ "2 2023-06-27T00:30:00+0200 \n",
+ "3 2023-06-27T18:45:29+0200 \n",
+ "4 2023-06-27T16:23:03+0200 \n",
+ "\n",
+ " source_url \\\n",
+ "0 https://www.handelsblatt.com/unternehmen/indus... \n",
+ "1 https://www.handelsblatt.com/unternehmen/hande... \n",
+ "2 https://www.handelsblatt.com/politik/internati... \n",
+ "3 https://www.handelsblatt.com/unternehmen/indus... \n",
+ "4 https://www.handelsblatt.com/unternehmen/hande... \n",
+ "\n",
+ " text \n",
+ "0 Der frühere Audi-Chef wurde wegen Betrugs zu e... \n",
+ "1 Honda Aircraft arbeitet an einem Privatflugzeu... \n",
+ "2 Unternehmen gehen von einer positiven wirtscha... \n",
+ "3 Das Start-up plante die Massenproduktion mit e... \n",
+ "4 Das Kartellamt stufte den US-Konzern vergangen... "
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from datetime import datetime\n",
+ "from tqdm import tqdm\n",
+ "import pandas as pd\n",
+ "\n",
+ "\n",
+ "news = []\n",
+ "for news_article in tqdm(items):\n",
+ " info = {\n",
+ " \"id\": news_article[\"guid\"],\n",
+ " \"title\": news_article[\"title\"],\n",
+ " \"date\": datetime.strptime(\n",
+ " news_article[\"pubDate\"], \"%a, %d %b %Y %H:%M:%S %z\"\n",
+ " ).strftime(\"%Y-%m-%dT%H:%M:%S%z\"),\n",
+ " \"source_url\": news_article[\"link\"],\n",
+ " \"text\": handelsblatt.get_news_details_text(news_article[\"link\"]),\n",
+ " }\n",
+ " news.append(info)\n",
+ "\n",
+ "df = pd.DataFrame(news)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 50/50 [00:00<00:00, 81.98it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "from models.News import News\n",
+ "\n",
+ "for article in tqdm(news):\n",
+ " news_article = News(**article)\n",
+ " if service.get_by_id(news_article.id) is None:\n",
+ " service.insert(news_article)"
+ ]
}
],
"metadata": {