Bundesanzeiger preparation, Handeslblatt RSS feed export

This commit is contained in:
TrisNol
2023-06-27 19:17:54 +02:00
parent 37fb1b1da3
commit 421b1e8c87
2 changed files with 522 additions and 330 deletions

View File

@ -596,6 +596,262 @@
"source": [
"service.get_by_id(\"abc\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Handelsblatt RSS Feed"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import xmltodict\n",
"from bs4 import BeautifulSoup\n",
"\n",
"\n",
"class HandelsblattRSS:\n",
" def __init__(self):\n",
" self.base_url = \"https://www.handelsblatt.com/contentexport/feed\"\n",
"\n",
" def get_news_for_category(self, category: str = \"unternehmen\") -> dict:\n",
" url = f\"{self.base_url}/{category}\"\n",
" result = requests.get(url=url)\n",
" if result.status_code == 200:\n",
" return xmltodict.parse(result.text)[\"rss\"][\"channel\"][\"item\"]\n",
" return None\n",
"\n",
" def get_news_details_text(self, url: str) -> dict:\n",
" content = requests.get(url)\n",
" soup = BeautifulSoup(content.text, features=\"html.parser\")\n",
"\n",
" return \" \".join(\n",
" [elem.text.replace(\"\\n\", \" \") for elem in soup.find_all(\"p\")][:]\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"handelsblatt = HandelsblattRSS()\n",
"\n",
"items = handelsblatt.get_news_for_category()\n",
"\n",
"from utils.mongodb.mongo import MongoConnector, MongoNewsService\n",
"\n",
"connector = MongoConnector(\n",
" hostname=\"trisnol.tech\",\n",
" database=\"transparenzregister\",\n",
" username=\"root\",\n",
" password=\"pR0R0v2e2\",\n",
")\n",
"\n",
"service = MongoNewsService(connector)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'2023-06-27T09:20:32+0200'"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from datetime import datetime\n",
"\n",
"d = items[0][\"pubDate\"]\n",
"datetime.strptime(d, \"%a, %d %b %Y %H:%M:%S %z\").strftime(\"%Y-%m-%dT%H:%M:%S%z\")"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 50/50 [01:04<00:00, 1.30s/it]\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>date</th>\n",
" <th>source_url</th>\n",
" <th>text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>https://www.handelsblatt.com/29227224.html</td>\n",
" <td>Dieselskandal: Ex-Audi-Chef Rupert Stadler zu ...</td>\n",
" <td>2023-06-27T09:20:32+0200</td>\n",
" <td>https://www.handelsblatt.com/unternehmen/indus...</td>\n",
" <td>Der frühere Audi-Chef wurde wegen Betrugs zu e...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>https://www.handelsblatt.com/29226410.html</td>\n",
" <td>Luftfahrt: Größer, reichweitenstärker aber n...</td>\n",
" <td>2023-06-27T16:28:53+0200</td>\n",
" <td>https://www.handelsblatt.com/unternehmen/hande...</td>\n",
" <td>Honda Aircraft arbeitet an einem Privatflugzeu...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>https://www.handelsblatt.com/29226522.html</td>\n",
" <td>Asien: Deutsche Unternehmen wetten auf den Ind...</td>\n",
" <td>2023-06-27T00:30:00+0200</td>\n",
" <td>https://www.handelsblatt.com/politik/internati...</td>\n",
" <td>Unternehmen gehen von einer positiven wirtscha...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>https://www.handelsblatt.com/29228524.html</td>\n",
" <td>Elektromobilität: US-Elektroautohersteller Lor...</td>\n",
" <td>2023-06-27T18:45:29+0200</td>\n",
" <td>https://www.handelsblatt.com/unternehmen/indus...</td>\n",
" <td>Das Start-up plante die Massenproduktion mit e...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>https://www.handelsblatt.com/29228272.html</td>\n",
" <td>US-Konzern: „Gewaltige Komplexität“ BGH prüf...</td>\n",
" <td>2023-06-27T16:23:03+0200</td>\n",
" <td>https://www.handelsblatt.com/unternehmen/hande...</td>\n",
" <td>Das Kartellamt stufte den US-Konzern vergangen...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id \\\n",
"0 https://www.handelsblatt.com/29227224.html \n",
"1 https://www.handelsblatt.com/29226410.html \n",
"2 https://www.handelsblatt.com/29226522.html \n",
"3 https://www.handelsblatt.com/29228524.html \n",
"4 https://www.handelsblatt.com/29228272.html \n",
"\n",
" title \\\n",
"0 Dieselskandal: Ex-Audi-Chef Rupert Stadler zu ... \n",
"1 Luftfahrt: Größer, reichweitenstärker aber n... \n",
"2 Asien: Deutsche Unternehmen wetten auf den Ind... \n",
"3 Elektromobilität: US-Elektroautohersteller Lor... \n",
"4 US-Konzern: „Gewaltige Komplexität“ BGH prüf... \n",
"\n",
" date \\\n",
"0 2023-06-27T09:20:32+0200 \n",
"1 2023-06-27T16:28:53+0200 \n",
"2 2023-06-27T00:30:00+0200 \n",
"3 2023-06-27T18:45:29+0200 \n",
"4 2023-06-27T16:23:03+0200 \n",
"\n",
" source_url \\\n",
"0 https://www.handelsblatt.com/unternehmen/indus... \n",
"1 https://www.handelsblatt.com/unternehmen/hande... \n",
"2 https://www.handelsblatt.com/politik/internati... \n",
"3 https://www.handelsblatt.com/unternehmen/indus... \n",
"4 https://www.handelsblatt.com/unternehmen/hande... \n",
"\n",
" text \n",
"0 Der frühere Audi-Chef wurde wegen Betrugs zu e... \n",
"1 Honda Aircraft arbeitet an einem Privatflugzeu... \n",
"2 Unternehmen gehen von einer positiven wirtscha... \n",
"3 Das Start-up plante die Massenproduktion mit e... \n",
"4 Das Kartellamt stufte den US-Konzern vergangen... "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from datetime import datetime\n",
"from tqdm import tqdm\n",
"import pandas as pd\n",
"\n",
"\n",
"news = []\n",
"for news_article in tqdm(items):\n",
" info = {\n",
" \"id\": news_article[\"guid\"],\n",
" \"title\": news_article[\"title\"],\n",
" \"date\": datetime.strptime(\n",
" news_article[\"pubDate\"], \"%a, %d %b %Y %H:%M:%S %z\"\n",
" ).strftime(\"%Y-%m-%dT%H:%M:%S%z\"),\n",
" \"source_url\": news_article[\"link\"],\n",
" \"text\": handelsblatt.get_news_details_text(news_article[\"link\"]),\n",
" }\n",
" news.append(info)\n",
"\n",
"df = pd.DataFrame(news)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 50/50 [00:00<00:00, 81.98it/s]\n"
]
}
],
"source": [
"from models.News import News\n",
"\n",
"for article in tqdm(news):\n",
" news_article = News(**article)\n",
" if service.get_by_id(news_article.id) is None:\n",
" service.insert(news_article)"
]
}
],
"metadata": {