mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-06-22 00:14:01 +02:00
Bundesanzeiger preparation, Handeslblatt RSS feed export
This commit is contained in:
@ -596,6 +596,262 @@
|
||||
"source": [
|
||||
"service.get_by_id(\"abc\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Handelsblatt RSS Feed"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"import xmltodict\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class HandelsblattRSS:\n",
|
||||
" def __init__(self):\n",
|
||||
" self.base_url = \"https://www.handelsblatt.com/contentexport/feed\"\n",
|
||||
"\n",
|
||||
" def get_news_for_category(self, category: str = \"unternehmen\") -> dict:\n",
|
||||
" url = f\"{self.base_url}/{category}\"\n",
|
||||
" result = requests.get(url=url)\n",
|
||||
" if result.status_code == 200:\n",
|
||||
" return xmltodict.parse(result.text)[\"rss\"][\"channel\"][\"item\"]\n",
|
||||
" return None\n",
|
||||
"\n",
|
||||
" def get_news_details_text(self, url: str) -> dict:\n",
|
||||
" content = requests.get(url)\n",
|
||||
" soup = BeautifulSoup(content.text, features=\"html.parser\")\n",
|
||||
"\n",
|
||||
" return \" \".join(\n",
|
||||
" [elem.text.replace(\"\\n\", \" \") for elem in soup.find_all(\"p\")][:]\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"handelsblatt = HandelsblattRSS()\n",
|
||||
"\n",
|
||||
"items = handelsblatt.get_news_for_category()\n",
|
||||
"\n",
|
||||
"from utils.mongodb.mongo import MongoConnector, MongoNewsService\n",
|
||||
"\n",
|
||||
"connector = MongoConnector(\n",
|
||||
" hostname=\"trisnol.tech\",\n",
|
||||
" database=\"transparenzregister\",\n",
|
||||
" username=\"root\",\n",
|
||||
" password=\"pR0R0v2e2\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"service = MongoNewsService(connector)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'2023-06-27T09:20:32+0200'"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from datetime import datetime\n",
|
||||
"\n",
|
||||
"d = items[0][\"pubDate\"]\n",
|
||||
"datetime.strptime(d, \"%a, %d %b %Y %H:%M:%S %z\").strftime(\"%Y-%m-%dT%H:%M:%S%z\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|██████████| 50/50 [01:04<00:00, 1.30s/it]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>id</th>\n",
|
||||
" <th>title</th>\n",
|
||||
" <th>date</th>\n",
|
||||
" <th>source_url</th>\n",
|
||||
" <th>text</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>https://www.handelsblatt.com/29227224.html</td>\n",
|
||||
" <td>Dieselskandal: Ex-Audi-Chef Rupert Stadler zu ...</td>\n",
|
||||
" <td>2023-06-27T09:20:32+0200</td>\n",
|
||||
" <td>https://www.handelsblatt.com/unternehmen/indus...</td>\n",
|
||||
" <td>Der frühere Audi-Chef wurde wegen Betrugs zu e...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>https://www.handelsblatt.com/29226410.html</td>\n",
|
||||
" <td>Luftfahrt: Größer, reichweitenstärker – aber n...</td>\n",
|
||||
" <td>2023-06-27T16:28:53+0200</td>\n",
|
||||
" <td>https://www.handelsblatt.com/unternehmen/hande...</td>\n",
|
||||
" <td>Honda Aircraft arbeitet an einem Privatflugzeu...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>https://www.handelsblatt.com/29226522.html</td>\n",
|
||||
" <td>Asien: Deutsche Unternehmen wetten auf den Ind...</td>\n",
|
||||
" <td>2023-06-27T00:30:00+0200</td>\n",
|
||||
" <td>https://www.handelsblatt.com/politik/internati...</td>\n",
|
||||
" <td>Unternehmen gehen von einer positiven wirtscha...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>https://www.handelsblatt.com/29228524.html</td>\n",
|
||||
" <td>Elektromobilität: US-Elektroautohersteller Lor...</td>\n",
|
||||
" <td>2023-06-27T18:45:29+0200</td>\n",
|
||||
" <td>https://www.handelsblatt.com/unternehmen/indus...</td>\n",
|
||||
" <td>Das Start-up plante die Massenproduktion mit e...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>https://www.handelsblatt.com/29228272.html</td>\n",
|
||||
" <td>US-Konzern: „Gewaltige Komplexität“ – BGH prüf...</td>\n",
|
||||
" <td>2023-06-27T16:23:03+0200</td>\n",
|
||||
" <td>https://www.handelsblatt.com/unternehmen/hande...</td>\n",
|
||||
" <td>Das Kartellamt stufte den US-Konzern vergangen...</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" id \\\n",
|
||||
"0 https://www.handelsblatt.com/29227224.html \n",
|
||||
"1 https://www.handelsblatt.com/29226410.html \n",
|
||||
"2 https://www.handelsblatt.com/29226522.html \n",
|
||||
"3 https://www.handelsblatt.com/29228524.html \n",
|
||||
"4 https://www.handelsblatt.com/29228272.html \n",
|
||||
"\n",
|
||||
" title \\\n",
|
||||
"0 Dieselskandal: Ex-Audi-Chef Rupert Stadler zu ... \n",
|
||||
"1 Luftfahrt: Größer, reichweitenstärker – aber n... \n",
|
||||
"2 Asien: Deutsche Unternehmen wetten auf den Ind... \n",
|
||||
"3 Elektromobilität: US-Elektroautohersteller Lor... \n",
|
||||
"4 US-Konzern: „Gewaltige Komplexität“ – BGH prüf... \n",
|
||||
"\n",
|
||||
" date \\\n",
|
||||
"0 2023-06-27T09:20:32+0200 \n",
|
||||
"1 2023-06-27T16:28:53+0200 \n",
|
||||
"2 2023-06-27T00:30:00+0200 \n",
|
||||
"3 2023-06-27T18:45:29+0200 \n",
|
||||
"4 2023-06-27T16:23:03+0200 \n",
|
||||
"\n",
|
||||
" source_url \\\n",
|
||||
"0 https://www.handelsblatt.com/unternehmen/indus... \n",
|
||||
"1 https://www.handelsblatt.com/unternehmen/hande... \n",
|
||||
"2 https://www.handelsblatt.com/politik/internati... \n",
|
||||
"3 https://www.handelsblatt.com/unternehmen/indus... \n",
|
||||
"4 https://www.handelsblatt.com/unternehmen/hande... \n",
|
||||
"\n",
|
||||
" text \n",
|
||||
"0 Der frühere Audi-Chef wurde wegen Betrugs zu e... \n",
|
||||
"1 Honda Aircraft arbeitet an einem Privatflugzeu... \n",
|
||||
"2 Unternehmen gehen von einer positiven wirtscha... \n",
|
||||
"3 Das Start-up plante die Massenproduktion mit e... \n",
|
||||
"4 Das Kartellamt stufte den US-Konzern vergangen... "
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from datetime import datetime\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"news = []\n",
|
||||
"for news_article in tqdm(items):\n",
|
||||
" info = {\n",
|
||||
" \"id\": news_article[\"guid\"],\n",
|
||||
" \"title\": news_article[\"title\"],\n",
|
||||
" \"date\": datetime.strptime(\n",
|
||||
" news_article[\"pubDate\"], \"%a, %d %b %Y %H:%M:%S %z\"\n",
|
||||
" ).strftime(\"%Y-%m-%dT%H:%M:%S%z\"),\n",
|
||||
" \"source_url\": news_article[\"link\"],\n",
|
||||
" \"text\": handelsblatt.get_news_details_text(news_article[\"link\"]),\n",
|
||||
" }\n",
|
||||
" news.append(info)\n",
|
||||
"\n",
|
||||
"df = pd.DataFrame(news)\n",
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|██████████| 50/50 [00:00<00:00, 81.98it/s]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from models.News import News\n",
|
||||
"\n",
|
||||
"for article in tqdm(news):\n",
|
||||
" news_article = News(**article)\n",
|
||||
" if service.get_by_id(news_article.id) is None:\n",
|
||||
" service.insert(news_article)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
Reference in New Issue
Block a user