checkpoint: Resolve error in handelsblatt text fetch

This commit is contained in:
TrisNol 2023-11-10 14:21:19 +01:00
parent a428eb4432
commit ae41cf61bc
3 changed files with 48 additions and 10 deletions

View File

@ -1,9 +1,6 @@
"""Base class for news extractors."""
import abc
import requests
from bs4 import BeautifulSoup
from aki_prj23_transparenzregister.models.news import News
@ -31,6 +28,7 @@ class BaseNewsExtractor(metaclass=abc.ABCMeta):
list[News] | None: List of news or None if an error occured.
"""
@abc.abstractmethod
def __get_news_details_text__(self, url: str) -> str:
"""Retrieve the text of a news article.
@ -40,9 +38,3 @@ class BaseNewsExtractor(metaclass=abc.ABCMeta):
Returns:
str: Text of the news article.
"""
content = requests.get(url, timeout=60)
soup = BeautifulSoup(content.text, features="html.parser")
return " ".join(
[elem.text.replace("\n", " ") for elem in soup.find_all("p")][:]
)

View File

@ -3,6 +3,8 @@ from datetime import datetime
import requests
import xmltodict
from bs4 import BeautifulSoup
from selenium import webdriver
from aki_prj23_transparenzregister.models.news import News
from aki_prj23_transparenzregister.utils.data_extraction.news.base import (
@ -41,9 +43,36 @@ class HandelsblattRSS(BaseNewsExtractor):
date=datetime.strptime(
article["pubDate"], "%a, %d %b %Y %H:%M:%S %z"
).strftime("%Y-%m-%dT%H:%M:%S%z"),
# FIXME Will now require JS enabled --> Use selenium rather than simple requests
text=self.__get_news_details_text__(article["link"]),
source_url=article["link"],
)
)
break
return news
def __get_news_details_text__(self, url: str) -> str:
"""Retrieve the text of a news article.
Args:
url (str): URL of the news article.
Returns:
str: Text of the news article.
"""
options = webdriver.ChromeOptions()
preferences = {
"profile.default_content_settings.popups": 0,
"safebrowsing.enabled": True,
}
options.add_argument("--headless=new")
options.add_experimental_option("prefs", preferences)
options.add_experimental_option("excludeSwitches", ["enable-logging"])
driver = webdriver.Chrome(options=options)
driver.get(url)
content = driver.page_source
soup = BeautifulSoup(content, features="html.parser")
return " ".join(
[elem.text.replace("\n", " ") for elem in soup.find_all("p")][:]
).strip()

View File

@ -1,5 +1,6 @@
"""Tageschau API news extractor."""
import requests
from bs4 import BeautifulSoup
from aki_prj23_transparenzregister.models.news import News
from aki_prj23_transparenzregister.utils.data_extraction.news.base import (
@ -42,3 +43,19 @@ class TagesschauAPI(BaseNewsExtractor):
)
)
return news
def __get_news_details_text__(self, url: str) -> str:
"""Retrieve the text of a news article.
Args:
url (str): URL of the news article.
Returns:
str: Text of the news article.
"""
content = requests.get(url, timeout=60)
soup = BeautifulSoup(content.text, features="html.parser")
return " ".join(
[elem.text.replace("\n", " ") for elem in soup.find_all("p")][1:]
).strip()