diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/news/base.py b/src/aki_prj23_transparenzregister/utils/data_extraction/news/base.py index ae46bf7..45d655b 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/news/base.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/news/base.py @@ -1,9 +1,6 @@ """Base class for news extractors.""" import abc -import requests -from bs4 import BeautifulSoup - from aki_prj23_transparenzregister.models.news import News @@ -31,6 +28,7 @@ class BaseNewsExtractor(metaclass=abc.ABCMeta): list[News] | None: List of news or None if an error occured. """ + @abc.abstractmethod def __get_news_details_text__(self, url: str) -> str: """Retrieve the text of a news article. @@ -40,9 +38,3 @@ class BaseNewsExtractor(metaclass=abc.ABCMeta): Returns: str: Text of the news article. """ - content = requests.get(url, timeout=60) - soup = BeautifulSoup(content.text, features="html.parser") - - return " ".join( - [elem.text.replace("\n", " ") for elem in soup.find_all("p")][:] - ) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py b/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py index 77c35f2..98cba65 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py @@ -3,6 +3,8 @@ from datetime import datetime import requests import xmltodict +from bs4 import BeautifulSoup +from selenium import webdriver from aki_prj23_transparenzregister.models.news import News from aki_prj23_transparenzregister.utils.data_extraction.news.base import ( @@ -41,9 +43,36 @@ class HandelsblattRSS(BaseNewsExtractor): date=datetime.strptime( article["pubDate"], "%a, %d %b %Y %H:%M:%S %z" ).strftime("%Y-%m-%dT%H:%M:%S%z"), - # FIXME Will now require JS enabled --> Use selenium rather than simple requests text=self.__get_news_details_text__(article["link"]), source_url=article["link"], ) ) + break return news + + def __get_news_details_text__(self, url: str) -> str: + """Retrieve the text of a news article. + + Args: + url (str): URL of the news article. + + Returns: + str: Text of the news article. + """ + options = webdriver.ChromeOptions() + preferences = { + "profile.default_content_settings.popups": 0, + "safebrowsing.enabled": True, + } + options.add_argument("--headless=new") + options.add_experimental_option("prefs", preferences) + options.add_experimental_option("excludeSwitches", ["enable-logging"]) + + driver = webdriver.Chrome(options=options) + driver.get(url) + content = driver.page_source + soup = BeautifulSoup(content, features="html.parser") + + return " ".join( + [elem.text.replace("\n", " ") for elem in soup.find_all("p")][:] + ).strip() diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/news/tagesschau.py b/src/aki_prj23_transparenzregister/utils/data_extraction/news/tagesschau.py index 82481b7..43cb322 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/news/tagesschau.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/news/tagesschau.py @@ -1,5 +1,6 @@ """Tageschau API news extractor.""" import requests +from bs4 import BeautifulSoup from aki_prj23_transparenzregister.models.news import News from aki_prj23_transparenzregister.utils.data_extraction.news.base import ( @@ -42,3 +43,19 @@ class TagesschauAPI(BaseNewsExtractor): ) ) return news + + def __get_news_details_text__(self, url: str) -> str: + """Retrieve the text of a news article. + + Args: + url (str): URL of the news article. + + Returns: + str: Text of the news article. + """ + content = requests.get(url, timeout=60) + soup = BeautifulSoup(content.text, features="html.parser") + + return " ".join( + [elem.text.replace("\n", " ") for elem in soup.find_all("p")][1:] + ).strip()