mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-04-24 17:12:34 +02:00
checkpoint: Resolve error in handelsblatt text fetch
This commit is contained in:
parent
a428eb4432
commit
ae41cf61bc
@ -1,9 +1,6 @@
|
||||
"""Base class for news extractors."""
|
||||
import abc
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from aki_prj23_transparenzregister.models.news import News
|
||||
|
||||
|
||||
@ -31,6 +28,7 @@ class BaseNewsExtractor(metaclass=abc.ABCMeta):
|
||||
list[News] | None: List of news or None if an error occured.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def __get_news_details_text__(self, url: str) -> str:
|
||||
"""Retrieve the text of a news article.
|
||||
|
||||
@ -40,9 +38,3 @@ class BaseNewsExtractor(metaclass=abc.ABCMeta):
|
||||
Returns:
|
||||
str: Text of the news article.
|
||||
"""
|
||||
content = requests.get(url, timeout=60)
|
||||
soup = BeautifulSoup(content.text, features="html.parser")
|
||||
|
||||
return " ".join(
|
||||
[elem.text.replace("\n", " ") for elem in soup.find_all("p")][:]
|
||||
)
|
||||
|
@ -3,6 +3,8 @@ from datetime import datetime
|
||||
|
||||
import requests
|
||||
import xmltodict
|
||||
from bs4 import BeautifulSoup
|
||||
from selenium import webdriver
|
||||
|
||||
from aki_prj23_transparenzregister.models.news import News
|
||||
from aki_prj23_transparenzregister.utils.data_extraction.news.base import (
|
||||
@ -41,9 +43,36 @@ class HandelsblattRSS(BaseNewsExtractor):
|
||||
date=datetime.strptime(
|
||||
article["pubDate"], "%a, %d %b %Y %H:%M:%S %z"
|
||||
).strftime("%Y-%m-%dT%H:%M:%S%z"),
|
||||
# FIXME Will now require JS enabled --> Use selenium rather than simple requests
|
||||
text=self.__get_news_details_text__(article["link"]),
|
||||
source_url=article["link"],
|
||||
)
|
||||
)
|
||||
break
|
||||
return news
|
||||
|
||||
def __get_news_details_text__(self, url: str) -> str:
|
||||
"""Retrieve the text of a news article.
|
||||
|
||||
Args:
|
||||
url (str): URL of the news article.
|
||||
|
||||
Returns:
|
||||
str: Text of the news article.
|
||||
"""
|
||||
options = webdriver.ChromeOptions()
|
||||
preferences = {
|
||||
"profile.default_content_settings.popups": 0,
|
||||
"safebrowsing.enabled": True,
|
||||
}
|
||||
options.add_argument("--headless=new")
|
||||
options.add_experimental_option("prefs", preferences)
|
||||
options.add_experimental_option("excludeSwitches", ["enable-logging"])
|
||||
|
||||
driver = webdriver.Chrome(options=options)
|
||||
driver.get(url)
|
||||
content = driver.page_source
|
||||
soup = BeautifulSoup(content, features="html.parser")
|
||||
|
||||
return " ".join(
|
||||
[elem.text.replace("\n", " ") for elem in soup.find_all("p")][:]
|
||||
).strip()
|
||||
|
@ -1,5 +1,6 @@
|
||||
"""Tageschau API news extractor."""
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from aki_prj23_transparenzregister.models.news import News
|
||||
from aki_prj23_transparenzregister.utils.data_extraction.news.base import (
|
||||
@ -42,3 +43,19 @@ class TagesschauAPI(BaseNewsExtractor):
|
||||
)
|
||||
)
|
||||
return news
|
||||
|
||||
def __get_news_details_text__(self, url: str) -> str:
|
||||
"""Retrieve the text of a news article.
|
||||
|
||||
Args:
|
||||
url (str): URL of the news article.
|
||||
|
||||
Returns:
|
||||
str: Text of the news article.
|
||||
"""
|
||||
content = requests.get(url, timeout=60)
|
||||
soup = BeautifulSoup(content.text, features="html.parser")
|
||||
|
||||
return " ".join(
|
||||
[elem.text.replace("\n", " ") for elem in soup.find_all("p")][1:]
|
||||
).strip()
|
||||
|
Loading…
x
Reference in New Issue
Block a user