mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-04-25 12:42:33 +02:00
checkpoint: Resolve error in handelsblatt text fetch
This commit is contained in:
parent
a428eb4432
commit
ae41cf61bc
@ -1,9 +1,6 @@
|
|||||||
"""Base class for news extractors."""
|
"""Base class for news extractors."""
|
||||||
import abc
|
import abc
|
||||||
|
|
||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
from aki_prj23_transparenzregister.models.news import News
|
from aki_prj23_transparenzregister.models.news import News
|
||||||
|
|
||||||
|
|
||||||
@ -31,6 +28,7 @@ class BaseNewsExtractor(metaclass=abc.ABCMeta):
|
|||||||
list[News] | None: List of news or None if an error occured.
|
list[News] | None: List of news or None if an error occured.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
def __get_news_details_text__(self, url: str) -> str:
|
def __get_news_details_text__(self, url: str) -> str:
|
||||||
"""Retrieve the text of a news article.
|
"""Retrieve the text of a news article.
|
||||||
|
|
||||||
@ -40,9 +38,3 @@ class BaseNewsExtractor(metaclass=abc.ABCMeta):
|
|||||||
Returns:
|
Returns:
|
||||||
str: Text of the news article.
|
str: Text of the news article.
|
||||||
"""
|
"""
|
||||||
content = requests.get(url, timeout=60)
|
|
||||||
soup = BeautifulSoup(content.text, features="html.parser")
|
|
||||||
|
|
||||||
return " ".join(
|
|
||||||
[elem.text.replace("\n", " ") for elem in soup.find_all("p")][:]
|
|
||||||
)
|
|
||||||
|
@ -3,6 +3,8 @@ from datetime import datetime
|
|||||||
|
|
||||||
import requests
|
import requests
|
||||||
import xmltodict
|
import xmltodict
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from selenium import webdriver
|
||||||
|
|
||||||
from aki_prj23_transparenzregister.models.news import News
|
from aki_prj23_transparenzregister.models.news import News
|
||||||
from aki_prj23_transparenzregister.utils.data_extraction.news.base import (
|
from aki_prj23_transparenzregister.utils.data_extraction.news.base import (
|
||||||
@ -41,9 +43,36 @@ class HandelsblattRSS(BaseNewsExtractor):
|
|||||||
date=datetime.strptime(
|
date=datetime.strptime(
|
||||||
article["pubDate"], "%a, %d %b %Y %H:%M:%S %z"
|
article["pubDate"], "%a, %d %b %Y %H:%M:%S %z"
|
||||||
).strftime("%Y-%m-%dT%H:%M:%S%z"),
|
).strftime("%Y-%m-%dT%H:%M:%S%z"),
|
||||||
# FIXME Will now require JS enabled --> Use selenium rather than simple requests
|
|
||||||
text=self.__get_news_details_text__(article["link"]),
|
text=self.__get_news_details_text__(article["link"]),
|
||||||
source_url=article["link"],
|
source_url=article["link"],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
break
|
||||||
return news
|
return news
|
||||||
|
|
||||||
|
def __get_news_details_text__(self, url: str) -> str:
|
||||||
|
"""Retrieve the text of a news article.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url (str): URL of the news article.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Text of the news article.
|
||||||
|
"""
|
||||||
|
options = webdriver.ChromeOptions()
|
||||||
|
preferences = {
|
||||||
|
"profile.default_content_settings.popups": 0,
|
||||||
|
"safebrowsing.enabled": True,
|
||||||
|
}
|
||||||
|
options.add_argument("--headless=new")
|
||||||
|
options.add_experimental_option("prefs", preferences)
|
||||||
|
options.add_experimental_option("excludeSwitches", ["enable-logging"])
|
||||||
|
|
||||||
|
driver = webdriver.Chrome(options=options)
|
||||||
|
driver.get(url)
|
||||||
|
content = driver.page_source
|
||||||
|
soup = BeautifulSoup(content, features="html.parser")
|
||||||
|
|
||||||
|
return " ".join(
|
||||||
|
[elem.text.replace("\n", " ") for elem in soup.find_all("p")][:]
|
||||||
|
).strip()
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
"""Tageschau API news extractor."""
|
"""Tageschau API news extractor."""
|
||||||
import requests
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from aki_prj23_transparenzregister.models.news import News
|
from aki_prj23_transparenzregister.models.news import News
|
||||||
from aki_prj23_transparenzregister.utils.data_extraction.news.base import (
|
from aki_prj23_transparenzregister.utils.data_extraction.news.base import (
|
||||||
@ -42,3 +43,19 @@ class TagesschauAPI(BaseNewsExtractor):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
return news
|
return news
|
||||||
|
|
||||||
|
def __get_news_details_text__(self, url: str) -> str:
|
||||||
|
"""Retrieve the text of a news article.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url (str): URL of the news article.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Text of the news article.
|
||||||
|
"""
|
||||||
|
content = requests.get(url, timeout=60)
|
||||||
|
soup = BeautifulSoup(content.text, features="html.parser")
|
||||||
|
|
||||||
|
return " ".join(
|
||||||
|
[elem.text.replace("\n", " ") for elem in soup.find_all("p")][1:]
|
||||||
|
).strip()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user