From a428eb4432a6d500a6a02174e36fc0069eee49d8 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Fri, 10 Nov 2023 13:58:04 +0100 Subject: [PATCH 1/6] checkpoint: Init news extraction components and main app --- poetry.lock | 13 ++- pyproject.toml | 2 + .../apps/fetch_news.py | 97 +++++++++++++++++++ .../utils/data_extraction/news/__init__.py | 1 + .../utils/data_extraction/news/base.py | 48 +++++++++ .../data_extraction/news/handelsblatt.py | 49 ++++++++++ .../utils/data_extraction/news/tagesschau.py | 44 +++++++++ .../utils/mongo/news_mongo_service.py | 10 +- 8 files changed, 258 insertions(+), 6 deletions(-) create mode 100644 src/aki_prj23_transparenzregister/apps/fetch_news.py create mode 100644 src/aki_prj23_transparenzregister/utils/data_extraction/news/__init__.py create mode 100644 src/aki_prj23_transparenzregister/utils/data_extraction/news/base.py create mode 100644 src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py create mode 100644 src/aki_prj23_transparenzregister/utils/data_extraction/news/tagesschau.py diff --git a/poetry.lock b/poetry.lock index c476a34..4aa252b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -5556,6 +5556,17 @@ tensorflow = ["safetensors[numpy]", "tensorflow (>=2.11.0)"] testing = ["h5py (>=3.7.0)", "huggingface_hub (>=0.12.1)", "hypothesis (>=6.70.2)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "safetensors[numpy]", "setuptools_rust (>=1.5.2)"] torch = ["safetensors[numpy]", "torch (>=1.10)"] +[[package]] +name = "schedule" +version = "1.2.1" +description = "Job scheduling for humans." +optional = false +python-versions = ">=3.7" +files = [ + {file = "schedule-1.2.1-py2.py3-none-any.whl", hash = "sha256:14cdeb083a596aa1de6dc77639a1b2ac8bf6eaafa82b1c9279d3612823063d01"}, + {file = "schedule-1.2.1.tar.gz", hash = "sha256:843bc0538b99c93f02b8b50e3e39886c06f2d003b24f48e1aa4cadfa3f341279"}, +] + [[package]] name = "scipy" version = "1.11.3" @@ -7368,4 +7379,4 @@ web-server = ["dash", "dash-auth", "dash-bootstrap-components", "matplotlib", "n [metadata] lock-version = "2.0" python-versions = ">=3.11,<3.13" -content-hash = "5ca44ede811dc417faeda6b976c032682be7b4edadc16fc6c81e2ffe3dc4f946" +content-hash = "74469846b7987ea0e7fa202cfa3d2406513f7ef02d849f38f3bfca49dd1d71c9" diff --git a/pyproject.toml b/pyproject.toml index c7da905..be5c77e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,6 +70,7 @@ pymongo = "^4.6.0" python = ">=3.11,<3.13" python-dotenv = "^1.0.0" rapidfuzz = "^3.5.2" +schedule = "^1.2.1" scipy = "^1.11.3" seaborn = "^0.13.0" selenium = "^4.15.2" @@ -141,6 +142,7 @@ pytest-repeat = "^0.9.1" copy-sql = "aki_prj23_transparenzregister.utils.sql.copy_sql:copy_db_cli" data-processing = "aki_prj23_transparenzregister.utils.data_processing:cli" data-transformation = "aki_prj23_transparenzregister.utils.data_transfer:transfer_data_cli" +fetch-news-schedule = "aki_prj23_transparenzregister.apps.fetch_news:fetch_news_cli" reset-sql = "aki_prj23_transparenzregister.utils.sql.connector:reset_all_tables_cli" webserver = "aki_prj23_transparenzregister.ui.app:main" diff --git a/src/aki_prj23_transparenzregister/apps/fetch_news.py b/src/aki_prj23_transparenzregister/apps/fetch_news.py new file mode 100644 index 0000000..a6d984b --- /dev/null +++ b/src/aki_prj23_transparenzregister/apps/fetch_news.py @@ -0,0 +1,97 @@ +"""Scheduled news article extraction and transfer to MongoDB.""" +import argparse +import sys +import time + +from loguru import logger +from schedule import every, run_pending + +from aki_prj23_transparenzregister.config.config_providers import ( + HELP_TEXT_CONFIG, + ConfigProvider, + get_config_provider, +) +from aki_prj23_transparenzregister.utils.data_extraction.news.handelsblatt import ( + HandelsblattRSS, +) +from aki_prj23_transparenzregister.utils.data_extraction.news.tagesschau import ( + TagesschauAPI, +) +from aki_prj23_transparenzregister.utils.logger_config import ( + add_logger_options_to_argparse, + configer_logger, +) +from aki_prj23_transparenzregister.utils.mongo.connector import MongoConnector +from aki_prj23_transparenzregister.utils.mongo.news_mongo_service import ( + MongoNewsService, +) + + +def fetch_news_cli() -> None: + """A cli interface to fetch latest news articles on a schedule.""" + parser = argparse.ArgumentParser( + prog="Process and transform data", + description="Copy data from one SQL database to another.", + epilog="Example: 'data-transformation secrets.json' or 'data-transformation ENV_VARS_'", + ) + parser.add_argument( + "config", + metavar="config", + default="ENV", + help=HELP_TEXT_CONFIG, + ) + add_logger_options_to_argparse(parser) + parsed = parser.parse_args(sys.argv[1:]) + configer_logger(namespace=parsed) + config_provider = get_config_provider(parsed.config) + + every(12).hours.do(schedule, config_provider) + + while True: + run_pending() + time.sleep(30) + + +def schedule(config_provider: ConfigProvider) -> None: + """Scheduled job to fetch news articles and transfer them to MongoDB. + + Args: + config_provider (ConfigProvider): ConfigProvider to get the MongoDB connection string + """ + logger.info("Starting scheduled job") + mongo_news_service = MongoNewsService( + MongoConnector(config_provider.get_mongo_connection_string()) + ) + handelsblatt = HandelsblattRSS() + tagesschau = TagesschauAPI() + + news_handelsblatt = handelsblatt.get_news_for_category() + if news_handelsblatt is None: + logger.error("Error while fetching news from Handelsblatt") + news_handelsblatt = [] + + news_tageschau = tagesschau.get_news_for_category() + if news_tageschau is None: + logger.error("Error while fetching news from Tagesschau") + news_tageschau = [] + + logger.info(f"Found {len(news_handelsblatt)} news articles from Handelsblatt") + logger.info(f"Found {len(news_tageschau)} news articles from Tagesschau") + + news_joined = news_handelsblatt + news_tageschau + + count_new_documents = 0 + count_duplicate_documents = 0 + for news in news_joined: + db_news = mongo_news_service.get_by_id(news.id) + if db_news is None: + mongo_news_service.insert(news) + count_new_documents += 1 + else: + count_duplicate_documents += 1 + logger.info(f"Inserted {count_new_documents} news documents") + logger.info( + f"Found {count_duplicate_documents} duplicate news documents while inserting" + ) + logger.info("Finished scheduled job") + logger.info("=========================================") diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/news/__init__.py b/src/aki_prj23_transparenzregister/utils/data_extraction/news/__init__.py new file mode 100644 index 0000000..b6eeb30 --- /dev/null +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/news/__init__.py @@ -0,0 +1 @@ +"""Data extraction of news articles from various sources.""" diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/news/base.py b/src/aki_prj23_transparenzregister/utils/data_extraction/news/base.py new file mode 100644 index 0000000..ae46bf7 --- /dev/null +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/news/base.py @@ -0,0 +1,48 @@ +"""Base class for news extractors.""" +import abc + +import requests +from bs4 import BeautifulSoup + +from aki_prj23_transparenzregister.models.news import News + + +class BaseNewsExtractor(metaclass=abc.ABCMeta): + """Base class for news extractors.""" + + base_url: str + + def __init__(self, base_url: str): + """Constructor. + + Args: + base_url (str): Base URL of the API. + """ + self.base_url = base_url + + @abc.abstractmethod + def get_news_for_category(self, category: str) -> list[News] | None: + """Retrieve news for the given category from the API implemented by the subclass. + + Args: + category (str): News category to retrieve. + + Returns: + list[News] | None: List of news or None if an error occured. + """ + + def __get_news_details_text__(self, url: str) -> str: + """Retrieve the text of a news article. + + Args: + url (str): URL of the news article. + + Returns: + str: Text of the news article. + """ + content = requests.get(url, timeout=60) + soup = BeautifulSoup(content.text, features="html.parser") + + return " ".join( + [elem.text.replace("\n", " ") for elem in soup.find_all("p")][:] + ) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py b/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py new file mode 100644 index 0000000..77c35f2 --- /dev/null +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py @@ -0,0 +1,49 @@ +"""Handelsblatt RSS news extractor.""" +from datetime import datetime + +import requests +import xmltodict + +from aki_prj23_transparenzregister.models.news import News +from aki_prj23_transparenzregister.utils.data_extraction.news.base import ( + BaseNewsExtractor, +) + + +class HandelsblattRSS(BaseNewsExtractor): + """Handelsblatt RSS news extractor.""" + + def __init__(self) -> None: + """Constructor.""" + super().__init__("https://www.handelsblatt.com/contentexport/feed") + + def get_news_for_category(self, category: str = "unternehmen") -> list[News] | None: + """Retrieve news for the given category from the Handelsblatt RSS feed. + + Args: + category (str, optional): Category to search for. Defaults to "unternehmen". + + Returns: + list[News] | None: List of news or None if an error occured. + """ + url = f"{self.base_url}/{category}" + result = requests.get(url=url, timeout=60) + if not result.ok: + return None + + news: list[News] = [] + items = xmltodict.parse(result.text)["rss"]["channel"]["item"] + for article in items: + news.append( + News( + id=article["guid"], + title=article["title"], + date=datetime.strptime( + article["pubDate"], "%a, %d %b %Y %H:%M:%S %z" + ).strftime("%Y-%m-%dT%H:%M:%S%z"), + # FIXME Will now require JS enabled --> Use selenium rather than simple requests + text=self.__get_news_details_text__(article["link"]), + source_url=article["link"], + ) + ) + return news diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/news/tagesschau.py b/src/aki_prj23_transparenzregister/utils/data_extraction/news/tagesschau.py new file mode 100644 index 0000000..82481b7 --- /dev/null +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/news/tagesschau.py @@ -0,0 +1,44 @@ +"""Tageschau API news extractor.""" +import requests + +from aki_prj23_transparenzregister.models.news import News +from aki_prj23_transparenzregister.utils.data_extraction.news.base import ( + BaseNewsExtractor, +) + + +class TagesschauAPI(BaseNewsExtractor): + """Tageschau API news extractor.""" + + def __init__(self) -> None: + """Constructor.""" + super().__init__("https://www.tagesschau.de/api2") + + def get_news_for_category(self, category: str = "wirtschaft") -> list[News] | None: + """Retrieve news for the given category from the Tageschau API. + + Args: + category (str, optional): Category to search for. Defaults to "wirtschaft". + + Returns: + list[News] | None: List of news or None if an error occured. + """ + url = f"{self.base_url}/news/" + regions = ",".join([str(i) for i in range(1, 16)]) + result = requests.get( + url=url, params={"regions": regions, "ressort": category}, timeout=60 + ) + if not result.ok: + return None + news = [] + for item in result.json()["news"]: + news.append( + News( + id=item["externalId"], + title=item["title"], + date=item["date"], + source_url=item["detailsweb"], + text=self.__get_news_details_text__(item["detailsweb"]), + ) + ) + return news diff --git a/src/aki_prj23_transparenzregister/utils/mongo/news_mongo_service.py b/src/aki_prj23_transparenzregister/utils/mongo/news_mongo_service.py index 1218a51..6143e02 100644 --- a/src/aki_prj23_transparenzregister/utils/mongo/news_mongo_service.py +++ b/src/aki_prj23_transparenzregister/utils/mongo/news_mongo_service.py @@ -20,7 +20,7 @@ class MongoNewsService: """Get all News documents. Returns: - list[News]: _description_ + list[News]: List of News documents """ result = self.collection.find() return [MongoEntryTransformer.transform_outgoing(elem) for elem in result] @@ -29,10 +29,10 @@ class MongoNewsService: """Get a News document by the given id. Args: - id (str): _description_ + id (str): ID of the News document Returns: - News | None: _description_ + News | None: News document or None if not found """ result = list(self.collection.find({"_id": id})) if len(result) == 1: @@ -43,10 +43,10 @@ class MongoNewsService: """Insert a new News document. Args: - news (News): _description_ + news (News): News article to be inserted Returns: - _type_: _description_ + InsertOneResult: Result of the insert operation """ return self.collection.insert_one(MongoEntryTransformer.transform_ingoing(news)) From ae41cf61bcc8555047f1f929723df79530b184a1 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Fri, 10 Nov 2023 14:21:19 +0100 Subject: [PATCH 2/6] checkpoint: Resolve error in handelsblatt text fetch --- .../utils/data_extraction/news/base.py | 10 +----- .../data_extraction/news/handelsblatt.py | 31 ++++++++++++++++++- .../utils/data_extraction/news/tagesschau.py | 17 ++++++++++ 3 files changed, 48 insertions(+), 10 deletions(-) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/news/base.py b/src/aki_prj23_transparenzregister/utils/data_extraction/news/base.py index ae46bf7..45d655b 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/news/base.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/news/base.py @@ -1,9 +1,6 @@ """Base class for news extractors.""" import abc -import requests -from bs4 import BeautifulSoup - from aki_prj23_transparenzregister.models.news import News @@ -31,6 +28,7 @@ class BaseNewsExtractor(metaclass=abc.ABCMeta): list[News] | None: List of news or None if an error occured. """ + @abc.abstractmethod def __get_news_details_text__(self, url: str) -> str: """Retrieve the text of a news article. @@ -40,9 +38,3 @@ class BaseNewsExtractor(metaclass=abc.ABCMeta): Returns: str: Text of the news article. """ - content = requests.get(url, timeout=60) - soup = BeautifulSoup(content.text, features="html.parser") - - return " ".join( - [elem.text.replace("\n", " ") for elem in soup.find_all("p")][:] - ) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py b/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py index 77c35f2..98cba65 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py @@ -3,6 +3,8 @@ from datetime import datetime import requests import xmltodict +from bs4 import BeautifulSoup +from selenium import webdriver from aki_prj23_transparenzregister.models.news import News from aki_prj23_transparenzregister.utils.data_extraction.news.base import ( @@ -41,9 +43,36 @@ class HandelsblattRSS(BaseNewsExtractor): date=datetime.strptime( article["pubDate"], "%a, %d %b %Y %H:%M:%S %z" ).strftime("%Y-%m-%dT%H:%M:%S%z"), - # FIXME Will now require JS enabled --> Use selenium rather than simple requests text=self.__get_news_details_text__(article["link"]), source_url=article["link"], ) ) + break return news + + def __get_news_details_text__(self, url: str) -> str: + """Retrieve the text of a news article. + + Args: + url (str): URL of the news article. + + Returns: + str: Text of the news article. + """ + options = webdriver.ChromeOptions() + preferences = { + "profile.default_content_settings.popups": 0, + "safebrowsing.enabled": True, + } + options.add_argument("--headless=new") + options.add_experimental_option("prefs", preferences) + options.add_experimental_option("excludeSwitches", ["enable-logging"]) + + driver = webdriver.Chrome(options=options) + driver.get(url) + content = driver.page_source + soup = BeautifulSoup(content, features="html.parser") + + return " ".join( + [elem.text.replace("\n", " ") for elem in soup.find_all("p")][:] + ).strip() diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/news/tagesschau.py b/src/aki_prj23_transparenzregister/utils/data_extraction/news/tagesschau.py index 82481b7..43cb322 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/news/tagesschau.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/news/tagesschau.py @@ -1,5 +1,6 @@ """Tageschau API news extractor.""" import requests +from bs4 import BeautifulSoup from aki_prj23_transparenzregister.models.news import News from aki_prj23_transparenzregister.utils.data_extraction.news.base import ( @@ -42,3 +43,19 @@ class TagesschauAPI(BaseNewsExtractor): ) ) return news + + def __get_news_details_text__(self, url: str) -> str: + """Retrieve the text of a news article. + + Args: + url (str): URL of the news article. + + Returns: + str: Text of the news article. + """ + content = requests.get(url, timeout=60) + soup = BeautifulSoup(content.text, features="html.parser") + + return " ".join( + [elem.text.replace("\n", " ") for elem in soup.find_all("p")][1:] + ).strip() From ac6ca3547b996c6f7f6ebeee57812328e882236d Mon Sep 17 00:00:00 2001 From: TrisNol Date: Fri, 10 Nov 2023 16:03:42 +0100 Subject: [PATCH 3/6] test: Add unit test for news api wrapper --- .../data_extraction/news/handelsblatt.py | 1 - .../data_extraction/news/handelsblatt_test.py | 89 +++++++++++++++++++ .../data_extraction/news/tagesschau_test.py | 76 ++++++++++++++++ 3 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 tests/utils/data_extraction/news/handelsblatt_test.py create mode 100644 tests/utils/data_extraction/news/tagesschau_test.py diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py b/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py index 98cba65..ada112b 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py @@ -47,7 +47,6 @@ class HandelsblattRSS(BaseNewsExtractor): source_url=article["link"], ) ) - break return news def __get_news_details_text__(self, url: str) -> str: diff --git a/tests/utils/data_extraction/news/handelsblatt_test.py b/tests/utils/data_extraction/news/handelsblatt_test.py new file mode 100644 index 0000000..d91a0c1 --- /dev/null +++ b/tests/utils/data_extraction/news/handelsblatt_test.py @@ -0,0 +1,89 @@ +"""Testing module for Handelsblatt RSS API.""" +from unittest.mock import Mock, patch + +from aki_prj23_transparenzregister.models.news import News +from aki_prj23_transparenzregister.utils.data_extraction.news.handelsblatt import ( + HandelsblattRSS, +) + + +def test_init() -> None: + api = HandelsblattRSS() + assert api is not None + assert api.base_url == "https://www.handelsblatt.com/contentexport/feed" + + +@patch( + "aki_prj23_transparenzregister.utils.data_extraction.news.handelsblatt.requests.get" +) +def test_get_news_for_category_error(mock_requests_get: Mock) -> None: + mock_requests_get.return_value = Mock(ok=False) + api = HandelsblattRSS() + assert api.get_news_for_category() is None + + +@patch( + "aki_prj23_transparenzregister.utils.data_extraction.news.handelsblatt.requests.get" +) +@patch( + "aki_prj23_transparenzregister.utils.data_extraction.news.handelsblatt.HandelsblattRSS.__get_news_details_text__" +) +def test_get_news_for_category(mock_get_details: Mock, mock_requests_get: Mock) -> None: + mock_get_details.return_value = ( + "Es war einmal vor langer Zeit, in einer weit, weit entfernten Galaxis..." + ) + mock_response = """ + + + test + Test + Fri, 10 Nov 2023 09:10:27 +0100 + https://www.handelsblatt.com/test + + + test + Test + Fri, 10 Nov 2023 09:10:27 +0100 + https://www.handelsblatt.com/test + + + + """ + mock_requests_get.return_value = Mock(ok=True, text=mock_response) + + api = HandelsblattRSS() + assert api.get_news_for_category() == [ + News( + id="test", + title="Test", + date="2023-11-10T09:10:27+0100", + source_url="https://www.handelsblatt.com/test", + text="Es war einmal vor langer Zeit, in einer weit, weit entfernten Galaxis...", + ), + News( + id="test", + title="Test", + date="2023-11-10T09:10:27+0100", + source_url="https://www.handelsblatt.com/test", + text="Es war einmal vor langer Zeit, in einer weit, weit entfernten Galaxis...", + ), + ] + + +@patch( + "aki_prj23_transparenzregister.utils.data_extraction.news.handelsblatt.webdriver.Chrome" +) +def test_get_news_details_text(mock_driver: Mock) -> None: + mock_response = """ + + +

Hallo Welt.

+

Dies ist ein Text.

+ + + + """ + mock_driver.return_value = Mock(page_source=mock_response) + + api = HandelsblattRSS() + assert api.__get_news_details_text__("test") == "Hallo Welt. Dies ist ein Text." diff --git a/tests/utils/data_extraction/news/tagesschau_test.py b/tests/utils/data_extraction/news/tagesschau_test.py new file mode 100644 index 0000000..a786203 --- /dev/null +++ b/tests/utils/data_extraction/news/tagesschau_test.py @@ -0,0 +1,76 @@ +"""Testing module for Tagesschau API.""" +from unittest.mock import Mock, patch + +from aki_prj23_transparenzregister.models.news import News +from aki_prj23_transparenzregister.utils.data_extraction.news.tagesschau import ( + TagesschauAPI, +) + + +def test_init() -> None: + api = TagesschauAPI() + assert api is not None + assert api.base_url == "https://www.tagesschau.de/api2" + + +@patch( + "aki_prj23_transparenzregister.utils.data_extraction.news.tagesschau.requests.get" +) +def test_get_news_for_category_error(mock_requests_get: Mock) -> None: + mock_requests_get.return_value = Mock(ok=False) + api = TagesschauAPI() + assert api.get_news_for_category() is None + + +@patch( + "aki_prj23_transparenzregister.utils.data_extraction.news.tagesschau.requests.get" +) +@patch( + "aki_prj23_transparenzregister.utils.data_extraction.news.tagesschau.TagesschauAPI.__get_news_details_text__" +) +def test_get_news_for_category(mock_get_details: Mock, mock_requests_get: Mock) -> None: + mock_get_details.return_value = ( + "Es war einmal vor langer Zeit, in einer weit, weit entfernten Galaxis..." + ) + mock_response = { + "news": [ + { + "externalId": "test", + "title": "Test", + "date": "2021-07-05", + "detailsweb": "https://www.tagesschau.de/test", + } + ] + } + mock_requests_get.return_value = Mock(ok=True, json=lambda: mock_response) + + api = TagesschauAPI() + assert api.get_news_for_category() == [ + News( + id="test", + title="Test", + date="2021-07-05", + source_url="https://www.tagesschau.de/test", + text="Es war einmal vor langer Zeit, in einer weit, weit entfernten Galaxis...", + ) + ] + + +@patch( + "aki_prj23_transparenzregister.utils.data_extraction.news.tagesschau.requests.get" +) +def test_get_news_details_text(mock_requests_get: Mock) -> None: + mock_response = """ + + +

Title to be ignored

+

Hallo Welt.

+

Dies ist ein Text.

+ + + + """ + mock_requests_get.return_value = Mock(ok=True, text=mock_response) + + api = TagesschauAPI() + assert api.__get_news_details_text__("test") == "Hallo Welt. Dies ist ein Text." From 170056bf588c8bbdb1a1293fc258a626770aa849 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Sat, 11 Nov 2023 11:43:10 +0100 Subject: [PATCH 4/6] test: Cover apps/fetch_news.py with unit tests --- .../apps/fetch_news.py | 14 ++- tests/apps/fetch_news_test.py | 93 +++++++++++++++++++ 2 files changed, 102 insertions(+), 5 deletions(-) create mode 100644 tests/apps/fetch_news_test.py diff --git a/src/aki_prj23_transparenzregister/apps/fetch_news.py b/src/aki_prj23_transparenzregister/apps/fetch_news.py index a6d984b..7f2fd5f 100644 --- a/src/aki_prj23_transparenzregister/apps/fetch_news.py +++ b/src/aki_prj23_transparenzregister/apps/fetch_news.py @@ -27,12 +27,12 @@ from aki_prj23_transparenzregister.utils.mongo.news_mongo_service import ( ) -def fetch_news_cli() -> None: +def fetch_news_cli() -> None: # pragma: no cover """A cli interface to fetch latest news articles on a schedule.""" parser = argparse.ArgumentParser( - prog="Process and transform data", - description="Copy data from one SQL database to another.", - epilog="Example: 'data-transformation secrets.json' or 'data-transformation ENV_VARS_'", + prog="Fetch News on schedule", + description="Fetch latest news articles from various sources on a schedule and transfer them to MongoDB.", + epilog="Example: 'fetch-news-schedule secrets.json' or 'fetchh-news-schedule ENV_VARS_'", ) parser.add_argument( "config", @@ -52,11 +52,14 @@ def fetch_news_cli() -> None: time.sleep(30) -def schedule(config_provider: ConfigProvider) -> None: +def schedule(config_provider: ConfigProvider) -> int: """Scheduled job to fetch news articles and transfer them to MongoDB. Args: config_provider (ConfigProvider): ConfigProvider to get the MongoDB connection string + + Returns: + int: Number of new documents inserted into MongoDB """ logger.info("Starting scheduled job") mongo_news_service = MongoNewsService( @@ -95,3 +98,4 @@ def schedule(config_provider: ConfigProvider) -> None: ) logger.info("Finished scheduled job") logger.info("=========================================") + return count_new_documents diff --git a/tests/apps/fetch_news_test.py b/tests/apps/fetch_news_test.py new file mode 100644 index 0000000..81063f3 --- /dev/null +++ b/tests/apps/fetch_news_test.py @@ -0,0 +1,93 @@ +"""Testing apps/fetch_news.py.""" +from unittest.mock import Mock, patch + +from aki_prj23_transparenzregister.apps import fetch_news +from aki_prj23_transparenzregister.models.news import News + + +def test_import() -> None: + assert fetch_news is not None + + +@patch("aki_prj23_transparenzregister.apps.fetch_news.MongoNewsService") +@patch("aki_prj23_transparenzregister.apps.fetch_news.MongoConnector") +@patch("aki_prj23_transparenzregister.apps.fetch_news.HandelsblattRSS") +@patch("aki_prj23_transparenzregister.apps.fetch_news.TagesschauAPI") +def test_schedule( + mock_tagesschau_api: Mock, + mock_handelsblatt_rss: Mock, + mock_mongo_connector: Mock, + mock_mongo_news_service: Mock, +) -> None: + mock_mongo_connector.return_value = Mock() + mock_mongo_news_service.return_value = Mock( + get_by_id=Mock(return_value=None), insert=Mock(return_value=Mock) + ) + + mock_news_handelsblatt = [ + News( + id="test", + title="The oldest and strongest emotion of mankind is fear, and the oldest and strongest kind of fear is fear of the unknown", + date="2023-11-10T09:10:27+0100", + source_url="https://www.handelsblatt.com/test", + text="", + ), + News( + id="test", + title="That is not dead which can eternal lie, And with strange aeons even death may die.", + date="2023-11-10T09:10:27+0100", + source_url="https://www.handelsblatt.com/test", + text="", + ), + ] + mock_news_tagesschau = [ + News( + id="test", + title="I know always that I am an outsider; a stranger in this century and among those who are still men.", + date="2023-11-10T09:10:27+0100", + source_url="https://www.tagesschau.de/test", + text="", + ), + News( + id="test", + title="Ph'nglui mglw'nafh Cthulhu R'lyeh wgah'nagl fhtagn.", + date="2023-11-10T09:10:27+0100", + source_url="https://www.tagesschau.de/test", + text="", + ), + ] + mock_tagesschau_api.return_value = Mock( + get_news_for_category=Mock(return_value=mock_news_tagesschau) + ) + mock_handelsblatt_rss.return_value = Mock( + get_news_for_category=Mock(return_value=mock_news_handelsblatt) + ) + assert fetch_news.schedule(Mock()) == len( + mock_news_handelsblatt + mock_news_tagesschau + ) + + +@patch("aki_prj23_transparenzregister.apps.fetch_news.MongoNewsService") +@patch("aki_prj23_transparenzregister.apps.fetch_news.MongoConnector") +@patch("aki_prj23_transparenzregister.apps.fetch_news.HandelsblattRSS") +@patch("aki_prj23_transparenzregister.apps.fetch_news.TagesschauAPI") +def test_schedule_error( + mock_tagesschau_api: Mock, + mock_handelsblatt_rss: Mock, + mock_mongo_connector: Mock, + mock_mongo_news_service: Mock, +) -> None: + mock_mongo_connector.return_value = Mock() + mock_mongo_news_service.return_value = Mock( + get_by_id=Mock(return_value=None), insert=Mock(return_value=Mock) + ) + + mock_news_handelsblatt = None + mock_news_tagesschau = None + mock_tagesschau_api.return_value = Mock( + get_news_for_category=Mock(return_value=mock_news_tagesschau) + ) + mock_handelsblatt_rss.return_value = Mock( + get_news_for_category=Mock(return_value=mock_news_handelsblatt) + ) + assert fetch_news.schedule(Mock()) == 0 From 5dcf8ecf55bfffdfa3bbd83ec96c2a655bdd214c Mon Sep 17 00:00:00 2001 From: TrisNol Date: Sat, 11 Nov 2023 13:19:23 +0100 Subject: [PATCH 5/6] build: Dockerize apps/fetch_news.py as ingestor --- Dockerfile | 14 ++++++++++++++ README.md | 4 ++-- docker-compose.yml | 2 +- .../apps/fetch_news.py | 2 +- .../utils/data_extraction/news/handelsblatt.py | 5 +++++ 5 files changed, 23 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2471363..fb93e64 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,9 +24,23 @@ FROM base as ingest LABEL PART="DATA_INGESTOR" +### Install Chrome ### +# Update the package lists +RUN apt-get update + +# Install wget and unzip +RUN apt-get install -y wget unzip + +# Install Google Chrome +RUN wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb +RUN dpkg -i google-chrome-stable_current_amd64.deb; apt-get -fy install + RUN pip install --find-links=dist aki-prj23-transparenzregister[ingest] --no-cache-dir && \ rm dist/ -R +ENTRYPOINT ["fetch-news-schedule", "ENV"] +CMD ["--level", "DEBUG"] + FROM base as data-transformation LABEL PART="DATA-TRANSFORMATION" diff --git a/README.md b/README.md index 55fd484..80193c5 100644 --- a/README.md +++ b/README.md @@ -56,12 +56,12 @@ the following layout: ``` PYTHON_POSTGRES_USERNAME=postgres PYTHON_POSTGRES_PASSWORD=postgres -PYTHON_POSTGRES_HOST=localhost +PYTHON_POSTGRES_HOST=postgres PYTHON_POSTGRES_DATABASE=postgres PYTHON_POSTGRES_PORT=5432 PYTHON_MONGO_USERNAME=username -PYTHON_MONGO_HOST=localhost +PYTHON_MONGO_HOST=mongodb PYTHON_MONGO_PASSWORD=password PYTHON_MONGO_PORT=27017 PYTHON_MONGO_DATABASE=transparenzregister diff --git a/docker-compose.yml b/docker-compose.yml index 5ed7d97..d190528 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,7 +13,7 @@ services: PYTHON_MONGO_PORT: ${PYTHON_MONGO_PORT:-27017} PYTHON_MONGO_DATABASE: ${PYTHON_MONGO_DATABASE:-transparenzregister} deploy: - replicas: 0 + replicas: 1 restart: on-failure:3 mongodb: diff --git a/src/aki_prj23_transparenzregister/apps/fetch_news.py b/src/aki_prj23_transparenzregister/apps/fetch_news.py index 7f2fd5f..b92f559 100644 --- a/src/aki_prj23_transparenzregister/apps/fetch_news.py +++ b/src/aki_prj23_transparenzregister/apps/fetch_news.py @@ -49,7 +49,7 @@ def fetch_news_cli() -> None: # pragma: no cover while True: run_pending() - time.sleep(30) + time.sleep(1) def schedule(config_provider: ConfigProvider) -> int: diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py b/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py index ada112b..2cadd92 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py @@ -64,9 +64,14 @@ class HandelsblattRSS(BaseNewsExtractor): "safebrowsing.enabled": True, } options.add_argument("--headless=new") + options.add_argument("--disable-gpu") options.add_experimental_option("prefs", preferences) options.add_experimental_option("excludeSwitches", ["enable-logging"]) + # Arguments required for running Chrome in Docker + options.add_argument("--no-sandbox") + options.add_argument("--disable-dev-shm-usage") + driver = webdriver.Chrome(options=options) driver.get(url) content = driver.page_source From 05ea0fbb3321e2270088b4568a692b6795be1652 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Sat, 11 Nov 2023 14:02:00 +0100 Subject: [PATCH 6/6] refactor: Include logger.catch with reraise --- .../utils/data_extraction/news/handelsblatt.py | 2 ++ .../utils/data_extraction/news/tagesschau.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py b/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py index 2cadd92..a1af5d5 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py @@ -4,6 +4,7 @@ from datetime import datetime import requests import xmltodict from bs4 import BeautifulSoup +from loguru import logger from selenium import webdriver from aki_prj23_transparenzregister.models.news import News @@ -19,6 +20,7 @@ class HandelsblattRSS(BaseNewsExtractor): """Constructor.""" super().__init__("https://www.handelsblatt.com/contentexport/feed") + @logger.catch(reraise=True) def get_news_for_category(self, category: str = "unternehmen") -> list[News] | None: """Retrieve news for the given category from the Handelsblatt RSS feed. diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/news/tagesschau.py b/src/aki_prj23_transparenzregister/utils/data_extraction/news/tagesschau.py index 43cb322..cabeaee 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/news/tagesschau.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/news/tagesschau.py @@ -1,6 +1,7 @@ """Tageschau API news extractor.""" import requests from bs4 import BeautifulSoup +from loguru import logger from aki_prj23_transparenzregister.models.news import News from aki_prj23_transparenzregister.utils.data_extraction.news.base import ( @@ -15,6 +16,7 @@ class TagesschauAPI(BaseNewsExtractor): """Constructor.""" super().__init__("https://www.tagesschau.de/api2") + @logger.catch(reraise=True) def get_news_for_category(self, category: str = "wirtschaft") -> list[News] | None: """Retrieve news for the given category from the Tageschau API.