From 5dcf8ecf55bfffdfa3bbd83ec96c2a655bdd214c Mon Sep 17 00:00:00 2001 From: TrisNol Date: Sat, 11 Nov 2023 13:19:23 +0100 Subject: [PATCH] build: Dockerize apps/fetch_news.py as ingestor --- Dockerfile | 14 ++++++++++++++ README.md | 4 ++-- docker-compose.yml | 2 +- .../apps/fetch_news.py | 2 +- .../utils/data_extraction/news/handelsblatt.py | 5 +++++ 5 files changed, 23 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2471363..fb93e64 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,9 +24,23 @@ FROM base as ingest LABEL PART="DATA_INGESTOR" +### Install Chrome ### +# Update the package lists +RUN apt-get update + +# Install wget and unzip +RUN apt-get install -y wget unzip + +# Install Google Chrome +RUN wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb +RUN dpkg -i google-chrome-stable_current_amd64.deb; apt-get -fy install + RUN pip install --find-links=dist aki-prj23-transparenzregister[ingest] --no-cache-dir && \ rm dist/ -R +ENTRYPOINT ["fetch-news-schedule", "ENV"] +CMD ["--level", "DEBUG"] + FROM base as data-transformation LABEL PART="DATA-TRANSFORMATION" diff --git a/README.md b/README.md index 55fd484..80193c5 100644 --- a/README.md +++ b/README.md @@ -56,12 +56,12 @@ the following layout: ``` PYTHON_POSTGRES_USERNAME=postgres PYTHON_POSTGRES_PASSWORD=postgres -PYTHON_POSTGRES_HOST=localhost +PYTHON_POSTGRES_HOST=postgres PYTHON_POSTGRES_DATABASE=postgres PYTHON_POSTGRES_PORT=5432 PYTHON_MONGO_USERNAME=username -PYTHON_MONGO_HOST=localhost +PYTHON_MONGO_HOST=mongodb PYTHON_MONGO_PASSWORD=password PYTHON_MONGO_PORT=27017 PYTHON_MONGO_DATABASE=transparenzregister diff --git a/docker-compose.yml b/docker-compose.yml index 5ed7d97..d190528 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,7 +13,7 @@ services: PYTHON_MONGO_PORT: ${PYTHON_MONGO_PORT:-27017} PYTHON_MONGO_DATABASE: ${PYTHON_MONGO_DATABASE:-transparenzregister} deploy: - replicas: 0 + replicas: 1 restart: on-failure:3 mongodb: diff --git a/src/aki_prj23_transparenzregister/apps/fetch_news.py b/src/aki_prj23_transparenzregister/apps/fetch_news.py index 7f2fd5f..b92f559 100644 --- a/src/aki_prj23_transparenzregister/apps/fetch_news.py +++ b/src/aki_prj23_transparenzregister/apps/fetch_news.py @@ -49,7 +49,7 @@ def fetch_news_cli() -> None: # pragma: no cover while True: run_pending() - time.sleep(30) + time.sleep(1) def schedule(config_provider: ConfigProvider) -> int: diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py b/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py index ada112b..2cadd92 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/news/handelsblatt.py @@ -64,9 +64,14 @@ class HandelsblattRSS(BaseNewsExtractor): "safebrowsing.enabled": True, } options.add_argument("--headless=new") + options.add_argument("--disable-gpu") options.add_experimental_option("prefs", preferences) options.add_experimental_option("excludeSwitches", ["enable-logging"]) + # Arguments required for running Chrome in Docker + options.add_argument("--no-sandbox") + options.add_argument("--disable-dev-shm-usage") + driver = webdriver.Chrome(options=options) driver.get(url) content = driver.page_source