Feature/ner (#103)

NER und Sentiment-Pipeline mit Services zur Datenextraktion. --------- Co-authored-by: Philipp Horstenkamp <philipp@horstenkamp.de> Co-authored-by: TrisNol <tristan.nolde@yahoo.de>
2025-08-13 04:34:37 +02:00 · 2023-10-16 19:54:24 +02:00
parent 99b61e7c2e
commit c680ac9759
28 changed files with 12509 additions and 10 deletions
--- a/.github/workflows/documentation.yaml
+++ b/.github/workflows/documentation.yaml
@@ -26,7 +26,7 @@ jobs:
    - name: Install and configure Poetry
      uses: snok/install-poetry@v1
      with:
-        version: 1.4.2
+        version: 1.6.1
        virtualenvs-create: false
    - run: poetry install --with doc --all-extras --without test,lint
    - name: Doc-Build
--- a/.github/workflows/lint-actions.yaml
+++ b/.github/workflows/lint-actions.yaml
@@ -28,7 +28,7 @@ jobs:
    - name: Install and configure Poetry
      uses: snok/install-poetry@v1
      with:
-        version: 1.4.2
+        version: 1.6.1
        virtualenvs-create: false
        virtualenvs-path: ~/local/share/virtualenvs
    - run: poetry install --without develop,doc --all-extras
@@ -56,7 +56,7 @@ jobs:
    - name: Install and configure Poetry
      uses: snok/install-poetry@v1
      with:
-        version: 1.4.2
+        version: 1.6.1
        virtualenvs-create: true
        virtualenvs-path: ~/local/share/virtualenvs
    - name: Check out Git repository
--- a/.github/workflows/test-and-build-action.yaml
+++ b/.github/workflows/test-and-build-action.yaml
@@ -20,10 +20,14 @@ jobs:
      uses: actions/setup-python@v4
      with:
        python-version: 3.11
+    - name: Install Cuda
+      run: |
+        sudo apt update
+#        sudo apt install cuda-10-0 -y
    - name: Install and configure Poetry
      uses: snok/install-poetry@v1
      with:
-        version: 1.4.2
+        version: 1.6.1
        virtualenvs-path: ~/local/share/virtualenvs
    - id: cache-pipenv
      uses: actions/cache@v3
@@ -85,7 +89,7 @@ jobs:
    - name: Install and configure Poetry
      uses: snok/install-poetry@v1
      with:
-        version: 1.4.2
+        version: 1.6.1
        virtualenvs-path: ~/local/share/virtualenvs
    - run: |
        poetry install --only test --all-extras
@@ -113,7 +117,7 @@ jobs:
    - name: Install and configure Poetry
      uses: snok/install-poetry@v1
      with:
-        version: 1.4.2
+        version: 1.6.1
        virtualenvs-path: ~/local/share/virtualenvs
    - id: cache-pipenv
      uses: actions/cache@v3
--- a/Jupyter/NER/.$Flow_Chart_NER_Function.drawio.bkp
+++ b/Jupyter/NER/.$Flow_Chart_NER_Function.drawio.bkp
@@ -0,0 +1 @@
+<mxfile host="Electron" modified="2023-08-17T13:01:42.139Z" agent="5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/20.8.10 Chrome/106.0.5249.199 Electron/21.3.5 Safari/537.36" etag="3J0g9IE6MsjTAohlE4lt" version="20.8.10" type="device"><diagram id="C5RBs43oDa-KdzZeNtuy" name="Page-1">7VrbUuM4EP2aVO0+QNkxuT1uAszswlCzA7MMT1OKrTgiimVkOZf5+m3ZUmxHJjETgiHAC1FbN/c5p7stu+EMpotPHIXjL8zDtNG0vEXDOW00m7bda8E/aVmmlo7TSQ0+J57qlBmuyS+sjJayxsTDUaGjYIwKEhaNLgsC7IqCDXHO5sVuI0aLq4bIx4bh2kXUtN4ST4xTa7fZyeyfMfHHemW73UuvTJHurO4kGiOPzXMm56zhDDhjIv01XQwwlc7Tfrn9e3lLLyftT//8Gz2g7/2Lm6v/jtLJzp8yZHULHAfit6deTJb27GISx+7NQ8/+8c16uLg/Ul6IxFL7C3vgPtVkXIyZzwJEzzJrn7M48LCc1YJW1ueSsRCMNhjvsRBLxQUUCwamsZhSdRUviPghhx+3VOsud+V0oWZOGkvdCARf5gbJ5l3+WjYsaelx6f3Jm1qjwxZfqn4Ri7mLN/RzFKUR97HY4OiMMKA0zKYYNgnjOKZIkFlxc0hR3l/1y2CFHwrZJxBIbXKGaKxWItMQgJMsZ15MQZ/rNMhAlrjMx0Tg6xAlrphDoCgCOmKBUGjbcJd9n6IoUgBEgrPJSnqy90pHGT5V4ZhhLvAi5yzTo/pqR8UtFbYcJeJ5FgNsLexxTv8n1p4wODEwuNvodqvo4hJFlSovD0WiqMD7S0ZRaA8pcyep6ZzIzSu56KDd3Y4WRUNM+8id+MlGB4wyDpcCFsjAsD10PLP42qb4yuezSsOASR5FFk2MyupUM31lJBC5Lmw0imBj69RZLfj7bGoZbLpiO4fyN0a3msjUeRqX7FfPpbbBpUs0DRvNNgU39IccfvnyV0hj3weywHRBwzk3Y9eYTYdxtD1dFDCWDDpHU0Klvz5jOsOCuKgkqSBKfFj31AW4MS8nDyxJAh9a7ax1k5AV4u/+kk1X18QK81a7JNtYJdmmu69s0zEw/QropdA1LZrge2AZv9ddA6Fbd8rvVgnSbzoEv3jG19F0e8q3K4ZpxR7r2IG/AoFefxXQq1RTfpQB+2FY89AKAb3DHJ/6MR2alcAw5kFSCLBYfFQCmyqBdlkSetFKQAfCHKjfcIgIgGldHmIdsDraW+p23XWAjhRFCKj0adNKFXZgGKzXYh3rBTEoPX/r7pwW389B5+MHmBUOOh8hxrMfdG7aZE5oq+Slc5fLghHxf4aczYiHuZnctAU2kA3iGAlZWllIPj+x4b18VSHpDGwGp4ylkqGTB74miMqOwAV40GJ+8sCVNL6wwGencn4Zdp1+MIzCFXC1id8QdVXOPC5+q6B9u0T7qz557bf2pf3eh/Z3075+b7dN+806tW/WOYb2cQByxM8p+fSlJWFBwi8uS075jpK/Y7l36pa7fox5/Xrft2531ePaE6gCvOmsvVqz15BM44QatdNj6uN3tTXJS12yiukdRwINKYnGEVoXdTFzH7Z0V9V3fdK1DR+/hHRfado9qZh2W3WmXfO9tqHGAM+jn5BAd0u8iRKR6+IoWuXfK5j5aMAo1Xp9X7nW0ScL9Qm2+SHYNSFWEGy7TsGanw4Ygh3EPGL8ewAPyT4Hvcmz3irSfYgxJ3iDPEHS8tg4wiLRt0y2yVJa3h5z4ylgEalCW54v68lGBFM5GBZ7iFma5achCpIFtendRYBe3Snb/HqAjKBtUujYBdeLP/5MIUxRUkyyzIq98iuEfTnadoqebpUdY9glnrbt1r6Crfkd34d036x02yUfiTyXdKGZfZecPndlX3c7Z/8D</diagram></mxfile>
--- a/Jupyter/NER/Flow_Chart_NER_Function.drawio
+++ b/Jupyter/NER/Flow_Chart_NER_Function.drawio
@@ -0,0 +1 @@
+<mxfile host="Electron" modified="2023-08-20T09:25:41.321Z" agent="5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/19.0.3 Chrome/102.0.5005.63 Electron/19.0.3 Safari/537.36" etag="NgKJlrtoa61kZ5SbTZB-" version="19.0.3" type="device" pages="2"><diagram id="C5RBs43oDa-KdzZeNtuy" name="Pipeline">7VpZc6M4EP41rtp9SIojvh7XzjG7yUzNTjKbydOUDDIolhERwsf8+m2BxGEom0zs2HGSyoPV6ID+vq+7JWjZw+niiqPQ/8xcTFuW4S5a9nnLsswzy2rJf8NdppZep50aPE5c1Sk33JJfWBkNZY2Ji6NSR8EYFSQsGx0WBNgRJRvinM3L3caMllcNkYcrhlsH0ar1nrjCV09hdXP7J0w8X69sdvrplSnSndWTRD5y2bxgsi9a9pAzJtJf08UQU+k87Zf7v5f39GbSufrn3+gJfR9c33357ySd7PI5Q7JH4DgQvz31YrI0Z9eTOHbunvrmj2/G0/XjifJCJJbaX9gF96km48JnHgsQvcitA87iwMVyVgNaeZ8bxkIwmmB8xEIsFRdQLBiYfDGl6ipeEPFDDj9tq9ZD4cr5Qs2cNJa6EQi+LAySzYfitXxY0tLjGrpOuThiMXfwmn62YjDiHhZr/KooI51ZoKEC5gqzKYabhA4cUyTIrMxVpCjvZf1yWOGHQvYZBFJ3PUM0ViuRaQjASZYzN6agz1Ua5CBLXOY+Efg2RIlv5hAoyoCOWSAU2iY85cCjKIoUAJHgbJJJT/bOdPR8fGaYC7xY61F9tavClApbtkJknscAUwvbL+j/zNgRBmcVDB7Wut0ou7hGUbXKK0KRKCpw/5JRFNojypxJarok8uaVXHTQ7m1Gi6IRpgPkTLzkRoeMMg6XAhbIwLA5dGxZjZ2qGuvnaypHRRZNjMbqVDN9ZSQQhS5sPI7gxlapky34+2xqV9j0hb04lL8xuu2JTN3ncck8eC51Kly6QdOwZXUouGEw4vDLk79CGnsekAWmC1r2ZTV2+Ww6iqPN6aKEsWTQJZoSKv31CdMZFsRBNUkFUeLBuucOwI15PXlgSRJ40OrkrbuErBB/d5dsena3lGzanZpsY9Rkm96usk23gulXQC+FzjJogu+RZfx+bwWE3r5Tfq9JkH7TIfjVM76OpptTvtkwTCv2GKc2/JUIdPhVQL9RTflRBuyGYdaxFQL6Dgt8GsR0VK0ERjEPkkKAxeKjElhXCXTqktCrVgI6EBZA/YZDRABM4+YY64DsaE8fWFr7rgN0pChDQKVPLSNV2JFhsFqLdY1XxKD2QK734rR4tAed6w4wNx50Ni2ztn7Que6uC0LLkpfOXQ4LxsT7GXI2Iy7m1eSmLXAD+SCOkZCllYHk/omNHuWrCklnYDN4yZdKhk4uOJ8gKjsCF2Cjxbxkw5U0PrPAY+dyfhl27UEwisIMuGMSv1HSvlmj/axPUfvtXWm//6H9Z2lfv6bbpP2mBfDraL9a51S0jwOQI96m5NOXloQFCb+4LDnlO0r+juXe3bfc9Tbm8PW+Zd1uXY8rO1AFuGWvvFozV5BMA4ca9aJt6rrH3JDkpS5Zw/SOI4FGlER+hFZFXc7cxy3drPren3TNio9fQ7qHkXbPGqbd9kGl3ep77YoaAzyPfkICfVniTZSIHAdHUZZ/v8DMJ0NGqdbr+8q1tj5Z2J9grXcs2HZDwXYOSrDVTwcqgh3GPGL8ewCbZI+D3uRZbxPpPsWYE7xGniBpeWwcYZHoWybbZCktb5c58RTAiVShLc+X9WRjgqkcDIs9xSzN8tMQBcmC2vTuIkB/3ym7+vUAGUO7SqFTB1wv/vgzhTBFSTHJqFbsjV8h7MrRpl32dLvuGMOs8bRptncVbKvf8X1I981Kt1Pzkci2pAvN/LvkdN+Vf91tX/wP</diagram><diagram id="KGNsg-YwxBcfRzNuZPCy" name="Service">dZHBDoIwDIafZnfYFPGMiBdPHDwvrLIlg5IxA/r0QjbEBU126L7/77q2hGXNWBjeySsK0IRGYiTsRCiNd5SS+UTi6Uia7B2ojRLetIJSvcDDyNOHEtAHRouorepCWGHbQmUDxo3BIbTdUYdVO17DBpQV11t6U8JKT+PkuAoXULX0pVN6cELDF7PvpJdc4PCFWE5YZhCti5oxAz0Pb5mLyzv/UT8fM9DaHwlTsL49XYINsfwN</diagram></mxfile>
--- a/Jupyter/NER/NER-Pipeline.md
+++ b/Jupyter/NER/NER-Pipeline.md
@@ -0,0 +1,38 @@
+```mermaid
+flowchart LR
+    DBConnect["`**Mongo Connect**
+    - create connection string
+    - establish connection`"]
+
+    DBRead["`**Mongo Read**
+    - read database
+    - get fields without attribute 'companies'`"]
+
+    NER["`**NERService**
+    - process news article
+    - get entities`"]
+
+    DBUpdate["`**Mongo Update Documents**
+    - update processed documents
+    - add an attribute 'companies'`"]
+
+    id1[["`**NERSpacy**
+    Named Entitiy Recognition with spaCy`"]]
+
+    id2[["`**NERCompanyList**
+    Named Entitiy Recognition by comparing text with list`"]]
+
+    id3[["`**NERTransformer**
+    Named Entitiy Recognition with transformer`"]]
+
+    DBConnect-->DBRead-->NER
+    NER--select service-->id1
+    NER--select service-->id2
+    NER--select service-->id3
+
+    id1-->DBUpdate
+    id2-->DBUpdate
+    id3-->DBUpdate
+
+
+```
--- a/Jupyter/NER/NER_Pipeline.ipynb
+++ b/Jupyter/NER/NER_Pipeline.ipynb
--- a/Jupyter/NER/NER_from_StagingDB.ipynb
+++ b/Jupyter/NER/NER_from_StagingDB.ipynb
--- a/Jupyter/NER/Sentiment-Pipeline.md
+++ b/Jupyter/NER/Sentiment-Pipeline.md
@@ -0,0 +1,32 @@
+```mermaid
+flowchart LR
+    DBConnect["`**Mongo Connect**
+    - create connection string
+    - establish connection`"]
+
+    DBRead["`**Mongo Read**
+    - read database
+    - get fields without attribute 'companies'`"]
+
+    NER["`**SentimentService**
+    - process news article
+    - get sentiment`"]
+
+    DBUpdate["`**Mongo Update Documents**
+    - update processed documents
+    - add an attribute 'sentiment'`"]
+
+    id1[["`**SentimentSpacy**
+    Sentiment analysis with spaCy`"]]
+
+
+    id3[["`**SentimentTransformer**
+    Sentiment analysis with transformer`"]]
+
+    DBConnect-->DBRead-->NER
+    NER--select service-->id1
+    NER--select service-->id3
+
+    id1-->DBUpdate
+    id3-->DBUpdate
+```
--- a/Jupyter/NER/Sentiment_Pipeline.ipynb
+++ b/Jupyter/NER/Sentiment_Pipeline.ipynb
--- a/Jupyter/Sentiment_Company_Matching/Name_Matching.ipynb
+++ b/Jupyter/Sentiment_Company_Matching/Name_Matching.ipynb
--- a/documentations/seminararbeiten/Datenspeicherung/images/08_DataMesh.png
+++ b/documentations/seminararbeiten/Datenspeicherung/images/08_DataMesh.png
--- a/documentations/seminararbeiten/DevOps/Seminarpraesentation.ipynb
+++ b/documentations/seminararbeiten/DevOps/Seminarpraesentation.ipynb
@@ -531,7 +531,7 @@
    "        python-version: 3.11\n",
    "    - uses: snok/install-poetry@v1 # setup poetry\n",
    "      with:\n",
-    "        version: 1.4.2\n",
+    "        version: 1.6.1\n",
    "        virtualenvs-path: ~/local/share/virtualenvs\n",
    "    - uses: actions/checkout@v3\n",
    "    - run: |\n",
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,12 +65,18 @@ python = "^3.11"
 python-dotenv = "^1.0.0"
 seaborn = "^0.12.2"
 selenium = "^4.12.0"
+spacy = "^3.6.1"
+spacy-sentiws = "^3.0.0"
+torch = {version = "*", source = "torch-cpu"}
+torchaudio = {version = "*", source = "torch-cpu"}
+torchvision = {version = "*", source = "torch-cpu"}
 tqdm = "^4.66.1"
+transformers = {version = "*", extras = ["torch"]}
 xmltodict = "^0.13.0"

 [tool.poetry.extras]
 ingest = ["selenium", "deutschland", "xmltodict"]
-transformation = []
+transformation = ["torch", "torchaudio", "torchvision", "transformers", "spacy-sentiws", "spacy"]
 web-server = ["dash", "dash-auth", "dash-bootstrap-components", "matplotlib", "seaborn"]

 [tool.poetry.group.develop.dependencies]
@@ -108,6 +114,7 @@ types-cachetools = "^5.3.0.6"
 types-pyOpenSSL = "*"
 types-requests = "^2.31.0.2"
 types-setuptools = "*"
+types-tabulate = "^0.9.0.3"
 types-tqdm = "^4.66.0.2"

 [tool.poetry.group.test.dependencies]
@@ -123,6 +130,11 @@ data-transformation = "aki_prj23_transparenzregister.utils.data_transfer:transfe
 reset-sql = "aki_prj23_transparenzregister.utils.sql.connector:reset_all_tables_cli"
 webserver = "aki_prj23_transparenzregister.ui.app:main"

+[[tool.poetry.source]]
+name = "torch-cpu"
+priority = "explicit"
+url = "https://download.pytorch.org/whl/cpu"
+
 [tool.ruff]
 exclude = [
  ".bzr",
--- a/src/aki_prj23_transparenzregister/utils/mongo/CompEntities.json
+++ b/src/aki_prj23_transparenzregister/utils/mongo/CompEntities.json
@@ -0,0 +1,102 @@
+[
+    "Volkswagen",
+    "Mercedes",
+    "Benz",
+    "Deutsche Telekom",
+    "Bmw",
+    "Deutsche Post",
+    "E.On",
+    "BASF",
+    "Siemens",
+    "Uniper",
+    "Bayer",
+    "Continental",
+    "Fresenius",
+    "Thyssen",
+    "Siemens",
+    "SAP",
+    "Metro",
+    "Hochtief",
+    "Traton",
+    "Ceconomy",
+    "ENBW",
+    "Adidas",
+    "Henkel",
+    "Heidelbergcement",
+    "Fresenius",
+    "Merck",
+    "Mckesson",
+    "RWE",
+    "Lufthansa",
+    "Hapag-Lloyd",
+    "Schaeffler",
+    "Evonik",
+    "Aurubis",
+    "Brenntag",
+    "Covestro",
+    "Infineon",
+    "Tui",
+    "Kion ",
+    "Zalando",
+    "Telefonica",
+    "Salzgitter",
+    "Beiersdorf",
+    "Suedzucker",
+    "Hella",
+    "Lanxess",
+    "Knorr",
+    "Rheinmetall",
+    "Hornbach",
+    "United",
+    "Puma",
+    "Baywa",
+    "Kloeckner",
+    "Hornbach",
+    "Bechtle",
+    "Nordex",
+    "Wacker",
+    "Gea",
+    "Vonovia",
+    "Prosiebensat1",
+    "Leoni",
+    "MTU",
+    "1&1",
+    "Jungheinrich",
+    "K+S",
+    "Hellofresh",
+    "Symrise",
+    "Aurelius",
+    "Mvv",
+    "Bilfinger",
+    "Draegerwerk",
+    "Krones",
+    "Duerr",
+    "Osram",
+    "Auto1",
+    "Deutsche Wohnen",
+    "Kabel Deutschland",
+    "Freenet",
+    "Kuka",
+    "Delivery Hero",
+    "Paul Hartmann",
+    "Fuchs Petrolub",
+    "Sartorius",
+    "Gelsenwasser",
+    "Mainova",
+    "Ksb",
+    "Heidelberger Druckmaschinen",
+    "Sixt",
+    "Hugo Boss",
+    "Dmg Mori",
+    "Mutares",
+    "Zooplus",
+    "Grammer",
+    "Fraport",
+    "Wacker Neuson",
+    "Indus Holding",
+    "Leg Immobilien",
+    "Elringklinger",
+    "Stroeer",
+    "Fielmann",
+    "Gerresheimer"
+  ]
--- a/src/aki_prj23_transparenzregister/utils/mongo/SentiWS/SentiWS.txt
+++ b/src/aki_prj23_transparenzregister/utils/mongo/SentiWS/SentiWS.txt
@@ -0,0 +1,77 @@
+SentiWS
+~~~~~~~
+
+SentimentWortschatz, or SentiWS for short, is a publicly available German-language resource for sentiment analysis, opinion mining etc. It lists positive and negative polarity bearing words weighted within the interval of [-1; 1] plus their part of speech tag, and if applicable, their inflections. The current version of SentiWS (v2.0) contains around 1,650 positive and 1,800 negative words, which sum up to around 16,000 positive and around 18,000 negative word forms incl. their inflections, respectively. It not only contains adjectives and adverbs explicitly expressing a sentiment, but also nouns and verbs implicitly containing one.
+
+
+License
+~~~~~~~
+
+SentiWS is licensed under a Creative Commons Attribution-Noncommercial-Share Alike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).
+
+
+Obtain a Copy
+~~~~~~~~~~~~~
+The latest version of SentiWS can be found at https://wortschatz.uni-leipzig.de/download/.
+
+
+Data Format
+~~~~~~~~~~~
+SentiWS is organised in two utf8-encoded text files structured the following way:
+
+<Word>|<POS tag> \t <Polarity weight> \t <Infl_1>,...,<Infl_k> \n
+
+where \t denotes a tab, and \n denotes a new line.
+
+
+Citation
+~~~~~~~~
+
+If you use SentiWS in your work we kindly ask you to cite
+
+R. Remus, U. Quasthoff & G. Heyer: SentiWS - a Publicly Available German-language Resource for Sentiment Analysis.
+In: Proceedings of the 7th International Language Ressources and Evaluation (LREC'10), 2010
+
+or use the following BibTeX-code snippet:
+
+@INPROCEEDINGS{remquahey2010,
+title = {SentiWS -- a Publicly Available German-language Resource for Sentiment Analysis},
+booktitle = {Proceedings of the 7th International Language Resources and Evaluation (LREC'10)},
+author = {Remus, R. and Quasthoff, U. and Heyer, G.},
+year = {2010}
+}
+
+
+Version History
+~~~~~~~~~~~~~~~
+
+SentiWS is "work in progress" and hence far from being fully-fledged and error-free. It will be continuously refined by adding missing words and word forms and removing ambiguous ones.
+
+v1.8b, 2010-05-19: First publicly available version as described in Remus et al. (2010).
+v1.8c, 2012-03-21: Second publicly available version in which some POS tags were corrected.
+v2.0, 2018-10-19: Third publicly available version in which the inflected forms were extended.
+
+
+Statistics
+~~~~~~~~~~
+
+				Positive	Negative
+Adjectives	Baseforms	792		712
+		Inflections	10,936		10,471
+Adverbs		Baseforms	7		4
+		Inflections	5		0
+Nouns		Baseforms	548		688
+		Inflections	736		1158
+Verbs		Baseforms	297		423
+		Inflections	3,246		4,580
+
+All		Baseforms	1,644		1,827
+		Inflections	14,923		16,209
+
+		Total		16,567		18,036
+
+Table: Overview of the dictionary's content
+
+
+
+SentiWS.txt was last updated on 2019-09-12.
--- a/src/aki_prj23_transparenzregister/utils/mongo/SentiWS/SentiWS_v2.0_Negative.txt
+++ b/src/aki_prj23_transparenzregister/utils/mongo/SentiWS/SentiWS_v2.0_Negative.txt
--- a/src/aki_prj23_transparenzregister/utils/mongo/SentiWS/SentiWS_v2.0_Positive.txt
+++ b/src/aki_prj23_transparenzregister/utils/mongo/SentiWS/SentiWS_v2.0_Positive.txt
--- a/src/aki_prj23_transparenzregister/utils/mongo/ner_pipeline.py
+++ b/src/aki_prj23_transparenzregister/utils/mongo/ner_pipeline.py
@@ -0,0 +1,110 @@
+"""Pipeline to get Entities from Staging DB."""
+
+import json
+import sys
+
+from loguru import logger
+from tqdm import tqdm
+
+import aki_prj23_transparenzregister.utils.mongo.connector as conn
+import aki_prj23_transparenzregister.utils.mongo.news_mongo_service as news
+from aki_prj23_transparenzregister.config.config_providers import (
+    JsonFileConfigProvider,
+)
+from aki_prj23_transparenzregister.utils.mongo import ner_service
+
+logger.add(sys.stdout, colorize=True)
+
+
+class EntityPipeline:
+    """Class to initialize NER Pipeline."""
+
+    def __init__(self, conn_string: conn.MongoConnection) -> None:
+        """Method to connect to StagingDB."""
+        self.connect_string = conn_string
+        self.connect_string.database = "transparenzregister_ner"
+        self.connector = conn.MongoConnector(self.connect_string)
+        self.news_obj = news.MongoNewsService(self.connector)
+
+    def process_documents(
+        self, entity: str, doc_attrib: str, ner_selection: str
+    ) -> None:
+        """Method to check documents, get entities and write them to document."""
+        CursorUnprogressed = self.news_obj.collection.find(  # noqa: N806
+            {"companies": {"$exists": False}}
+        )
+        documents = list(CursorUnprogressed)
+        logger.info("Dokumente: ", str(CursorUnprogressed))
+
+        # Determine NER service based on config
+        # spaCy
+        if ner_selection == "use_spacy_ner":
+            ner_service_instance = ner_service.NerAnalysisService(
+                use_spacy=True, use_transformer=False, use_companylist=False
+            )
+            ner_service_func = ner_service_instance.ner_spacy
+
+        # company list
+        elif ner_selection == "use_companylist_ner":
+            ner_service_instance = ner_service.NerAnalysisService(
+                use_spacy=False, use_transformer=False, use_companylist=True
+            )
+            ner_service_func = ner_service_instance.ner_company_list
+
+        # transformer
+        elif ner_selection == "use_transformer_ner":
+            ner_service_instance = ner_service.NerAnalysisService(
+                use_spacy=False, use_transformer=True, use_companylist=False
+            )
+            ner_service_func = ner_service_instance.ner_transformer
+        if len(documents) > 0:
+            for document in tqdm(documents):
+                ents = ner_service_func(document, entity, doc_attrib)
+                self.news_obj.collection.update_one(
+                    {"_id": document["_id"]},
+                    {"$set": {"companies": ents}},
+                )
+        else:
+            logger.info("No documents found.")
+
+
+if __name__ == "__main__":
+    # Establish MongoDB Connection using secrets
+    config_provider = JsonFileConfigProvider("./secrets.json")
+    connect_string = config_provider.get_mongo_connection_string()
+
+    # dir of config json
+    config_file_path = (
+        "src/aki_prj23_transparenzregister/utils/mongo/ner_sentiment_config.json"
+    )
+
+    # Load NER service configuration from JSON
+    with open(config_file_path) as config_file:
+        ner_config = json.load(config_file)
+
+    # read configuration
+    entity = ner_config["ner_service"]["entity"]
+    logger.info("NER Pipeline: searching for entity of type", str(entity))
+    doc_attrib = ner_config["ner_service"]["doc_attrib"]
+    logger.info("NER Pipeline: searching in document attribute ", str(doc_attrib))
+
+    # read selected service
+    if ner_config["ner_service"]["use_companylist_ner"] is True:
+        ner_selection = "use_companylist_ner"
+        logger.info("NER Pipeline: Searching entities with company list")
+
+    elif ner_config["ner_service"]["use_spacy_ner"] is True:
+        ner_selection = "use_spacy_ner"
+        logger.info("NER Pipeline: Searching entities with spaCy")
+
+    elif ner_config["ner_service"]["use_transformer_ner"] is True:
+        ner_selection = "use_transformer_ner"
+        logger.info("NER Pipeline: Searching entities with transformer")
+
+    else:
+        logger.info(
+            "NER Pipeline: No NER services selected or error in configuration file."
+        )
+
+    entity_pipeline = EntityPipeline(connect_string)
+    entity_pipeline.process_documents(entity, doc_attrib, ner_selection)
--- a/src/aki_prj23_transparenzregister/utils/mongo/ner_sentiment_config.json
+++ b/src/aki_prj23_transparenzregister/utils/mongo/ner_sentiment_config.json
@@ -0,0 +1,17 @@
+{
+    "sentiment_service": {
+        "comment": "Select only one service by setting true and deselect the other with false. Valid doc_attrib: text, title",
+        "use_spacy": false,
+        "use_transformer": true,
+        "doc_attrib": "text"
+    },
+
+    "ner_service": {
+        "comment": "Select only one service by setting true and deselect the other with false. Valid doc_attrib: text, title",
+        "use_spacy_ner": false,
+        "use_transformer_ner": true,
+        "use_companylist_ner":false,
+        "doc_attrib": "text",
+        "entity":"ORG"
+    }
+}
--- a/src/aki_prj23_transparenzregister/utils/mongo/ner_service.py
+++ b/src/aki_prj23_transparenzregister/utils/mongo/ner_service.py
@@ -0,0 +1,150 @@
+"""NER Service module."""
+
+import json
+from collections import Counter
+from typing import Final
+
+import spacy
+from transformers import pipeline
+
+
+class NerAnalysisService:
+    """Class to initialize NER model."""
+
+    def __init__(
+        self,
+        use_spacy: bool = False,
+        use_transformer: bool = False,
+        use_companylist: bool = False,
+    ) -> None:
+        """Method to check which sentiment model is chosen."""
+        if use_spacy:
+            self.init_spacy()
+        if use_transformer:
+            self.init_transformer()
+        if use_companylist:
+            self.init_companylist()
+
+    def init_spacy(self) -> None:
+        """Method to initialize spaCy.
+
+        Optimized by ChatGPT.
+        """
+        # check if model is available and load it
+        SPACY_MODEL_NAME: Final[str] = "de_core_news_lg"  # noqa: N806
+        if not spacy.util.is_package(SPACY_MODEL_NAME):
+            from spacy.cli.download import download as spacy_download
+
+            spacy_download(SPACY_MODEL_NAME)  # type: ignore
+        self.nlp = spacy.load(SPACY_MODEL_NAME)
+
+    def init_transformer(self) -> None:
+        """Method to initialize transformer."""
+        # init NER Transformer
+        self.classifier = pipeline(
+            "ner",
+            model="fhswf/bert_de_ner",
+            grouped_entities=True,
+            tokenizer="dbmdz/bert-base-german-cased",
+        )
+
+    def init_companylist(self) -> None:
+        """Method to initialize company list."""
+        with open(
+            "src/aki_prj23_transparenzregister/utils/mongo/CompEntities.json", "rb"
+        ) as complist:
+            self.complist = json.load(complist)
+
+    def ner_spacy(
+        self, doc: dict, ent_type: str = "ORG", doc_attrib: str = "title"
+    ) -> dict:
+        """Named Entity Recognition with Spacy.
+
+        Args:
+                doc: a document which is processed with spacy
+                ent_type: string with specific entity (LOC - Location; PERSON - People; NORP - Nationalities or religious or political groups;
+                        FAC - Buildings, airports, highways, bridges, etc.; ORG - Companies, agencies, institutions, etc.;
+                        GPE - Countries, cities, states.; LOC - Non-GPE locations, mountain ranges, bodies of water)
+                doc_attrib: which attribute of the document has to be processed: text or title
+        Returns:
+                list of entities.
+        """
+        # init list for entities
+        entities = []
+
+        text = doc[doc_attrib]
+
+        # get entities
+        doc_nlp = self.nlp(text)
+
+        # select company
+        for ent in doc_nlp.ents:
+            if ent.label_ == ent_type:
+                entities.append(ent.text)
+        return dict(Counter(entities))
+
+    def ner_company_list(
+        self, doc: dict, ent_type: str = "ORG", doc_attrib: str = "title"
+    ) -> dict:
+        """Named Entity Recognition by String comparision.
+
+        Args:
+                doc: a dict from where entities are searched
+                ent_type: type of searched entity
+                doc_attrib: which attribute of the dict is searched
+        Raises:
+                NotImplementedError: To be defined by child classes
+        Returns:
+                list with entities.
+        """
+        # Convert all entries in the company_list to lowercase
+        self.complist = [company_name.lower() for company_name in self.complist]
+
+        # Create an empty list to store the entities
+        entities = []
+
+        # Search the text for company names
+        text = doc[doc_attrib]
+        # Convert title to lowercase
+        text = text.lower()
+
+        for company_name in self.complist:
+            start_idx = text.find(company_name)
+            if start_idx != -1:  # Wort gefunden
+                start_idx + len(company_name)
+                entity = company_name
+                if entity not in entities:
+                    entities.append(entity)
+
+        return dict(Counter(entities))
+
+    def ner_transformer(
+        self, doc: dict, ent_type: str = "ORG", doc_attrib: str = "title"
+    ) -> dict:
+        """Named Entity Recognition with Transformer.
+
+        Args:
+                doc: a string which is processed with a transformer model
+                ent_type: string with specific entity (PERSON - People; NORP - Nationalities or religious or political groups;
+                        FAC - Buildings, airports, highways, bridges, etc.; ORG - Companies, agencies, institutions, etc.;
+                        GPE - Countries, cities, states.; LOC - Non-GPE locations, mountain ranges, bodies of water).
+                doc_attrib: Attribute of the document (title or text)
+
+        Returns:
+                list of entities.
+        """
+        # init list for entities
+        entities = []
+        text = doc[doc_attrib]
+        sentences = text.split(". ")  # Split text into sentences based on '. '
+
+        # Process each sentence separately
+        for sentence in sentences:
+            res = self.classifier(
+                sentence
+            )  # Assuming 'classifier' processes a single sentence at a time
+
+            for i in range(len(res)):
+                if res[i]["entity_group"] == ent_type:
+                    entities.append(res[i]["word"])
+        return dict(Counter(entities))
--- a/src/aki_prj23_transparenzregister/utils/mongo/sentiment_pipeline.py
+++ b/src/aki_prj23_transparenzregister/utils/mongo/sentiment_pipeline.py
@@ -0,0 +1,91 @@
+"""Pipeline to get sentiments from Staging DB nes articles."""
+
+import json
+import os
+
+from loguru import logger
+from tqdm import tqdm
+
+import aki_prj23_transparenzregister.utils.mongo.connector as conn
+import aki_prj23_transparenzregister.utils.mongo.news_mongo_service as news
+from aki_prj23_transparenzregister.config.config_providers import JsonFileConfigProvider
+from aki_prj23_transparenzregister.config.config_template import MongoConnection
+from aki_prj23_transparenzregister.utils.mongo import sentiment_service
+
+
+class SentimentPipeline:
+    """Class to initialize sentiment Pipeline."""
+
+    def __init__(self, conn_string: MongoConnection) -> None:
+        """Method to connect to StagingDB."""
+        self.connect_string = conn_string
+        self.connect_string.database = "transparenzregister_ner"
+        self.connector = conn.MongoConnector(self.connect_string)
+        self.news_obj = news.MongoNewsService(self.connector)
+
+    def process_documents(self, doc_attrib: str, sentiment_selection: str) -> None:
+        """Method to check documents, get entities and write them to document."""
+        CursorUnprogressed = self.news_obj.collection.find(  # noqa: N806
+            {"sentiment": {"$exists": False}}
+        )
+        documents = list(CursorUnprogressed)
+
+        if len(documents) > 0:
+            for document in tqdm(documents):
+                text = document[doc_attrib]
+
+                # Determine sentiment analysis service based on config
+                if sentiment_selection == "use_spacy":
+                    selected_service = sentiment_service.SentimentAnalysisService(
+                        use_spacy=True, use_transformer=False
+                    )
+                    sentiment_service_func = selected_service.sentiment_spacy
+
+                elif sentiment_selection == "use_transformer":
+                    selected_service = sentiment_service.SentimentAnalysisService(
+                        use_spacy=False, use_transformer=True
+                    )
+                    sentiment_service_func = selected_service.sentiment_transformer
+
+                # sents = selected_service.sentiment_spacy(text)
+                sents = sentiment_service_func(text)
+                sentiment = {"label": sents[0], "score": sents[1]}
+                self.news_obj.collection.update_one(
+                    {"_id": document["_id"]},
+                    {"$set": {"sentiment": sentiment}},
+                )
+        else:
+            logger.info("No documents found.")
+
+
+if __name__ == "__main__":
+    # Establish MongoDB Connection using secrets
+    config_provider = JsonFileConfigProvider("./secrets.json")
+    connect_string = config_provider.get_mongo_connection_string()
+
+    # dir of config json
+    script_dir = os.path.dirname(__file__)
+    config_file_path = os.path.join(script_dir, "ner_sentiment_config.json")
+    # Load sentiment service configuration from JSON
+    with open(config_file_path) as config_file:
+        sentiment_config = json.load(config_file)
+    # Where to search the sentiment
+    doc_attrib = sentiment_config["sentiment_service"]["doc_attrib"]
+    logger.info("Sentiment Pipeline: searching in document attribute ", str(doc_attrib))
+
+    # read selected service
+    if sentiment_config["sentiment_service"]["use_spacy"] is True:
+        sentiment_selection = "use_spacy"
+        logger.info("Sentiment Pipleline: Searching sentiments with spaCy")
+
+    elif sentiment_config["sentiment_service"]["use_transformer"] is True:
+        sentiment_selection = "use_transformer"
+        logger.info("Sentiment Pipleline: Searching sentiments with transformer")
+
+    else:
+        logger.info(
+            "Sentiment Pipleline: No Sentiment services selected or error in configuration file."
+        )
+
+    sentiment_pipeline = SentimentPipeline(connect_string)
+    sentiment_pipeline.process_documents(doc_attrib, sentiment_selection)
--- a/src/aki_prj23_transparenzregister/utils/mongo/sentiment_service.py
+++ b/src/aki_prj23_transparenzregister/utils/mongo/sentiment_service.py
@@ -0,0 +1,170 @@
+"""Service for Sentiment analysis."""
+
+import os
+import zipfile
+from typing import Final
+
+import requests
+import spacy
+from loguru import logger
+from spacy_sentiws import spaCySentiWS  # noqa: F401
+from transformers import pipeline
+
+
+class SentimentAnalysisService:
+    """Class to initialize spaCy or Transformer model."""
+
+    def __init__(self, use_spacy: bool = False, use_transformer: bool = False) -> None:
+        """Method to check which sentiment model is chosen."""
+        if use_spacy:
+            self.init_spacy()
+        if use_transformer:
+            self.init_transformer()
+
+    def init_spacy(self) -> None:
+        """Method to initialize spaCy."""
+        # check if model is available and load it
+        SPACY_MODEL_NAME: Final[str] = "de_core_news_lg"  # noqa: N806
+        if not spacy.util.is_package(SPACY_MODEL_NAME):
+            cli = spacy.cli  # type: ignore
+            cli.download(SPACY_MODEL_NAME)  # type: ignore
+        self.nlp = spacy.load(SPACY_MODEL_NAME)
+
+        # path to spaCy vocabulary
+        PATH: Final[  # noqa: N806
+            str
+        ] = "src/aki_prj23_transparenzregister/utils/mongo/SentiWS/"
+        # check if vocabulary is existing, otehrwise download it
+        if not os.path.exists(PATH):
+            URL: Final[  # noqa: N806
+                str
+            ] = "https://downloads.wortschatz-leipzig.de/etc/SentiWS/SentiWS_v2.0.zip"
+            logger.info("SentiWS vocabulary not found. Starting download...")
+            # Create the data directory if it doesn't exist
+            os.makedirs(PATH, exist_ok=True)
+
+            # File path for the downloaded ZIP file
+            zip_file_path = os.path.join(PATH, "SentiWS_v2.0.zip")
+
+            # Download the ZIP file
+            response = requests.get(URL)  # noqa: S113
+            with open(zip_file_path, "wb") as zip_file:
+                zip_file.write(response.content)
+
+            # Extract the ZIP file
+            with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
+                zip_ref.extractall(PATH)
+
+            # Remove the downloaded ZIP file if it's no longer needed
+            os.remove(zip_file_path)
+
+            logger.info("SentiWS data downloaded and extracted successfully.")
+        else:
+            logger.info("SentiWS data directory already exists.")
+
+        # create spaCy pipeline
+        self.nlp.add_pipe("sentiws", config={"sentiws_path": PATH})
+
+    def init_transformer(self) -> None:
+        """Method to initialize transformer."""
+        # loading the sentiment model(~ 436MB) for transformer
+        self.sentiment_analyzer = pipeline(
+            "sentiment-analysis", model="oliverguhr/german-sentiment-bert"
+        )
+
+    def sentiment_spacy(self, doc: str) -> tuple:
+        """Sentiment Analytics with Spacy.
+
+        Args:
+                doc: a document which is processed with spacy
+                docAttrib: which attribute of the document has to be processed: text or title
+
+        Returns:
+                label: positive, negative, neutral.
+        """
+        # set limits for sentiments
+        _upperlimit = 0.1
+        _lowerlimit = -0.1
+
+        _doc = self.nlp(doc)
+        _score = None
+        _sent = None
+        # init a sentiment counter
+        _pos = 0
+        _neg = 0
+        # init a summarizer for maximum sentiment score to normalize values
+        _max_score = 0
+        for token in _doc:
+            token_score = token._.sentiws  # noqa: SLF001
+            if token_score is not None:
+                _max_score += abs(token_score)
+                if token_score < 0:
+                    _neg += token_score
+                if token_score > 0:
+                    _pos += token_score
+
+        # Normalize the score to the range 0..1
+        _normalized_score = (_pos - abs(_neg)) / _max_score if _max_score > 0 else 0
+
+        if _normalized_score > _upperlimit:
+            _sent = "positive"
+        elif _normalized_score < _lowerlimit:
+            _sent = "negative"
+        else:
+            _sent = "neutral"
+        return _sent, abs(_normalized_score)
+
+    def sentiment_transformer(self, doc: str) -> tuple:
+        """Sentiment Analysis with Transformer.
+
+        Args:
+                doc: a string which is processed with a transformer model
+
+        Returns:
+                sentiment and score.
+        """
+        sentences = doc.split(". ")  # Split text into sentences based on '. '
+        # init total sentiment and score counter
+        total_score = 0
+        total_positive_score = 0
+        total_negative_score = 0
+        total_neutral_score = 0
+
+        _score = None
+        _sent = None
+
+        # Process each sentence separately
+        for sentence in sentences:
+            # get sentiment
+            results = self.sentiment_analyzer(sentence)
+            _score = results[0]["score"]
+            _sent = results[0]["label"]
+
+            # sum up sepcific score
+            if _sent == "positive":
+                total_positive_score += _score
+            elif _sent == "negative":
+                total_negative_score += _score
+            else:
+                total_neutral_score += _score
+
+            # sum up total score
+            total_score += _score
+
+        # total specific score
+        total_positive_score_normalized = total_positive_score / total_score
+        total_negative_score_normalized = total_negative_score / total_score
+        total_neutral_score_normalized = total_neutral_score / total_score
+
+        if total_positive_score_normalized > total_negative_score_normalized:
+            final_sentiment = "positive"
+            out_score = total_positive_score_normalized
+        elif total_positive_score_normalized < total_negative_score_normalized:
+            final_sentiment = "negative"
+            out_score = total_negative_score_normalized
+        else:
+            final_sentiment = "neutral"
+            out_score = total_neutral_score_normalized
+
+        # return _sent, _score
+        return final_sentiment, out_score
--- a/tests/utils/mongo/ner_pipeline_test.py
+++ b/tests/utils/mongo/ner_pipeline_test.py
@@ -0,0 +1,291 @@
+"""Tests for checking NER Pipeline."""
+
+from unittest.mock import Mock, patch
+
+import pytest
+
+from aki_prj23_transparenzregister.config.config_template import MongoConnection
+from aki_prj23_transparenzregister.utils.mongo.ner_pipeline import EntityPipeline
+
+
+@pytest.fixture()
+def mock_mongo_connection() -> MongoConnection:
+    """Mock MongoConnector class.
+
+    Args:
+        mocker (any): Library mocker
+
+    Returns:
+        Mock: Mocked MongoConnector
+    """
+    return MongoConnection("", "", None, "" "", "")
+
+
+@pytest.fixture()
+def mock_mongo_connector(mocker: Mock) -> Mock:
+    """Mock MongoConnector class.
+
+    Args:
+        mocker (any): Library mocker
+
+    Returns:
+        Mock: Mocked MongoConnector
+    """
+    mock = Mock()
+    mocker.patch(
+        "aki_prj23_transparenzregister.utils.mongo.connector.MongoConnector",
+        return_value=mock,
+    )
+    mock.database = {"news": Mock()}
+    return mock
+
+
+@pytest.fixture()
+def mock_spacy(mocker: Mock) -> Mock:
+    """Mock MongoConnector class.
+
+    Args:
+        mocker (any): Library mocker
+
+    Returns:
+        Mock: Mocked MongoConnector
+    """
+    mock = Mock()
+    mocker.patch(
+        "aki_prj23_transparenzregister.utils.mongo.ner_service.NerAnalysisService.init_spacy",
+        return_value=mock,
+    )
+    return mock
+
+
+# Mocking the NerAnalysisService methods
+@patch(
+    "aki_prj23_transparenzregister.utils.mongo.ner_service.NerAnalysisService.ner_spacy"
+)
+def test_entity_pipeline_with_spacy(
+    mock_ner_spacy: Mock,
+    mock_mongo_connector: Mock,
+    mock_mongo_connection: MongoConnection,
+    mock_spacy: Mock,
+) -> None:
+    # Configure the mock to return a specific NER result
+    mock_ner_spacy.return_value = {"ORG": 2, "PERSON": 1}
+
+    # Create an instance of the EntityPipeline
+    entity_pipeline = EntityPipeline(mock_mongo_connection)
+
+    # Mock the news collection and documents for testing
+    mock_collection = Mock()
+    mock_documents = [{"_id": "document1", "title": "Apple Inc is a tech company."}]
+
+    # Set the collection to the mock_collection
+    entity_pipeline.news_obj.collection = mock_collection
+
+    # Mock the find method of the collection to return the mock documents
+    mock_collection.find.return_value = mock_documents
+
+    # Call the process_documents method with spaCy NER
+    entity_pipeline.process_documents(
+        entity="ORG", doc_attrib="title", ner_selection="use_spacy_ner"
+    )
+
+    # Ensure that ner_spacy was called with the correct parameters
+    mock_ner_spacy.assert_called_once_with(mock_documents[0], "ORG", "title")
+
+    # Ensure that the document in the collection was updated with the NER results
+    mock_collection.update_one.assert_called_once_with(
+        {"_id": "document1"},
+        {"$set": {"companies": {"ORG": 2, "PERSON": 1}}},
+    )
+
+
+@patch(
+    "aki_prj23_transparenzregister.utils.mongo.ner_service.NerAnalysisService.ner_spacy"
+)
+def test_entity_pipeline_with_spacy_no_docs(
+    mock_ner_spacy: Mock,
+    mock_mongo_connector: Mock,
+    mock_mongo_connection: MongoConnection,
+    mock_spacy: Mock,
+) -> None:
+    # Configure the mock to return a specific NER result
+    mock_ner_spacy.return_value = {"ORG": 2, "PERSON": 1}
+
+    # Create an instance of the EntityPipeline
+    entity_pipeline = EntityPipeline(mock_mongo_connection)
+
+    # Mock the news collection and documents for testing
+    mock_collection = Mock()
+    mock_documents: list[dict] = []
+
+    # Set the collection to the mock_collection
+    entity_pipeline.news_obj.collection = mock_collection
+
+    # Mock the find method of the collection to return the mock documents
+    mock_collection.find.return_value = mock_documents
+
+    # Call the process_documents method with spaCy NER
+    entity_pipeline.process_documents(
+        entity="ORG", doc_attrib="title", ner_selection="use_spacy_ner"
+    )
+
+    # Ensure that sentiment_spacy was not called
+    mock_ner_spacy.assert_not_called()
+
+    # Ensure that the document in the collection was not updated
+    mock_collection.assert_not_called()
+
+
+@patch(
+    "aki_prj23_transparenzregister.utils.mongo.ner_service.NerAnalysisService.ner_company_list"
+)
+def test_entity_pipeline_with_companylist_ner(
+    mock_ner_companylist: Mock,
+    mock_mongo_connector: Mock,
+    mock_mongo_connection: MongoConnection,
+    mock_spacy: Mock,
+) -> None:
+    # Konfigurieren Sie das Mock-Objekt, um ein spezifisches NER-Ergebnis zurückzugeben
+    mock_ner_companylist.return_value = {"ORG": 3, "LOCATION": 2}
+
+    # Create an instance of the EntityPipeline
+    entity_pipeline = EntityPipeline(mock_mongo_connection)
+
+    # Mock die News-Sammlung und Dokumente für Tests
+    mock_collection = Mock()
+    mock_documents = [
+        {"_id": "document2", "title": "Siemens ist ein deutsches Unternehmen."}
+    ]
+
+    # Set the collection to the mock_collection
+    entity_pipeline.news_obj.collection = mock_collection
+    # Mock the find method of the collection to return the mock documents
+    mock_collection.find.return_value = mock_documents
+
+    # Call the process_documents method with Company List NER
+    entity_pipeline.process_documents(
+        entity="ORG", doc_attrib="title", ner_selection="use_companylist_ner"
+    )
+
+    # Überprüfen Sie, ob ner_company_list mit den richtigen Parametern aufgerufen wurde
+    mock_ner_companylist.assert_called_once_with(mock_documents[0], "ORG", "title")
+
+    # Überprüfen Sie, ob das Dokument in der Sammlung mit den NER-Ergebnissen aktualisiert wurde
+    mock_collection.update_one.assert_called_once_with(
+        {"_id": "document2"},
+        {"$set": {"companies": {"ORG": 3, "LOCATION": 2}}},
+    )
+
+
+@patch(
+    "aki_prj23_transparenzregister.utils.mongo.ner_service.NerAnalysisService.ner_company_list"
+)
+def test_entity_pipeline_with_companylist_ner_no_docs(
+    mock_ner_companylist: Mock,
+    mock_mongo_connector: Mock,
+    mock_mongo_connection: MongoConnection,
+    mock_spacy: Mock,
+) -> None:
+    # Configure the mock to return a specific NER result
+    mock_ner_companylist.return_value = {"ORG": 3, "LOCATION": 2}
+
+    # Create an instance of the EntityPipeline
+    entity_pipeline = EntityPipeline(mock_mongo_connection)
+
+    # Mock die News-Sammlung und Dokumente für Tests
+    mock_collection = Mock()
+    mock_documents: list[dict] = []
+
+    # Set the collection to the mock_collection
+    entity_pipeline.news_obj.collection = mock_collection
+    # Mock the find method of the collection to return the mock documents
+    mock_collection.find.return_value = mock_documents
+
+    # Call the process_documents method with Company List NER
+    entity_pipeline.process_documents(
+        entity="ORG", doc_attrib="title", ner_selection="use_companylist_ner"
+    )
+
+    # Ensure that ner_company_list is not called
+    mock_ner_companylist.assert_not_called()
+
+    # Ensure that the document in the collection was not updated
+    mock_collection.update_one.assert_not_called()
+
+
+# Add more test cases for other NER methods (e.g., use_companylist_ner, use_transformer_ner) following a similar pattern.
+@patch(
+    "aki_prj23_transparenzregister.utils.mongo.ner_service.NerAnalysisService.ner_spacy"
+)
+def test_entity_pipeline_with_transformer(
+    mock_ner_transformer: Mock,
+    mock_mongo_connector: Mock,
+    mock_mongo_connection: MongoConnection,
+    mock_spacy: Mock,
+) -> None:
+    # Configure the mock to return a specific NER result
+    mock_ner_transformer.return_value = {"ORG": 2, "PERSON": 1}
+
+    # Create an instance of the EntityPipeline
+    entity_pipeline = EntityPipeline(mock_mongo_connection)
+
+    # Mock the news collection and documents for testing
+    mock_collection = Mock()
+    mock_documents = [{"_id": "document1", "title": "Apple Inc is a tech company."}]
+
+    # Set the collection to the mock_collection
+    entity_pipeline.news_obj.collection = mock_collection
+
+    # Mock the find method of the collection to return the mock documents
+    mock_collection.find.return_value = mock_documents
+
+    # Call the process_documents method with spaCy NER
+    entity_pipeline.process_documents(
+        entity="ORG", doc_attrib="title", ner_selection="use_spacy_ner"
+    )
+
+    # Ensure that ner_spacy was called with the correct parameters
+    mock_ner_transformer.assert_called_once_with(mock_documents[0], "ORG", "title")
+
+    # Ensure that the document in the collection was updated with the NER results
+    mock_collection.update_one.assert_called_once_with(
+        {"_id": "document1"},
+        {"$set": {"companies": {"ORG": 2, "PERSON": 1}}},
+    )
+
+
+@patch(
+    "aki_prj23_transparenzregister.utils.mongo.ner_service.NerAnalysisService.ner_spacy"
+)
+def test_entity_pipeline_with_transformer_no_docs(
+    mock_ner_transformer: Mock,
+    mock_mongo_connector: Mock,
+    mock_mongo_connection: MongoConnection,
+    mock_spacy: Mock,
+) -> None:
+    # Configure the mock to return a specific NER result
+    mock_ner_transformer.return_value = {"ORG": 2, "PERSON": 1}
+
+    # Create an instance of the EntityPipeline
+    entity_pipeline = EntityPipeline(mock_mongo_connection)
+
+    # Mock the news collection and documents for testing
+    mock_collection = Mock()
+    mock_documents: list[dict] = []
+
+    # Set the collection to the mock_collection
+    entity_pipeline.news_obj.collection = mock_collection
+
+    # Mock the find method of the collection to return the mock documents
+    mock_collection.find.return_value = mock_documents
+
+    # Call the process_documents method with spaCy NER
+    entity_pipeline.process_documents(
+        entity="ORG", doc_attrib="title", ner_selection="use_spacy_ner"
+    )
+
+    # Ensure that ner_transformer is not called
+    mock_ner_transformer.assert_not_called()
+
+    # Ensure that the document in the collection was not updated
+    mock_collection.update_one.assert_not_called()
--- a/tests/utils/mongo/ner_service_test.py
+++ b/tests/utils/mongo/ner_service_test.py
@@ -0,0 +1,54 @@
+"""Tests for checking NER Services."""
+
+from aki_prj23_transparenzregister.utils.mongo.ner_service import NerAnalysisService
+
+
+def test_ner_spacy() -> None:
+    """Mock TestNerService."""
+    # Create instance of NerAnalysisService with use_spacy=True
+    ner_service = NerAnalysisService(
+        use_spacy=True, use_transformer=False, use_companylist=False
+    )
+    # 1st testing
+    doc = {"title": "Siemens ist ein Unternehmen."}
+    result = ner_service.ner_spacy(doc, ent_type="ORG", doc_attrib="title")
+    assert result == {"Siemens": 1}
+
+    # 2nd testing
+    doc = {"text": "BASF ist ein großes Unternehmen."}
+    result = ner_service.ner_spacy(doc, ent_type="ORG", doc_attrib="text")
+    assert result == {"BASF": 1}
+
+
+def test_ner_company_list() -> None:
+    """Mock test_ner_company."""
+    # Create instance of NerAnalysisService with use_use_companylist=True
+    ner_service = NerAnalysisService(
+        use_spacy=False, use_transformer=False, use_companylist=True
+    )
+
+    doc = {"title": "Siemens ist ein Unternehmen."}
+    result = ner_service.ner_company_list(doc, ent_type="ORG", doc_attrib="title")
+    assert result == {"siemens": 1}
+
+    # 2nd testing
+    doc = {"text": "BASF ist ein großes Unternehmen."}
+    result = ner_service.ner_company_list(doc, ent_type="ORG", doc_attrib="text")
+    assert result == {"basf": 1}
+
+
+def test_ner_transformer() -> None:
+    """Mock test_ner_company."""
+    # Create instance of NerAnalysisService with use_use_companylist=True
+    ner_service = NerAnalysisService(
+        use_spacy=False, use_transformer=True, use_companylist=False
+    )
+
+    doc = {"title": "Siemens ist ein Unternehmen."}
+    result = ner_service.ner_transformer(doc, ent_type="ORG", doc_attrib="title")
+    assert result == {"Siemens": 1}
+
+    # 2nd testing
+    doc = {"text": "BASF ist ein großes Unternehmen."}
+    result = ner_service.ner_transformer(doc, ent_type="ORG", doc_attrib="text")
+    assert result == {"BASF": 1}
--- a/tests/utils/mongo/sentiment_pipeline_test.py
+++ b/tests/utils/mongo/sentiment_pipeline_test.py
@@ -0,0 +1,210 @@
+"""Unit test for sentiment pipeline."""
+
+from unittest.mock import Mock, patch
+
+import pytest
+
+from aki_prj23_transparenzregister.config.config_template import MongoConnection
+from aki_prj23_transparenzregister.utils.mongo.sentiment_pipeline import (
+    SentimentPipeline,
+)
+
+
+@pytest.fixture()
+def mock_mongo_connection() -> MongoConnection:
+    """Mock MongoConnector class.
+
+    Args:
+        mocker (any): Library mocker
+
+    Returns:
+        Mock: Mocked MongoConnector
+    """
+    return MongoConnection("", "", None, "" "", "")
+
+
+@pytest.fixture()
+def mock_mongo_connector(mocker: Mock) -> Mock:
+    """Mock MongoConnector class.
+
+    Args:
+        mocker (any): Library mocker
+
+    Returns:
+        Mock: Mocked MongoConnector
+    """
+    mock = Mock()
+    mocker.patch(
+        "aki_prj23_transparenzregister.utils.mongo.connector.MongoConnector",
+        return_value=mock,
+    )
+    mock.database = {"news": Mock()}
+    return mock
+
+
+@pytest.fixture()
+def mock_spacy(mocker: Mock) -> Mock:
+    """Mock MongoConnector class.
+
+    Args:
+        mocker (any): Library mocker
+
+    Returns:
+        Mock: Mocked MongoConnector
+    """
+    mock = Mock()
+    mocker.patch(
+        "aki_prj23_transparenzregister.utils.mongo.sentiment_service.SentimentAnalysisService.init_spacy",
+        return_value=mock,
+    )
+    return mock
+
+
+@patch(
+    "aki_prj23_transparenzregister.utils.mongo.sentiment_service.SentimentAnalysisService.sentiment_spacy"
+)
+def test_sentiment_pipeline_existing_sentiment(
+    mock_sentiment_spacy: Mock,
+    mock_mongo_connector: Mock,
+    mock_mongo_connection: MongoConnection,
+    mock_spacy: Mock,
+) -> None:
+    # Configure the mock to return a specific sentiment result
+    mock_sentiment_spacy.return_value = ("positive", 0.8)
+
+    # Create an instance of the SentimentPipeline
+    sentiment_pipeline = SentimentPipeline(mock_mongo_connection)
+
+    # Mock the news collection and documents for testing
+    mock_collection = Mock()
+    mock_documents = [
+        {
+            "_id": "document1",
+            "text": "This is a positive text.",
+            "sentiment": {"label": "neutral", "score": 0.5},
+        }
+    ]
+
+    # Set the collection to the mock_collection
+    sentiment_pipeline.news_obj.collection = mock_collection
+
+    # Mock the find method of the collection to return the mock documents
+    mock_collection.find.return_value = mock_documents
+
+    # Call the process_documents method
+    sentiment_pipeline.process_documents("text", "use_spacy")
+
+    # Ensure that sentiment_spacy was called with the correct text
+    mock_sentiment_spacy.assert_called_once_with("This is a positive text.")
+
+    # Ensure that the document in the collection was not updated with sentiment
+    # mock_collection.update_one.assert_not_called()
+
+
+@patch(
+    "aki_prj23_transparenzregister.utils.mongo.sentiment_service.SentimentAnalysisService.sentiment_spacy"
+)
+def test_sentiment_pipeline_no_documents(
+    mock_sentiment_spacy: Mock,
+    mock_mongo_connector: Mock,
+    mock_mongo_connection: MongoConnection,
+    mock_spacy: Mock,
+) -> None:
+    # Configure the mock to return a specific sentiment result
+    mock_sentiment_spacy.return_value = ("positive", 0.8)
+
+    # Create an instance of the SentimentPipeline
+    sentiment_pipeline = SentimentPipeline(mock_mongo_connection)
+
+    # Mock the news collection to return an empty result
+    mock_collection = Mock()
+    mock_collection.find.return_value = []
+
+    # Set the collection to the mock_collection
+    sentiment_pipeline.news_obj.collection = mock_collection
+
+    # Call the process_documents method
+    sentiment_pipeline.process_documents("text", "use_spacy")
+
+    # Ensure that sentiment_spacy was not called
+    mock_sentiment_spacy.assert_not_called()
+
+    # Ensure that the document in the collection was not updated with sentiment
+    mock_collection.update_one.assert_not_called()
+
+
+@patch(
+    "aki_prj23_transparenzregister.utils.mongo.sentiment_service.SentimentAnalysisService.sentiment_spacy"
+)
+def test_sentiment_pipeline_with_spacy(
+    mock_sentiment_spacy: Mock,
+    mock_mongo_connector: Mock,
+    mock_mongo_connection: MongoConnection,
+    mock_spacy: Mock,
+) -> None:
+    # Configure the mock to return a specific sentiment result
+    mock_sentiment_spacy.return_value = ("positive", 0.8)
+
+    # Create an instance of the SentimentPipeline
+    sentiment_pipeline = SentimentPipeline(mock_mongo_connection)
+
+    # Mock the news collection and documents for testing
+    mock_collection = Mock()
+    mock_documents = [{"_id": "document1", "text": "This is a positive text."}]
+
+    # Set the collection to the mock_collection
+    sentiment_pipeline.news_obj.collection = mock_collection
+
+    # Mock the find method of the collection to return the mock documents
+    mock_collection.find.return_value = mock_documents
+
+    # Call the process_documents method
+    sentiment_pipeline.process_documents("text", "use_spacy")
+
+    # Ensure that sentiment_spacy was called with the correct text
+    mock_sentiment_spacy.assert_called_once_with("This is a positive text.")
+
+    # Ensure that the document in the collection was updated with the sentiment result
+    mock_collection.update_one.assert_called_once_with(
+        {"_id": "document1"},
+        {"$set": {"sentiment": {"label": "positive", "score": 0.8}}},
+    )
+
+
+# Mocking the SentimentAnalysisService methods
+@patch(
+    "aki_prj23_transparenzregister.utils.mongo.sentiment_service.SentimentAnalysisService.sentiment_transformer"
+)
+def test_sentiment_pipeline_with_transformer(
+    mock_sentiment_transformer: Mock,
+    mock_mongo_connector: Mock,
+    mock_mongo_connection: MongoConnection,
+    mock_spacy: Mock,
+) -> None:
+    # Configure the mock to return a specific sentiment result
+    mock_sentiment_transformer.return_value = ("negative", 0.6)
+
+    # Create an instance of the SentimentPipeline
+    sentiment_pipeline = SentimentPipeline(mock_mongo_connection)
+
+    # Mock the news collection and documents for testing
+    mock_collection = Mock()
+    mock_documents = [{"_id": "document2", "text": "This is a negative text."}]
+
+    # Set the collection to the mock_collection
+    sentiment_pipeline.news_obj.collection = mock_collection
+
+    # Mock the find method of the collection to return the mock documents
+    mock_collection.find.return_value = mock_documents
+
+    # Call the process_documents method
+    sentiment_pipeline.process_documents("text", "use_transformer")
+
+    # Ensure that sentiment_transformer was called with the correct text
+    mock_sentiment_transformer.assert_called_once_with("This is a negative text.")
+
+    # Ensure that the document in the collection was updated with the sentiment result
+    mock_collection.update_one.assert_called_once_with(
+        {"_id": "document2"},
+        {"$set": {"sentiment": {"label": "negative", "score": 0.6}}},
+    )
--- a/tests/utils/mongo/sentiment_service_test.py
+++ b/tests/utils/mongo/sentiment_service_test.py
@@ -0,0 +1,78 @@
+"""Tests for checking Sentiment Services."""
+
+
+from aki_prj23_transparenzregister.utils.mongo.sentiment_service import (
+    SentimentAnalysisService,
+)
+
+
+def test_sentiment_service_with_spacy_pos() -> None:
+    """Mock testing spaCy Sentiment Service with positive sentiment."""
+    # Init SentimentAnalysisService with spaCy
+    sentiment_service = SentimentAnalysisService(use_spacy=True)
+
+    # run the test
+    text = "Dies ist ein großartiger Test. Ich liebe es!"
+    sentiment, score = sentiment_service.sentiment_spacy(text)
+    assert sentiment == "positive"
+    assert score > 0
+
+
+def test_sentiment_service_with_spacy_neg() -> None:
+    """Mock testing spaCy Sentiment Service with negative sentiment."""
+    # Init SentimentAnalysisService with spaCy
+    sentiment_service = SentimentAnalysisService(use_spacy=True)
+
+    # run the test
+    text = "Dies ist ein wirklich schrecklicher Test. Ich hasse ihn!"
+    sentiment, score = sentiment_service.sentiment_spacy(text)
+    assert sentiment == "negative"
+    assert score > 0
+
+
+def test_sentiment_service_with_spacy_neut() -> None:
+    """Mock testing spaCy Sentiment Service with neutral sentiment."""
+    # Init SentimentAnalysisService with spaCy
+    sentiment_service = SentimentAnalysisService(use_spacy=True)
+
+    # run the test
+    text = "Dies ist ein Test."
+    sentiment, score = sentiment_service.sentiment_spacy(text)
+    assert sentiment == "neutral"
+    assert score >= 0
+
+
+def test_sentiment_service_with_transformer_pos() -> None:
+    """Mock testing Transformer Sentiment Service with positive Sentiment."""
+    # Init SentimentAnalysisService with Transformer
+    sentiment_service = SentimentAnalysisService(use_transformer=True)
+
+    # run the test
+    text = "Dies ist ein großartiger Test. Ich liebe es!"
+    sentiment, score = sentiment_service.sentiment_transformer(text)
+    assert sentiment == "positive"
+    assert score > 0
+
+
+def test_sentiment_service_with_transformer_neg() -> None:
+    """Mock testing Transformer Sentiment Service with negative Sentiment."""
+    # Init SentimentAnalysisService with Transformer
+    sentiment_service = SentimentAnalysisService(use_transformer=True)
+
+    # run the test
+    text = "Dies ist ein wirklich schrecklicher Test. Ich hasse ihn!"
+    sentiment, score = sentiment_service.sentiment_transformer(text)
+    assert sentiment == "negative"
+    assert score > 0
+
+
+def test_sentiment_service_with_transformer_neut() -> None:
+    """Mock testing Transformer Sentiment Service with neutral Sentiment."""
+    # Init SentimentAnalysisService with Transformer
+    sentiment_service = SentimentAnalysisService(use_transformer=True)
+
+    # run the test
+    text = "Das ist ein Text, ohne besondere Stimmung."
+    sentiment, score = sentiment_service.sentiment_transformer(text)
+    assert sentiment == "neutral"
+    assert score >= 0
				`@@ -0,0 +1 @@`
				<mxfile host="Electron" modified="2023-08-17T13:01:42.139Z" agent="5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/20.8.10 Chrome/106.0.5249.199 Electron/21.3.5 Safari/537.36" etag="3J0g9IE6MsjTAohlE4lt" version="20.8.10" type="device"><diagram id="C5RBs43oDa-KdzZeNtuy" name="Page-1">7VrbUuM4EP2aVO0+QNkxuT1uAszswlCzA7MMT1OKrTgiimVkOZf5+m3ZUmxHJjETgiHAC1FbN/c5p7stu+EMpotPHIXjL8zDtNG0vEXDOW00m7bda8E/aVmmlo7TSQ0+J57qlBmuyS+sjJayxsTDUaGjYIwKEhaNLgsC7IqCDXHO5sVuI0aLq4bIx4bh2kXUtN4ST4xTa7fZyeyfMfHHemW73UuvTJHurO4kGiOPzXMm56zhDDhjIv01XQwwlc7Tfrn9e3lLLyftT//8Gz2g7/2Lm6v/jtLJzp8yZHULHAfit6deTJb27GISx+7NQ8/+8c16uLg/Ul6IxFL7C3vgPtVkXIyZzwJEzzJrn7M48LCc1YJW1ueSsRCMNhjvsRBLxQUUCwamsZhSdRUviPghhx+3VOsud+V0oWZOGkvdCARf5gbJ5l3+WjYsaelx6f3Jm1qjwxZfqn4Ri7mLN/RzFKUR97HY4OiMMKA0zKYYNgnjOKZIkFlxc0hR3l/1y2CFHwrZJxBIbXKGaKxWItMQgJMsZ15MQZ/rNMhAlrjMx0Tg6xAlrphDoCgCOmKBUGjbcJd9n6IoUgBEgrPJSnqy90pHGT5V4ZhhLvAi5yzTo/pqR8UtFbYcJeJ5FgNsLexxTv8n1p4wODEwuNvodqvo4hJFlSovD0WiqMD7S0ZRaA8pcyep6ZzIzSu56KDd3Y4WRUNM+8id+MlGB4wyDpcCFsjAsD10PLP42qb4yuezSsOASR5FFk2MyupUM31lJBC5Lmw0imBj69RZLfj7bGoZbLpiO4fyN0a3msjUeRqX7FfPpbbBpUs0DRvNNgU39IccfvnyV0hj3weywHRBwzk3Y9eYTYdxtD1dFDCWDDpHU0Klvz5jOsOCuKgkqSBKfFj31AW4MS8nDyxJAh9a7ax1k5AV4u/+kk1X18QK81a7JNtYJdmmu69s0zEw/QropdA1LZrge2AZv9ddA6Fbd8rvVgnSbzoEv3jG19F0e8q3K4ZpxR7r2IG/AoFefxXQq1RTfpQB+2FY89AKAb3DHJ/6MR2alcAw5kFSCLBYfFQCmyqBdlkSetFKQAfCHKjfcIgIgGldHmIdsDraW+p23XWAjhRFCKj0adNKFXZgGKzXYh3rBTEoPX/r7pwW389B5+MHmBUOOh8hxrMfdG7aZE5oq+Slc5fLghHxf4aczYiHuZnctAU2kA3iGAlZWllIPj+x4b18VSHpDGwGp4ylkqGTB74miMqOwAV40GJ+8sCVNL6wwGencn4Zdp1+MIzCFXC1id8QdVXOPC5+q6B9u0T7qz557bf2pf3eh/Z3075+b7dN+806tW/WOYb2cQByxM8p+fSlJWFBwi8uS075jpK/Y7l36pa7fox5/Xrft2531ePaE6gCvOmsvVqz15BM44QatdNj6uN3tTXJS12yiukdRwINKYnGEVoXdTFzH7Z0V9V3fdK1DR+/hHRfado9qZh2W3WmXfO9tqHGAM+jn5BAd0u8iRKR6+IoWuXfK5j5aMAo1Xp9X7nW0ScL9Qm2+SHYNSFWEGy7TsGanw4Ygh3EPGL8ewAPyT4Hvcmz3irSfYgxJ3iDPEHS8tg4wiLRt0y2yVJa3h5z4ylgEalCW54v68lGBFM5GBZ7iFma5achCpIFtendRYBe3Snb/HqAjKBtUujYBdeLP/5MIUxRUkyyzIq98iuEfTnadoqebpUdY9glnrbt1r6Crfkd34d036x02yUfiTyXdKGZfZecPndlX3c7Z/8D</diagram></mxfile>