Feature/ner (#103)

NER und Sentiment-Pipeline mit Services zur Datenextraktion. --------- Co-authored-by: Philipp Horstenkamp <philipp@horstenkamp.de> Co-authored-by: TrisNol <tristan.nolde@yahoo.de>
2025-08-12 22:34:37 +02:00 · 2023-10-16 19:54:24 +02:00
parent 99b61e7c2e
commit c680ac9759
28 changed files with 12509 additions and 10 deletions
--- a/.github/workflows/documentation.yaml
+++ b/.github/workflows/documentation.yaml
@@ -26,7 +26,7 @@ jobs:
    - name: Install and configure Poetry
      uses: snok/install-poetry@v1
      with:
-        version: 1.4.2
+        version: 1.6.1
        virtualenvs-create: false
    - run: poetry install --with doc --all-extras --without test,lint
    - name: Doc-Build
--- a/.github/workflows/lint-actions.yaml
+++ b/.github/workflows/lint-actions.yaml
@@ -28,7 +28,7 @@ jobs:
    - name: Install and configure Poetry
      uses: snok/install-poetry@v1
      with:
-        version: 1.4.2
+        version: 1.6.1
        virtualenvs-create: false
        virtualenvs-path: ~/local/share/virtualenvs
    - run: poetry install --without develop,doc --all-extras
@@ -56,7 +56,7 @@ jobs:
    - name: Install and configure Poetry
      uses: snok/install-poetry@v1
      with:
-        version: 1.4.2
+        version: 1.6.1
        virtualenvs-create: true
        virtualenvs-path: ~/local/share/virtualenvs
    - name: Check out Git repository
--- a/.github/workflows/test-and-build-action.yaml
+++ b/.github/workflows/test-and-build-action.yaml
@@ -20,10 +20,14 @@ jobs:
      uses: actions/setup-python@v4
      with:
        python-version: 3.11
    - name: Install Cuda
      run: |
        sudo apt update
 #        sudo apt install cuda-10-0 -y
    - name: Install and configure Poetry
      uses: snok/install-poetry@v1
      with:
-        version: 1.4.2
+        version: 1.6.1
        virtualenvs-path: ~/local/share/virtualenvs
    - id: cache-pipenv
      uses: actions/cache@v3
@@ -85,7 +89,7 @@ jobs:
    - name: Install and configure Poetry
      uses: snok/install-poetry@v1
      with:
-        version: 1.4.2
+        version: 1.6.1
        virtualenvs-path: ~/local/share/virtualenvs
    - run: |
        poetry install --only test --all-extras
@@ -113,7 +117,7 @@ jobs:
    - name: Install and configure Poetry
      uses: snok/install-poetry@v1
      with:
-        version: 1.4.2
+        version: 1.6.1
        virtualenvs-path: ~/local/share/virtualenvs
    - id: cache-pipenv
      uses: actions/cache@v3
--- a/Jupyter/NER/.$Flow_Chart_NER_Function.drawio.bkp
+++ b/Jupyter/NER/.$Flow_Chart_NER_Function.drawio.bkp
@@ -0,0 +1 @@
 <mxfile host="Electron" modified="2023-08-17T13:01:42.139Z" agent="5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/20.8.10 Chrome/106.0.5249.199 Electron/21.3.5 Safari/537.36" etag="3J0g9IE6MsjTAohlE4lt" version="20.8.10" type="device"><diagram id="C5RBs43oDa-KdzZeNtuy" name="Page-1">7VrbUuM4EP2aVO0+QNkxuT1uAszswlCzA7MMT1OKrTgiimVkOZf5+m3ZUmxHJjETgiHAC1FbN/c5p7stu+EMpotPHIXjL8zDtNG0vEXDOW00m7bda8E/aVmmlo7TSQ0+J57qlBmuyS+sjJayxsTDUaGjYIwKEhaNLgsC7IqCDXHO5sVuI0aLq4bIx4bh2kXUtN4ST4xTa7fZyeyfMfHHemW73UuvTJHurO4kGiOPzXMm56zhDDhjIv01XQwwlc7Tfrn9e3lLLyftT//8Gz2g7/2Lm6v/jtLJzp8yZHULHAfit6deTJb27GISx+7NQ8/+8c16uLg/Ul6IxFL7C3vgPtVkXIyZzwJEzzJrn7M48LCc1YJW1ueSsRCMNhjvsRBLxQUUCwamsZhSdRUviPghhx+3VOsud+V0oWZOGkvdCARf5gbJ5l3+WjYsaelx6f3Jm1qjwxZfqn4Ri7mLN/RzFKUR97HY4OiMMKA0zKYYNgnjOKZIkFlxc0hR3l/1y2CFHwrZJxBIbXKGaKxWItMQgJMsZ15MQZ/rNMhAlrjMx0Tg6xAlrphDoCgCOmKBUGjbcJd9n6IoUgBEgrPJSnqy90pHGT5V4ZhhLvAi5yzTo/pqR8UtFbYcJeJ5FgNsLexxTv8n1p4wODEwuNvodqvo4hJFlSovD0WiqMD7S0ZRaA8pcyep6ZzIzSu56KDd3Y4WRUNM+8id+MlGB4wyDpcCFsjAsD10PLP42qb4yuezSsOASR5FFk2MyupUM31lJBC5Lmw0imBj69RZLfj7bGoZbLpiO4fyN0a3msjUeRqX7FfPpbbBpUs0DRvNNgU39IccfvnyV0hj3weywHRBwzk3Y9eYTYdxtD1dFDCWDDpHU0Klvz5jOsOCuKgkqSBKfFj31AW4MS8nDyxJAh9a7ax1k5AV4u/+kk1X18QK81a7JNtYJdmmu69s0zEw/QropdA1LZrge2AZv9ddA6Fbd8rvVgnSbzoEv3jG19F0e8q3K4ZpxR7r2IG/AoFefxXQq1RTfpQB+2FY89AKAb3DHJ/6MR2alcAw5kFSCLBYfFQCmyqBdlkSetFKQAfCHKjfcIgIgGldHmIdsDraW+p23XWAjhRFCKj0adNKFXZgGKzXYh3rBTEoPX/r7pwW389B5+MHmBUOOh8hxrMfdG7aZE5oq+Slc5fLghHxf4aczYiHuZnctAU2kA3iGAlZWllIPj+x4b18VSHpDGwGp4ylkqGTB74miMqOwAV40GJ+8sCVNL6wwGencn4Zdp1+MIzCFXC1id8QdVXOPC5+q6B9u0T7qz557bf2pf3eh/Z3075+b7dN+806tW/WOYb2cQByxM8p+fSlJWFBwi8uS075jpK/Y7l36pa7fox5/Xrft2531ePaE6gCvOmsvVqz15BM44QatdNj6uN3tTXJS12yiukdRwINKYnGEVoXdTFzH7Z0V9V3fdK1DR+/hHRfado9qZh2W3WmXfO9tqHGAM+jn5BAd0u8iRKR6+IoWuXfK5j5aMAo1Xp9X7nW0ScL9Qm2+SHYNSFWEGy7TsGanw4Ygh3EPGL8ewAPyT4Hvcmz3irSfYgxJ3iDPEHS8tg4wiLRt0y2yVJa3h5z4ylgEalCW54v68lGBFM5GBZ7iFma5achCpIFtendRYBe3Snb/HqAjKBtUujYBdeLP/5MIUxRUkyyzIq98iuEfTnadoqebpUdY9glnrbt1r6Crfkd34d036x02yUfiTyXdKGZfZecPndlX3c7Z/8D</diagram></mxfile>
--- a/Jupyter/NER/Flow_Chart_NER_Function.drawio
+++ b/Jupyter/NER/Flow_Chart_NER_Function.drawio
@@ -0,0 +1 @@
 <mxfile host="Electron" modified="2023-08-20T09:25:41.321Z" agent="5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/19.0.3 Chrome/102.0.5005.63 Electron/19.0.3 Safari/537.36" etag="NgKJlrtoa61kZ5SbTZB-" version="19.0.3" type="device" pages="2"><diagram id="C5RBs43oDa-KdzZeNtuy" name="Pipeline">7VpZc6M4EP41rtp9SIojvh7XzjG7yUzNTjKbydOUDDIolhERwsf8+m2BxGEom0zs2HGSyoPV6ID+vq+7JWjZw+niiqPQ/8xcTFuW4S5a9nnLsswzy2rJf8NdppZep50aPE5c1Sk33JJfWBkNZY2Ji6NSR8EYFSQsGx0WBNgRJRvinM3L3caMllcNkYcrhlsH0ar1nrjCV09hdXP7J0w8X69sdvrplSnSndWTRD5y2bxgsi9a9pAzJtJf08UQU+k87Zf7v5f39GbSufrn3+gJfR9c33357ySd7PI5Q7JH4DgQvz31YrI0Z9eTOHbunvrmj2/G0/XjifJCJJbaX9gF96km48JnHgsQvcitA87iwMVyVgNaeZ8bxkIwmmB8xEIsFRdQLBiYfDGl6ipeEPFDDj9tq9ZD4cr5Qs2cNJa6EQi+LAySzYfitXxY0tLjGrpOuThiMXfwmn62YjDiHhZr/KooI51ZoKEC5gqzKYabhA4cUyTIrMxVpCjvZf1yWOGHQvYZBFJ3PUM0ViuRaQjASZYzN6agz1Ua5CBLXOY+Efg2RIlv5hAoyoCOWSAU2iY85cCjKIoUAJHgbJJJT/bOdPR8fGaYC7xY61F9tavClApbtkJknscAUwvbL+j/zNgRBmcVDB7Wut0ou7hGUbXKK0KRKCpw/5JRFNojypxJarok8uaVXHTQ7m1Gi6IRpgPkTLzkRoeMMg6XAhbIwLA5dGxZjZ2qGuvnaypHRRZNjMbqVDN9ZSQQhS5sPI7gxlapky34+2xqV9j0hb04lL8xuu2JTN3ncck8eC51Kly6QdOwZXUouGEw4vDLk79CGnsekAWmC1r2ZTV2+Ww6iqPN6aKEsWTQJZoSKv31CdMZFsRBNUkFUeLBuucOwI15PXlgSRJ40OrkrbuErBB/d5dsena3lGzanZpsY9Rkm96usk23gulXQC+FzjJogu+RZfx+bwWE3r5Tfq9JkH7TIfjVM76OpptTvtkwTCv2GKc2/JUIdPhVQL9RTflRBuyGYdaxFQL6Dgt8GsR0VK0ERjEPkkKAxeKjElhXCXTqktCrVgI6EBZA/YZDRABM4+YY64DsaE8fWFr7rgN0pChDQKVPLSNV2JFhsFqLdY1XxKD2QK734rR4tAed6w4wNx50Ni2ztn7Que6uC0LLkpfOXQ4LxsT7GXI2Iy7m1eSmLXAD+SCOkZCllYHk/omNHuWrCklnYDN4yZdKhk4uOJ8gKjsCF2Cjxbxkw5U0PrPAY+dyfhl27UEwisIMuGMSv1HSvlmj/axPUfvtXWm//6H9Z2lfv6bbpP2mBfDraL9a51S0jwOQI96m5NOXloQFCb+4LDnlO0r+juXe3bfc9Tbm8PW+Zd1uXY8rO1AFuGWvvFozV5BMA4ca9aJt6rrH3JDkpS5Zw/SOI4FGlER+hFZFXc7cxy3drPren3TNio9fQ7qHkXbPGqbd9kGl3ep77YoaAzyPfkICfVniTZSIHAdHUZZ/v8DMJ0NGqdbr+8q1tj5Z2J9grXcs2HZDwXYOSrDVTwcqgh3GPGL8ewCbZI+D3uRZbxPpPsWYE7xGniBpeWwcYZHoWybbZCktb5c58RTAiVShLc+X9WRjgqkcDIs9xSzN8tMQBcmC2vTuIkB/3ym7+vUAGUO7SqFTB1wv/vgzhTBFSTHJqFbsjV8h7MrRpl32dLvuGMOs8bRptncVbKvf8X1I981Kt1Pzkci2pAvN/LvkdN+Vf91tX/wP</diagram><diagram id="KGNsg-YwxBcfRzNuZPCy" name="Service">dZHBDoIwDIafZnfYFPGMiBdPHDwvrLIlg5IxA/r0QjbEBU126L7/77q2hGXNWBjeySsK0IRGYiTsRCiNd5SS+UTi6Uia7B2ojRLetIJSvcDDyNOHEtAHRouorepCWGHbQmUDxo3BIbTdUYdVO17DBpQV11t6U8JKT+PkuAoXULX0pVN6cELDF7PvpJdc4PCFWE5YZhCti5oxAz0Pb5mLyzv/UT8fM9DaHwlTsL49XYINsfwN</diagram></mxfile>
--- a/Jupyter/NER/NER-Pipeline.md
+++ b/Jupyter/NER/NER-Pipeline.md
@@ -0,0 +1,38 @@
 ```mermaid
 flowchart LR
    DBConnect["`**Mongo Connect**
    - create connection string
    - establish connection`"]
    DBRead["`**Mongo Read**
    - read database
    - get fields without attribute 'companies'`"]
    NER["`**NERService**
    - process news article
    - get entities`"]
    DBUpdate["`**Mongo Update Documents**
    - update processed documents
    - add an attribute 'companies'`"]
    id1[["`**NERSpacy**
    Named Entitiy Recognition with spaCy`"]]
    id2[["`**NERCompanyList**
    Named Entitiy Recognition by comparing text with list`"]]
    id3[["`**NERTransformer**
    Named Entitiy Recognition with transformer`"]]
    DBConnect-->DBRead-->NER
    NER--select service-->id1
    NER--select service-->id2
    NER--select service-->id3
    id1-->DBUpdate
    id2-->DBUpdate
    id3-->DBUpdate
 ```
--- a/Jupyter/NER/NER_Pipeline.ipynb
+++ b/Jupyter/NER/NER_Pipeline.ipynb
--- a/Jupyter/NER/NER_from_StagingDB.ipynb
+++ b/Jupyter/NER/NER_from_StagingDB.ipynb
--- a/Jupyter/NER/Sentiment-Pipeline.md
+++ b/Jupyter/NER/Sentiment-Pipeline.md
@@ -0,0 +1,32 @@
 ```mermaid
 flowchart LR
    DBConnect["`**Mongo Connect**
    - create connection string
    - establish connection`"]
    DBRead["`**Mongo Read**
    - read database
    - get fields without attribute 'companies'`"]
    NER["`**SentimentService**
    - process news article
    - get sentiment`"]
    DBUpdate["`**Mongo Update Documents**
    - update processed documents
    - add an attribute 'sentiment'`"]
    id1[["`**SentimentSpacy**
    Sentiment analysis with spaCy`"]]
    id3[["`**SentimentTransformer**
    Sentiment analysis with transformer`"]]
    DBConnect-->DBRead-->NER
    NER--select service-->id1
    NER--select service-->id3
    id1-->DBUpdate
    id3-->DBUpdate
 ```
--- a/Jupyter/NER/Sentiment_Pipeline.ipynb
+++ b/Jupyter/NER/Sentiment_Pipeline.ipynb
--- a/Jupyter/Sentiment_Company_Matching/Name_Matching.ipynb
+++ b/Jupyter/Sentiment_Company_Matching/Name_Matching.ipynb
--- a/documentations/seminararbeiten/Datenspeicherung/images/08_DataMesh.png
+++ b/documentations/seminararbeiten/Datenspeicherung/images/08_DataMesh.png
--- a/documentations/seminararbeiten/DevOps/Seminarpraesentation.ipynb
+++ b/documentations/seminararbeiten/DevOps/Seminarpraesentation.ipynb
@@ -531,7 +531,7 @@
    "        python-version: 3.11\n",
    "    - uses: snok/install-poetry@v1 # setup poetry\n",
    "      with:\n",
-    "        version: 1.4.2\n",
+    "        version: 1.6.1\n",
    "        virtualenvs-path: ~/local/share/virtualenvs\n",
    "    - uses: actions/checkout@v3\n",
    "    - run: |\n",
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,12 +65,18 @@ python = "^3.11"
 python-dotenv = "^1.0.0"
 seaborn = "^0.12.2"
 selenium = "^4.12.0"
 spacy = "^3.6.1"
 spacy-sentiws = "^3.0.0"
 torch = {version = "*", source = "torch-cpu"}
 torchaudio = {version = "*", source = "torch-cpu"}
 torchvision = {version = "*", source = "torch-cpu"}
 tqdm = "^4.66.1"
 transformers = {version = "*", extras = ["torch"]}
 xmltodict = "^0.13.0"
 [tool.poetry.extras]
 ingest = ["selenium", "deutschland", "xmltodict"]
-transformation = []
+transformation = ["torch", "torchaudio", "torchvision", "transformers", "spacy-sentiws", "spacy"]
 web-server = ["dash", "dash-auth", "dash-bootstrap-components", "matplotlib", "seaborn"]
 [tool.poetry.group.develop.dependencies]
@@ -108,6 +114,7 @@ types-cachetools = "^5.3.0.6"
 types-pyOpenSSL = "*"
 types-requests = "^2.31.0.2"
 types-setuptools = "*"
 types-tabulate = "^0.9.0.3"
 types-tqdm = "^4.66.0.2"
 [tool.poetry.group.test.dependencies]
@@ -123,6 +130,11 @@ data-transformation = "aki_prj23_transparenzregister.utils.data_transfer:transfe
 reset-sql = "aki_prj23_transparenzregister.utils.sql.connector:reset_all_tables_cli"
 webserver = "aki_prj23_transparenzregister.ui.app:main"
 [[tool.poetry.source]]
 name = "torch-cpu"
 priority = "explicit"
 url = "https://download.pytorch.org/whl/cpu"
 [tool.ruff]
 exclude = [
  ".bzr",
--- a/src/aki_prj23_transparenzregister/utils/mongo/CompEntities.json
+++ b/src/aki_prj23_transparenzregister/utils/mongo/CompEntities.json
@@ -0,0 +1,102 @@
 [
    "Volkswagen",
    "Mercedes",
    "Benz",
    "Deutsche Telekom",
    "Bmw",
    "Deutsche Post",
    "E.On",
    "BASF",
    "Siemens",
    "Uniper",
    "Bayer",
    "Continental",
    "Fresenius",
    "Thyssen",
    "Siemens",
    "SAP",
    "Metro",
    "Hochtief",
    "Traton",
    "Ceconomy",
    "ENBW",
    "Adidas",
    "Henkel",
    "Heidelbergcement",
    "Fresenius",
    "Merck",
    "Mckesson",
    "RWE",
    "Lufthansa",
    "Hapag-Lloyd",
    "Schaeffler",
    "Evonik",
    "Aurubis",
    "Brenntag",
    "Covestro",
    "Infineon",
    "Tui",
    "Kion ",
    "Zalando",
    "Telefonica",
    "Salzgitter",
    "Beiersdorf",
    "Suedzucker",
    "Hella",
    "Lanxess",
    "Knorr",
    "Rheinmetall",
    "Hornbach",
    "United",
    "Puma",
    "Baywa",
    "Kloeckner",
    "Hornbach",
    "Bechtle",
    "Nordex",
    "Wacker",
    "Gea",
    "Vonovia",
    "Prosiebensat1",
    "Leoni",
    "MTU",
    "1&1",
    "Jungheinrich",
    "K+S",
    "Hellofresh",
    "Symrise",
    "Aurelius",
    "Mvv",
    "Bilfinger",
    "Draegerwerk",
    "Krones",
    "Duerr",
    "Osram",
    "Auto1",
    "Deutsche Wohnen",
    "Kabel Deutschland",
    "Freenet",
    "Kuka",
    "Delivery Hero",
    "Paul Hartmann",
    "Fuchs Petrolub",
    "Sartorius",
    "Gelsenwasser",
    "Mainova",
    "Ksb",
    "Heidelberger Druckmaschinen",
    "Sixt",
    "Hugo Boss",
    "Dmg Mori",
    "Mutares",
    "Zooplus",
    "Grammer",
    "Fraport",
    "Wacker Neuson",
    "Indus Holding",
    "Leg Immobilien",
    "Elringklinger",
    "Stroeer",
    "Fielmann",
    "Gerresheimer"
  ]
--- a/src/aki_prj23_transparenzregister/utils/mongo/SentiWS/SentiWS.txt
+++ b/src/aki_prj23_transparenzregister/utils/mongo/SentiWS/SentiWS.txt
@@ -0,0 +1,77 @@
 SentiWS
 ~~~~~~~
 SentimentWortschatz, or SentiWS for short, is a publicly available German-language resource for sentiment analysis, opinion mining etc. It lists positive and negative polarity bearing words weighted within the interval of [-1; 1] plus their part of speech tag, and if applicable, their inflections. The current version of SentiWS (v2.0) contains around 1,650 positive and 1,800 negative words, which sum up to around 16,000 positive and around 18,000 negative word forms incl. their inflections, respectively. It not only contains adjectives and adverbs explicitly expressing a sentiment, but also nouns and verbs implicitly containing one.
 License
 ~~~~~~~
 SentiWS is licensed under a Creative Commons Attribution-Noncommercial-Share Alike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).
 Obtain a Copy
 ~~~~~~~~~~~~~
 The latest version of SentiWS can be found at https://wortschatz.uni-leipzig.de/download/.
 Data Format
 ~~~~~~~~~~~
 SentiWS is organised in two utf8-encoded text files structured the following way:
 <Word>|<POS tag> \t <Polarity weight> \t <Infl_1>,...,<Infl_k> \n
 where \t denotes a tab, and \n denotes a new line.
 Citation
 ~~~~~~~~
 If you use SentiWS in your work we kindly ask you to cite
 R. Remus, U. Quasthoff & G. Heyer: SentiWS - a Publicly Available German-language Resource for Sentiment Analysis.
 In: Proceedings of the 7th International Language Ressources and Evaluation (LREC'10), 2010
 or use the following BibTeX-code snippet:
@INPROCEEDINGS{remquahey2010,
 title = {SentiWS -- a Publicly Available German-language Resource for Sentiment Analysis},
 booktitle = {Proceedings of the 7th International Language Resources and Evaluation (LREC'10)},
 author = {Remus, R. and Quasthoff, U. and Heyer, G.},
 year = {2010}
 }
 Version History
 ~~~~~~~~~~~~~~~
 SentiWS is "work in progress" and hence far from being fully-fledged and error-free. It will be continuously refined by adding missing words and word forms and removing ambiguous ones.
 v1.8b, 2010-05-19: First publicly available version as described in Remus et al. (2010).
 v1.8c, 2012-03-21: Second publicly available version in which some POS tags were corrected.
 v2.0, 2018-10-19: Third publicly available version in which the inflected forms were extended.
 Statistics
 ~~~~~~~~~~
 				Positive	Negative
 Adjectives	Baseforms	792		712
 		Inflections	10,936		10,471
 Adverbs		Baseforms	7		4
 		Inflections	5		0
 Nouns		Baseforms	548		688
 		Inflections	736		1158
 Verbs		Baseforms	297		423
 		Inflections	3,246		4,580
 All		Baseforms	1,644		1,827
 		Inflections	14,923		16,209
 		Total		16,567		18,036
 Table: Overview of the dictionary's content
 SentiWS.txt was last updated on 2019-09-12.
--- a/src/aki_prj23_transparenzregister/utils/mongo/SentiWS/SentiWS_v2.0_Negative.txt
+++ b/src/aki_prj23_transparenzregister/utils/mongo/SentiWS/SentiWS_v2.0_Negative.txt
--- a/src/aki_prj23_transparenzregister/utils/mongo/SentiWS/SentiWS_v2.0_Positive.txt
+++ b/src/aki_prj23_transparenzregister/utils/mongo/SentiWS/SentiWS_v2.0_Positive.txt
--- a/src/aki_prj23_transparenzregister/utils/mongo/ner_pipeline.py
+++ b/src/aki_prj23_transparenzregister/utils/mongo/ner_pipeline.py
@@ -0,0 +1,110 @@
 """Pipeline to get Entities from Staging DB."""
 import json
 import sys
 from loguru import logger
 from tqdm import tqdm
 import aki_prj23_transparenzregister.utils.mongo.connector as conn
 import aki_prj23_transparenzregister.utils.mongo.news_mongo_service as news
 from aki_prj23_transparenzregister.config.config_providers import (
    JsonFileConfigProvider,
 )
 from aki_prj23_transparenzregister.utils.mongo import ner_service
 logger.add(sys.stdout, colorize=True)
 class EntityPipeline:
    """Class to initialize NER Pipeline."""
    def __init__(self, conn_string: conn.MongoConnection) -> None:
        """Method to connect to StagingDB."""
        self.connect_string = conn_string
        self.connect_string.database = "transparenzregister_ner"
        self.connector = conn.MongoConnector(self.connect_string)
        self.news_obj = news.MongoNewsService(self.connector)
    def process_documents(
        self, entity: str, doc_attrib: str, ner_selection: str
    ) -> None:
        """Method to check documents, get entities and write them to document."""
        CursorUnprogressed = self.news_obj.collection.find(  # noqa: N806
            {"companies": {"$exists": False}}
        )
        documents = list(CursorUnprogressed)
        logger.info("Dokumente: ", str(CursorUnprogressed))
        # Determine NER service based on config
        # spaCy
        if ner_selection == "use_spacy_ner":
            ner_service_instance = ner_service.NerAnalysisService(
                use_spacy=True, use_transformer=False, use_companylist=False
            )
            ner_service_func = ner_service_instance.ner_spacy
        # company list
        elif ner_selection == "use_companylist_ner":
            ner_service_instance = ner_service.NerAnalysisService(
                use_spacy=False, use_transformer=False, use_companylist=True
            )
            ner_service_func = ner_service_instance.ner_company_list
        # transformer
        elif ner_selection == "use_transformer_ner":
            ner_service_instance = ner_service.NerAnalysisService(
                use_spacy=False, use_transformer=True, use_companylist=False
            )
            ner_service_func = ner_service_instance.ner_transformer
        if len(documents) > 0:
            for document in tqdm(documents):
                ents = ner_service_func(document, entity, doc_attrib)
                self.news_obj.collection.update_one(
                    {"_id": document["_id"]},
                    {"$set": {"companies": ents}},
                )
        else:
            logger.info("No documents found.")
 if __name__ == "__main__":
    # Establish MongoDB Connection using secrets
    config_provider = JsonFileConfigProvider("./secrets.json")
    connect_string = config_provider.get_mongo_connection_string()
    # dir of config json
    config_file_path = (
        "src/aki_prj23_transparenzregister/utils/mongo/ner_sentiment_config.json"
    )
    # Load NER service configuration from JSON
    with open(config_file_path) as config_file:
        ner_config = json.load(config_file)
    # read configuration
    entity = ner_config["ner_service"]["entity"]
    logger.info("NER Pipeline: searching for entity of type", str(entity))
    doc_attrib = ner_config["ner_service"]["doc_attrib"]
    logger.info("NER Pipeline: searching in document attribute ", str(doc_attrib))
    # read selected service
    if ner_config["ner_service"]["use_companylist_ner"] is True:
        ner_selection = "use_companylist_ner"
        logger.info("NER Pipeline: Searching entities with company list")
    elif ner_config["ner_service"]["use_spacy_ner"] is True:
        ner_selection = "use_spacy_ner"
        logger.info("NER Pipeline: Searching entities with spaCy")
    elif ner_config["ner_service"]["use_transformer_ner"] is True:
        ner_selection = "use_transformer_ner"
        logger.info("NER Pipeline: Searching entities with transformer")
    else:
        logger.info(
            "NER Pipeline: No NER services selected or error in configuration file."
        )
    entity_pipeline = EntityPipeline(connect_string)
    entity_pipeline.process_documents(entity, doc_attrib, ner_selection)
--- a/src/aki_prj23_transparenzregister/utils/mongo/ner_sentiment_config.json
+++ b/src/aki_prj23_transparenzregister/utils/mongo/ner_sentiment_config.json
@@ -0,0 +1,17 @@
 {
    "sentiment_service": {
        "comment": "Select only one service by setting true and deselect the other with false. Valid doc_attrib: text, title",
        "use_spacy": false,
        "use_transformer": true,
        "doc_attrib": "text"
    },
    "ner_service": {
        "comment": "Select only one service by setting true and deselect the other with false. Valid doc_attrib: text, title",
        "use_spacy_ner": false,
        "use_transformer_ner": true,
        "use_companylist_ner":false,
        "doc_attrib": "text",
        "entity":"ORG"
    }
 }
--- a/src/aki_prj23_transparenzregister/utils/mongo/ner_service.py
+++ b/src/aki_prj23_transparenzregister/utils/mongo/ner_service.py
@@ -0,0 +1,150 @@
 """NER Service module."""
 import json
 from collections import Counter
 from typing import Final
 import spacy
 from transformers import pipeline
 class NerAnalysisService:
    """Class to initialize NER model."""
    def __init__(
        self,
        use_spacy: bool = False,
        use_transformer: bool = False,
        use_companylist: bool = False,
    ) -> None:
        """Method to check which sentiment model is chosen."""
        if use_spacy:
            self.init_spacy()
        if use_transformer:
            self.init_transformer()
        if use_companylist:
            self.init_companylist()
    def init_spacy(self) -> None:
        """Method to initialize spaCy.
        Optimized by ChatGPT.
        """
        # check if model is available and load it
        SPACY_MODEL_NAME: Final[str] = "de_core_news_lg"  # noqa: N806
        if not spacy.util.is_package(SPACY_MODEL_NAME):
            from spacy.cli.download import download as spacy_download
            spacy_download(SPACY_MODEL_NAME)  # type: ignore
        self.nlp = spacy.load(SPACY_MODEL_NAME)
    def init_transformer(self) -> None:
        """Method to initialize transformer."""
        # init NER Transformer
        self.classifier = pipeline(
            "ner",
            model="fhswf/bert_de_ner",
            grouped_entities=True,
            tokenizer="dbmdz/bert-base-german-cased",
        )
    def init_companylist(self) -> None:
        """Method to initialize company list."""
        with open(
            "src/aki_prj23_transparenzregister/utils/mongo/CompEntities.json", "rb"
        ) as complist:
            self.complist = json.load(complist)
    def ner_spacy(
        self, doc: dict, ent_type: str = "ORG", doc_attrib: str = "title"
    ) -> dict:
        """Named Entity Recognition with Spacy.
        Args:
                doc: a document which is processed with spacy
                ent_type: string with specific entity (LOC - Location; PERSON - People; NORP - Nationalities or religious or political groups;
                        FAC - Buildings, airports, highways, bridges, etc.; ORG - Companies, agencies, institutions, etc.;
                        GPE - Countries, cities, states.; LOC - Non-GPE locations, mountain ranges, bodies of water)
                doc_attrib: which attribute of the document has to be processed: text or title
        Returns:
                list of entities.
        """
        # init list for entities
        entities = []
        text = doc[doc_attrib]
        # get entities
        doc_nlp = self.nlp(text)
        # select company
        for ent in doc_nlp.ents:
            if ent.label_ == ent_type:
                entities.append(ent.text)
        return dict(Counter(entities))
    def ner_company_list(
        self, doc: dict, ent_type: str = "ORG", doc_attrib: str = "title"
    ) -> dict:
        """Named Entity Recognition by String comparision.
        Args:
                doc: a dict from where entities are searched
                ent_type: type of searched entity
                doc_attrib: which attribute of the dict is searched
        Raises:
                NotImplementedError: To be defined by child classes
        Returns:
                list with entities.
        """
        # Convert all entries in the company_list to lowercase
        self.complist = [company_name.lower() for company_name in self.complist]
        # Create an empty list to store the entities
        entities = []
        # Search the text for company names
        text = doc[doc_attrib]
        # Convert title to lowercase
        text = text.lower()
        for company_name in self.complist:
            start_idx = text.find(company_name)
            if start_idx != -1:  # Wort gefunden
                start_idx + len(company_name)
                entity = company_name
                if entity not in entities:
                    entities.append(entity)
        return dict(Counter(entities))
    def ner_transformer(
        self, doc: dict, ent_type: str = "ORG", doc_attrib: str = "title"
    ) -> dict:
        """Named Entity Recognition with Transformer.
        Args:
                doc: a string which is processed with a transformer model
                ent_type: string with specific entity (PERSON - People; NORP - Nationalities or religious or political groups;
                        FAC - Buildings, airports, highways, bridges, etc.; ORG - Companies, agencies, institutions, etc.;
                        GPE - Countries, cities, states.; LOC - Non-GPE locations, mountain ranges, bodies of water).
                doc_attrib: Attribute of the document (title or text)
        Returns:
                list of entities.
        """
        # init list for entities
        entities = []
        text = doc[doc_attrib]
        sentences = text.split(". ")  # Split text into sentences based on '. '
        # Process each sentence separately
        for sentence in sentences:
            res = self.classifier(
                sentence
            )  # Assuming 'classifier' processes a single sentence at a time
            for i in range(len(res)):
                if res[i]["entity_group"] == ent_type:
                    entities.append(res[i]["word"])
        return dict(Counter(entities))
--- a/src/aki_prj23_transparenzregister/utils/mongo/sentiment_pipeline.py
+++ b/src/aki_prj23_transparenzregister/utils/mongo/sentiment_pipeline.py
@@ -0,0 +1,91 @@
 """Pipeline to get sentiments from Staging DB nes articles."""
 import json
 import os
 from loguru import logger
 from tqdm import tqdm
 import aki_prj23_transparenzregister.utils.mongo.connector as conn
 import aki_prj23_transparenzregister.utils.mongo.news_mongo_service as news
 from aki_prj23_transparenzregister.config.config_providers import JsonFileConfigProvider
 from aki_prj23_transparenzregister.config.config_template import MongoConnection
 from aki_prj23_transparenzregister.utils.mongo import sentiment_service
 class SentimentPipeline:
    """Class to initialize sentiment Pipeline."""
    def __init__(self, conn_string: MongoConnection) -> None:
        """Method to connect to StagingDB."""
        self.connect_string = conn_string
        self.connect_string.database = "transparenzregister_ner"
        self.connector = conn.MongoConnector(self.connect_string)
        self.news_obj = news.MongoNewsService(self.connector)
    def process_documents(self, doc_attrib: str, sentiment_selection: str) -> None:
        """Method to check documents, get entities and write them to document."""
        CursorUnprogressed = self.news_obj.collection.find(  # noqa: N806
            {"sentiment": {"$exists": False}}
        )
        documents = list(CursorUnprogressed)
        if len(documents) > 0:
            for document in tqdm(documents):
                text = document[doc_attrib]
                # Determine sentiment analysis service based on config
                if sentiment_selection == "use_spacy":
                    selected_service = sentiment_service.SentimentAnalysisService(
                        use_spacy=True, use_transformer=False
                    )
                    sentiment_service_func = selected_service.sentiment_spacy
                elif sentiment_selection == "use_transformer":
                    selected_service = sentiment_service.SentimentAnalysisService(
                        use_spacy=False, use_transformer=True
                    )
                    sentiment_service_func = selected_service.sentiment_transformer
                # sents = selected_service.sentiment_spacy(text)
                sents = sentiment_service_func(text)
                sentiment = {"label": sents[0], "score": sents[1]}
                self.news_obj.collection.update_one(
                    {"_id": document["_id"]},
                    {"$set": {"sentiment": sentiment}},
                )
        else:
            logger.info("No documents found.")
 if __name__ == "__main__":
    # Establish MongoDB Connection using secrets
    config_provider = JsonFileConfigProvider("./secrets.json")
    connect_string = config_provider.get_mongo_connection_string()
    # dir of config json
    script_dir = os.path.dirname(__file__)
    config_file_path = os.path.join(script_dir, "ner_sentiment_config.json")
    # Load sentiment service configuration from JSON
    with open(config_file_path) as config_file:
        sentiment_config = json.load(config_file)
    # Where to search the sentiment
    doc_attrib = sentiment_config["sentiment_service"]["doc_attrib"]
    logger.info("Sentiment Pipeline: searching in document attribute ", str(doc_attrib))
    # read selected service
    if sentiment_config["sentiment_service"]["use_spacy"] is True:
        sentiment_selection = "use_spacy"
        logger.info("Sentiment Pipleline: Searching sentiments with spaCy")
    elif sentiment_config["sentiment_service"]["use_transformer"] is True:
        sentiment_selection = "use_transformer"
        logger.info("Sentiment Pipleline: Searching sentiments with transformer")
    else:
        logger.info(
            "Sentiment Pipleline: No Sentiment services selected or error in configuration file."
        )
    sentiment_pipeline = SentimentPipeline(connect_string)
    sentiment_pipeline.process_documents(doc_attrib, sentiment_selection)
--- a/src/aki_prj23_transparenzregister/utils/mongo/sentiment_service.py
+++ b/src/aki_prj23_transparenzregister/utils/mongo/sentiment_service.py
@@ -0,0 +1,170 @@
 """Service for Sentiment analysis."""
 import os
 import zipfile
 from typing import Final
 import requests
 import spacy
 from loguru import logger
 from spacy_sentiws import spaCySentiWS  # noqa: F401
 from transformers import pipeline
 class SentimentAnalysisService:
    """Class to initialize spaCy or Transformer model."""
    def __init__(self, use_spacy: bool = False, use_transformer: bool = False) -> None:
        """Method to check which sentiment model is chosen."""
        if use_spacy:
            self.init_spacy()
        if use_transformer:
            self.init_transformer()
    def init_spacy(self) -> None:
        """Method to initialize spaCy."""
        # check if model is available and load it
        SPACY_MODEL_NAME: Final[str] = "de_core_news_lg"  # noqa: N806
        if not spacy.util.is_package(SPACY_MODEL_NAME):
            cli = spacy.cli  # type: ignore
            cli.download(SPACY_MODEL_NAME)  # type: ignore
        self.nlp = spacy.load(SPACY_MODEL_NAME)
        # path to spaCy vocabulary
        PATH: Final[  # noqa: N806
            str
        ] = "src/aki_prj23_transparenzregister/utils/mongo/SentiWS/"
        # check if vocabulary is existing, otehrwise download it
        if not os.path.exists(PATH):
            URL: Final[  # noqa: N806
                str
            ] = "https://downloads.wortschatz-leipzig.de/etc/SentiWS/SentiWS_v2.0.zip"
            logger.info("SentiWS vocabulary not found. Starting download...")
            # Create the data directory if it doesn't exist
            os.makedirs(PATH, exist_ok=True)
            # File path for the downloaded ZIP file
            zip_file_path = os.path.join(PATH, "SentiWS_v2.0.zip")
            # Download the ZIP file
            response = requests.get(URL)  # noqa: S113
            with open(zip_file_path, "wb") as zip_file:
                zip_file.write(response.content)
            # Extract the ZIP file
            with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
                zip_ref.extractall(PATH)
            # Remove the downloaded ZIP file if it's no longer needed
            os.remove(zip_file_path)
            logger.info("SentiWS data downloaded and extracted successfully.")
        else:
            logger.info("SentiWS data directory already exists.")
        # create spaCy pipeline
        self.nlp.add_pipe("sentiws", config={"sentiws_path": PATH})
    def init_transformer(self) -> None:
        """Method to initialize transformer."""
        # loading the sentiment model(~ 436MB) for transformer
        self.sentiment_analyzer = pipeline(
            "sentiment-analysis", model="oliverguhr/german-sentiment-bert"
        )
    def sentiment_spacy(self, doc: str) -> tuple:
        """Sentiment Analytics with Spacy.
        Args:
                doc: a document which is processed with spacy
                docAttrib: which attribute of the document has to be processed: text or title
        Returns:
                label: positive, negative, neutral.
        """
        # set limits for sentiments
        _upperlimit = 0.1
        _lowerlimit = -0.1
        _doc = self.nlp(doc)
        _score = None
        _sent = None
        # init a sentiment counter
        _pos = 0
        _neg = 0
        # init a summarizer for maximum sentiment score to normalize values
        _max_score = 0
        for token in _doc:
            token_score = token._.sentiws  # noqa: SLF001
            if token_score is not None:
                _max_score += abs(token_score)
                if token_score < 0:
                    _neg += token_score
                if token_score > 0:
                    _pos += token_score
        # Normalize the score to the range 0..1
        _normalized_score = (_pos - abs(_neg)) / _max_score if _max_score > 0 else 0
        if _normalized_score > _upperlimit:
            _sent = "positive"
        elif _normalized_score < _lowerlimit:
            _sent = "negative"
        else:
            _sent = "neutral"
        return _sent, abs(_normalized_score)
    def sentiment_transformer(self, doc: str) -> tuple:
        """Sentiment Analysis with Transformer.
        Args:
                doc: a string which is processed with a transformer model
        Returns:
                sentiment and score.
        """
        sentences = doc.split(". ")  # Split text into sentences based on '. '
        # init total sentiment and score counter
        total_score = 0
        total_positive_score = 0
        total_negative_score = 0
        total_neutral_score = 0
        _score = None
        _sent = None
        # Process each sentence separately
        for sentence in sentences:
            # get sentiment
            results = self.sentiment_analyzer(sentence)
            _score = results[0]["score"]
            _sent = results[0]["label"]
            # sum up sepcific score
            if _sent == "positive":
                total_positive_score += _score
            elif _sent == "negative":
                total_negative_score += _score
            else:
                total_neutral_score += _score
            # sum up total score
            total_score += _score
        # total specific score
        total_positive_score_normalized = total_positive_score / total_score
        total_negative_score_normalized = total_negative_score / total_score
        total_neutral_score_normalized = total_neutral_score / total_score
        if total_positive_score_normalized > total_negative_score_normalized:
            final_sentiment = "positive"
            out_score = total_positive_score_normalized
        elif total_positive_score_normalized < total_negative_score_normalized:
            final_sentiment = "negative"
            out_score = total_negative_score_normalized
        else:
            final_sentiment = "neutral"
            out_score = total_neutral_score_normalized
        # return _sent, _score
        return final_sentiment, out_score
--- a/tests/utils/mongo/ner_pipeline_test.py
+++ b/tests/utils/mongo/ner_pipeline_test.py
@@ -0,0 +1,291 @@
 """Tests for checking NER Pipeline."""
 from unittest.mock import Mock, patch
 import pytest
 from aki_prj23_transparenzregister.config.config_template import MongoConnection
 from aki_prj23_transparenzregister.utils.mongo.ner_pipeline import EntityPipeline
@pytest.fixture()
 def mock_mongo_connection() -> MongoConnection:
    """Mock MongoConnector class.
    Args:
        mocker (any): Library mocker
    Returns:
        Mock: Mocked MongoConnector
    """
    return MongoConnection("", "", None, "" "", "")
@pytest.fixture()
 def mock_mongo_connector(mocker: Mock) -> Mock:
    """Mock MongoConnector class.
    Args:
        mocker (any): Library mocker
    Returns:
        Mock: Mocked MongoConnector
    """
    mock = Mock()
    mocker.patch(
        "aki_prj23_transparenzregister.utils.mongo.connector.MongoConnector",
        return_value=mock,
    )
    mock.database = {"news": Mock()}
    return mock
@pytest.fixture()
 def mock_spacy(mocker: Mock) -> Mock:
    """Mock MongoConnector class.
    Args:
        mocker (any): Library mocker
    Returns:
        Mock: Mocked MongoConnector
    """
    mock = Mock()
    mocker.patch(
        "aki_prj23_transparenzregister.utils.mongo.ner_service.NerAnalysisService.init_spacy",
        return_value=mock,
    )
    return mock
 # Mocking the NerAnalysisService methods
@patch(
    "aki_prj23_transparenzregister.utils.mongo.ner_service.NerAnalysisService.ner_spacy"
 )
 def test_entity_pipeline_with_spacy(
    mock_ner_spacy: Mock,
    mock_mongo_connector: Mock,
    mock_mongo_connection: MongoConnection,
    mock_spacy: Mock,
 ) -> None:
    # Configure the mock to return a specific NER result
    mock_ner_spacy.return_value = {"ORG": 2, "PERSON": 1}
    # Create an instance of the EntityPipeline
    entity_pipeline = EntityPipeline(mock_mongo_connection)
    # Mock the news collection and documents for testing
    mock_collection = Mock()
    mock_documents = [{"_id": "document1", "title": "Apple Inc is a tech company."}]
    # Set the collection to the mock_collection
    entity_pipeline.news_obj.collection = mock_collection
    # Mock the find method of the collection to return the mock documents
    mock_collection.find.return_value = mock_documents
    # Call the process_documents method with spaCy NER
    entity_pipeline.process_documents(
        entity="ORG", doc_attrib="title", ner_selection="use_spacy_ner"
    )
    # Ensure that ner_spacy was called with the correct parameters
    mock_ner_spacy.assert_called_once_with(mock_documents[0], "ORG", "title")
    # Ensure that the document in the collection was updated with the NER results
    mock_collection.update_one.assert_called_once_with(
        {"_id": "document1"},
        {"$set": {"companies": {"ORG": 2, "PERSON": 1}}},
    )
@patch(
    "aki_prj23_transparenzregister.utils.mongo.ner_service.NerAnalysisService.ner_spacy"
 )
 def test_entity_pipeline_with_spacy_no_docs(
    mock_ner_spacy: Mock,
    mock_mongo_connector: Mock,
    mock_mongo_connection: MongoConnection,
    mock_spacy: Mock,
 ) -> None:
    # Configure the mock to return a specific NER result
    mock_ner_spacy.return_value = {"ORG": 2, "PERSON": 1}
    # Create an instance of the EntityPipeline
    entity_pipeline = EntityPipeline(mock_mongo_connection)
    # Mock the news collection and documents for testing
    mock_collection = Mock()
    mock_documents: list[dict] = []
    # Set the collection to the mock_collection
    entity_pipeline.news_obj.collection = mock_collection
    # Mock the find method of the collection to return the mock documents
    mock_collection.find.return_value = mock_documents
    # Call the process_documents method with spaCy NER
    entity_pipeline.process_documents(
        entity="ORG", doc_attrib="title", ner_selection="use_spacy_ner"
    )
    # Ensure that sentiment_spacy was not called
    mock_ner_spacy.assert_not_called()
    # Ensure that the document in the collection was not updated
    mock_collection.assert_not_called()
@patch(
    "aki_prj23_transparenzregister.utils.mongo.ner_service.NerAnalysisService.ner_company_list"
 )
 def test_entity_pipeline_with_companylist_ner(
    mock_ner_companylist: Mock,
    mock_mongo_connector: Mock,
    mock_mongo_connection: MongoConnection,
    mock_spacy: Mock,
 ) -> None:
    # Konfigurieren Sie das Mock-Objekt, um ein spezifisches NER-Ergebnis zurückzugeben
    mock_ner_companylist.return_value = {"ORG": 3, "LOCATION": 2}
    # Create an instance of the EntityPipeline
    entity_pipeline = EntityPipeline(mock_mongo_connection)
    # Mock die News-Sammlung und Dokumente für Tests
    mock_collection = Mock()
    mock_documents = [
        {"_id": "document2", "title": "Siemens ist ein deutsches Unternehmen."}
    ]
    # Set the collection to the mock_collection
    entity_pipeline.news_obj.collection = mock_collection
    # Mock the find method of the collection to return the mock documents
    mock_collection.find.return_value = mock_documents
    # Call the process_documents method with Company List NER
    entity_pipeline.process_documents(
        entity="ORG", doc_attrib="title", ner_selection="use_companylist_ner"
    )
    # Überprüfen Sie, ob ner_company_list mit den richtigen Parametern aufgerufen wurde
    mock_ner_companylist.assert_called_once_with(mock_documents[0], "ORG", "title")
    # Überprüfen Sie, ob das Dokument in der Sammlung mit den NER-Ergebnissen aktualisiert wurde
    mock_collection.update_one.assert_called_once_with(
        {"_id": "document2"},
        {"$set": {"companies": {"ORG": 3, "LOCATION": 2}}},
    )
@patch(
    "aki_prj23_transparenzregister.utils.mongo.ner_service.NerAnalysisService.ner_company_list"
 )
 def test_entity_pipeline_with_companylist_ner_no_docs(
    mock_ner_companylist: Mock,
    mock_mongo_connector: Mock,
    mock_mongo_connection: MongoConnection,
    mock_spacy: Mock,
 ) -> None:
    # Configure the mock to return a specific NER result
    mock_ner_companylist.return_value = {"ORG": 3, "LOCATION": 2}
    # Create an instance of the EntityPipeline
    entity_pipeline = EntityPipeline(mock_mongo_connection)
    # Mock die News-Sammlung und Dokumente für Tests
    mock_collection = Mock()
    mock_documents: list[dict] = []
    # Set the collection to the mock_collection
    entity_pipeline.news_obj.collection = mock_collection
    # Mock the find method of the collection to return the mock documents
    mock_collection.find.return_value = mock_documents
    # Call the process_documents method with Company List NER
    entity_pipeline.process_documents(
        entity="ORG", doc_attrib="title", ner_selection="use_companylist_ner"
    )
    # Ensure that ner_company_list is not called
    mock_ner_companylist.assert_not_called()
    # Ensure that the document in the collection was not updated
    mock_collection.update_one.assert_not_called()
 # Add more test cases for other NER methods (e.g., use_companylist_ner, use_transformer_ner) following a similar pattern.
@patch(
    "aki_prj23_transparenzregister.utils.mongo.ner_service.NerAnalysisService.ner_spacy"
 )
 def test_entity_pipeline_with_transformer(
    mock_ner_transformer: Mock,
    mock_mongo_connector: Mock,
    mock_mongo_connection: MongoConnection,
    mock_spacy: Mock,
 ) -> None:
    # Configure the mock to return a specific NER result
    mock_ner_transformer.return_value = {"ORG": 2, "PERSON": 1}
    # Create an instance of the EntityPipeline
    entity_pipeline = EntityPipeline(mock_mongo_connection)
    # Mock the news collection and documents for testing
    mock_collection = Mock()
    mock_documents = [{"_id": "document1", "title": "Apple Inc is a tech company."}]
    # Set the collection to the mock_collection
    entity_pipeline.news_obj.collection = mock_collection
    # Mock the find method of the collection to return the mock documents
    mock_collection.find.return_value = mock_documents
    # Call the process_documents method with spaCy NER
    entity_pipeline.process_documents(
        entity="ORG", doc_attrib="title", ner_selection="use_spacy_ner"
    )
    # Ensure that ner_spacy was called with the correct parameters
    mock_ner_transformer.assert_called_once_with(mock_documents[0], "ORG", "title")
    # Ensure that the document in the collection was updated with the NER results
    mock_collection.update_one.assert_called_once_with(
        {"_id": "document1"},
        {"$set": {"companies": {"ORG": 2, "PERSON": 1}}},
    )
@patch(
    "aki_prj23_transparenzregister.utils.mongo.ner_service.NerAnalysisService.ner_spacy"
 )
 def test_entity_pipeline_with_transformer_no_docs(
    mock_ner_transformer: Mock,
    mock_mongo_connector: Mock,
    mock_mongo_connection: MongoConnection,
    mock_spacy: Mock,
 ) -> None:
    # Configure the mock to return a specific NER result
    mock_ner_transformer.return_value = {"ORG": 2, "PERSON": 1}
    # Create an instance of the EntityPipeline
    entity_pipeline = EntityPipeline(mock_mongo_connection)
    # Mock the news collection and documents for testing
    mock_collection = Mock()
    mock_documents: list[dict] = []
    # Set the collection to the mock_collection
    entity_pipeline.news_obj.collection = mock_collection
    # Mock the find method of the collection to return the mock documents
    mock_collection.find.return_value = mock_documents
    # Call the process_documents method with spaCy NER
    entity_pipeline.process_documents(
        entity="ORG", doc_attrib="title", ner_selection="use_spacy_ner"
    )
    # Ensure that ner_transformer is not called
    mock_ner_transformer.assert_not_called()
    # Ensure that the document in the collection was not updated
    mock_collection.update_one.assert_not_called()
--- a/tests/utils/mongo/ner_service_test.py
+++ b/tests/utils/mongo/ner_service_test.py
@@ -0,0 +1,54 @@
 """Tests for checking NER Services."""
 from aki_prj23_transparenzregister.utils.mongo.ner_service import NerAnalysisService
 def test_ner_spacy() -> None:
    """Mock TestNerService."""
    # Create instance of NerAnalysisService with use_spacy=True
    ner_service = NerAnalysisService(
        use_spacy=True, use_transformer=False, use_companylist=False
    )
    # 1st testing
    doc = {"title": "Siemens ist ein Unternehmen."}
    result = ner_service.ner_spacy(doc, ent_type="ORG", doc_attrib="title")
    assert result == {"Siemens": 1}
    # 2nd testing
    doc = {"text": "BASF ist ein großes Unternehmen."}
    result = ner_service.ner_spacy(doc, ent_type="ORG", doc_attrib="text")
    assert result == {"BASF": 1}
 def test_ner_company_list() -> None:
    """Mock test_ner_company."""
    # Create instance of NerAnalysisService with use_use_companylist=True
    ner_service = NerAnalysisService(
        use_spacy=False, use_transformer=False, use_companylist=True
    )
    doc = {"title": "Siemens ist ein Unternehmen."}
    result = ner_service.ner_company_list(doc, ent_type="ORG", doc_attrib="title")
    assert result == {"siemens": 1}
    # 2nd testing
    doc = {"text": "BASF ist ein großes Unternehmen."}
    result = ner_service.ner_company_list(doc, ent_type="ORG", doc_attrib="text")
    assert result == {"basf": 1}
 def test_ner_transformer() -> None:
    """Mock test_ner_company."""
    # Create instance of NerAnalysisService with use_use_companylist=True
    ner_service = NerAnalysisService(
        use_spacy=False, use_transformer=True, use_companylist=False
    )
    doc = {"title": "Siemens ist ein Unternehmen."}
    result = ner_service.ner_transformer(doc, ent_type="ORG", doc_attrib="title")
    assert result == {"Siemens": 1}
    # 2nd testing
    doc = {"text": "BASF ist ein großes Unternehmen."}
    result = ner_service.ner_transformer(doc, ent_type="ORG", doc_attrib="text")
    assert result == {"BASF": 1}
--- a/tests/utils/mongo/sentiment_pipeline_test.py
+++ b/tests/utils/mongo/sentiment_pipeline_test.py
@@ -0,0 +1,210 @@
 """Unit test for sentiment pipeline."""
 from unittest.mock import Mock, patch
 import pytest
 from aki_prj23_transparenzregister.config.config_template import MongoConnection
 from aki_prj23_transparenzregister.utils.mongo.sentiment_pipeline import (
    SentimentPipeline,
 )
@pytest.fixture()
 def mock_mongo_connection() -> MongoConnection:
    """Mock MongoConnector class.
    Args:
        mocker (any): Library mocker
    Returns:
        Mock: Mocked MongoConnector
    """
    return MongoConnection("", "", None, "" "", "")
@pytest.fixture()
 def mock_mongo_connector(mocker: Mock) -> Mock:
    """Mock MongoConnector class.
    Args:
        mocker (any): Library mocker
    Returns:
        Mock: Mocked MongoConnector
    """
    mock = Mock()
    mocker.patch(
        "aki_prj23_transparenzregister.utils.mongo.connector.MongoConnector",
        return_value=mock,
    )
    mock.database = {"news": Mock()}
    return mock
@pytest.fixture()
 def mock_spacy(mocker: Mock) -> Mock:
    """Mock MongoConnector class.
    Args:
        mocker (any): Library mocker
    Returns:
        Mock: Mocked MongoConnector
    """
    mock = Mock()
    mocker.patch(
        "aki_prj23_transparenzregister.utils.mongo.sentiment_service.SentimentAnalysisService.init_spacy",
        return_value=mock,
    )
    return mock
@patch(
    "aki_prj23_transparenzregister.utils.mongo.sentiment_service.SentimentAnalysisService.sentiment_spacy"
 )
 def test_sentiment_pipeline_existing_sentiment(
    mock_sentiment_spacy: Mock,
    mock_mongo_connector: Mock,
    mock_mongo_connection: MongoConnection,
    mock_spacy: Mock,
 ) -> None:
    # Configure the mock to return a specific sentiment result
    mock_sentiment_spacy.return_value = ("positive", 0.8)
    # Create an instance of the SentimentPipeline
    sentiment_pipeline = SentimentPipeline(mock_mongo_connection)
    # Mock the news collection and documents for testing
    mock_collection = Mock()
    mock_documents = [
        {
            "_id": "document1",
            "text": "This is a positive text.",
            "sentiment": {"label": "neutral", "score": 0.5},
        }
    ]
    # Set the collection to the mock_collection
    sentiment_pipeline.news_obj.collection = mock_collection
    # Mock the find method of the collection to return the mock documents
    mock_collection.find.return_value = mock_documents
    # Call the process_documents method
    sentiment_pipeline.process_documents("text", "use_spacy")
    # Ensure that sentiment_spacy was called with the correct text
    mock_sentiment_spacy.assert_called_once_with("This is a positive text.")
    # Ensure that the document in the collection was not updated with sentiment
    # mock_collection.update_one.assert_not_called()
@patch(
    "aki_prj23_transparenzregister.utils.mongo.sentiment_service.SentimentAnalysisService.sentiment_spacy"
 )
 def test_sentiment_pipeline_no_documents(
    mock_sentiment_spacy: Mock,
    mock_mongo_connector: Mock,
    mock_mongo_connection: MongoConnection,
    mock_spacy: Mock,
 ) -> None:
    # Configure the mock to return a specific sentiment result
    mock_sentiment_spacy.return_value = ("positive", 0.8)
    # Create an instance of the SentimentPipeline
    sentiment_pipeline = SentimentPipeline(mock_mongo_connection)
    # Mock the news collection to return an empty result
    mock_collection = Mock()
    mock_collection.find.return_value = []
    # Set the collection to the mock_collection
    sentiment_pipeline.news_obj.collection = mock_collection
    # Call the process_documents method
    sentiment_pipeline.process_documents("text", "use_spacy")
    # Ensure that sentiment_spacy was not called
    mock_sentiment_spacy.assert_not_called()
    # Ensure that the document in the collection was not updated with sentiment
    mock_collection.update_one.assert_not_called()
@patch(
    "aki_prj23_transparenzregister.utils.mongo.sentiment_service.SentimentAnalysisService.sentiment_spacy"
 )
 def test_sentiment_pipeline_with_spacy(
    mock_sentiment_spacy: Mock,
    mock_mongo_connector: Mock,
    mock_mongo_connection: MongoConnection,
    mock_spacy: Mock,
 ) -> None:
    # Configure the mock to return a specific sentiment result
    mock_sentiment_spacy.return_value = ("positive", 0.8)
    # Create an instance of the SentimentPipeline
    sentiment_pipeline = SentimentPipeline(mock_mongo_connection)
    # Mock the news collection and documents for testing
    mock_collection = Mock()
    mock_documents = [{"_id": "document1", "text": "This is a positive text."}]
    # Set the collection to the mock_collection
    sentiment_pipeline.news_obj.collection = mock_collection
    # Mock the find method of the collection to return the mock documents
    mock_collection.find.return_value = mock_documents
    # Call the process_documents method
    sentiment_pipeline.process_documents("text", "use_spacy")
    # Ensure that sentiment_spacy was called with the correct text
    mock_sentiment_spacy.assert_called_once_with("This is a positive text.")
    # Ensure that the document in the collection was updated with the sentiment result
    mock_collection.update_one.assert_called_once_with(
        {"_id": "document1"},
        {"$set": {"sentiment": {"label": "positive", "score": 0.8}}},
    )
 # Mocking the SentimentAnalysisService methods
@patch(
    "aki_prj23_transparenzregister.utils.mongo.sentiment_service.SentimentAnalysisService.sentiment_transformer"
 )
 def test_sentiment_pipeline_with_transformer(
    mock_sentiment_transformer: Mock,
    mock_mongo_connector: Mock,
    mock_mongo_connection: MongoConnection,
    mock_spacy: Mock,
 ) -> None:
    # Configure the mock to return a specific sentiment result
    mock_sentiment_transformer.return_value = ("negative", 0.6)
    # Create an instance of the SentimentPipeline
    sentiment_pipeline = SentimentPipeline(mock_mongo_connection)
    # Mock the news collection and documents for testing
    mock_collection = Mock()
    mock_documents = [{"_id": "document2", "text": "This is a negative text."}]
    # Set the collection to the mock_collection
    sentiment_pipeline.news_obj.collection = mock_collection
    # Mock the find method of the collection to return the mock documents
    mock_collection.find.return_value = mock_documents
    # Call the process_documents method
    sentiment_pipeline.process_documents("text", "use_transformer")
    # Ensure that sentiment_transformer was called with the correct text
    mock_sentiment_transformer.assert_called_once_with("This is a negative text.")
    # Ensure that the document in the collection was updated with the sentiment result
    mock_collection.update_one.assert_called_once_with(
        {"_id": "document2"},
        {"$set": {"sentiment": {"label": "negative", "score": 0.6}}},
    )
--- a/tests/utils/mongo/sentiment_service_test.py
+++ b/tests/utils/mongo/sentiment_service_test.py
@@ -0,0 +1,78 @@
 """Tests for checking Sentiment Services."""
 from aki_prj23_transparenzregister.utils.mongo.sentiment_service import (
    SentimentAnalysisService,
 )
 def test_sentiment_service_with_spacy_pos() -> None:
    """Mock testing spaCy Sentiment Service with positive sentiment."""
    # Init SentimentAnalysisService with spaCy
    sentiment_service = SentimentAnalysisService(use_spacy=True)
    # run the test
    text = "Dies ist ein großartiger Test. Ich liebe es!"
    sentiment, score = sentiment_service.sentiment_spacy(text)
    assert sentiment == "positive"
    assert score > 0
 def test_sentiment_service_with_spacy_neg() -> None:
    """Mock testing spaCy Sentiment Service with negative sentiment."""
    # Init SentimentAnalysisService with spaCy
    sentiment_service = SentimentAnalysisService(use_spacy=True)
    # run the test
    text = "Dies ist ein wirklich schrecklicher Test. Ich hasse ihn!"
    sentiment, score = sentiment_service.sentiment_spacy(text)
    assert sentiment == "negative"
    assert score > 0
 def test_sentiment_service_with_spacy_neut() -> None:
    """Mock testing spaCy Sentiment Service with neutral sentiment."""
    # Init SentimentAnalysisService with spaCy
    sentiment_service = SentimentAnalysisService(use_spacy=True)
    # run the test
    text = "Dies ist ein Test."
    sentiment, score = sentiment_service.sentiment_spacy(text)
    assert sentiment == "neutral"
    assert score >= 0
 def test_sentiment_service_with_transformer_pos() -> None:
    """Mock testing Transformer Sentiment Service with positive Sentiment."""
    # Init SentimentAnalysisService with Transformer
    sentiment_service = SentimentAnalysisService(use_transformer=True)
    # run the test
    text = "Dies ist ein großartiger Test. Ich liebe es!"
    sentiment, score = sentiment_service.sentiment_transformer(text)
    assert sentiment == "positive"
    assert score > 0
 def test_sentiment_service_with_transformer_neg() -> None:
    """Mock testing Transformer Sentiment Service with negative Sentiment."""
    # Init SentimentAnalysisService with Transformer
    sentiment_service = SentimentAnalysisService(use_transformer=True)
    # run the test
    text = "Dies ist ein wirklich schrecklicher Test. Ich hasse ihn!"
    sentiment, score = sentiment_service.sentiment_transformer(text)
    assert sentiment == "negative"
    assert score > 0
 def test_sentiment_service_with_transformer_neut() -> None:
    """Mock testing Transformer Sentiment Service with neutral Sentiment."""
    # Init SentimentAnalysisService with Transformer
    sentiment_service = SentimentAnalysisService(use_transformer=True)
    # run the test
    text = "Das ist ein Text, ohne besondere Stimmung."
    sentiment, score = sentiment_service.sentiment_transformer(text)
    assert sentiment == "neutral"
    assert score >= 0
		`@@ -0,0 +1 @@`
							<mxfile host="Electron" modified="2023-08-17T13:01:42.139Z" agent="5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/20.8.10 Chrome/106.0.5249.199 Electron/21.3.5 Safari/537.36" etag="3J0g9IE6MsjTAohlE4lt" version="20.8.10" type="device"><diagram id="C5RBs43oDa-KdzZeNtuy" name="Page-1">7VrbUuM4EP2aVO0+QNkxuT1uAszswlCzA7MMT1OKrTgiimVkOZf5+m3ZUmxHJjETgiHAC1FbN/c5p7stu+EMpotPHIXjL8zDtNG0vEXDOW00m7bda8E/aVmmlo7TSQ0+J57qlBmuyS+sjJayxsTDUaGjYIwKEhaNLgsC7IqCDXHO5sVuI0aLq4bIx4bh2kXUtN4ST4xTa7fZyeyfMfHHemW73UuvTJHurO4kGiOPzXMm56zhDDhjIv01XQwwlc7Tfrn9e3lLLyftT//8Gz2g7/2Lm6v/jtLJzp8yZHULHAfit6deTJb27GISx+7NQ8/+8c16uLg/Ul6IxFL7C3vgPtVkXIyZzwJEzzJrn7M48LCc1YJW1ueSsRCMNhjvsRBLxQUUCwamsZhSdRUviPghhx+3VOsud+V0oWZOGkvdCARf5gbJ5l3+WjYsaelx6f3Jm1qjwxZfqn4Ri7mLN/RzFKUR97HY4OiMMKA0zKYYNgnjOKZIkFlxc0hR3l/1y2CFHwrZJxBIbXKGaKxWItMQgJMsZ15MQZ/rNMhAlrjMx0Tg6xAlrphDoCgCOmKBUGjbcJd9n6IoUgBEgrPJSnqy90pHGT5V4ZhhLvAi5yzTo/pqR8UtFbYcJeJ5FgNsLexxTv8n1p4wODEwuNvodqvo4hJFlSovD0WiqMD7S0ZRaA8pcyep6ZzIzSu56KDd3Y4WRUNM+8id+MlGB4wyDpcCFsjAsD10PLP42qb4yuezSsOASR5FFk2MyupUM31lJBC5Lmw0imBj69RZLfj7bGoZbLpiO4fyN0a3msjUeRqX7FfPpbbBpUs0DRvNNgU39IccfvnyV0hj3weywHRBwzk3Y9eYTYdxtD1dFDCWDDpHU0Klvz5jOsOCuKgkqSBKfFj31AW4MS8nDyxJAh9a7ax1k5AV4u/+kk1X18QK81a7JNtYJdmmu69s0zEw/QropdA1LZrge2AZv9ddA6Fbd8rvVgnSbzoEv3jG19F0e8q3K4ZpxR7r2IG/AoFefxXQq1RTfpQB+2FY89AKAb3DHJ/6MR2alcAw5kFSCLBYfFQCmyqBdlkSetFKQAfCHKjfcIgIgGldHmIdsDraW+p23XWAjhRFCKj0adNKFXZgGKzXYh3rBTEoPX/r7pwW389B5+MHmBUOOh8hxrMfdG7aZE5oq+Slc5fLghHxf4aczYiHuZnctAU2kA3iGAlZWllIPj+x4b18VSHpDGwGp4ylkqGTB74miMqOwAV40GJ+8sCVNL6wwGencn4Zdp1+MIzCFXC1id8QdVXOPC5+q6B9u0T7qz557bf2pf3eh/Z3075+b7dN+806tW/WOYb2cQByxM8p+fSlJWFBwi8uS075jpK/Y7l36pa7fox5/Xrft2531ePaE6gCvOmsvVqz15BM44QatdNj6uN3tTXJS12yiukdRwINKYnGEVoXdTFzH7Z0V9V3fdK1DR+/hHRfado9qZh2W3WmXfO9tqHGAM+jn5BAd0u8iRKR6+IoWuXfK5j5aMAo1Xp9X7nW0ScL9Qm2+SHYNSFWEGy7TsGanw4Ygh3EPGL8ewAPyT4Hvcmz3irSfYgxJ3iDPEHS8tg4wiLRt0y2yVJa3h5z4ylgEalCW54v68lGBFM5GBZ7iFma5achCpIFtendRYBe3Snb/HqAjKBtUujYBdeLP/5MIUxRUkyyzIq98iuEfTnadoqebpUdY9glnrbt1r6Crfkd34d036x02yUfiTyXdKGZfZecPndlX3c7Z/8D</diagram></mxfile>