Feature/ner (#103)

NER und Sentiment-Pipeline mit Services zur Datenextraktion.

---------

Co-authored-by: Philipp Horstenkamp <philipp@horstenkamp.de>
Co-authored-by: TrisNol <tristan.nolde@yahoo.de>
This commit is contained in:
Sebastian 2023-10-16 19:54:24 +02:00 committed by GitHub
parent 99b61e7c2e
commit c680ac9759
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
28 changed files with 12509 additions and 10 deletions

View File

@ -26,7 +26,7 @@ jobs:
- name: Install and configure Poetry - name: Install and configure Poetry
uses: snok/install-poetry@v1 uses: snok/install-poetry@v1
with: with:
version: 1.4.2 version: 1.6.1
virtualenvs-create: false virtualenvs-create: false
- run: poetry install --with doc --all-extras --without test,lint - run: poetry install --with doc --all-extras --without test,lint
- name: Doc-Build - name: Doc-Build

View File

@ -28,7 +28,7 @@ jobs:
- name: Install and configure Poetry - name: Install and configure Poetry
uses: snok/install-poetry@v1 uses: snok/install-poetry@v1
with: with:
version: 1.4.2 version: 1.6.1
virtualenvs-create: false virtualenvs-create: false
virtualenvs-path: ~/local/share/virtualenvs virtualenvs-path: ~/local/share/virtualenvs
- run: poetry install --without develop,doc --all-extras - run: poetry install --without develop,doc --all-extras
@ -56,7 +56,7 @@ jobs:
- name: Install and configure Poetry - name: Install and configure Poetry
uses: snok/install-poetry@v1 uses: snok/install-poetry@v1
with: with:
version: 1.4.2 version: 1.6.1
virtualenvs-create: true virtualenvs-create: true
virtualenvs-path: ~/local/share/virtualenvs virtualenvs-path: ~/local/share/virtualenvs
- name: Check out Git repository - name: Check out Git repository

View File

@ -20,10 +20,14 @@ jobs:
uses: actions/setup-python@v4 uses: actions/setup-python@v4
with: with:
python-version: 3.11 python-version: 3.11
- name: Install Cuda
run: |
sudo apt update
# sudo apt install cuda-10-0 -y
- name: Install and configure Poetry - name: Install and configure Poetry
uses: snok/install-poetry@v1 uses: snok/install-poetry@v1
with: with:
version: 1.4.2 version: 1.6.1
virtualenvs-path: ~/local/share/virtualenvs virtualenvs-path: ~/local/share/virtualenvs
- id: cache-pipenv - id: cache-pipenv
uses: actions/cache@v3 uses: actions/cache@v3
@ -85,7 +89,7 @@ jobs:
- name: Install and configure Poetry - name: Install and configure Poetry
uses: snok/install-poetry@v1 uses: snok/install-poetry@v1
with: with:
version: 1.4.2 version: 1.6.1
virtualenvs-path: ~/local/share/virtualenvs virtualenvs-path: ~/local/share/virtualenvs
- run: | - run: |
poetry install --only test --all-extras poetry install --only test --all-extras
@ -113,7 +117,7 @@ jobs:
- name: Install and configure Poetry - name: Install and configure Poetry
uses: snok/install-poetry@v1 uses: snok/install-poetry@v1
with: with:
version: 1.4.2 version: 1.6.1
virtualenvs-path: ~/local/share/virtualenvs virtualenvs-path: ~/local/share/virtualenvs
- id: cache-pipenv - id: cache-pipenv
uses: actions/cache@v3 uses: actions/cache@v3

View File

@ -0,0 +1 @@
<mxfile host="Electron" modified="2023-08-17T13:01:42.139Z" agent="5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/20.8.10 Chrome/106.0.5249.199 Electron/21.3.5 Safari/537.36" etag="3J0g9IE6MsjTAohlE4lt" version="20.8.10" type="device"><diagram id="C5RBs43oDa-KdzZeNtuy" name="Page-1">7VrbUuM4EP2aVO0+QNkxuT1uAszswlCzA7MMT1OKrTgiimVkOZf5+m3ZUmxHJjETgiHAC1FbN/c5p7stu+EMpotPHIXjL8zDtNG0vEXDOW00m7bda8E/aVmmlo7TSQ0+J57qlBmuyS+sjJayxsTDUaGjYIwKEhaNLgsC7IqCDXHO5sVuI0aLq4bIx4bh2kXUtN4ST4xTa7fZyeyfMfHHemW73UuvTJHurO4kGiOPzXMm56zhDDhjIv01XQwwlc7Tfrn9e3lLLyftT//8Gz2g7/2Lm6v/jtLJzp8yZHULHAfit6deTJb27GISx+7NQ8/+8c16uLg/Ul6IxFL7C3vgPtVkXIyZzwJEzzJrn7M48LCc1YJW1ueSsRCMNhjvsRBLxQUUCwamsZhSdRUviPghhx+3VOsud+V0oWZOGkvdCARf5gbJ5l3+WjYsaelx6f3Jm1qjwxZfqn4Ri7mLN/RzFKUR97HY4OiMMKA0zKYYNgnjOKZIkFlxc0hR3l/1y2CFHwrZJxBIbXKGaKxWItMQgJMsZ15MQZ/rNMhAlrjMx0Tg6xAlrphDoCgCOmKBUGjbcJd9n6IoUgBEgrPJSnqy90pHGT5V4ZhhLvAi5yzTo/pqR8UtFbYcJeJ5FgNsLexxTv8n1p4wODEwuNvodqvo4hJFlSovD0WiqMD7S0ZRaA8pcyep6ZzIzSu56KDd3Y4WRUNM+8id+MlGB4wyDpcCFsjAsD10PLP42qb4yuezSsOASR5FFk2MyupUM31lJBC5Lmw0imBj69RZLfj7bGoZbLpiO4fyN0a3msjUeRqX7FfPpbbBpUs0DRvNNgU39IccfvnyV0hj3weywHRBwzk3Y9eYTYdxtD1dFDCWDDpHU0Klvz5jOsOCuKgkqSBKfFj31AW4MS8nDyxJAh9a7ax1k5AV4u/+kk1X18QK81a7JNtYJdmmu69s0zEw/QropdA1LZrge2AZv9ddA6Fbd8rvVgnSbzoEv3jG19F0e8q3K4ZpxR7r2IG/AoFefxXQq1RTfpQB+2FY89AKAb3DHJ/6MR2alcAw5kFSCLBYfFQCmyqBdlkSetFKQAfCHKjfcIgIgGldHmIdsDraW+p23XWAjhRFCKj0adNKFXZgGKzXYh3rBTEoPX/r7pwW389B5+MHmBUOOh8hxrMfdG7aZE5oq+Slc5fLghHxf4aczYiHuZnctAU2kA3iGAlZWllIPj+x4b18VSHpDGwGp4ylkqGTB74miMqOwAV40GJ+8sCVNL6wwGencn4Zdp1+MIzCFXC1id8QdVXOPC5+q6B9u0T7qz557bf2pf3eh/Z3075+b7dN+806tW/WOYb2cQByxM8p+fSlJWFBwi8uS075jpK/Y7l36pa7fox5/Xrft2531ePaE6gCvOmsvVqz15BM44QatdNj6uN3tTXJS12yiukdRwINKYnGEVoXdTFzH7Z0V9V3fdK1DR+/hHRfado9qZh2W3WmXfO9tqHGAM+jn5BAd0u8iRKR6+IoWuXfK5j5aMAo1Xp9X7nW0ScL9Qm2+SHYNSFWEGy7TsGanw4Ygh3EPGL8ewAPyT4Hvcmz3irSfYgxJ3iDPEHS8tg4wiLRt0y2yVJa3h5z4ylgEalCW54v68lGBFM5GBZ7iFma5achCpIFtendRYBe3Snb/HqAjKBtUujYBdeLP/5MIUxRUkyyzIq98iuEfTnadoqebpUdY9glnrbt1r6Crfkd34d036x02yUfiTyXdKGZfZecPndlX3c7Z/8D</diagram></mxfile>

View File

@ -0,0 +1 @@
<mxfile host="Electron" modified="2023-08-20T09:25:41.321Z" agent="5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/19.0.3 Chrome/102.0.5005.63 Electron/19.0.3 Safari/537.36" etag="NgKJlrtoa61kZ5SbTZB-" version="19.0.3" type="device" pages="2"><diagram id="C5RBs43oDa-KdzZeNtuy" name="Pipeline">7VpZc6M4EP41rtp9SIojvh7XzjG7yUzNTjKbydOUDDIolhERwsf8+m2BxGEom0zs2HGSyoPV6ID+vq+7JWjZw+niiqPQ/8xcTFuW4S5a9nnLsswzy2rJf8NdppZep50aPE5c1Sk33JJfWBkNZY2Ji6NSR8EYFSQsGx0WBNgRJRvinM3L3caMllcNkYcrhlsH0ar1nrjCV09hdXP7J0w8X69sdvrplSnSndWTRD5y2bxgsi9a9pAzJtJf08UQU+k87Zf7v5f39GbSufrn3+gJfR9c33357ySd7PI5Q7JH4DgQvz31YrI0Z9eTOHbunvrmj2/G0/XjifJCJJbaX9gF96km48JnHgsQvcitA87iwMVyVgNaeZ8bxkIwmmB8xEIsFRdQLBiYfDGl6ipeEPFDDj9tq9ZD4cr5Qs2cNJa6EQi+LAySzYfitXxY0tLjGrpOuThiMXfwmn62YjDiHhZr/KooI51ZoKEC5gqzKYabhA4cUyTIrMxVpCjvZf1yWOGHQvYZBFJ3PUM0ViuRaQjASZYzN6agz1Ua5CBLXOY+Efg2RIlv5hAoyoCOWSAU2iY85cCjKIoUAJHgbJJJT/bOdPR8fGaYC7xY61F9tavClApbtkJknscAUwvbL+j/zNgRBmcVDB7Wut0ou7hGUbXKK0KRKCpw/5JRFNojypxJarok8uaVXHTQ7m1Gi6IRpgPkTLzkRoeMMg6XAhbIwLA5dGxZjZ2qGuvnaypHRRZNjMbqVDN9ZSQQhS5sPI7gxlapky34+2xqV9j0hb04lL8xuu2JTN3ncck8eC51Kly6QdOwZXUouGEw4vDLk79CGnsekAWmC1r2ZTV2+Ww6iqPN6aKEsWTQJZoSKv31CdMZFsRBNUkFUeLBuucOwI15PXlgSRJ40OrkrbuErBB/d5dsena3lGzanZpsY9Rkm96usk23gulXQC+FzjJogu+RZfx+bwWE3r5Tfq9JkH7TIfjVM76OpptTvtkwTCv2GKc2/JUIdPhVQL9RTflRBuyGYdaxFQL6Dgt8GsR0VK0ERjEPkkKAxeKjElhXCXTqktCrVgI6EBZA/YZDRABM4+YY64DsaE8fWFr7rgN0pChDQKVPLSNV2JFhsFqLdY1XxKD2QK734rR4tAed6w4wNx50Ni2ztn7Que6uC0LLkpfOXQ4LxsT7GXI2Iy7m1eSmLXAD+SCOkZCllYHk/omNHuWrCklnYDN4yZdKhk4uOJ8gKjsCF2Cjxbxkw5U0PrPAY+dyfhl27UEwisIMuGMSv1HSvlmj/axPUfvtXWm//6H9Z2lfv6bbpP2mBfDraL9a51S0jwOQI96m5NOXloQFCb+4LDnlO0r+juXe3bfc9Tbm8PW+Zd1uXY8rO1AFuGWvvFozV5BMA4ca9aJt6rrH3JDkpS5Zw/SOI4FGlER+hFZFXc7cxy3drPren3TNio9fQ7qHkXbPGqbd9kGl3ep77YoaAzyPfkICfVniTZSIHAdHUZZ/v8DMJ0NGqdbr+8q1tj5Z2J9grXcs2HZDwXYOSrDVTwcqgh3GPGL8ewCbZI+D3uRZbxPpPsWYE7xGniBpeWwcYZHoWybbZCktb5c58RTAiVShLc+X9WRjgqkcDIs9xSzN8tMQBcmC2vTuIkB/3ym7+vUAGUO7SqFTB1wv/vgzhTBFSTHJqFbsjV8h7MrRpl32dLvuGMOs8bRptncVbKvf8X1I981Kt1Pzkci2pAvN/LvkdN+Vf91tX/wP</diagram><diagram id="KGNsg-YwxBcfRzNuZPCy" name="Service">dZHBDoIwDIafZnfYFPGMiBdPHDwvrLIlg5IxA/r0QjbEBU126L7/77q2hGXNWBjeySsK0IRGYiTsRCiNd5SS+UTi6Uia7B2ojRLetIJSvcDDyNOHEtAHRouorepCWGHbQmUDxo3BIbTdUYdVO17DBpQV11t6U8JKT+PkuAoXULX0pVN6cELDF7PvpJdc4PCFWE5YZhCti5oxAz0Pb5mLyzv/UT8fM9DaHwlTsL49XYINsfwN</diagram></mxfile>

View File

@ -0,0 +1,38 @@
```mermaid
flowchart LR
DBConnect["`**Mongo Connect**
- create connection string
- establish connection`"]
DBRead["`**Mongo Read**
- read database
- get fields without attribute 'companies'`"]
NER["`**NERService**
- process news article
- get entities`"]
DBUpdate["`**Mongo Update Documents**
- update processed documents
- add an attribute 'companies'`"]
id1[["`**NERSpacy**
Named Entitiy Recognition with spaCy`"]]
id2[["`**NERCompanyList**
Named Entitiy Recognition by comparing text with list`"]]
id3[["`**NERTransformer**
Named Entitiy Recognition with transformer`"]]
DBConnect-->DBRead-->NER
NER--select service-->id1
NER--select service-->id2
NER--select service-->id3
id1-->DBUpdate
id2-->DBUpdate
id3-->DBUpdate
```

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,32 @@
```mermaid
flowchart LR
DBConnect["`**Mongo Connect**
- create connection string
- establish connection`"]
DBRead["`**Mongo Read**
- read database
- get fields without attribute 'companies'`"]
NER["`**SentimentService**
- process news article
- get sentiment`"]
DBUpdate["`**Mongo Update Documents**
- update processed documents
- add an attribute 'sentiment'`"]
id1[["`**SentimentSpacy**
Sentiment analysis with spaCy`"]]
id3[["`**SentimentTransformer**
Sentiment analysis with transformer`"]]
DBConnect-->DBRead-->NER
NER--select service-->id1
NER--select service-->id3
id1-->DBUpdate
id3-->DBUpdate
```

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 343 KiB

View File

@ -531,7 +531,7 @@
" python-version: 3.11\n", " python-version: 3.11\n",
" - uses: snok/install-poetry@v1 # setup poetry\n", " - uses: snok/install-poetry@v1 # setup poetry\n",
" with:\n", " with:\n",
" version: 1.4.2\n", " version: 1.6.1\n",
" virtualenvs-path: ~/local/share/virtualenvs\n", " virtualenvs-path: ~/local/share/virtualenvs\n",
" - uses: actions/checkout@v3\n", " - uses: actions/checkout@v3\n",
" - run: |\n", " - run: |\n",

1254
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -65,12 +65,18 @@ python = "^3.11"
python-dotenv = "^1.0.0" python-dotenv = "^1.0.0"
seaborn = "^0.12.2" seaborn = "^0.12.2"
selenium = "^4.12.0" selenium = "^4.12.0"
spacy = "^3.6.1"
spacy-sentiws = "^3.0.0"
torch = {version = "*", source = "torch-cpu"}
torchaudio = {version = "*", source = "torch-cpu"}
torchvision = {version = "*", source = "torch-cpu"}
tqdm = "^4.66.1" tqdm = "^4.66.1"
transformers = {version = "*", extras = ["torch"]}
xmltodict = "^0.13.0" xmltodict = "^0.13.0"
[tool.poetry.extras] [tool.poetry.extras]
ingest = ["selenium", "deutschland", "xmltodict"] ingest = ["selenium", "deutschland", "xmltodict"]
transformation = [] transformation = ["torch", "torchaudio", "torchvision", "transformers", "spacy-sentiws", "spacy"]
web-server = ["dash", "dash-auth", "dash-bootstrap-components", "matplotlib", "seaborn"] web-server = ["dash", "dash-auth", "dash-bootstrap-components", "matplotlib", "seaborn"]
[tool.poetry.group.develop.dependencies] [tool.poetry.group.develop.dependencies]
@ -108,6 +114,7 @@ types-cachetools = "^5.3.0.6"
types-pyOpenSSL = "*" types-pyOpenSSL = "*"
types-requests = "^2.31.0.2" types-requests = "^2.31.0.2"
types-setuptools = "*" types-setuptools = "*"
types-tabulate = "^0.9.0.3"
types-tqdm = "^4.66.0.2" types-tqdm = "^4.66.0.2"
[tool.poetry.group.test.dependencies] [tool.poetry.group.test.dependencies]
@ -123,6 +130,11 @@ data-transformation = "aki_prj23_transparenzregister.utils.data_transfer:transfe
reset-sql = "aki_prj23_transparenzregister.utils.sql.connector:reset_all_tables_cli" reset-sql = "aki_prj23_transparenzregister.utils.sql.connector:reset_all_tables_cli"
webserver = "aki_prj23_transparenzregister.ui.app:main" webserver = "aki_prj23_transparenzregister.ui.app:main"
[[tool.poetry.source]]
name = "torch-cpu"
priority = "explicit"
url = "https://download.pytorch.org/whl/cpu"
[tool.ruff] [tool.ruff]
exclude = [ exclude = [
".bzr", ".bzr",

View File

@ -0,0 +1,102 @@
[
"Volkswagen",
"Mercedes",
"Benz",
"Deutsche Telekom",
"Bmw",
"Deutsche Post",
"E.On",
"BASF",
"Siemens",
"Uniper",
"Bayer",
"Continental",
"Fresenius",
"Thyssen",
"Siemens",
"SAP",
"Metro",
"Hochtief",
"Traton",
"Ceconomy",
"ENBW",
"Adidas",
"Henkel",
"Heidelbergcement",
"Fresenius",
"Merck",
"Mckesson",
"RWE",
"Lufthansa",
"Hapag-Lloyd",
"Schaeffler",
"Evonik",
"Aurubis",
"Brenntag",
"Covestro",
"Infineon",
"Tui",
"Kion ",
"Zalando",
"Telefonica",
"Salzgitter",
"Beiersdorf",
"Suedzucker",
"Hella",
"Lanxess",
"Knorr",
"Rheinmetall",
"Hornbach",
"United",
"Puma",
"Baywa",
"Kloeckner",
"Hornbach",
"Bechtle",
"Nordex",
"Wacker",
"Gea",
"Vonovia",
"Prosiebensat1",
"Leoni",
"MTU",
"1&1",
"Jungheinrich",
"K+S",
"Hellofresh",
"Symrise",
"Aurelius",
"Mvv",
"Bilfinger",
"Draegerwerk",
"Krones",
"Duerr",
"Osram",
"Auto1",
"Deutsche Wohnen",
"Kabel Deutschland",
"Freenet",
"Kuka",
"Delivery Hero",
"Paul Hartmann",
"Fuchs Petrolub",
"Sartorius",
"Gelsenwasser",
"Mainova",
"Ksb",
"Heidelberger Druckmaschinen",
"Sixt",
"Hugo Boss",
"Dmg Mori",
"Mutares",
"Zooplus",
"Grammer",
"Fraport",
"Wacker Neuson",
"Indus Holding",
"Leg Immobilien",
"Elringklinger",
"Stroeer",
"Fielmann",
"Gerresheimer"
]

View File

@ -0,0 +1,77 @@
SentiWS
~~~~~~~
SentimentWortschatz, or SentiWS for short, is a publicly available German-language resource for sentiment analysis, opinion mining etc. It lists positive and negative polarity bearing words weighted within the interval of [-1; 1] plus their part of speech tag, and if applicable, their inflections. The current version of SentiWS (v2.0) contains around 1,650 positive and 1,800 negative words, which sum up to around 16,000 positive and around 18,000 negative word forms incl. their inflections, respectively. It not only contains adjectives and adverbs explicitly expressing a sentiment, but also nouns and verbs implicitly containing one.
License
~~~~~~~
SentiWS is licensed under a Creative Commons Attribution-Noncommercial-Share Alike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).
Obtain a Copy
~~~~~~~~~~~~~
The latest version of SentiWS can be found at https://wortschatz.uni-leipzig.de/download/.
Data Format
~~~~~~~~~~~
SentiWS is organised in two utf8-encoded text files structured the following way:
<Word>|<POS tag> \t <Polarity weight> \t <Infl_1>,...,<Infl_k> \n
where \t denotes a tab, and \n denotes a new line.
Citation
~~~~~~~~
If you use SentiWS in your work we kindly ask you to cite
R. Remus, U. Quasthoff & G. Heyer: SentiWS - a Publicly Available German-language Resource for Sentiment Analysis.
In: Proceedings of the 7th International Language Ressources and Evaluation (LREC'10), 2010
or use the following BibTeX-code snippet:
@INPROCEEDINGS{remquahey2010,
title = {SentiWS -- a Publicly Available German-language Resource for Sentiment Analysis},
booktitle = {Proceedings of the 7th International Language Resources and Evaluation (LREC'10)},
author = {Remus, R. and Quasthoff, U. and Heyer, G.},
year = {2010}
}
Version History
~~~~~~~~~~~~~~~
SentiWS is "work in progress" and hence far from being fully-fledged and error-free. It will be continuously refined by adding missing words and word forms and removing ambiguous ones.
v1.8b, 2010-05-19: First publicly available version as described in Remus et al. (2010).
v1.8c, 2012-03-21: Second publicly available version in which some POS tags were corrected.
v2.0, 2018-10-19: Third publicly available version in which the inflected forms were extended.
Statistics
~~~~~~~~~~
Positive Negative
Adjectives Baseforms 792 712
Inflections 10,936 10,471
Adverbs Baseforms 7 4
Inflections 5 0
Nouns Baseforms 548 688
Inflections 736 1158
Verbs Baseforms 297 423
Inflections 3,246 4,580
All Baseforms 1,644 1,827
Inflections 14,923 16,209
Total 16,567 18,036
Table: Overview of the dictionary's content
SentiWS.txt was last updated on 2019-09-12.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,110 @@
"""Pipeline to get Entities from Staging DB."""
import json
import sys
from loguru import logger
from tqdm import tqdm
import aki_prj23_transparenzregister.utils.mongo.connector as conn
import aki_prj23_transparenzregister.utils.mongo.news_mongo_service as news
from aki_prj23_transparenzregister.config.config_providers import (
JsonFileConfigProvider,
)
from aki_prj23_transparenzregister.utils.mongo import ner_service
logger.add(sys.stdout, colorize=True)
class EntityPipeline:
"""Class to initialize NER Pipeline."""
def __init__(self, conn_string: conn.MongoConnection) -> None:
"""Method to connect to StagingDB."""
self.connect_string = conn_string
self.connect_string.database = "transparenzregister_ner"
self.connector = conn.MongoConnector(self.connect_string)
self.news_obj = news.MongoNewsService(self.connector)
def process_documents(
self, entity: str, doc_attrib: str, ner_selection: str
) -> None:
"""Method to check documents, get entities and write them to document."""
CursorUnprogressed = self.news_obj.collection.find( # noqa: N806
{"companies": {"$exists": False}}
)
documents = list(CursorUnprogressed)
logger.info("Dokumente: ", str(CursorUnprogressed))
# Determine NER service based on config
# spaCy
if ner_selection == "use_spacy_ner":
ner_service_instance = ner_service.NerAnalysisService(
use_spacy=True, use_transformer=False, use_companylist=False
)
ner_service_func = ner_service_instance.ner_spacy
# company list
elif ner_selection == "use_companylist_ner":
ner_service_instance = ner_service.NerAnalysisService(
use_spacy=False, use_transformer=False, use_companylist=True
)
ner_service_func = ner_service_instance.ner_company_list
# transformer
elif ner_selection == "use_transformer_ner":
ner_service_instance = ner_service.NerAnalysisService(
use_spacy=False, use_transformer=True, use_companylist=False
)
ner_service_func = ner_service_instance.ner_transformer
if len(documents) > 0:
for document in tqdm(documents):
ents = ner_service_func(document, entity, doc_attrib)
self.news_obj.collection.update_one(
{"_id": document["_id"]},
{"$set": {"companies": ents}},
)
else:
logger.info("No documents found.")
if __name__ == "__main__":
# Establish MongoDB Connection using secrets
config_provider = JsonFileConfigProvider("./secrets.json")
connect_string = config_provider.get_mongo_connection_string()
# dir of config json
config_file_path = (
"src/aki_prj23_transparenzregister/utils/mongo/ner_sentiment_config.json"
)
# Load NER service configuration from JSON
with open(config_file_path) as config_file:
ner_config = json.load(config_file)
# read configuration
entity = ner_config["ner_service"]["entity"]
logger.info("NER Pipeline: searching for entity of type", str(entity))
doc_attrib = ner_config["ner_service"]["doc_attrib"]
logger.info("NER Pipeline: searching in document attribute ", str(doc_attrib))
# read selected service
if ner_config["ner_service"]["use_companylist_ner"] is True:
ner_selection = "use_companylist_ner"
logger.info("NER Pipeline: Searching entities with company list")
elif ner_config["ner_service"]["use_spacy_ner"] is True:
ner_selection = "use_spacy_ner"
logger.info("NER Pipeline: Searching entities with spaCy")
elif ner_config["ner_service"]["use_transformer_ner"] is True:
ner_selection = "use_transformer_ner"
logger.info("NER Pipeline: Searching entities with transformer")
else:
logger.info(
"NER Pipeline: No NER services selected or error in configuration file."
)
entity_pipeline = EntityPipeline(connect_string)
entity_pipeline.process_documents(entity, doc_attrib, ner_selection)

View File

@ -0,0 +1,17 @@
{
"sentiment_service": {
"comment": "Select only one service by setting true and deselect the other with false. Valid doc_attrib: text, title",
"use_spacy": false,
"use_transformer": true,
"doc_attrib": "text"
},
"ner_service": {
"comment": "Select only one service by setting true and deselect the other with false. Valid doc_attrib: text, title",
"use_spacy_ner": false,
"use_transformer_ner": true,
"use_companylist_ner":false,
"doc_attrib": "text",
"entity":"ORG"
}
}

View File

@ -0,0 +1,150 @@
"""NER Service module."""
import json
from collections import Counter
from typing import Final
import spacy
from transformers import pipeline
class NerAnalysisService:
"""Class to initialize NER model."""
def __init__(
self,
use_spacy: bool = False,
use_transformer: bool = False,
use_companylist: bool = False,
) -> None:
"""Method to check which sentiment model is chosen."""
if use_spacy:
self.init_spacy()
if use_transformer:
self.init_transformer()
if use_companylist:
self.init_companylist()
def init_spacy(self) -> None:
"""Method to initialize spaCy.
Optimized by ChatGPT.
"""
# check if model is available and load it
SPACY_MODEL_NAME: Final[str] = "de_core_news_lg" # noqa: N806
if not spacy.util.is_package(SPACY_MODEL_NAME):
from spacy.cli.download import download as spacy_download
spacy_download(SPACY_MODEL_NAME) # type: ignore
self.nlp = spacy.load(SPACY_MODEL_NAME)
def init_transformer(self) -> None:
"""Method to initialize transformer."""
# init NER Transformer
self.classifier = pipeline(
"ner",
model="fhswf/bert_de_ner",
grouped_entities=True,
tokenizer="dbmdz/bert-base-german-cased",
)
def init_companylist(self) -> None:
"""Method to initialize company list."""
with open(
"src/aki_prj23_transparenzregister/utils/mongo/CompEntities.json", "rb"
) as complist:
self.complist = json.load(complist)
def ner_spacy(
self, doc: dict, ent_type: str = "ORG", doc_attrib: str = "title"
) -> dict:
"""Named Entity Recognition with Spacy.
Args:
doc: a document which is processed with spacy
ent_type: string with specific entity (LOC - Location; PERSON - People; NORP - Nationalities or religious or political groups;
FAC - Buildings, airports, highways, bridges, etc.; ORG - Companies, agencies, institutions, etc.;
GPE - Countries, cities, states.; LOC - Non-GPE locations, mountain ranges, bodies of water)
doc_attrib: which attribute of the document has to be processed: text or title
Returns:
list of entities.
"""
# init list for entities
entities = []
text = doc[doc_attrib]
# get entities
doc_nlp = self.nlp(text)
# select company
for ent in doc_nlp.ents:
if ent.label_ == ent_type:
entities.append(ent.text)
return dict(Counter(entities))
def ner_company_list(
self, doc: dict, ent_type: str = "ORG", doc_attrib: str = "title"
) -> dict:
"""Named Entity Recognition by String comparision.
Args:
doc: a dict from where entities are searched
ent_type: type of searched entity
doc_attrib: which attribute of the dict is searched
Raises:
NotImplementedError: To be defined by child classes
Returns:
list with entities.
"""
# Convert all entries in the company_list to lowercase
self.complist = [company_name.lower() for company_name in self.complist]
# Create an empty list to store the entities
entities = []
# Search the text for company names
text = doc[doc_attrib]
# Convert title to lowercase
text = text.lower()
for company_name in self.complist:
start_idx = text.find(company_name)
if start_idx != -1: # Wort gefunden
start_idx + len(company_name)
entity = company_name
if entity not in entities:
entities.append(entity)
return dict(Counter(entities))
def ner_transformer(
self, doc: dict, ent_type: str = "ORG", doc_attrib: str = "title"
) -> dict:
"""Named Entity Recognition with Transformer.
Args:
doc: a string which is processed with a transformer model
ent_type: string with specific entity (PERSON - People; NORP - Nationalities or religious or political groups;
FAC - Buildings, airports, highways, bridges, etc.; ORG - Companies, agencies, institutions, etc.;
GPE - Countries, cities, states.; LOC - Non-GPE locations, mountain ranges, bodies of water).
doc_attrib: Attribute of the document (title or text)
Returns:
list of entities.
"""
# init list for entities
entities = []
text = doc[doc_attrib]
sentences = text.split(". ") # Split text into sentences based on '. '
# Process each sentence separately
for sentence in sentences:
res = self.classifier(
sentence
) # Assuming 'classifier' processes a single sentence at a time
for i in range(len(res)):
if res[i]["entity_group"] == ent_type:
entities.append(res[i]["word"])
return dict(Counter(entities))

View File

@ -0,0 +1,91 @@
"""Pipeline to get sentiments from Staging DB nes articles."""
import json
import os
from loguru import logger
from tqdm import tqdm
import aki_prj23_transparenzregister.utils.mongo.connector as conn
import aki_prj23_transparenzregister.utils.mongo.news_mongo_service as news
from aki_prj23_transparenzregister.config.config_providers import JsonFileConfigProvider
from aki_prj23_transparenzregister.config.config_template import MongoConnection
from aki_prj23_transparenzregister.utils.mongo import sentiment_service
class SentimentPipeline:
"""Class to initialize sentiment Pipeline."""
def __init__(self, conn_string: MongoConnection) -> None:
"""Method to connect to StagingDB."""
self.connect_string = conn_string
self.connect_string.database = "transparenzregister_ner"
self.connector = conn.MongoConnector(self.connect_string)
self.news_obj = news.MongoNewsService(self.connector)
def process_documents(self, doc_attrib: str, sentiment_selection: str) -> None:
"""Method to check documents, get entities and write them to document."""
CursorUnprogressed = self.news_obj.collection.find( # noqa: N806
{"sentiment": {"$exists": False}}
)
documents = list(CursorUnprogressed)
if len(documents) > 0:
for document in tqdm(documents):
text = document[doc_attrib]
# Determine sentiment analysis service based on config
if sentiment_selection == "use_spacy":
selected_service = sentiment_service.SentimentAnalysisService(
use_spacy=True, use_transformer=False
)
sentiment_service_func = selected_service.sentiment_spacy
elif sentiment_selection == "use_transformer":
selected_service = sentiment_service.SentimentAnalysisService(
use_spacy=False, use_transformer=True
)
sentiment_service_func = selected_service.sentiment_transformer
# sents = selected_service.sentiment_spacy(text)
sents = sentiment_service_func(text)
sentiment = {"label": sents[0], "score": sents[1]}
self.news_obj.collection.update_one(
{"_id": document["_id"]},
{"$set": {"sentiment": sentiment}},
)
else:
logger.info("No documents found.")
if __name__ == "__main__":
# Establish MongoDB Connection using secrets
config_provider = JsonFileConfigProvider("./secrets.json")
connect_string = config_provider.get_mongo_connection_string()
# dir of config json
script_dir = os.path.dirname(__file__)
config_file_path = os.path.join(script_dir, "ner_sentiment_config.json")
# Load sentiment service configuration from JSON
with open(config_file_path) as config_file:
sentiment_config = json.load(config_file)
# Where to search the sentiment
doc_attrib = sentiment_config["sentiment_service"]["doc_attrib"]
logger.info("Sentiment Pipeline: searching in document attribute ", str(doc_attrib))
# read selected service
if sentiment_config["sentiment_service"]["use_spacy"] is True:
sentiment_selection = "use_spacy"
logger.info("Sentiment Pipleline: Searching sentiments with spaCy")
elif sentiment_config["sentiment_service"]["use_transformer"] is True:
sentiment_selection = "use_transformer"
logger.info("Sentiment Pipleline: Searching sentiments with transformer")
else:
logger.info(
"Sentiment Pipleline: No Sentiment services selected or error in configuration file."
)
sentiment_pipeline = SentimentPipeline(connect_string)
sentiment_pipeline.process_documents(doc_attrib, sentiment_selection)

View File

@ -0,0 +1,170 @@
"""Service for Sentiment analysis."""
import os
import zipfile
from typing import Final
import requests
import spacy
from loguru import logger
from spacy_sentiws import spaCySentiWS # noqa: F401
from transformers import pipeline
class SentimentAnalysisService:
"""Class to initialize spaCy or Transformer model."""
def __init__(self, use_spacy: bool = False, use_transformer: bool = False) -> None:
"""Method to check which sentiment model is chosen."""
if use_spacy:
self.init_spacy()
if use_transformer:
self.init_transformer()
def init_spacy(self) -> None:
"""Method to initialize spaCy."""
# check if model is available and load it
SPACY_MODEL_NAME: Final[str] = "de_core_news_lg" # noqa: N806
if not spacy.util.is_package(SPACY_MODEL_NAME):
cli = spacy.cli # type: ignore
cli.download(SPACY_MODEL_NAME) # type: ignore
self.nlp = spacy.load(SPACY_MODEL_NAME)
# path to spaCy vocabulary
PATH: Final[ # noqa: N806
str
] = "src/aki_prj23_transparenzregister/utils/mongo/SentiWS/"
# check if vocabulary is existing, otehrwise download it
if not os.path.exists(PATH):
URL: Final[ # noqa: N806
str
] = "https://downloads.wortschatz-leipzig.de/etc/SentiWS/SentiWS_v2.0.zip"
logger.info("SentiWS vocabulary not found. Starting download...")
# Create the data directory if it doesn't exist
os.makedirs(PATH, exist_ok=True)
# File path for the downloaded ZIP file
zip_file_path = os.path.join(PATH, "SentiWS_v2.0.zip")
# Download the ZIP file
response = requests.get(URL) # noqa: S113
with open(zip_file_path, "wb") as zip_file:
zip_file.write(response.content)
# Extract the ZIP file
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
zip_ref.extractall(PATH)
# Remove the downloaded ZIP file if it's no longer needed
os.remove(zip_file_path)
logger.info("SentiWS data downloaded and extracted successfully.")
else:
logger.info("SentiWS data directory already exists.")
# create spaCy pipeline
self.nlp.add_pipe("sentiws", config={"sentiws_path": PATH})
def init_transformer(self) -> None:
"""Method to initialize transformer."""
# loading the sentiment model(~ 436MB) for transformer
self.sentiment_analyzer = pipeline(
"sentiment-analysis", model="oliverguhr/german-sentiment-bert"
)
def sentiment_spacy(self, doc: str) -> tuple:
"""Sentiment Analytics with Spacy.
Args:
doc: a document which is processed with spacy
docAttrib: which attribute of the document has to be processed: text or title
Returns:
label: positive, negative, neutral.
"""
# set limits for sentiments
_upperlimit = 0.1
_lowerlimit = -0.1
_doc = self.nlp(doc)
_score = None
_sent = None
# init a sentiment counter
_pos = 0
_neg = 0
# init a summarizer for maximum sentiment score to normalize values
_max_score = 0
for token in _doc:
token_score = token._.sentiws # noqa: SLF001
if token_score is not None:
_max_score += abs(token_score)
if token_score < 0:
_neg += token_score
if token_score > 0:
_pos += token_score
# Normalize the score to the range 0..1
_normalized_score = (_pos - abs(_neg)) / _max_score if _max_score > 0 else 0
if _normalized_score > _upperlimit:
_sent = "positive"
elif _normalized_score < _lowerlimit:
_sent = "negative"
else:
_sent = "neutral"
return _sent, abs(_normalized_score)
def sentiment_transformer(self, doc: str) -> tuple:
"""Sentiment Analysis with Transformer.
Args:
doc: a string which is processed with a transformer model
Returns:
sentiment and score.
"""
sentences = doc.split(". ") # Split text into sentences based on '. '
# init total sentiment and score counter
total_score = 0
total_positive_score = 0
total_negative_score = 0
total_neutral_score = 0
_score = None
_sent = None
# Process each sentence separately
for sentence in sentences:
# get sentiment
results = self.sentiment_analyzer(sentence)
_score = results[0]["score"]
_sent = results[0]["label"]
# sum up sepcific score
if _sent == "positive":
total_positive_score += _score
elif _sent == "negative":
total_negative_score += _score
else:
total_neutral_score += _score
# sum up total score
total_score += _score
# total specific score
total_positive_score_normalized = total_positive_score / total_score
total_negative_score_normalized = total_negative_score / total_score
total_neutral_score_normalized = total_neutral_score / total_score
if total_positive_score_normalized > total_negative_score_normalized:
final_sentiment = "positive"
out_score = total_positive_score_normalized
elif total_positive_score_normalized < total_negative_score_normalized:
final_sentiment = "negative"
out_score = total_negative_score_normalized
else:
final_sentiment = "neutral"
out_score = total_neutral_score_normalized
# return _sent, _score
return final_sentiment, out_score

View File

@ -0,0 +1,291 @@
"""Tests for checking NER Pipeline."""
from unittest.mock import Mock, patch
import pytest
from aki_prj23_transparenzregister.config.config_template import MongoConnection
from aki_prj23_transparenzregister.utils.mongo.ner_pipeline import EntityPipeline
@pytest.fixture()
def mock_mongo_connection() -> MongoConnection:
"""Mock MongoConnector class.
Args:
mocker (any): Library mocker
Returns:
Mock: Mocked MongoConnector
"""
return MongoConnection("", "", None, "" "", "")
@pytest.fixture()
def mock_mongo_connector(mocker: Mock) -> Mock:
"""Mock MongoConnector class.
Args:
mocker (any): Library mocker
Returns:
Mock: Mocked MongoConnector
"""
mock = Mock()
mocker.patch(
"aki_prj23_transparenzregister.utils.mongo.connector.MongoConnector",
return_value=mock,
)
mock.database = {"news": Mock()}
return mock
@pytest.fixture()
def mock_spacy(mocker: Mock) -> Mock:
"""Mock MongoConnector class.
Args:
mocker (any): Library mocker
Returns:
Mock: Mocked MongoConnector
"""
mock = Mock()
mocker.patch(
"aki_prj23_transparenzregister.utils.mongo.ner_service.NerAnalysisService.init_spacy",
return_value=mock,
)
return mock
# Mocking the NerAnalysisService methods
@patch(
"aki_prj23_transparenzregister.utils.mongo.ner_service.NerAnalysisService.ner_spacy"
)
def test_entity_pipeline_with_spacy(
mock_ner_spacy: Mock,
mock_mongo_connector: Mock,
mock_mongo_connection: MongoConnection,
mock_spacy: Mock,
) -> None:
# Configure the mock to return a specific NER result
mock_ner_spacy.return_value = {"ORG": 2, "PERSON": 1}
# Create an instance of the EntityPipeline
entity_pipeline = EntityPipeline(mock_mongo_connection)
# Mock the news collection and documents for testing
mock_collection = Mock()
mock_documents = [{"_id": "document1", "title": "Apple Inc is a tech company."}]
# Set the collection to the mock_collection
entity_pipeline.news_obj.collection = mock_collection
# Mock the find method of the collection to return the mock documents
mock_collection.find.return_value = mock_documents
# Call the process_documents method with spaCy NER
entity_pipeline.process_documents(
entity="ORG", doc_attrib="title", ner_selection="use_spacy_ner"
)
# Ensure that ner_spacy was called with the correct parameters
mock_ner_spacy.assert_called_once_with(mock_documents[0], "ORG", "title")
# Ensure that the document in the collection was updated with the NER results
mock_collection.update_one.assert_called_once_with(
{"_id": "document1"},
{"$set": {"companies": {"ORG": 2, "PERSON": 1}}},
)
@patch(
"aki_prj23_transparenzregister.utils.mongo.ner_service.NerAnalysisService.ner_spacy"
)
def test_entity_pipeline_with_spacy_no_docs(
mock_ner_spacy: Mock,
mock_mongo_connector: Mock,
mock_mongo_connection: MongoConnection,
mock_spacy: Mock,
) -> None:
# Configure the mock to return a specific NER result
mock_ner_spacy.return_value = {"ORG": 2, "PERSON": 1}
# Create an instance of the EntityPipeline
entity_pipeline = EntityPipeline(mock_mongo_connection)
# Mock the news collection and documents for testing
mock_collection = Mock()
mock_documents: list[dict] = []
# Set the collection to the mock_collection
entity_pipeline.news_obj.collection = mock_collection
# Mock the find method of the collection to return the mock documents
mock_collection.find.return_value = mock_documents
# Call the process_documents method with spaCy NER
entity_pipeline.process_documents(
entity="ORG", doc_attrib="title", ner_selection="use_spacy_ner"
)
# Ensure that sentiment_spacy was not called
mock_ner_spacy.assert_not_called()
# Ensure that the document in the collection was not updated
mock_collection.assert_not_called()
@patch(
"aki_prj23_transparenzregister.utils.mongo.ner_service.NerAnalysisService.ner_company_list"
)
def test_entity_pipeline_with_companylist_ner(
mock_ner_companylist: Mock,
mock_mongo_connector: Mock,
mock_mongo_connection: MongoConnection,
mock_spacy: Mock,
) -> None:
# Konfigurieren Sie das Mock-Objekt, um ein spezifisches NER-Ergebnis zurückzugeben
mock_ner_companylist.return_value = {"ORG": 3, "LOCATION": 2}
# Create an instance of the EntityPipeline
entity_pipeline = EntityPipeline(mock_mongo_connection)
# Mock die News-Sammlung und Dokumente für Tests
mock_collection = Mock()
mock_documents = [
{"_id": "document2", "title": "Siemens ist ein deutsches Unternehmen."}
]
# Set the collection to the mock_collection
entity_pipeline.news_obj.collection = mock_collection
# Mock the find method of the collection to return the mock documents
mock_collection.find.return_value = mock_documents
# Call the process_documents method with Company List NER
entity_pipeline.process_documents(
entity="ORG", doc_attrib="title", ner_selection="use_companylist_ner"
)
# Überprüfen Sie, ob ner_company_list mit den richtigen Parametern aufgerufen wurde
mock_ner_companylist.assert_called_once_with(mock_documents[0], "ORG", "title")
# Überprüfen Sie, ob das Dokument in der Sammlung mit den NER-Ergebnissen aktualisiert wurde
mock_collection.update_one.assert_called_once_with(
{"_id": "document2"},
{"$set": {"companies": {"ORG": 3, "LOCATION": 2}}},
)
@patch(
"aki_prj23_transparenzregister.utils.mongo.ner_service.NerAnalysisService.ner_company_list"
)
def test_entity_pipeline_with_companylist_ner_no_docs(
mock_ner_companylist: Mock,
mock_mongo_connector: Mock,
mock_mongo_connection: MongoConnection,
mock_spacy: Mock,
) -> None:
# Configure the mock to return a specific NER result
mock_ner_companylist.return_value = {"ORG": 3, "LOCATION": 2}
# Create an instance of the EntityPipeline
entity_pipeline = EntityPipeline(mock_mongo_connection)
# Mock die News-Sammlung und Dokumente für Tests
mock_collection = Mock()
mock_documents: list[dict] = []
# Set the collection to the mock_collection
entity_pipeline.news_obj.collection = mock_collection
# Mock the find method of the collection to return the mock documents
mock_collection.find.return_value = mock_documents
# Call the process_documents method with Company List NER
entity_pipeline.process_documents(
entity="ORG", doc_attrib="title", ner_selection="use_companylist_ner"
)
# Ensure that ner_company_list is not called
mock_ner_companylist.assert_not_called()
# Ensure that the document in the collection was not updated
mock_collection.update_one.assert_not_called()
# Add more test cases for other NER methods (e.g., use_companylist_ner, use_transformer_ner) following a similar pattern.
@patch(
"aki_prj23_transparenzregister.utils.mongo.ner_service.NerAnalysisService.ner_spacy"
)
def test_entity_pipeline_with_transformer(
mock_ner_transformer: Mock,
mock_mongo_connector: Mock,
mock_mongo_connection: MongoConnection,
mock_spacy: Mock,
) -> None:
# Configure the mock to return a specific NER result
mock_ner_transformer.return_value = {"ORG": 2, "PERSON": 1}
# Create an instance of the EntityPipeline
entity_pipeline = EntityPipeline(mock_mongo_connection)
# Mock the news collection and documents for testing
mock_collection = Mock()
mock_documents = [{"_id": "document1", "title": "Apple Inc is a tech company."}]
# Set the collection to the mock_collection
entity_pipeline.news_obj.collection = mock_collection
# Mock the find method of the collection to return the mock documents
mock_collection.find.return_value = mock_documents
# Call the process_documents method with spaCy NER
entity_pipeline.process_documents(
entity="ORG", doc_attrib="title", ner_selection="use_spacy_ner"
)
# Ensure that ner_spacy was called with the correct parameters
mock_ner_transformer.assert_called_once_with(mock_documents[0], "ORG", "title")
# Ensure that the document in the collection was updated with the NER results
mock_collection.update_one.assert_called_once_with(
{"_id": "document1"},
{"$set": {"companies": {"ORG": 2, "PERSON": 1}}},
)
@patch(
"aki_prj23_transparenzregister.utils.mongo.ner_service.NerAnalysisService.ner_spacy"
)
def test_entity_pipeline_with_transformer_no_docs(
mock_ner_transformer: Mock,
mock_mongo_connector: Mock,
mock_mongo_connection: MongoConnection,
mock_spacy: Mock,
) -> None:
# Configure the mock to return a specific NER result
mock_ner_transformer.return_value = {"ORG": 2, "PERSON": 1}
# Create an instance of the EntityPipeline
entity_pipeline = EntityPipeline(mock_mongo_connection)
# Mock the news collection and documents for testing
mock_collection = Mock()
mock_documents: list[dict] = []
# Set the collection to the mock_collection
entity_pipeline.news_obj.collection = mock_collection
# Mock the find method of the collection to return the mock documents
mock_collection.find.return_value = mock_documents
# Call the process_documents method with spaCy NER
entity_pipeline.process_documents(
entity="ORG", doc_attrib="title", ner_selection="use_spacy_ner"
)
# Ensure that ner_transformer is not called
mock_ner_transformer.assert_not_called()
# Ensure that the document in the collection was not updated
mock_collection.update_one.assert_not_called()

View File

@ -0,0 +1,54 @@
"""Tests for checking NER Services."""
from aki_prj23_transparenzregister.utils.mongo.ner_service import NerAnalysisService
def test_ner_spacy() -> None:
"""Mock TestNerService."""
# Create instance of NerAnalysisService with use_spacy=True
ner_service = NerAnalysisService(
use_spacy=True, use_transformer=False, use_companylist=False
)
# 1st testing
doc = {"title": "Siemens ist ein Unternehmen."}
result = ner_service.ner_spacy(doc, ent_type="ORG", doc_attrib="title")
assert result == {"Siemens": 1}
# 2nd testing
doc = {"text": "BASF ist ein großes Unternehmen."}
result = ner_service.ner_spacy(doc, ent_type="ORG", doc_attrib="text")
assert result == {"BASF": 1}
def test_ner_company_list() -> None:
"""Mock test_ner_company."""
# Create instance of NerAnalysisService with use_use_companylist=True
ner_service = NerAnalysisService(
use_spacy=False, use_transformer=False, use_companylist=True
)
doc = {"title": "Siemens ist ein Unternehmen."}
result = ner_service.ner_company_list(doc, ent_type="ORG", doc_attrib="title")
assert result == {"siemens": 1}
# 2nd testing
doc = {"text": "BASF ist ein großes Unternehmen."}
result = ner_service.ner_company_list(doc, ent_type="ORG", doc_attrib="text")
assert result == {"basf": 1}
def test_ner_transformer() -> None:
"""Mock test_ner_company."""
# Create instance of NerAnalysisService with use_use_companylist=True
ner_service = NerAnalysisService(
use_spacy=False, use_transformer=True, use_companylist=False
)
doc = {"title": "Siemens ist ein Unternehmen."}
result = ner_service.ner_transformer(doc, ent_type="ORG", doc_attrib="title")
assert result == {"Siemens": 1}
# 2nd testing
doc = {"text": "BASF ist ein großes Unternehmen."}
result = ner_service.ner_transformer(doc, ent_type="ORG", doc_attrib="text")
assert result == {"BASF": 1}

View File

@ -0,0 +1,210 @@
"""Unit test for sentiment pipeline."""
from unittest.mock import Mock, patch
import pytest
from aki_prj23_transparenzregister.config.config_template import MongoConnection
from aki_prj23_transparenzregister.utils.mongo.sentiment_pipeline import (
SentimentPipeline,
)
@pytest.fixture()
def mock_mongo_connection() -> MongoConnection:
"""Mock MongoConnector class.
Args:
mocker (any): Library mocker
Returns:
Mock: Mocked MongoConnector
"""
return MongoConnection("", "", None, "" "", "")
@pytest.fixture()
def mock_mongo_connector(mocker: Mock) -> Mock:
"""Mock MongoConnector class.
Args:
mocker (any): Library mocker
Returns:
Mock: Mocked MongoConnector
"""
mock = Mock()
mocker.patch(
"aki_prj23_transparenzregister.utils.mongo.connector.MongoConnector",
return_value=mock,
)
mock.database = {"news": Mock()}
return mock
@pytest.fixture()
def mock_spacy(mocker: Mock) -> Mock:
"""Mock MongoConnector class.
Args:
mocker (any): Library mocker
Returns:
Mock: Mocked MongoConnector
"""
mock = Mock()
mocker.patch(
"aki_prj23_transparenzregister.utils.mongo.sentiment_service.SentimentAnalysisService.init_spacy",
return_value=mock,
)
return mock
@patch(
"aki_prj23_transparenzregister.utils.mongo.sentiment_service.SentimentAnalysisService.sentiment_spacy"
)
def test_sentiment_pipeline_existing_sentiment(
mock_sentiment_spacy: Mock,
mock_mongo_connector: Mock,
mock_mongo_connection: MongoConnection,
mock_spacy: Mock,
) -> None:
# Configure the mock to return a specific sentiment result
mock_sentiment_spacy.return_value = ("positive", 0.8)
# Create an instance of the SentimentPipeline
sentiment_pipeline = SentimentPipeline(mock_mongo_connection)
# Mock the news collection and documents for testing
mock_collection = Mock()
mock_documents = [
{
"_id": "document1",
"text": "This is a positive text.",
"sentiment": {"label": "neutral", "score": 0.5},
}
]
# Set the collection to the mock_collection
sentiment_pipeline.news_obj.collection = mock_collection
# Mock the find method of the collection to return the mock documents
mock_collection.find.return_value = mock_documents
# Call the process_documents method
sentiment_pipeline.process_documents("text", "use_spacy")
# Ensure that sentiment_spacy was called with the correct text
mock_sentiment_spacy.assert_called_once_with("This is a positive text.")
# Ensure that the document in the collection was not updated with sentiment
# mock_collection.update_one.assert_not_called()
@patch(
"aki_prj23_transparenzregister.utils.mongo.sentiment_service.SentimentAnalysisService.sentiment_spacy"
)
def test_sentiment_pipeline_no_documents(
mock_sentiment_spacy: Mock,
mock_mongo_connector: Mock,
mock_mongo_connection: MongoConnection,
mock_spacy: Mock,
) -> None:
# Configure the mock to return a specific sentiment result
mock_sentiment_spacy.return_value = ("positive", 0.8)
# Create an instance of the SentimentPipeline
sentiment_pipeline = SentimentPipeline(mock_mongo_connection)
# Mock the news collection to return an empty result
mock_collection = Mock()
mock_collection.find.return_value = []
# Set the collection to the mock_collection
sentiment_pipeline.news_obj.collection = mock_collection
# Call the process_documents method
sentiment_pipeline.process_documents("text", "use_spacy")
# Ensure that sentiment_spacy was not called
mock_sentiment_spacy.assert_not_called()
# Ensure that the document in the collection was not updated with sentiment
mock_collection.update_one.assert_not_called()
@patch(
"aki_prj23_transparenzregister.utils.mongo.sentiment_service.SentimentAnalysisService.sentiment_spacy"
)
def test_sentiment_pipeline_with_spacy(
mock_sentiment_spacy: Mock,
mock_mongo_connector: Mock,
mock_mongo_connection: MongoConnection,
mock_spacy: Mock,
) -> None:
# Configure the mock to return a specific sentiment result
mock_sentiment_spacy.return_value = ("positive", 0.8)
# Create an instance of the SentimentPipeline
sentiment_pipeline = SentimentPipeline(mock_mongo_connection)
# Mock the news collection and documents for testing
mock_collection = Mock()
mock_documents = [{"_id": "document1", "text": "This is a positive text."}]
# Set the collection to the mock_collection
sentiment_pipeline.news_obj.collection = mock_collection
# Mock the find method of the collection to return the mock documents
mock_collection.find.return_value = mock_documents
# Call the process_documents method
sentiment_pipeline.process_documents("text", "use_spacy")
# Ensure that sentiment_spacy was called with the correct text
mock_sentiment_spacy.assert_called_once_with("This is a positive text.")
# Ensure that the document in the collection was updated with the sentiment result
mock_collection.update_one.assert_called_once_with(
{"_id": "document1"},
{"$set": {"sentiment": {"label": "positive", "score": 0.8}}},
)
# Mocking the SentimentAnalysisService methods
@patch(
"aki_prj23_transparenzregister.utils.mongo.sentiment_service.SentimentAnalysisService.sentiment_transformer"
)
def test_sentiment_pipeline_with_transformer(
mock_sentiment_transformer: Mock,
mock_mongo_connector: Mock,
mock_mongo_connection: MongoConnection,
mock_spacy: Mock,
) -> None:
# Configure the mock to return a specific sentiment result
mock_sentiment_transformer.return_value = ("negative", 0.6)
# Create an instance of the SentimentPipeline
sentiment_pipeline = SentimentPipeline(mock_mongo_connection)
# Mock the news collection and documents for testing
mock_collection = Mock()
mock_documents = [{"_id": "document2", "text": "This is a negative text."}]
# Set the collection to the mock_collection
sentiment_pipeline.news_obj.collection = mock_collection
# Mock the find method of the collection to return the mock documents
mock_collection.find.return_value = mock_documents
# Call the process_documents method
sentiment_pipeline.process_documents("text", "use_transformer")
# Ensure that sentiment_transformer was called with the correct text
mock_sentiment_transformer.assert_called_once_with("This is a negative text.")
# Ensure that the document in the collection was updated with the sentiment result
mock_collection.update_one.assert_called_once_with(
{"_id": "document2"},
{"$set": {"sentiment": {"label": "negative", "score": 0.6}}},
)

View File

@ -0,0 +1,78 @@
"""Tests for checking Sentiment Services."""
from aki_prj23_transparenzregister.utils.mongo.sentiment_service import (
SentimentAnalysisService,
)
def test_sentiment_service_with_spacy_pos() -> None:
"""Mock testing spaCy Sentiment Service with positive sentiment."""
# Init SentimentAnalysisService with spaCy
sentiment_service = SentimentAnalysisService(use_spacy=True)
# run the test
text = "Dies ist ein großartiger Test. Ich liebe es!"
sentiment, score = sentiment_service.sentiment_spacy(text)
assert sentiment == "positive"
assert score > 0
def test_sentiment_service_with_spacy_neg() -> None:
"""Mock testing spaCy Sentiment Service with negative sentiment."""
# Init SentimentAnalysisService with spaCy
sentiment_service = SentimentAnalysisService(use_spacy=True)
# run the test
text = "Dies ist ein wirklich schrecklicher Test. Ich hasse ihn!"
sentiment, score = sentiment_service.sentiment_spacy(text)
assert sentiment == "negative"
assert score > 0
def test_sentiment_service_with_spacy_neut() -> None:
"""Mock testing spaCy Sentiment Service with neutral sentiment."""
# Init SentimentAnalysisService with spaCy
sentiment_service = SentimentAnalysisService(use_spacy=True)
# run the test
text = "Dies ist ein Test."
sentiment, score = sentiment_service.sentiment_spacy(text)
assert sentiment == "neutral"
assert score >= 0
def test_sentiment_service_with_transformer_pos() -> None:
"""Mock testing Transformer Sentiment Service with positive Sentiment."""
# Init SentimentAnalysisService with Transformer
sentiment_service = SentimentAnalysisService(use_transformer=True)
# run the test
text = "Dies ist ein großartiger Test. Ich liebe es!"
sentiment, score = sentiment_service.sentiment_transformer(text)
assert sentiment == "positive"
assert score > 0
def test_sentiment_service_with_transformer_neg() -> None:
"""Mock testing Transformer Sentiment Service with negative Sentiment."""
# Init SentimentAnalysisService with Transformer
sentiment_service = SentimentAnalysisService(use_transformer=True)
# run the test
text = "Dies ist ein wirklich schrecklicher Test. Ich hasse ihn!"
sentiment, score = sentiment_service.sentiment_transformer(text)
assert sentiment == "negative"
assert score > 0
def test_sentiment_service_with_transformer_neut() -> None:
"""Mock testing Transformer Sentiment Service with neutral Sentiment."""
# Init SentimentAnalysisService with Transformer
sentiment_service = SentimentAnalysisService(use_transformer=True)
# run the test
text = "Das ist ein Text, ohne besondere Stimmung."
sentiment, score = sentiment_service.sentiment_transformer(text)
assert sentiment == "neutral"
assert score >= 0