mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-05-13 19:18:47 +02:00
Improved model for sentiment pipeline (#434)
Austausch des Sentiment-Modells.
This commit is contained in:
parent
6890562a18
commit
35016ba5f3
154
Jupyter/Sentiment_Company_Matching/Drop_News_from_Mongo.ipynb
Normal file
154
Jupyter/Sentiment_Company_Matching/Drop_News_from_Mongo.ipynb
Normal file
@ -0,0 +1,154 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Notebook to drop news article from MongoDB, because these articles do not match"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import json\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import aki_prj23_transparenzregister.utils.mongo.connector as conn\n",
|
||||||
|
"from aki_prj23_transparenzregister.config.config_providers import JsonFileConfigProvider\n",
|
||||||
|
"import aki_prj23_transparenzregister.utils.mongo.news_mongo_service as news\n",
|
||||||
|
"import aki_prj23_transparenzregister.utils.mongo.company_mongo_service as comps\n",
|
||||||
|
"from tqdm import tqdm\n",
|
||||||
|
"import pickle"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"202"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# optional: LAden der ObjectIDs\n",
|
||||||
|
"with open(\"ObjectID2\", \"rb\") as fp: # Unpickling\n",
|
||||||
|
" compListLoaded = pickle.load(fp)\n",
|
||||||
|
"\n",
|
||||||
|
"len(compListLoaded)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Mongo Connect: create connection string and connect\n",
|
||||||
|
"config_provider = JsonFileConfigProvider(\"../../secrets.json\")\n",
|
||||||
|
"engine = config_provider.get_mongo_connection_string()\n",
|
||||||
|
"engine.database = \"transparenzregister\"\n",
|
||||||
|
"connector = conn.MongoConnector(engine)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Process all documents in news collection and check if attribute 'companies' is existing\n",
|
||||||
|
"\n",
|
||||||
|
"# Read data from database\n",
|
||||||
|
"NERObj = news.MongoNewsService(connector)\n",
|
||||||
|
"allNER = NERObj.get_all()\n",
|
||||||
|
"\n",
|
||||||
|
"# Create a cursor which has all unprogressed articles; articles without the attribute 'companies'\n",
|
||||||
|
"CursorNERNames = NERObj.collection.find({\"companies\": {\"$exists\": True}})\n",
|
||||||
|
"documentsNER = list(CursorNERNames)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"3055"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"len(documentsNER)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"100%|██████████| 3055/3055 [00:04<00:00, 744.27it/s]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# drop all documents, which have no matching from Rapidfuzz\n",
|
||||||
|
"if len(documentsNER) > 0:\n",
|
||||||
|
" for document in tqdm(documentsNER):\n",
|
||||||
|
" if document[\"_id\"] not in compListLoaded:\n",
|
||||||
|
" # print(\"Doc found not in list!!!\")\n",
|
||||||
|
"\n",
|
||||||
|
" NERObj.collection.delete_one({\"_id\": document[\"_id\"]})\n",
|
||||||
|
"else:\n",
|
||||||
|
" print(\"No documents found.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "aki-prj23-transparenzregister-eMuJN1BX-py3.11",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
@ -70,7 +70,7 @@ class SentimentAnalysisService:
|
|||||||
# loading the sentiment model(~ 436MB) for transformer
|
# loading the sentiment model(~ 436MB) for transformer
|
||||||
self.sentiment_analyzer = pipeline(
|
self.sentiment_analyzer = pipeline(
|
||||||
"sentiment-analysis",
|
"sentiment-analysis",
|
||||||
model="oliverguhr/german-sentiment-bert",
|
model="bardsai/finance-sentiment-de-base",
|
||||||
truncation=True,
|
truncation=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user