Created pipeline to run ner sentiment and sql ingest (#314)

Created a dataprocessing pipline that enhances the raw mined data with
Organsiation extractions and sentiment analysis prio to moving the data
to the sql db.
The transfer of matched data is done afterword.

---------

Co-authored-by: SeZett <zeleny.sebastian@fh-swf.de>
This commit is contained in:
2023-11-11 14:28:12 +01:00
committed by GitHub
parent a6d486209a
commit 066800123d
12 changed files with 206 additions and 132 deletions

View File

@ -83,9 +83,7 @@ def test_entity_pipeline_with_spacy(
mock_collection.find.return_value = mock_documents
# Call the process_documents method with spaCy NER
entity_pipeline.process_documents(
entity="ORG", doc_attrib="title", ner_selection="use_spacy_ner"
)
entity_pipeline.process_documents(doc_attrib="title", ner_method="spacy")
# Ensure that ner_spacy was called with the correct parameters
mock_ner_spacy.assert_called_once_with(mock_documents[0], "ORG", "title")
@ -121,9 +119,7 @@ def test_entity_pipeline_with_spacy_no_docs(
mock_collection.find.return_value = mock_documents
# Call the process_documents method with spaCy NER
entity_pipeline.process_documents(
entity="ORG", doc_attrib="title", ner_selection="use_spacy_ner"
)
entity_pipeline.process_documents(doc_attrib="title", ner_method="spacy")
# Ensure that sentiment_spacy was not called
mock_ner_spacy.assert_not_called()
@ -135,14 +131,14 @@ def test_entity_pipeline_with_spacy_no_docs(
@patch(
"aki_prj23_transparenzregister.ai.ner_service.NerAnalysisService.ner_company_list"
)
def test_entity_pipeline_with_companylist_ner(
mock_ner_companylist: Mock,
def test_entity_pipeline_with_company_list_ner(
mock_ner_company_list: Mock,
mock_mongo_connector: Mock,
mock_mongo_connection: MongoConnection,
mock_spacy: Mock,
) -> None:
# Konfigurieren Sie das Mock-Objekt, um ein spezifisches NER-Ergebnis zurückzugeben
mock_ner_companylist.return_value = {"ORG": 3, "LOCATION": 2}
mock_ner_company_list.return_value = {"ORG": 3, "LOCATION": 2}
# Create an instance of the EntityPipeline
entity_pipeline = EntityPipeline(mock_mongo_connection)
@ -159,12 +155,10 @@ def test_entity_pipeline_with_companylist_ner(
mock_collection.find.return_value = mock_documents
# Call the process_documents method with Company List NER
entity_pipeline.process_documents(
entity="ORG", doc_attrib="title", ner_selection="use_companylist_ner"
)
entity_pipeline.process_documents(doc_attrib="title", ner_method="company_list")
# Überprüfen Sie, ob ner_company_list mit den richtigen Parametern aufgerufen wurde
mock_ner_companylist.assert_called_once_with(mock_documents[0], "ORG", "title")
mock_ner_company_list.assert_called_once_with(mock_documents[0], "ORG", "title")
# Überprüfen Sie, ob das Dokument in der Sammlung mit den NER-Ergebnissen aktualisiert wurde
mock_collection.update_one.assert_called_once_with(
@ -176,14 +170,14 @@ def test_entity_pipeline_with_companylist_ner(
@patch(
"aki_prj23_transparenzregister.ai.ner_service.NerAnalysisService.ner_company_list"
)
def test_entity_pipeline_with_companylist_ner_no_docs(
mock_ner_companylist: Mock,
def test_entity_pipeline_with_company_list_ner_no_docs(
mock_ner_company_list: Mock,
mock_mongo_connector: Mock,
mock_mongo_connection: MongoConnection,
mock_spacy: Mock,
) -> None:
# Configure the mock to return a specific NER result
mock_ner_companylist.return_value = {"ORG": 3, "LOCATION": 2}
mock_ner_company_list.return_value = {"ORG": 3, "LOCATION": 2}
# Create an instance of the EntityPipeline
entity_pipeline = EntityPipeline(mock_mongo_connection)
@ -198,18 +192,15 @@ def test_entity_pipeline_with_companylist_ner_no_docs(
mock_collection.find.return_value = mock_documents
# Call the process_documents method with Company List NER
entity_pipeline.process_documents(
entity="ORG", doc_attrib="title", ner_selection="use_companylist_ner"
)
entity_pipeline.process_documents(doc_attrib="title", ner_method="company_list")
# Ensure that ner_company_list is not called
mock_ner_companylist.assert_not_called()
mock_ner_company_list.assert_not_called()
# Ensure that the document in the collection was not updated
mock_collection.update_one.assert_not_called()
# Add more test cases for other NER methods (e.g., use_companylist_ner, use_transformer_ner) following a similar pattern.
@patch("aki_prj23_transparenzregister.ai.ner_service.NerAnalysisService.ner_spacy")
def test_entity_pipeline_with_transformer(
mock_ner_transformer: Mock,
@ -234,9 +225,7 @@ def test_entity_pipeline_with_transformer(
mock_collection.find.return_value = mock_documents
# Call the process_documents method with spaCy NER
entity_pipeline.process_documents(
entity="ORG", doc_attrib="title", ner_selection="use_spacy_ner"
)
entity_pipeline.process_documents(doc_attrib="title", ner_method="spacy")
# Ensure that ner_spacy was called with the correct parameters
mock_ner_transformer.assert_called_once_with(mock_documents[0], "ORG", "title")
@ -272,9 +261,7 @@ def test_entity_pipeline_with_transformer_no_docs(
mock_collection.find.return_value = mock_documents
# Call the process_documents method with spaCy NER
entity_pipeline.process_documents(
entity="ORG", doc_attrib="title", ner_selection="use_spacy_ner"
)
entity_pipeline.process_documents(doc_attrib="title", ner_method="spacy")
# Ensure that ner_transformer is not called
mock_ner_transformer.assert_not_called()

View File

@ -92,7 +92,7 @@ def test_sentiment_pipeline_existing_sentiment(
mock_collection.find.return_value = mock_documents
# Call the process_documents method
sentiment_pipeline.process_documents("text", "use_spacy")
sentiment_pipeline.process_documents("text", "spacy")
# Ensure that sentiment_spacy was called with the correct text
mock_sentiment_spacy.assert_called_once_with("This is a positive text.")
@ -124,7 +124,7 @@ def test_sentiment_pipeline_no_documents(
sentiment_pipeline.news_obj.collection = mock_collection
# Call the process_documents method
sentiment_pipeline.process_documents("text", "use_spacy")
sentiment_pipeline.process_documents("text", "spacy")
# Ensure that sentiment_spacy was not called
mock_sentiment_spacy.assert_not_called()
@ -159,7 +159,7 @@ def test_sentiment_pipeline_with_spacy(
mock_collection.find.return_value = mock_documents
# Call the process_documents method
sentiment_pipeline.process_documents("text", "use_spacy")
sentiment_pipeline.process_documents("text", "spacy")
# Ensure that sentiment_spacy was called with the correct text
mock_sentiment_spacy.assert_called_once_with("This is a positive text.")
@ -198,7 +198,7 @@ def test_sentiment_pipeline_with_transformer(
mock_collection.find.return_value = mock_documents
# Call the process_documents method
sentiment_pipeline.process_documents("text", "use_transformer")
sentiment_pipeline.process_documents("text", "transformer")
# Ensure that sentiment_transformer was called with the correct text
mock_sentiment_transformer.assert_called_once_with("This is a negative text.")

View File

@ -0,0 +1,7 @@
"""Tests for the data processing module."""
from aki_prj23_transparenzregister.utils import data_processing
def test_import() -> None:
"""Tests if the data processing module can be imported."""
assert data_processing

View File

@ -130,7 +130,7 @@ def test_transfer_news_to_sql(full_db: Session, monkeypatch: MonkeyPatch) -> Non
"aki_prj23_transparenzregister.utils.transfer_news.get_all_news",
lambda _: NEWS_TEXTS,
)
transfer_news.transfer_news_to_sql(None, full_db) # type: ignore
transfer_news._transfer_news_to_sql(None, full_db) # type: ignore
articles = pd.read_sql_table(entities.News.__tablename__, full_db.bind) # type: ignore
assert "text" in articles.columns
del articles["text"]