300 ner service crash (#301)

- Bugfixing: Wenn das Dokument aus der Mongo keinen Eintrag für Text oder Titel hatte, ist die Pipeline gecrasht. -->Eine Abfrage wurde hinzugefügt. - Überschreiben des Datenbanknamens wurde entfernt - ein Attribut des Transformers hat sich geändert --------- Co-authored-by: Philipp Horstenkamp <philipp@horstenkamp.de>
2025-07-12 15:40:10 +02:00 · 2023-11-02 07:55:38 +01:00
parent 7620efc6da
commit a96ebe916c
2 changed files with 34 additions and 29 deletions
--- a/src/aki_prj23_transparenzregister/utils/mongo/ner_pipeline.py
+++ b/src/aki_prj23_transparenzregister/utils/mongo/ner_pipeline.py
@ -22,7 +22,6 @@ class EntityPipeline:
    def __init__(self, conn_string: conn.MongoConnection) -> None:
        """Method to connect to StagingDB."""
        self.connect_string = conn_string
-        self.connect_string.database = "transparenzregister_ner"
        self.connector = conn.MongoConnector(self.connect_string)
        self.news_obj = news.MongoNewsService(self.connector)

--- a/src/aki_prj23_transparenzregister/utils/mongo/ner_service.py
+++ b/src/aki_prj23_transparenzregister/utils/mongo/ner_service.py
@ -44,7 +44,7 @@ class NerAnalysisService:
        self.classifier = pipeline(
            "ner",
            model="fhswf/bert_de_ner",
-            grouped_entities=True,
+            aggregation_strategy="simple",
            tokenizer="dbmdz/bert-base-german-cased",
        )

@ -72,15 +72,16 @@ class NerAnalysisService:
        # init list for entities
        entities = []

-        text = doc[doc_attrib]
+        text = doc[doc_attrib].strip()
+        # check if text is a string and not empty
+        if isinstance(text, str) and text:
+            # get entities
+            doc_nlp = self.nlp(text)

-        # get entities
-        doc_nlp = self.nlp(text)
-
-        # select company
-        for ent in doc_nlp.ents:
-            if ent.label_ == ent_type:
-                entities.append(ent.text)
+            # select company
+            for ent in doc_nlp.ents:
+                if ent.label_ == ent_type:
+                    entities.append(ent.text)
        return dict(Counter(entities))

    def ner_company_list(
@ -104,17 +105,19 @@ class NerAnalysisService:
        entities = []

        # Search the text for company names
-        text = doc[doc_attrib]
-        # Convert title to lowercase
-        text = text.lower()
+        text = doc[doc_attrib].strip()
+        # check if text is a string and not empty
+        if isinstance(text, str) and text:
+            # Convert title to lowercase
+            text = text.lower()

-        for company_name in self.complist:
-            start_idx = text.find(company_name)
-            if start_idx != -1:  # Wort gefunden
-                start_idx + len(company_name)
-                entity = company_name
-                if entity not in entities:
-                    entities.append(entity)
+            for company_name in self.complist:
+                start_idx = text.find(company_name)
+                if start_idx != -1:  # Wort gefunden
+                    start_idx + len(company_name)
+                    entity = company_name
+                    if entity not in entities:
+                        entities.append(entity)

        return dict(Counter(entities))

@ -136,15 +139,18 @@ class NerAnalysisService:
        # init list for entities
        entities = []
        text = doc[doc_attrib]
-        sentences = text.split(". ")  # Split text into sentences based on '. '
+        # check if text is a string and not empty
+        if isinstance(text, str) and text:
+            sentences = text.split(". ")  # Split text into sentences based on '. '

-        # Process each sentence separately
-        for sentence in sentences:
-            res = self.classifier(
-                sentence
-            )  # Assuming 'classifier' processes a single sentence at a time
+            # Process each sentence separately
+            for sentence in sentences:
+                res = self.classifier(
+                    sentence
+                )  # Assuming 'classifier' processes a single sentence at a time
+
+                for _ in res:
+                    if _["entity_group"] == ent_type:
+                        entities.append(_["word"])

-            for i in range(len(res)):
-                if res[i]["entity_group"] == ent_type:
-                    entities.append(res[i]["word"])
        return dict(Counter(entities))