300 ner service crash (#301)

- Bugfixing: Wenn das Dokument aus der Mongo keinen Eintrag für Text
oder Titel hatte, ist die Pipeline gecrasht. -->Eine Abfrage wurde
hinzugefügt.
- Überschreiben des Datenbanknamens wurde entfernt
- ein Attribut des Transformers hat sich geändert

---------

Co-authored-by: Philipp Horstenkamp <philipp@horstenkamp.de>
This commit is contained in:
Sebastian 2023-11-02 07:55:38 +01:00 committed by GitHub
parent 7620efc6da
commit a96ebe916c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 34 additions and 29 deletions

View File

@ -22,7 +22,6 @@ class EntityPipeline:
def __init__(self, conn_string: conn.MongoConnection) -> None:
"""Method to connect to StagingDB."""
self.connect_string = conn_string
self.connect_string.database = "transparenzregister_ner"
self.connector = conn.MongoConnector(self.connect_string)
self.news_obj = news.MongoNewsService(self.connector)

View File

@ -44,7 +44,7 @@ class NerAnalysisService:
self.classifier = pipeline(
"ner",
model="fhswf/bert_de_ner",
grouped_entities=True,
aggregation_strategy="simple",
tokenizer="dbmdz/bert-base-german-cased",
)
@ -72,15 +72,16 @@ class NerAnalysisService:
# init list for entities
entities = []
text = doc[doc_attrib]
text = doc[doc_attrib].strip()
# check if text is a string and not empty
if isinstance(text, str) and text:
# get entities
doc_nlp = self.nlp(text)
# get entities
doc_nlp = self.nlp(text)
# select company
for ent in doc_nlp.ents:
if ent.label_ == ent_type:
entities.append(ent.text)
# select company
for ent in doc_nlp.ents:
if ent.label_ == ent_type:
entities.append(ent.text)
return dict(Counter(entities))
def ner_company_list(
@ -104,17 +105,19 @@ class NerAnalysisService:
entities = []
# Search the text for company names
text = doc[doc_attrib]
# Convert title to lowercase
text = text.lower()
text = doc[doc_attrib].strip()
# check if text is a string and not empty
if isinstance(text, str) and text:
# Convert title to lowercase
text = text.lower()
for company_name in self.complist:
start_idx = text.find(company_name)
if start_idx != -1: # Wort gefunden
start_idx + len(company_name)
entity = company_name
if entity not in entities:
entities.append(entity)
for company_name in self.complist:
start_idx = text.find(company_name)
if start_idx != -1: # Wort gefunden
start_idx + len(company_name)
entity = company_name
if entity not in entities:
entities.append(entity)
return dict(Counter(entities))
@ -136,15 +139,18 @@ class NerAnalysisService:
# init list for entities
entities = []
text = doc[doc_attrib]
sentences = text.split(". ") # Split text into sentences based on '. '
# check if text is a string and not empty
if isinstance(text, str) and text:
sentences = text.split(". ") # Split text into sentences based on '. '
# Process each sentence separately
for sentence in sentences:
res = self.classifier(
sentence
) # Assuming 'classifier' processes a single sentence at a time
# Process each sentence separately
for sentence in sentences:
res = self.classifier(
sentence
) # Assuming 'classifier' processes a single sentence at a time
for _ in res:
if _["entity_group"] == ent_type:
entities.append(_["word"])
for i in range(len(res)):
if res[i]["entity_group"] == ent_type:
entities.append(res[i]["word"])
return dict(Counter(entities))