mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-05-14 05:18:46 +02:00
300 ner service crash (#301)
- Bugfixing: Wenn das Dokument aus der Mongo keinen Eintrag für Text oder Titel hatte, ist die Pipeline gecrasht. -->Eine Abfrage wurde hinzugefügt. - Überschreiben des Datenbanknamens wurde entfernt - ein Attribut des Transformers hat sich geändert --------- Co-authored-by: Philipp Horstenkamp <philipp@horstenkamp.de>
This commit is contained in:
parent
7620efc6da
commit
a96ebe916c
@ -22,7 +22,6 @@ class EntityPipeline:
|
|||||||
def __init__(self, conn_string: conn.MongoConnection) -> None:
|
def __init__(self, conn_string: conn.MongoConnection) -> None:
|
||||||
"""Method to connect to StagingDB."""
|
"""Method to connect to StagingDB."""
|
||||||
self.connect_string = conn_string
|
self.connect_string = conn_string
|
||||||
self.connect_string.database = "transparenzregister_ner"
|
|
||||||
self.connector = conn.MongoConnector(self.connect_string)
|
self.connector = conn.MongoConnector(self.connect_string)
|
||||||
self.news_obj = news.MongoNewsService(self.connector)
|
self.news_obj = news.MongoNewsService(self.connector)
|
||||||
|
|
||||||
|
@ -44,7 +44,7 @@ class NerAnalysisService:
|
|||||||
self.classifier = pipeline(
|
self.classifier = pipeline(
|
||||||
"ner",
|
"ner",
|
||||||
model="fhswf/bert_de_ner",
|
model="fhswf/bert_de_ner",
|
||||||
grouped_entities=True,
|
aggregation_strategy="simple",
|
||||||
tokenizer="dbmdz/bert-base-german-cased",
|
tokenizer="dbmdz/bert-base-german-cased",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -72,15 +72,16 @@ class NerAnalysisService:
|
|||||||
# init list for entities
|
# init list for entities
|
||||||
entities = []
|
entities = []
|
||||||
|
|
||||||
text = doc[doc_attrib]
|
text = doc[doc_attrib].strip()
|
||||||
|
# check if text is a string and not empty
|
||||||
|
if isinstance(text, str) and text:
|
||||||
|
# get entities
|
||||||
|
doc_nlp = self.nlp(text)
|
||||||
|
|
||||||
# get entities
|
# select company
|
||||||
doc_nlp = self.nlp(text)
|
for ent in doc_nlp.ents:
|
||||||
|
if ent.label_ == ent_type:
|
||||||
# select company
|
entities.append(ent.text)
|
||||||
for ent in doc_nlp.ents:
|
|
||||||
if ent.label_ == ent_type:
|
|
||||||
entities.append(ent.text)
|
|
||||||
return dict(Counter(entities))
|
return dict(Counter(entities))
|
||||||
|
|
||||||
def ner_company_list(
|
def ner_company_list(
|
||||||
@ -104,17 +105,19 @@ class NerAnalysisService:
|
|||||||
entities = []
|
entities = []
|
||||||
|
|
||||||
# Search the text for company names
|
# Search the text for company names
|
||||||
text = doc[doc_attrib]
|
text = doc[doc_attrib].strip()
|
||||||
# Convert title to lowercase
|
# check if text is a string and not empty
|
||||||
text = text.lower()
|
if isinstance(text, str) and text:
|
||||||
|
# Convert title to lowercase
|
||||||
|
text = text.lower()
|
||||||
|
|
||||||
for company_name in self.complist:
|
for company_name in self.complist:
|
||||||
start_idx = text.find(company_name)
|
start_idx = text.find(company_name)
|
||||||
if start_idx != -1: # Wort gefunden
|
if start_idx != -1: # Wort gefunden
|
||||||
start_idx + len(company_name)
|
start_idx + len(company_name)
|
||||||
entity = company_name
|
entity = company_name
|
||||||
if entity not in entities:
|
if entity not in entities:
|
||||||
entities.append(entity)
|
entities.append(entity)
|
||||||
|
|
||||||
return dict(Counter(entities))
|
return dict(Counter(entities))
|
||||||
|
|
||||||
@ -136,15 +139,18 @@ class NerAnalysisService:
|
|||||||
# init list for entities
|
# init list for entities
|
||||||
entities = []
|
entities = []
|
||||||
text = doc[doc_attrib]
|
text = doc[doc_attrib]
|
||||||
sentences = text.split(". ") # Split text into sentences based on '. '
|
# check if text is a string and not empty
|
||||||
|
if isinstance(text, str) and text:
|
||||||
|
sentences = text.split(". ") # Split text into sentences based on '. '
|
||||||
|
|
||||||
# Process each sentence separately
|
# Process each sentence separately
|
||||||
for sentence in sentences:
|
for sentence in sentences:
|
||||||
res = self.classifier(
|
res = self.classifier(
|
||||||
sentence
|
sentence
|
||||||
) # Assuming 'classifier' processes a single sentence at a time
|
) # Assuming 'classifier' processes a single sentence at a time
|
||||||
|
|
||||||
|
for _ in res:
|
||||||
|
if _["entity_group"] == ent_type:
|
||||||
|
entities.append(_["word"])
|
||||||
|
|
||||||
for i in range(len(res)):
|
|
||||||
if res[i]["entity_group"] == ent_type:
|
|
||||||
entities.append(res[i]["word"])
|
|
||||||
return dict(Counter(entities))
|
return dict(Counter(entities))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user