mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-09-14 00:31:20 +02:00
47 KiB
47 KiB
In [ ]:
import json
import pandas as pd
import aki_prj23_transparenzregister.utils.mongo.connector as conn
from aki_prj23_transparenzregister.config.config_providers import JsonFileConfigProvider
import aki_prj23_transparenzregister.utils.mongo.news_mongo_service as news
import aki_prj23_transparenzregister.utils.mongo.company_mongo_service as comps
In [ ]:
class ABSA:
def __init__(self):
self.config_provider = JsonFileConfigProvider("./secrets.json")
self.connect_string = self.config_provider.get_mongo_connection_string()
self.connect_string.database = "transparenzregister_ner"
self.connector = conn.MongoConnector(self.connect_string)
In [ ]:
# Mongo Connect: create connection string and connect
config_provider = JsonFileConfigProvider("../../secrets.json")
engine = config_provider.get_mongo_connection_string()
engine.database = "transparenzregister_ner"
connector = conn.MongoConnector(engine)
In [ ]:
# Process all documents and check if attribute 'name' is existing
# Read data from database
CompsObj = comps.CompanyMongoService(connector)
allComps = CompsObj.get_all()
# Create a cursor which has all unprogressed articles; articles without the attribute 'companies'
CursorCompNames = CompsObj.collection.find({"name": {"$exists": True}})
documents = list(CursorCompNames)
In [ ]:
# create a list with all company names
compList = []
if len(documents) > 0:
for document in documents:
# ents=NERService.NERCompanyList(company_list,document)
compList.append(document["name"])
# add a new attribute 'companies' to document
# newsObj.collection.update_one(
# {"_id": document["_id"]}, # Filter für das entsprechende Dokument
# {"$set": {"companies": ents}} # Neues Attribut hinzufügen, initialisiert mit einer leeren Liste
# )
else:
print("No documents found.")
In [ ]:
# Process all documents in news collection and check if attribute 'companies' is existing
# Read data from database
NERObj = news.MongoNewsService(connector)
allNER = NERObj.get_all()
# Create a cursor which has all unprogressed articles; articles without the attribute 'companies'
CursorNERNames = NERObj.collection.find({"companies": {"$exists": True}})
documentsNER = list(CursorNERNames)
In [ ]:
# install and import rapidfuzz
# pip install rapidfuzz
from rapidfuzz import process
In [ ]:
# Process all documents in news collection and check if attribute 'companies' is existing
# Read data from database
NERObj = news.MongoNewsService(connector)
allNER = NERObj.get_all()
# Create a cursor which has all unprogressed articles; articles without the attribute 'companies'
CursorNERNames = NERObj.collection.find({"companies": {"$exists": True}})
documentsNER = list(CursorNERNames)
In [ ]:
if len(documentsNER) > 0:
for document in documentsNER:
resList = [] # result list with matched names
for entity_name, frequency in document["companies"].items():
if len(entity_name) > 2:
result = process.extractOne(entity_name, compList)
if result is not None:
# Wenn ein ähnlicher Name gefunden wurde
if result[1] >= 95:
# Passen Sie die Ähnlichkeitsbewertungsschwelle nach Bedarf an
similar_name = result[0]
# print(f"Ähnlicher Name gefunden: {entity_name} (Ähnlichkeit: {result[1]})")
# print(f"Ähnlichkeit mit: {similar_name}")
# print(f"Häufigkeit: {frequency}")
print(
f"NER Entität: {entity_name} passt zu:{similar_name} zu: {result[1]}% und {frequency} Matches "
)
# ents=NERService.NERCompanyList(company_list,document)
# compList.append(document['name'])
# add a new attribute 'companies' to document
# newsObj.collection.update_one(
# {"_id": document["_id"]}, # Filter für das entsprechende Dokument
# {"$set": {"companies": ents}} # Neues Attribut hinzufügen, initialisiert mit einer leeren Liste
# )
else:
print("No documents found.")
In [ ]:
documentsNER[1]["companies"].items()
In [ ]:
compList
In [ ]:
import re
# Funktion zum Entfernen von Unternehmenstypen und rechtlichen Zusätzen
def remove_legal_additions(name):
# Verwenden Sie einen regulären Ausdruck, um gängige Zusätze wie "GmbH" und "AG" zu entfernen
cleaned_name = re.sub(
r"\b(GmbH|AG|KG|SE|& Co\. KGaA|& Co\.|e\.K\.|mbH|mbH & Co\. KG)\b", "", name
)
# Entfernen Sie führende und nachfolgende Leerzeichen
cleaned_name = cleaned_name.strip()
return cleaned_name
# Bereinigen Sie die Liste von Unternehmensnamen
complist2 = [remove_legal_additions(name) for name in compList]
complist2
In [ ]:
complist3 = [word.split()[0] for word in complist2 if len(word.split()[0]) >= 3]
complist4 = list(set(complist3))
complist4
In [ ]:
# Bestimmte Begriffe, die keine Firmennamen sind, müssen aus complist4 entfernt werden
if ("Deutsche") in complist4:
complist4.remove("Deutsche")
else:
pass
if ("Hamburg") in complist4:
complist4.remove("Hamburg")
else:
pass
if ("Hamburger") in complist4:
complist4.remove("Hamburger")
else:
pass
if ("Union") in complist4:
complist4.remove("Union")
else:
pass
if ("Energy") in complist4:
complist4.remove("Energy")
else:
pass
if ("Hugo") in complist4:
complist4.remove("Hugo")
else:
pass
if ("Pro") in complist4:
complist4.remove("Pro")
else:
pass
if ("OTC") in complist4:
complist4.remove("OTC")
else:
pass
if ("web") in complist4:
complist4.remove("web")
else:
pass
if ("Kabel") in complist4:
complist4.remove("Kabel")
else:
pass
if ("Club") in complist4:
complist4.remove("Club")
else:
pass
if ("The") in complist4:
complist4.remove("The")
else:
pass
if ("United") in complist4:
complist4.remove("United")
else:
pass
if ("Frankfurter") in complist4:
complist4.remove("Frankfurter")
else:
pass
if ("CMC") in complist4:
complist4.remove("CMC")
else:
pass
if ("Bayern") in complist4:
complist4.remove("Bayern")
else:
pass
if ("Haus") in complist4:
complist4.remove("Haus")
else:
pass
if ("Gesellschaft") in complist4:
complist4.remove("Gesellschaft")
else:
pass
if ("Delivery") in complist4:
complist4.remove("Delivery")
else:
pass
if ("Aachener") in complist4:
complist4.remove("Aachener")
else:
pass
if ("Group") in complist4:
complist4.remove("Group")
else:
pass
if ("Retail") in complist4:
complist4.remove("Retail")
else:
pass
if ("Media") in complist4:
complist4.remove("Media")
else:
pass
if ("European") in complist4:
complist4.remove("European")
else:
pass
if ("Fuels") in complist4:
complist4.remove("Fuels")
else:
pass
In [ ]:
# Zusammenführung der beiden Listen complist2 und complist4
complist5 = complist2 + complist4
complist5
In [ ]:
df1 = pd.DataFrame(documentsNER)
df1
In [ ]:
# Function to extract company names
def extract_company_names(company_dict):
return list(company_dict.keys())
# Apply the function to the 'companies' column
df1["companies"] = df1["companies"].apply(lambda x: extract_company_names(x))
df1
In [ ]:
df1["companies_filtered"] = df1["companies"].apply(
lambda x: [company for company in x if company in complist5]
)
df1
In [ ]:
# Nur Auswahl der ersten fünf Spalten von "companies_filtered", um später Rechenressourcen zu ersparen.
def split_list1(row):
return pd.Series(row["companies_filtered"][:5])
# Apply the function and concatenate the result with the original DataFrame
df2 = df1.apply(split_list1, axis=1).merge(df1, left_index=True, right_index=True)
# Drop the original column with lists
df2 = df2.drop("companies", axis=1)
df2 = df2.drop("companies_filtered", axis=1)
df2
In [ ]:
df2.rename(columns={0: "company1"}, inplace=True)
df2.rename(columns={1: "company2"}, inplace=True)
df2.rename(columns={2: "company3"}, inplace=True)
df2.rename(columns={3: "company4"}, inplace=True)
df2.rename(columns={4: "company5"}, inplace=True)
df2
In [ ]:
# cell10 = df2.loc[3, 'company1']
# print(cell10)
In [ ]:
df2.dropna(
subset=["company1", "company2", "company3", "company4", "company5"],
how="all",
inplace=True,
)
df2
In [ ]:
# Die Indizes müssen resetted werden, da es ansonsten bei den nachfolgenden Schritte Fehlermeldungen gibt (Iterieren über i würde ansonsten nicht funktionieren).
df2 = df2.reset_index(drop=True)
df2
In [ ]:
def filter_sentences_company1(row):
target_word = row["company1"]
# Check if target_word is NaN
if pd.isna(target_word):
return None
sentences = re.split(r"[.:>]", row["text"]) # Split by dot
# Extract sentences containing target word
filtered_sentences = [
sentence.strip() for sentence in sentences if target_word in sentence
]
# Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!
concatenated_sentences = ". ".join(filtered_sentences)[:200]
# Return None if no sentences contain the target word
return concatenated_sentences if concatenated_sentences else None
df2["text_company1"] = df2.apply(filter_sentences_company1, axis=1)
In [ ]:
def filter_sentences_company2(row):
target_word = row["company2"]
# Check if target_word is NaN
if pd.isna(target_word):
return None
sentences = re.split(r"[.:>]", row["text"]) # Split by dot
# Extract sentences containing target word
filtered_sentences = [
sentence.strip() for sentence in sentences if target_word in sentence
]
# Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!
concatenated_sentences = ". ".join(filtered_sentences)[:200]
# Return None if no sentences contain the target word
return concatenated_sentences if concatenated_sentences else None
df2["text_company2"] = df2.apply(filter_sentences_company2, axis=1)
In [ ]:
def filter_sentences_company3(row):
target_word = row["company3"]
# Check if target_word is NaN
if pd.isna(target_word):
return None
sentences = re.split(r"[.:>]", row["text"]) # Split by dot
# Extract sentences containing target word
filtered_sentences = [
sentence.strip() for sentence in sentences if target_word in sentence
]
# Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!
concatenated_sentences = ". ".join(filtered_sentences)[:200]
# Return None if no sentences contain the target word
return concatenated_sentences if concatenated_sentences else None
df2["text_company3"] = df2.apply(filter_sentences_company3, axis=1)
In [ ]:
def filter_sentences_company4(row):
target_word = row["company4"]
# Check if target_word is NaN
if pd.isna(target_word):
return None
sentences = re.split(r"[.:>]", row["text"]) # Split by dot
# Extract sentences containing target word
filtered_sentences = [
sentence.strip() for sentence in sentences if target_word in sentence
]
# Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!
concatenated_sentences = ". ".join(filtered_sentences)[:200]
# Return None if no sentences contain the target word
return concatenated_sentences if concatenated_sentences else None
df2["text_company4"] = df2.apply(filter_sentences_company4, axis=1)
In [ ]:
def filter_sentences_company5(row):
target_word = row["company5"]
# Check if target_word is NaN
if pd.isna(target_word):
return None
sentences = re.split(r"[.:>]", row["text"]) # Split by dot
# Extract sentences containing target word
filtered_sentences = [
sentence.strip() for sentence in sentences if target_word in sentence
]
# Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!
concatenated_sentences = ". ".join(filtered_sentences)[:200]
# Return None if no sentences contain the target word
return concatenated_sentences if concatenated_sentences else None
df2["text_company5"] = df2.apply(filter_sentences_company5, axis=1)
df2
In [ ]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, MarianMTModel
# Hinweis: Durch den zusätzlichen Code "truncation=True" kann die Beschränkung auf 512 Zeichen bei der maschinen Übersetzung ausgeschaltet werden.
translation_tokenizer = AutoTokenizer.from_pretrained(
"Helsinki-NLP/opus-mt-de-en", truncation=True
)
translation_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-de-en")
In [ ]:
# Testen, ob die maschinelle Übersetzung bei einem Beispielssatz funkioniert.
def translate_sentiment(text: str) -> str:
input_tokens = translation_tokenizer([text], return_tensors="pt")
generated_ids = translation_model.generate(**input_tokens)
return translation_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[
0
]
headline = "Pelabresib: Biotech-Unternehmen Morphosys hofft mit neuem Krebs-Medikament endlich auf den Durchbruch. "
tf = translate_sentiment(headline)
tf
In [ ]:
translate_list_company1 = []
for i in range(len(df2)):
text = str(
df2["text_company1"].loc[i]
) # Convert to string (this is very important)
texttrans = translate_sentiment(text)
translate_list_company1.append(texttrans)
dftrans_company1 = pd.DataFrame(translate_list_company1)
dftrans_company1.rename(columns={0: "text_company1_eng"}, inplace=True)
dftrans_company1
In [ ]:
translate_list_company2 = []
for i in range(len(df2)):
text = str(
df2["text_company2"].loc[i]
) # Convert to string (this is very important)
texttrans = translate_sentiment(text)
translate_list_company2.append(texttrans)
dftrans_company2 = pd.DataFrame(translate_list_company2)
dftrans_company2.rename(columns={0: "text_company2_eng"}, inplace=True)
dftrans_company2
In [ ]:
translate_list_company3 = []
for i in range(len(df2)):
text = str(
df2["text_company3"].loc[i]
) # Convert to string (this is very important)
texttrans = translate_sentiment(text)
translate_list_company3.append(texttrans)
dftrans_company3 = pd.DataFrame(translate_list_company3)
dftrans_company3.rename(columns={0: "text_company3_eng"}, inplace=True)
dftrans_company3
In [ ]:
translate_list_company4 = []
for i in range(len(df2)):
text = str(
df2["text_company4"].loc[i]
) # Convert to string (this is very important)
texttrans = translate_sentiment(text)
translate_list_company4.append(texttrans)
dftrans_company4 = pd.DataFrame(translate_list_company4)
dftrans_company4.rename(columns={0: "text_company4_eng"}, inplace=True)
dftrans_company4
In [ ]:
translate_list_company5 = []
for i in range(len(df2)):
text = str(
df2["text_company5"].loc[i]
) # Convert to string (this is very important)
texttrans = translate_sentiment(text)
translate_list_company5.append(texttrans)
dftrans_company5 = pd.DataFrame(translate_list_company5)
dftrans_company5.rename(columns={0: "text_company5_eng"}, inplace=True)
dftrans_company5
In [ ]:
df3 = df2[
[
"_id",
"title",
"text",
"company1",
"text_company1",
"company2",
"text_company2",
"company3",
"text_company3",
"company4",
"text_company4",
"company5",
"text_company5",
]
]
df3
In [ ]:
df3.insert(4, "text_company1_eng", dftrans_company1.iloc[:, 0])
df3.insert(7, "text_company2_eng", dftrans_company2.iloc[:, 0])
df3.insert(10, "text_company3_eng", dftrans_company3.iloc[:, 0])
df3.insert(13, "text_company4_eng", dftrans_company4.iloc[:, 0])
df3.insert(16, "text_company5_eng", dftrans_company5.iloc[:, 0])
df3
In [ ]:
# df3.to_csv('df3_20231213.csv')
In [ ]:
df3 = df3.map(lambda x: None if pd.isna(x) else x)
df3
In [ ]:
# test ABSA
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
# Das ABSA-Modell und der Tokenizer werden geladen bzw. definiert
model_name = "yangheng/deberta-v3-base-absa-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
In [ ]:
# Test anhand eines Beispielsatzes
for aspect in ["Siemens"]:
print(
aspect,
classifier(
"Siemens is doing great",
text_pair=aspect,
),
)
In [ ]:
# Test anhand eines Beispielsatzes Teil 2
for aspect in [df2.loc[1, "company1"]]:
print(
aspect,
classifier(
"Siemens is doing great",
text_pair=aspect,
),
)
In [ ]:
# Muster für ABSA CompanyX
result_list_company1 = []
for i in range(len(df3)):
aspect = df3["company1"].loc[i]
textfinal = aspect, classifier(df3["text_company1_eng"].loc[i], text_pair=aspect)
result_list_company1.append(textfinal)
dfcompany1 = pd.DataFrame(result_list_company1)
dfcompany1.rename(columns={0: "company1_new"}, inplace=True)
dfcompany1.rename(columns={1: "company1_ABSA_v1"}, inplace=True)
dfcompany1["company1_ABSA_v1"] = dfcompany1["company1_ABSA_v1"].astype(str)
dfcompany1["company1_ABSA"] = dfcompany1["company1_ABSA_v1"].str[12:19]
import re
# Es wird ein Schema für einen "Regulären Ausdruck" definiert, um numerische Werte zu finden.
pattern = r"(\d+(\.\d+)?)"
# Die numerischen Werte werden mittels des "Regulären Ausdrucks" extrahiert.
dfcompany1["company1_numABSA"] = dfcompany1["company1_ABSA_v1"].apply(
lambda x: re.search(pattern, str(x)).group(1)
if re.search(pattern, str(x))
else None
)
# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.
dfcompany1["company1_numABSA"] = pd.to_numeric(
dfcompany1["company1_numABSA"], errors="coerce"
)
dfcompany1 = dfcompany1.drop("company1_ABSA_v1", axis=1)
dfcompany1
In [ ]:
def handle_none(row):
if row["company1_new"] is None:
row["company1_ABSA"] = None
row["company1_numABSA"] = None
return row
# Apply the custom function to each row
dfcompany1new = dfcompany1.apply(handle_none, axis=1)
dfcompany1new
In [ ]:
# Test anhand eines Beispielsatzes Teil 2
for aspect in [df3.loc[9, "company2"]]:
if df3["text_company2_eng"].loc[9] != "None":
print(
aspect,
classifier(
df3["text_company2_eng"].loc[9],
text_pair=aspect,
),
)
else:
print(None)
In [ ]:
result_list_company2 = []
for i in range(len(df3)):
aspect = df3["company2"].loc[i]
textfinal = aspect, classifier(df3["text_company2_eng"].loc[i], text_pair=aspect)
result_list_company2.append(textfinal)
dfcompany2 = pd.DataFrame(result_list_company2)
dfcompany2.rename(columns={0: "company2_new"}, inplace=True)
dfcompany2.rename(columns={1: "company2_ABSA_v1"}, inplace=True)
dfcompany2["company2_ABSA_v1"] = dfcompany2["company2_ABSA_v1"].astype(str)
dfcompany2["company2_ABSA"] = dfcompany2["company2_ABSA_v1"].str[12:19]
import re
# Es wird ein Schema für einen "Regulären Ausdruck" definiert, um numerische Werte zu finden.
pattern = r"(\d+(\.\d+)?)"
# Die numerischen Werte werden mittels des "Regulären Ausdrucks" extrahiert.
dfcompany2["company2_numABSA"] = dfcompany2["company2_ABSA_v1"].apply(
lambda x: re.search(pattern, str(x)).group(1)
if re.search(pattern, str(x))
else None
)
# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.
dfcompany2["company2_numABSA"] = pd.to_numeric(
dfcompany2["company2_numABSA"], errors="coerce"
)
dfcompany2 = dfcompany2.drop("company2_ABSA_v1", axis=1)
dfcompany2
In [ ]:
def handle_none(row):
if row["company2_new"] is None:
row["company2_ABSA"] = None
row["company2_numABSA"] = None
return row
# Apply the custom function to each row
dfcompany2new = dfcompany2.apply(handle_none, axis=1)
dfcompany2new
In [ ]:
result_list_company3 = []
for i in range(len(df3)):
aspect = df3["company3"].loc[i]
textfinal = aspect, classifier(df3["text_company3_eng"].loc[i], text_pair=aspect)
result_list_company3.append(textfinal)
dfcompany3 = pd.DataFrame(result_list_company3)
dfcompany3.rename(columns={0: "company3_new"}, inplace=True)
dfcompany3.rename(columns={1: "company3_ABSA_v1"}, inplace=True)
dfcompany3["company3_ABSA_v1"] = dfcompany3["company3_ABSA_v1"].astype(str)
dfcompany3["company3_ABSA"] = dfcompany3["company3_ABSA_v1"].str[12:19]
import re
# Es wird ein Schema für einen "Regulären Ausdruck" definiert, um numerische Werte zu finden.
pattern = r"(\d+(\.\d+)?)"
# Die numerischen Werte werden mittels des "Regulären Ausdrucks" extrahiert.
dfcompany3["company3_numABSA"] = dfcompany3["company3_ABSA_v1"].apply(
lambda x: re.search(pattern, str(x)).group(1)
if re.search(pattern, str(x))
else None
)
# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.
dfcompany3["company3_numABSA"] = pd.to_numeric(
dfcompany3["company3_numABSA"], errors="coerce"
)
dfcompany3 = dfcompany3.drop("company3_ABSA_v1", axis=1)
dfcompany3
In [ ]:
def handle_none(row):
if row["company3_new"] is None:
row["company3_ABSA"] = None
row["company3_numABSA"] = None
return row
# Apply the custom function to each row
dfcompany3new = dfcompany3.apply(handle_none, axis=1)
dfcompany3new
In [ ]:
result_list_company4 = []
for i in range(len(df3)):
aspect = df3["company4"].loc[i]
textfinal = aspect, classifier(df3["text_company4_eng"].loc[i], text_pair=aspect)
result_list_company4.append(textfinal)
dfcompany4 = pd.DataFrame(result_list_company4)
dfcompany4.rename(columns={0: "company4_new"}, inplace=True)
dfcompany4.rename(columns={1: "company4_ABSA_v1"}, inplace=True)
dfcompany4["company4_ABSA_v1"] = dfcompany4["company4_ABSA_v1"].astype(str)
dfcompany4["company4_ABSA"] = dfcompany4["company4_ABSA_v1"].str[12:19]
import re
# Es wird ein Schema für einen "Regulären Ausdruck" definiert, um numerische Werte zu finden.
pattern = r"(\d+(\.\d+)?)"
# Die numerischen Werte werden mittels des "Regulären Ausdrucks" extrahiert.
dfcompany4["company4_numABSA"] = dfcompany4["company4_ABSA_v1"].apply(
lambda x: re.search(pattern, str(x)).group(1)
if re.search(pattern, str(x))
else None
)
# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.
dfcompany4["company4_numABSA"] = pd.to_numeric(
dfcompany4["company4_numABSA"], errors="coerce"
)
dfcompany4 = dfcompany4.drop("company4_ABSA_v1", axis=1)
dfcompany4
In [ ]:
def handle_none(row):
if row["company4_new"] is None:
row["company4_ABSA"] = None
row["company4_numABSA"] = None
return row
# Apply the custom function to each row
dfcompany4new = dfcompany4.apply(handle_none, axis=1)
dfcompany4new
In [ ]:
result_list_company5 = []
for i in range(len(df3)):
aspect = df3["company5"].loc[i]
textfinal = aspect, classifier(df3["text_company5_eng"].loc[i], text_pair=aspect)
result_list_company5.append(textfinal)
dfcompany5 = pd.DataFrame(result_list_company5)
dfcompany5.rename(columns={0: "company5_new"}, inplace=True)
dfcompany5.rename(columns={1: "company5_ABSA_v1"}, inplace=True)
dfcompany5["company5_ABSA_v1"] = dfcompany5["company5_ABSA_v1"].astype(str)
dfcompany5["company5_ABSA"] = dfcompany5["company5_ABSA_v1"].str[12:19]
import re
# Es wird ein Schema für einen "Regulären Ausdruck" definiert, um numerische Werte zu finden.
pattern = r"(\d+(\.\d+)?)"
# Die numerischen Werte werden mittels des "Regulären Ausdrucks" extrahiert.
dfcompany5["company5_numABSA"] = dfcompany5["company5_ABSA_v1"].apply(
lambda x: re.search(pattern, str(x)).group(1)
if re.search(pattern, str(x))
else None
)
# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.
dfcompany5["company5_numABSA"] = pd.to_numeric(
dfcompany5["company5_numABSA"], errors="coerce"
)
dfcompany5 = dfcompany5.drop("company5_ABSA_v1", axis=1)
dfcompany5
In [ ]:
def handle_none(row):
if row["company5_new"] is None:
row["company5_ABSA"] = None
row["company5_numABSA"] = None
return row
# Apply the custom function to each row
dfcompany5new = dfcompany5.apply(handle_none, axis=1)
dfcompany5new
In [ ]:
dftotal1 = pd.concat(
[df3, dfcompany1new, dfcompany2new, dfcompany3new, dfcompany4new, dfcompany5new],
axis=1,
join="outer",
)
columns_to_drop = ["company1", "company2", "company3", "company4", "company5"]
dftotal1.drop(columns=columns_to_drop, inplace=True)
dftotal1
In [ ]:
# Custom function to combine two columns into a list
def combine_to_list_company1(row):
return {
row["company1_new"]: {
"ABSA": row["company1_ABSA"],
"numABSA": row["company1_numABSA"],
}
}
# Apply the custom function to each row and create a new column 'Combined'
dftotal1["company1_Combined"] = dftotal1.apply(combine_to_list_company1, axis=1)
# columns_to_drop = ["company1_new", "company1_ABSA", "company1_numABSA"]
# dftotal1.drop(columns=columns_to_drop, inplace=True)
dftotal1
In [ ]:
# Custom function to combine two columns into a list
def combine_to_list_company2(row):
return {
row["company2_new"]: {
"ABSA": row["company2_ABSA"],
"numABSA": row["company2_numABSA"],
}
}
# Apply the custom function to each row and create a new column 'Combined'
dftotal1["company2_Combined"] = dftotal1.apply(combine_to_list_company2, axis=1)
# columns_to_drop = ["company2_new", "company2_ABSA", "company2_numABSA"]
# dftotal1.drop(columns=columns_to_drop, inplace=True)
dftotal1
In [ ]:
# Custom function to combine two columns into a list
def combine_to_list_company3(row):
return {
row["company3_new"]: {
"ABSA": row["company3_ABSA"],
"numABSA": row["company3_numABSA"],
}
}
# Apply the custom function to each row and create a new column 'Combined'
dftotal1["company3_Combined"] = dftotal1.apply(combine_to_list_company3, axis=1)
# columns_to_drop = ["company3_new", "company3_ABSA", "company3_numABSA"]
# dftotal1.drop(columns=columns_to_drop, inplace=True)
dftotal1
In [ ]:
# Custom function to combine two columns into a list
def combine_to_list_company4(row):
return {
row["company4_new"]: {
"ABSA": row["company4_ABSA"],
"numABSA": row["company4_numABSA"],
}
}
# Apply the custom function to each row and create a new column 'Combined'
dftotal1["company4_Combined"] = dftotal1.apply(combine_to_list_company4, axis=1)
# columns_to_drop = ["company4_new", "company4_ABSA", "company4_numABSA"]
# dftotal1.drop(columns=columns_to_drop, inplace=True)
dftotal1
In [ ]:
# Custom function to combine two columns into a list
def combine_to_list_company5(row):
return {
row["company5_new"]: {
"ABSA": row["company5_ABSA"],
"numABSA": row["company5_numABSA"],
}
}
# Apply the custom function to each row and create a new column 'Combined'
dftotal1["company5_Combined"] = dftotal1.apply(combine_to_list_company5, axis=1)
# columns_to_drop = ["company5_new", "company5_ABSA", "company5_numABSA"]
# dftotal1.drop(columns=columns_to_drop, inplace=True)
dftotal1
In [ ]:
# Combine the ABSAs from company1/2/3/4/5 into one combined ABSA, eliminating the "Nones"
def combine_to_dict(row):
if (
None in row["company5_Combined"]
and None not in row["company2_Combined"]
and None not in row["company3_Combined"]
and None not in row["company4_Combined"]
):
return {
row["company1_new"]: {
"ABSA": row["company1_ABSA"],
"numABSA": row["company1_numABSA"],
},
row["company2_new"]: {
"ABSA": row["company2_ABSA"],
"numABSA": row["company2_numABSA"],
},
row["company3_new"]: {
"ABSA": row["company3_ABSA"],
"numABSA": row["company3_numABSA"],
},
row["company4_new"]: {
"ABSA": row["company4_ABSA"],
"numABSA": row["company4_numABSA"],
},
}
elif (
None in row["company4_Combined"]
and None in row["company5_Combined"]
and None not in row["company2_Combined"]
and None not in row["company3_Combined"]
):
return {
row["company1_new"]: {
"ABSA": row["company1_ABSA"],
"numABSA": row["company1_numABSA"],
},
row["company2_new"]: {
"ABSA": row["company2_ABSA"],
"numABSA": row["company2_numABSA"],
},
row["company3_new"]: {
"ABSA": row["company3_ABSA"],
"numABSA": row["company3_numABSA"],
},
}
elif (
None in row["company3_Combined"]
and None in row["company4_Combined"]
and None in row["company5_Combined"]
and None not in row["company2_Combined"]
):
return {
row["company1_new"]: {
"ABSA": row["company1_ABSA"],
"numABSA": row["company1_numABSA"],
},
row["company2_new"]: {
"ABSA": row["company2_ABSA"],
"numABSA": row["company2_numABSA"],
},
}
elif (
None in row["company2_Combined"]
and None in row["company3_Combined"]
and None in row["company4_Combined"]
and None in row["company5_Combined"]
):
return {
row["company1_new"]: {
"ABSA": row["company1_ABSA"],
"numABSA": row["company1_numABSA"],
}
}
else:
return {
row["company1_new"]: {
"ABSA": row["company1_ABSA"],
"numABSA": row["company1_numABSA"],
},
row["company2_new"]: {
"ABSA": row["company2_ABSA"],
"numABSA": row["company2_numABSA"],
},
row["company3_new"]: {
"ABSA": row["company3_ABSA"],
"numABSA": row["company3_numABSA"],
},
row["company4_new"]: {
"ABSA": row["company4_ABSA"],
"numABSA": row["company4_numABSA"],
},
row["company5_new"]: {
"ABSA": row["company5_ABSA"],
"numABSA": row["company5_numABSA"],
},
}
# Apply the custom function to each row and create a new column 'Combined'
dftotal1["Combined_ABSA"] = dftotal1.apply(combine_to_dict, axis=1)
dftotal1
In [ ]:
# dftotal1.to_csv('dftotal1_20231217_v6.csv')
In [ ]:
# Ganz am Ende packen
# Create an instance of NewsObject (replace 'your_collection' with your actual collection)
newsObj = news.MongoNewsService(connector)
if len(dftotal1) > 0:
for i in range(len(dftotal1)):
# ents=NERService.NERCompanyList(company_list,document)
# add a new attribute 'Combined_ABSA' to document
newsObj.collection.update_one(
{"_id": dftotal1["_id"].iloc[i]}, # Filter für das entsprechende Dokument
{
"$set": {"Combined_ABSA": dftotal1["Combined_ABSA"].iloc[i]}
}, # Neues Attribut hinzufügen, initialisiert mit einer leeren Liste
)
else:
print("No documents found.")