Files
aki_prj23_transparenzregister/Jupyter/ABSA/ABSA_v6.ipynb
2024-01-06 12:49:04 +01:00

47 KiB

In [ ]:
import json
import pandas as pd
import aki_prj23_transparenzregister.utils.mongo.connector as conn
from aki_prj23_transparenzregister.config.config_providers import JsonFileConfigProvider
import aki_prj23_transparenzregister.utils.mongo.news_mongo_service as news
import aki_prj23_transparenzregister.utils.mongo.company_mongo_service as comps
In [ ]:
class ABSA:
    def __init__(self):
        self.config_provider = JsonFileConfigProvider("./secrets.json")
        self.connect_string = self.config_provider.get_mongo_connection_string()
        self.connect_string.database = "transparenzregister_ner"
        self.connector = conn.MongoConnector(self.connect_string)
In [ ]:
# Mongo Connect: create connection string and connect
config_provider = JsonFileConfigProvider("../../secrets.json")
engine = config_provider.get_mongo_connection_string()
engine.database = "transparenzregister_ner"
connector = conn.MongoConnector(engine)
In [ ]:
# Process all documents and check if attribute 'name' is existing
# Read data from database
CompsObj = comps.CompanyMongoService(connector)
allComps = CompsObj.get_all()

# Create a cursor which has all unprogressed articles; articles without the attribute 'companies'
CursorCompNames = CompsObj.collection.find({"name": {"$exists": True}})
documents = list(CursorCompNames)
In [ ]:
# create a list with all company names
compList = []

if len(documents) > 0:
    for document in documents:
        # ents=NERService.NERCompanyList(company_list,document)
        compList.append(document["name"])
        # add a new attribute 'companies' to document
        # newsObj.collection.update_one(
        #        {"_id": document["_id"]},  # Filter für das entsprechende Dokument
        #        {"$set": {"companies": ents}}  # Neues Attribut hinzufügen, initialisiert mit einer leeren Liste
        #    )

else:
    print("No documents found.")
In [ ]:
# Process all documents in news collection and check if attribute 'companies' is existing

# Read data from database
NERObj = news.MongoNewsService(connector)
allNER = NERObj.get_all()

# Create a cursor which has all unprogressed articles; articles without the attribute 'companies'
CursorNERNames = NERObj.collection.find({"companies": {"$exists": True}})
documentsNER = list(CursorNERNames)
In [ ]:
# install and import rapidfuzz
# pip install rapidfuzz
from rapidfuzz import process
In [ ]:
# Process all documents in news collection and check if attribute 'companies' is existing

# Read data from database
NERObj = news.MongoNewsService(connector)
allNER = NERObj.get_all()

# Create a cursor which has all unprogressed articles; articles without the attribute 'companies'
CursorNERNames = NERObj.collection.find({"companies": {"$exists": True}})
documentsNER = list(CursorNERNames)
In [ ]:
if len(documentsNER) > 0:
    for document in documentsNER:
        resList = []  # result list with matched names
        for entity_name, frequency in document["companies"].items():
            if len(entity_name) > 2:
                result = process.extractOne(entity_name, compList)
                if result is not None:
                    # Wenn ein ähnlicher Name gefunden wurde
                    if result[1] >= 95:
                        # Passen Sie die Ähnlichkeitsbewertungsschwelle nach Bedarf an
                        similar_name = result[0]
                        # print(f"Ähnlicher Name gefunden: {entity_name} (Ähnlichkeit: {result[1]})")
                        # print(f"Ähnlichkeit mit: {similar_name}")
                        # print(f"Häufigkeit: {frequency}")
                        print(
                            f"NER Entität: {entity_name} passt zu:{similar_name} zu: {result[1]}% und {frequency} Matches "
                        )

        # ents=NERService.NERCompanyList(company_list,document)
        # compList.append(document['name'])
        # add a new attribute 'companies' to document
        # newsObj.collection.update_one(
        #        {"_id": document["_id"]},  # Filter für das entsprechende Dokument
        #        {"$set": {"companies": ents}}  # Neues Attribut hinzufügen, initialisiert mit einer leeren Liste
        #    )

else:
    print("No documents found.")
In [ ]:
documentsNER[1]["companies"].items()
In [ ]:
compList
In [ ]:
import re


# Funktion zum Entfernen von Unternehmenstypen und rechtlichen Zusätzen
def remove_legal_additions(name):
    # Verwenden Sie einen regulären Ausdruck, um gängige Zusätze wie "GmbH" und "AG" zu entfernen
    cleaned_name = re.sub(
        r"\b(GmbH|AG|KG|SE|& Co\. KGaA|& Co\.|e\.K\.|mbH|mbH & Co\. KG)\b", "", name
    )
    # Entfernen Sie führende und nachfolgende Leerzeichen
    cleaned_name = cleaned_name.strip()
    return cleaned_name


# Bereinigen Sie die Liste von Unternehmensnamen
complist2 = [remove_legal_additions(name) for name in compList]
complist2
In [ ]:
complist3 = [word.split()[0] for word in complist2 if len(word.split()[0]) >= 3]
complist4 = list(set(complist3))
complist4
In [ ]:
# Bestimmte Begriffe, die keine Firmennamen sind, müssen aus complist4 entfernt werden
if ("Deutsche") in complist4:
    complist4.remove("Deutsche")
else:
    pass

if ("Hamburg") in complist4:
    complist4.remove("Hamburg")
else:
    pass

if ("Hamburger") in complist4:
    complist4.remove("Hamburger")
else:
    pass

if ("Union") in complist4:
    complist4.remove("Union")
else:
    pass

if ("Energy") in complist4:
    complist4.remove("Energy")
else:
    pass

if ("Hugo") in complist4:
    complist4.remove("Hugo")
else:
    pass

if ("Pro") in complist4:
    complist4.remove("Pro")
else:
    pass

if ("OTC") in complist4:
    complist4.remove("OTC")
else:
    pass

if ("web") in complist4:
    complist4.remove("web")
else:
    pass

if ("Kabel") in complist4:
    complist4.remove("Kabel")
else:
    pass

if ("Club") in complist4:
    complist4.remove("Club")
else:
    pass

if ("The") in complist4:
    complist4.remove("The")
else:
    pass

if ("United") in complist4:
    complist4.remove("United")
else:
    pass

if ("Frankfurter") in complist4:
    complist4.remove("Frankfurter")
else:
    pass

if ("CMC") in complist4:
    complist4.remove("CMC")
else:
    pass

if ("Bayern") in complist4:
    complist4.remove("Bayern")
else:
    pass

if ("Haus") in complist4:
    complist4.remove("Haus")
else:
    pass

if ("Gesellschaft") in complist4:
    complist4.remove("Gesellschaft")
else:
    pass

if ("Delivery") in complist4:
    complist4.remove("Delivery")
else:
    pass

if ("Aachener") in complist4:
    complist4.remove("Aachener")
else:
    pass

if ("Group") in complist4:
    complist4.remove("Group")
else:
    pass

if ("Retail") in complist4:
    complist4.remove("Retail")
else:
    pass

if ("Media") in complist4:
    complist4.remove("Media")
else:
    pass

if ("European") in complist4:
    complist4.remove("European")
else:
    pass

if ("Fuels") in complist4:
    complist4.remove("Fuels")
else:
    pass
In [ ]:
# Zusammenführung der beiden Listen complist2 und complist4
complist5 = complist2 + complist4
complist5
In [ ]:
df1 = pd.DataFrame(documentsNER)
df1
In [ ]:
# Function to extract company names
def extract_company_names(company_dict):
    return list(company_dict.keys())


# Apply the function to the 'companies' column
df1["companies"] = df1["companies"].apply(lambda x: extract_company_names(x))
df1
In [ ]:
df1["companies_filtered"] = df1["companies"].apply(
    lambda x: [company for company in x if company in complist5]
)
df1
In [ ]:
# Nur Auswahl der ersten fünf Spalten von "companies_filtered", um später Rechenressourcen zu ersparen.
def split_list1(row):
    return pd.Series(row["companies_filtered"][:5])


# Apply the function and concatenate the result with the original DataFrame
df2 = df1.apply(split_list1, axis=1).merge(df1, left_index=True, right_index=True)

# Drop the original column with lists
df2 = df2.drop("companies", axis=1)
df2 = df2.drop("companies_filtered", axis=1)
df2
In [ ]:
df2.rename(columns={0: "company1"}, inplace=True)
df2.rename(columns={1: "company2"}, inplace=True)
df2.rename(columns={2: "company3"}, inplace=True)
df2.rename(columns={3: "company4"}, inplace=True)
df2.rename(columns={4: "company5"}, inplace=True)
df2
In [ ]:
# cell10 = df2.loc[3, 'company1']
# print(cell10)
In [ ]:
df2.dropna(
    subset=["company1", "company2", "company3", "company4", "company5"],
    how="all",
    inplace=True,
)
df2
In [ ]:
# Die Indizes müssen resetted werden, da es ansonsten bei den nachfolgenden Schritte Fehlermeldungen gibt (Iterieren über i würde ansonsten nicht funktionieren).
df2 = df2.reset_index(drop=True)
df2
In [ ]:
def filter_sentences_company1(row):
    target_word = row["company1"]

    # Check if target_word is NaN
    if pd.isna(target_word):
        return None

    sentences = re.split(r"[.:>]", row["text"])  # Split by dot

    # Extract sentences containing target word
    filtered_sentences = [
        sentence.strip() for sentence in sentences if target_word in sentence
    ]

    # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!
    concatenated_sentences = ". ".join(filtered_sentences)[:200]

    # Return None if no sentences contain the target word
    return concatenated_sentences if concatenated_sentences else None


df2["text_company1"] = df2.apply(filter_sentences_company1, axis=1)
In [ ]:
def filter_sentences_company2(row):
    target_word = row["company2"]

    # Check if target_word is NaN
    if pd.isna(target_word):
        return None

    sentences = re.split(r"[.:>]", row["text"])  # Split by dot

    # Extract sentences containing target word
    filtered_sentences = [
        sentence.strip() for sentence in sentences if target_word in sentence
    ]

    # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!
    concatenated_sentences = ". ".join(filtered_sentences)[:200]

    # Return None if no sentences contain the target word
    return concatenated_sentences if concatenated_sentences else None


df2["text_company2"] = df2.apply(filter_sentences_company2, axis=1)
In [ ]:
def filter_sentences_company3(row):
    target_word = row["company3"]

    # Check if target_word is NaN
    if pd.isna(target_word):
        return None

    sentences = re.split(r"[.:>]", row["text"])  # Split by dot

    # Extract sentences containing target word
    filtered_sentences = [
        sentence.strip() for sentence in sentences if target_word in sentence
    ]

    # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!
    concatenated_sentences = ". ".join(filtered_sentences)[:200]

    # Return None if no sentences contain the target word
    return concatenated_sentences if concatenated_sentences else None


df2["text_company3"] = df2.apply(filter_sentences_company3, axis=1)
In [ ]:
def filter_sentences_company4(row):
    target_word = row["company4"]

    # Check if target_word is NaN
    if pd.isna(target_word):
        return None

    sentences = re.split(r"[.:>]", row["text"])  # Split by dot

    # Extract sentences containing target word
    filtered_sentences = [
        sentence.strip() for sentence in sentences if target_word in sentence
    ]

    # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!
    concatenated_sentences = ". ".join(filtered_sentences)[:200]

    # Return None if no sentences contain the target word
    return concatenated_sentences if concatenated_sentences else None


df2["text_company4"] = df2.apply(filter_sentences_company4, axis=1)
In [ ]:
def filter_sentences_company5(row):
    target_word = row["company5"]

    # Check if target_word is NaN
    if pd.isna(target_word):
        return None

    sentences = re.split(r"[.:>]", row["text"])  # Split by dot

    # Extract sentences containing target word
    filtered_sentences = [
        sentence.strip() for sentence in sentences if target_word in sentence
    ]

    # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!
    concatenated_sentences = ". ".join(filtered_sentences)[:200]

    # Return None if no sentences contain the target word
    return concatenated_sentences if concatenated_sentences else None


df2["text_company5"] = df2.apply(filter_sentences_company5, axis=1)
df2
In [ ]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, MarianMTModel

# Hinweis: Durch den zusätzlichen Code "truncation=True" kann die Beschränkung auf 512 Zeichen bei der maschinen Übersetzung ausgeschaltet werden.
translation_tokenizer = AutoTokenizer.from_pretrained(
    "Helsinki-NLP/opus-mt-de-en", truncation=True
)

translation_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-de-en")
In [ ]:
# Testen, ob die maschinelle Übersetzung bei einem Beispielssatz funkioniert.
def translate_sentiment(text: str) -> str:
    input_tokens = translation_tokenizer([text], return_tensors="pt")
    generated_ids = translation_model.generate(**input_tokens)
    return translation_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[
        0
    ]


headline = "Pelabresib: Biotech-Unternehmen Morphosys hofft mit neuem Krebs-Medikament endlich auf den Durchbruch. "
tf = translate_sentiment(headline)
tf
In [ ]:
translate_list_company1 = []

for i in range(len(df2)):
    text = str(
        df2["text_company1"].loc[i]
    )  # Convert to string (this is very important)
    texttrans = translate_sentiment(text)
    translate_list_company1.append(texttrans)

dftrans_company1 = pd.DataFrame(translate_list_company1)
dftrans_company1.rename(columns={0: "text_company1_eng"}, inplace=True)
dftrans_company1
In [ ]:
translate_list_company2 = []

for i in range(len(df2)):
    text = str(
        df2["text_company2"].loc[i]
    )  # Convert to string (this is very important)
    texttrans = translate_sentiment(text)
    translate_list_company2.append(texttrans)

dftrans_company2 = pd.DataFrame(translate_list_company2)
dftrans_company2.rename(columns={0: "text_company2_eng"}, inplace=True)
dftrans_company2
In [ ]:
translate_list_company3 = []

for i in range(len(df2)):
    text = str(
        df2["text_company3"].loc[i]
    )  # Convert to string (this is very important)
    texttrans = translate_sentiment(text)
    translate_list_company3.append(texttrans)

dftrans_company3 = pd.DataFrame(translate_list_company3)
dftrans_company3.rename(columns={0: "text_company3_eng"}, inplace=True)
dftrans_company3
In [ ]:
translate_list_company4 = []

for i in range(len(df2)):
    text = str(
        df2["text_company4"].loc[i]
    )  # Convert to string (this is very important)
    texttrans = translate_sentiment(text)
    translate_list_company4.append(texttrans)

dftrans_company4 = pd.DataFrame(translate_list_company4)
dftrans_company4.rename(columns={0: "text_company4_eng"}, inplace=True)
dftrans_company4
In [ ]:
translate_list_company5 = []

for i in range(len(df2)):
    text = str(
        df2["text_company5"].loc[i]
    )  # Convert to string (this is very important)
    texttrans = translate_sentiment(text)
    translate_list_company5.append(texttrans)

dftrans_company5 = pd.DataFrame(translate_list_company5)
dftrans_company5.rename(columns={0: "text_company5_eng"}, inplace=True)
dftrans_company5
In [ ]:
df3 = df2[
    [
        "_id",
        "title",
        "text",
        "company1",
        "text_company1",
        "company2",
        "text_company2",
        "company3",
        "text_company3",
        "company4",
        "text_company4",
        "company5",
        "text_company5",
    ]
]
df3
In [ ]:
df3.insert(4, "text_company1_eng", dftrans_company1.iloc[:, 0])
df3.insert(7, "text_company2_eng", dftrans_company2.iloc[:, 0])
df3.insert(10, "text_company3_eng", dftrans_company3.iloc[:, 0])
df3.insert(13, "text_company4_eng", dftrans_company4.iloc[:, 0])
df3.insert(16, "text_company5_eng", dftrans_company5.iloc[:, 0])
df3
In [ ]:
# df3.to_csv('df3_20231213.csv')
In [ ]:
df3 = df3.map(lambda x: None if pd.isna(x) else x)
df3
In [ ]:
# test ABSA
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Das ABSA-Modell und der Tokenizer werden geladen bzw. definiert
model_name = "yangheng/deberta-v3-base-absa-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
In [ ]:
# Test anhand eines Beispielsatzes
for aspect in ["Siemens"]:
    print(
        aspect,
        classifier(
            "Siemens is doing great",
            text_pair=aspect,
        ),
    )
In [ ]:
# Test anhand eines Beispielsatzes Teil 2
for aspect in [df2.loc[1, "company1"]]:
    print(
        aspect,
        classifier(
            "Siemens is doing great",
            text_pair=aspect,
        ),
    )
In [ ]:
# Muster für ABSA CompanyX

result_list_company1 = []

for i in range(len(df3)):
    aspect = df3["company1"].loc[i]
    textfinal = aspect, classifier(df3["text_company1_eng"].loc[i], text_pair=aspect)
    result_list_company1.append(textfinal)

dfcompany1 = pd.DataFrame(result_list_company1)
dfcompany1.rename(columns={0: "company1_new"}, inplace=True)
dfcompany1.rename(columns={1: "company1_ABSA_v1"}, inplace=True)

dfcompany1["company1_ABSA_v1"] = dfcompany1["company1_ABSA_v1"].astype(str)
dfcompany1["company1_ABSA"] = dfcompany1["company1_ABSA_v1"].str[12:19]

import re

# Es wird ein Schema für einen "Regulären Ausdruck" definiert, um numerische Werte zu finden.
pattern = r"(\d+(\.\d+)?)"

# Die numerischen Werte werden mittels des "Regulären Ausdrucks" extrahiert.
dfcompany1["company1_numABSA"] = dfcompany1["company1_ABSA_v1"].apply(
    lambda x: re.search(pattern, str(x)).group(1)
    if re.search(pattern, str(x))
    else None
)

# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.
dfcompany1["company1_numABSA"] = pd.to_numeric(
    dfcompany1["company1_numABSA"], errors="coerce"
)
dfcompany1 = dfcompany1.drop("company1_ABSA_v1", axis=1)
dfcompany1
In [ ]:
def handle_none(row):
    if row["company1_new"] is None:
        row["company1_ABSA"] = None
        row["company1_numABSA"] = None
    return row


# Apply the custom function to each row
dfcompany1new = dfcompany1.apply(handle_none, axis=1)
dfcompany1new
In [ ]:
# Test anhand eines Beispielsatzes Teil 2
for aspect in [df3.loc[9, "company2"]]:
    if df3["text_company2_eng"].loc[9] != "None":
        print(
            aspect,
            classifier(
                df3["text_company2_eng"].loc[9],
                text_pair=aspect,
            ),
        )
    else:
        print(None)
In [ ]:
result_list_company2 = []

for i in range(len(df3)):
    aspect = df3["company2"].loc[i]
    textfinal = aspect, classifier(df3["text_company2_eng"].loc[i], text_pair=aspect)
    result_list_company2.append(textfinal)

dfcompany2 = pd.DataFrame(result_list_company2)
dfcompany2.rename(columns={0: "company2_new"}, inplace=True)
dfcompany2.rename(columns={1: "company2_ABSA_v1"}, inplace=True)

dfcompany2["company2_ABSA_v1"] = dfcompany2["company2_ABSA_v1"].astype(str)
dfcompany2["company2_ABSA"] = dfcompany2["company2_ABSA_v1"].str[12:19]

import re

# Es wird ein Schema für einen "Regulären Ausdruck" definiert, um numerische Werte zu finden.
pattern = r"(\d+(\.\d+)?)"

# Die numerischen Werte werden mittels des "Regulären Ausdrucks" extrahiert.
dfcompany2["company2_numABSA"] = dfcompany2["company2_ABSA_v1"].apply(
    lambda x: re.search(pattern, str(x)).group(1)
    if re.search(pattern, str(x))
    else None
)

# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.
dfcompany2["company2_numABSA"] = pd.to_numeric(
    dfcompany2["company2_numABSA"], errors="coerce"
)
dfcompany2 = dfcompany2.drop("company2_ABSA_v1", axis=1)
dfcompany2
In [ ]:
def handle_none(row):
    if row["company2_new"] is None:
        row["company2_ABSA"] = None
        row["company2_numABSA"] = None
    return row


# Apply the custom function to each row
dfcompany2new = dfcompany2.apply(handle_none, axis=1)
dfcompany2new
In [ ]:
result_list_company3 = []

for i in range(len(df3)):
    aspect = df3["company3"].loc[i]
    textfinal = aspect, classifier(df3["text_company3_eng"].loc[i], text_pair=aspect)
    result_list_company3.append(textfinal)

dfcompany3 = pd.DataFrame(result_list_company3)
dfcompany3.rename(columns={0: "company3_new"}, inplace=True)
dfcompany3.rename(columns={1: "company3_ABSA_v1"}, inplace=True)

dfcompany3["company3_ABSA_v1"] = dfcompany3["company3_ABSA_v1"].astype(str)
dfcompany3["company3_ABSA"] = dfcompany3["company3_ABSA_v1"].str[12:19]

import re

# Es wird ein Schema für einen "Regulären Ausdruck" definiert, um numerische Werte zu finden.
pattern = r"(\d+(\.\d+)?)"

# Die numerischen Werte werden mittels des "Regulären Ausdrucks" extrahiert.
dfcompany3["company3_numABSA"] = dfcompany3["company3_ABSA_v1"].apply(
    lambda x: re.search(pattern, str(x)).group(1)
    if re.search(pattern, str(x))
    else None
)

# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.
dfcompany3["company3_numABSA"] = pd.to_numeric(
    dfcompany3["company3_numABSA"], errors="coerce"
)
dfcompany3 = dfcompany3.drop("company3_ABSA_v1", axis=1)
dfcompany3
In [ ]:
def handle_none(row):
    if row["company3_new"] is None:
        row["company3_ABSA"] = None
        row["company3_numABSA"] = None
    return row


# Apply the custom function to each row
dfcompany3new = dfcompany3.apply(handle_none, axis=1)
dfcompany3new
In [ ]:
result_list_company4 = []

for i in range(len(df3)):
    aspect = df3["company4"].loc[i]
    textfinal = aspect, classifier(df3["text_company4_eng"].loc[i], text_pair=aspect)
    result_list_company4.append(textfinal)

dfcompany4 = pd.DataFrame(result_list_company4)
dfcompany4.rename(columns={0: "company4_new"}, inplace=True)
dfcompany4.rename(columns={1: "company4_ABSA_v1"}, inplace=True)

dfcompany4["company4_ABSA_v1"] = dfcompany4["company4_ABSA_v1"].astype(str)
dfcompany4["company4_ABSA"] = dfcompany4["company4_ABSA_v1"].str[12:19]

import re

# Es wird ein Schema für einen "Regulären Ausdruck" definiert, um numerische Werte zu finden.
pattern = r"(\d+(\.\d+)?)"

# Die numerischen Werte werden mittels des "Regulären Ausdrucks" extrahiert.
dfcompany4["company4_numABSA"] = dfcompany4["company4_ABSA_v1"].apply(
    lambda x: re.search(pattern, str(x)).group(1)
    if re.search(pattern, str(x))
    else None
)

# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.
dfcompany4["company4_numABSA"] = pd.to_numeric(
    dfcompany4["company4_numABSA"], errors="coerce"
)
dfcompany4 = dfcompany4.drop("company4_ABSA_v1", axis=1)
dfcompany4
In [ ]:
def handle_none(row):
    if row["company4_new"] is None:
        row["company4_ABSA"] = None
        row["company4_numABSA"] = None
    return row


# Apply the custom function to each row
dfcompany4new = dfcompany4.apply(handle_none, axis=1)
dfcompany4new
In [ ]:
result_list_company5 = []

for i in range(len(df3)):
    aspect = df3["company5"].loc[i]
    textfinal = aspect, classifier(df3["text_company5_eng"].loc[i], text_pair=aspect)
    result_list_company5.append(textfinal)

dfcompany5 = pd.DataFrame(result_list_company5)
dfcompany5.rename(columns={0: "company5_new"}, inplace=True)
dfcompany5.rename(columns={1: "company5_ABSA_v1"}, inplace=True)

dfcompany5["company5_ABSA_v1"] = dfcompany5["company5_ABSA_v1"].astype(str)
dfcompany5["company5_ABSA"] = dfcompany5["company5_ABSA_v1"].str[12:19]

import re

# Es wird ein Schema für einen "Regulären Ausdruck" definiert, um numerische Werte zu finden.
pattern = r"(\d+(\.\d+)?)"

# Die numerischen Werte werden mittels des "Regulären Ausdrucks" extrahiert.
dfcompany5["company5_numABSA"] = dfcompany5["company5_ABSA_v1"].apply(
    lambda x: re.search(pattern, str(x)).group(1)
    if re.search(pattern, str(x))
    else None
)

# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.
dfcompany5["company5_numABSA"] = pd.to_numeric(
    dfcompany5["company5_numABSA"], errors="coerce"
)
dfcompany5 = dfcompany5.drop("company5_ABSA_v1", axis=1)
dfcompany5
In [ ]:
def handle_none(row):
    if row["company5_new"] is None:
        row["company5_ABSA"] = None
        row["company5_numABSA"] = None
    return row


# Apply the custom function to each row
dfcompany5new = dfcompany5.apply(handle_none, axis=1)
dfcompany5new
In [ ]:
dftotal1 = pd.concat(
    [df3, dfcompany1new, dfcompany2new, dfcompany3new, dfcompany4new, dfcompany5new],
    axis=1,
    join="outer",
)
columns_to_drop = ["company1", "company2", "company3", "company4", "company5"]
dftotal1.drop(columns=columns_to_drop, inplace=True)
dftotal1
In [ ]:
# Custom function to combine two columns into a list
def combine_to_list_company1(row):
    return {
        row["company1_new"]: {
            "ABSA": row["company1_ABSA"],
            "numABSA": row["company1_numABSA"],
        }
    }


# Apply the custom function to each row and create a new column 'Combined'
dftotal1["company1_Combined"] = dftotal1.apply(combine_to_list_company1, axis=1)
# columns_to_drop = ["company1_new", "company1_ABSA", "company1_numABSA"]
# dftotal1.drop(columns=columns_to_drop, inplace=True)
dftotal1
In [ ]:
# Custom function to combine two columns into a list
def combine_to_list_company2(row):
    return {
        row["company2_new"]: {
            "ABSA": row["company2_ABSA"],
            "numABSA": row["company2_numABSA"],
        }
    }


# Apply the custom function to each row and create a new column 'Combined'
dftotal1["company2_Combined"] = dftotal1.apply(combine_to_list_company2, axis=1)
# columns_to_drop = ["company2_new", "company2_ABSA", "company2_numABSA"]
# dftotal1.drop(columns=columns_to_drop, inplace=True)
dftotal1
In [ ]:
# Custom function to combine two columns into a list
def combine_to_list_company3(row):
    return {
        row["company3_new"]: {
            "ABSA": row["company3_ABSA"],
            "numABSA": row["company3_numABSA"],
        }
    }


# Apply the custom function to each row and create a new column 'Combined'
dftotal1["company3_Combined"] = dftotal1.apply(combine_to_list_company3, axis=1)
# columns_to_drop = ["company3_new", "company3_ABSA", "company3_numABSA"]
# dftotal1.drop(columns=columns_to_drop, inplace=True)
dftotal1
In [ ]:
# Custom function to combine two columns into a list
def combine_to_list_company4(row):
    return {
        row["company4_new"]: {
            "ABSA": row["company4_ABSA"],
            "numABSA": row["company4_numABSA"],
        }
    }


# Apply the custom function to each row and create a new column 'Combined'
dftotal1["company4_Combined"] = dftotal1.apply(combine_to_list_company4, axis=1)
# columns_to_drop = ["company4_new", "company4_ABSA", "company4_numABSA"]
# dftotal1.drop(columns=columns_to_drop, inplace=True)
dftotal1
In [ ]:
# Custom function to combine two columns into a list
def combine_to_list_company5(row):
    return {
        row["company5_new"]: {
            "ABSA": row["company5_ABSA"],
            "numABSA": row["company5_numABSA"],
        }
    }


# Apply the custom function to each row and create a new column 'Combined'
dftotal1["company5_Combined"] = dftotal1.apply(combine_to_list_company5, axis=1)
# columns_to_drop = ["company5_new", "company5_ABSA", "company5_numABSA"]
# dftotal1.drop(columns=columns_to_drop, inplace=True)
dftotal1
In [ ]:
# Combine the ABSAs from company1/2/3/4/5 into one combined ABSA, eliminating the "Nones"


def combine_to_dict(row):
    if (
        None in row["company5_Combined"]
        and None not in row["company2_Combined"]
        and None not in row["company3_Combined"]
        and None not in row["company4_Combined"]
    ):
        return {
            row["company1_new"]: {
                "ABSA": row["company1_ABSA"],
                "numABSA": row["company1_numABSA"],
            },
            row["company2_new"]: {
                "ABSA": row["company2_ABSA"],
                "numABSA": row["company2_numABSA"],
            },
            row["company3_new"]: {
                "ABSA": row["company3_ABSA"],
                "numABSA": row["company3_numABSA"],
            },
            row["company4_new"]: {
                "ABSA": row["company4_ABSA"],
                "numABSA": row["company4_numABSA"],
            },
        }
    elif (
        None in row["company4_Combined"]
        and None in row["company5_Combined"]
        and None not in row["company2_Combined"]
        and None not in row["company3_Combined"]
    ):
        return {
            row["company1_new"]: {
                "ABSA": row["company1_ABSA"],
                "numABSA": row["company1_numABSA"],
            },
            row["company2_new"]: {
                "ABSA": row["company2_ABSA"],
                "numABSA": row["company2_numABSA"],
            },
            row["company3_new"]: {
                "ABSA": row["company3_ABSA"],
                "numABSA": row["company3_numABSA"],
            },
        }
    elif (
        None in row["company3_Combined"]
        and None in row["company4_Combined"]
        and None in row["company5_Combined"]
        and None not in row["company2_Combined"]
    ):
        return {
            row["company1_new"]: {
                "ABSA": row["company1_ABSA"],
                "numABSA": row["company1_numABSA"],
            },
            row["company2_new"]: {
                "ABSA": row["company2_ABSA"],
                "numABSA": row["company2_numABSA"],
            },
        }
    elif (
        None in row["company2_Combined"]
        and None in row["company3_Combined"]
        and None in row["company4_Combined"]
        and None in row["company5_Combined"]
    ):
        return {
            row["company1_new"]: {
                "ABSA": row["company1_ABSA"],
                "numABSA": row["company1_numABSA"],
            }
        }
    else:
        return {
            row["company1_new"]: {
                "ABSA": row["company1_ABSA"],
                "numABSA": row["company1_numABSA"],
            },
            row["company2_new"]: {
                "ABSA": row["company2_ABSA"],
                "numABSA": row["company2_numABSA"],
            },
            row["company3_new"]: {
                "ABSA": row["company3_ABSA"],
                "numABSA": row["company3_numABSA"],
            },
            row["company4_new"]: {
                "ABSA": row["company4_ABSA"],
                "numABSA": row["company4_numABSA"],
            },
            row["company5_new"]: {
                "ABSA": row["company5_ABSA"],
                "numABSA": row["company5_numABSA"],
            },
        }


# Apply the custom function to each row and create a new column 'Combined'
dftotal1["Combined_ABSA"] = dftotal1.apply(combine_to_dict, axis=1)
dftotal1
In [ ]:
# dftotal1.to_csv('dftotal1_20231217_v6.csv')
In [ ]:
# Ganz am Ende packen
# Create an instance of NewsObject (replace 'your_collection' with your actual collection)
newsObj = news.MongoNewsService(connector)

if len(dftotal1) > 0:
    for i in range(len(dftotal1)):
        # ents=NERService.NERCompanyList(company_list,document)
        # add a new attribute 'Combined_ABSA' to document
        newsObj.collection.update_one(
            {"_id": dftotal1["_id"].iloc[i]},  # Filter für das entsprechende Dokument
            {
                "$set": {"Combined_ABSA": dftotal1["Combined_ABSA"].iloc[i]}
            },  # Neues Attribut hinzufügen, initialisiert mit einer leeren Liste
        )

else:
    print("No documents found.")