aki_prj23_transparenzregister/Jupyter/ABSA/ABSA_v5.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "import aki_prj23_transparenzregister.utils.mongo.connector as conn\n",
    "from aki_prj23_transparenzregister.config.config_providers import JsonFileConfigProvider\n",
    "import aki_prj23_transparenzregister.utils.mongo.news_mongo_service as news\n",
    "import aki_prj23_transparenzregister.utils.mongo.company_mongo_service as comps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class ABSA:\n",
    "    def __init__(self):\n",
    "        self.config_provider = JsonFileConfigProvider(\"./secrets.json\")\n",
    "        self.connect_string = self.config_provider.get_mongo_connection_string()\n",
    "        self.connect_string.database = \"transparenzregister_ner\"\n",
    "        self.connector = conn.MongoConnector(self.connect_string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Mongo Connect: create connection string and connect\n",
    "config_provider = JsonFileConfigProvider(\"../../secrets.json\")\n",
    "engine = config_provider.get_mongo_connection_string()\n",
    "engine.database = \"transparenzregister_ner\"\n",
    "connector = conn.MongoConnector(engine)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Process all documents and check if attribute 'name' is existing\n",
    "# Read data from database\n",
    "CompsObj = comps.CompanyMongoService(connector)\n",
    "allComps = CompsObj.get_all()\n",
    "\n",
    "# Create a cursor which has all unprogressed articles; articles without the attribute 'companies'\n",
    "CursorCompNames = CompsObj.collection.find({\"name\": {\"$exists\": True}})\n",
    "documents = list(CursorCompNames)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create a list with all company names\n",
    "compList = []\n",
    "\n",
    "if len(documents) > 0:\n",
    "    for document in documents:\n",
    "        # ents=NERService.NERCompanyList(company_list,document)\n",
    "        compList.append(document[\"name\"])\n",
    "        # add a new attribute 'companies' to document\n",
    "        # newsObj.collection.update_one(\n",
    "        #        {\"_id\": document[\"_id\"]},  # Filter für das entsprechende Dokument\n",
    "        #        {\"$set\": {\"companies\": ents}}  # Neues Attribut hinzufügen, initialisiert mit einer leeren Liste\n",
    "        #    )\n",
    "\n",
    "else:\n",
    "    print(\"No documents found.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Process all documents in news collection and check if attribute 'companies' is existing\n",
    "\n",
    "# Read data from database\n",
    "NERObj = news.MongoNewsService(connector)\n",
    "allNER = NERObj.get_all()\n",
    "\n",
    "# Create a cursor which has all unprogressed articles; articles without the attribute 'companies'\n",
    "CursorNERNames = NERObj.collection.find({\"companies\": {\"$exists\": True}})\n",
    "documentsNER = list(CursorNERNames)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# install and import rapidfuzz\n",
    "# pip install rapidfuzz\n",
    "from rapidfuzz import process"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Process all documents in news collection and check if attribute 'companies' is existing\n",
    "\n",
    "# Read data from database\n",
    "NERObj = news.MongoNewsService(connector)\n",
    "allNER = NERObj.get_all()\n",
    "\n",
    "# Create a cursor which has all unprogressed articles; articles without the attribute 'companies'\n",
    "CursorNERNames = NERObj.collection.find({\"companies\": {\"$exists\": True}})\n",
    "documentsNER = list(CursorNERNames)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if len(documentsNER) > 0:\n",
    "    for document in documentsNER:\n",
    "        resList = []  # result list with matched names\n",
    "        for entity_name, frequency in document[\"companies\"].items():\n",
    "            if len(entity_name) > 2:\n",
    "                result = process.extractOne(entity_name, compList)\n",
    "                if result is not None:\n",
    "                    # Wenn ein ähnlicher Name gefunden wurde\n",
    "                    if result[1] >= 95:\n",
    "                        # Passen Sie die Ähnlichkeitsbewertungsschwelle nach Bedarf an\n",
    "                        similar_name = result[0]\n",
    "                        # print(f\"Ähnlicher Name gefunden: {entity_name} (Ähnlichkeit: {result[1]})\")\n",
    "                        # print(f\"Ähnlichkeit mit: {similar_name}\")\n",
    "                        # print(f\"Häufigkeit: {frequency}\")\n",
    "                        print(\n",
    "                            f\"NER Entität: {entity_name} passt zu:{similar_name} zu: {result[1]}% und {frequency} Matches \"\n",
    "                        )\n",
    "\n",
    "        # ents=NERService.NERCompanyList(company_list,document)\n",
    "        # compList.append(document['name'])\n",
    "        # add a new attribute 'companies' to document\n",
    "        # newsObj.collection.update_one(\n",
    "        #        {\"_id\": document[\"_id\"]},  # Filter für das entsprechende Dokument\n",
    "        #        {\"$set\": {\"companies\": ents}}  # Neues Attribut hinzufügen, initialisiert mit einer leeren Liste\n",
    "        #    )\n",
    "\n",
    "else:\n",
    "    print(\"No documents found.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "documentsNER[1][\"companies\"].items()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "compList"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "\n",
    "# Funktion zum Entfernen von Unternehmenstypen und rechtlichen Zusätzen\n",
    "def remove_legal_additions(name):\n",
    "    # Verwenden Sie einen regulären Ausdruck, um gängige Zusätze wie \"GmbH\" und \"AG\" zu entfernen\n",
    "    cleaned_name = re.sub(\n",
    "        r\"\\b(GmbH|AG|KG|SE|& Co\\. KGaA|& Co\\.|e\\.K\\.|mbH|mbH & Co\\. KG)\\b\", \"\", name\n",
    "    )\n",
    "    # Entfernen Sie führende und nachfolgende Leerzeichen\n",
    "    cleaned_name = cleaned_name.strip()\n",
    "    return cleaned_name\n",
    "\n",
    "\n",
    "# Bereinigen Sie die Liste von Unternehmensnamen\n",
    "complist2 = [remove_legal_additions(name) for name in compList]\n",
    "complist2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "complist3 = [word.split()[0] for word in complist2 if len(word.split()[0]) >= 3]\n",
    "complist4 = list(set(complist3))\n",
    "complist4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Bestimmte Begriffe, die keine Firmennamen sind, müssen aus complist4 entfernt werden\n",
    "if (\"Deutsche\") in complist4:\n",
    "    complist4.remove(\"Deutsche\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"Hamburg\") in complist4:\n",
    "    complist4.remove(\"Hamburg\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"Hamburger\") in complist4:\n",
    "    complist4.remove(\"Hamburger\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"Union\") in complist4:\n",
    "    complist4.remove(\"Union\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"Energy\") in complist4:\n",
    "    complist4.remove(\"Energy\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"Hugo\") in complist4:\n",
    "    complist4.remove(\"Hugo\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"Pro\") in complist4:\n",
    "    complist4.remove(\"Pro\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"OTC\") in complist4:\n",
    "    complist4.remove(\"OTC\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"web\") in complist4:\n",
    "    complist4.remove(\"web\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"Kabel\") in complist4:\n",
    "    complist4.remove(\"Kabel\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"Club\") in complist4:\n",
    "    complist4.remove(\"Club\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"The\") in complist4:\n",
    "    complist4.remove(\"The\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"United\") in complist4:\n",
    "    complist4.remove(\"United\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"Frankfurter\") in complist4:\n",
    "    complist4.remove(\"Frankfurter\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"CMC\") in complist4:\n",
    "    complist4.remove(\"CMC\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"Bayern\") in complist4:\n",
    "    complist4.remove(\"Bayern\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"Haus\") in complist4:\n",
    "    complist4.remove(\"Haus\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"Gesellschaft\") in complist4:\n",
    "    complist4.remove(\"Gesellschaft\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"Delivery\") in complist4:\n",
    "    complist4.remove(\"Delivery\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"Aachener\") in complist4:\n",
    "    complist4.remove(\"Aachener\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"Group\") in complist4:\n",
    "    complist4.remove(\"Group\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"Retail\") in complist4:\n",
    "    complist4.remove(\"Retail\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"Media\") in complist4:\n",
    "    complist4.remove(\"Media\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"European\") in complist4:\n",
    "    complist4.remove(\"European\")\n",
    "else:\n",
    "    pass\n",
    "\n",
    "if (\"Fuels\") in complist4:\n",
    "    complist4.remove(\"Fuels\")\n",
    "else:\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Zusammenführung der beiden Listen complist2 und complist4\n",
    "complist5 = complist2 + complist4\n",
    "complist5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 = pd.DataFrame(documentsNER)\n",
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to extract company names\n",
    "def extract_company_names(company_dict):\n",
    "    return list(company_dict.keys())\n",
    "\n",
    "\n",
    "# Apply the function to the 'companies' column\n",
    "df1[\"companies\"] = df1[\"companies\"].apply(lambda x: extract_company_names(x))\n",
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1[\"companies_filtered\"] = df1[\"companies\"].apply(\n",
    "    lambda x: [company for company in x if company in complist5]\n",
    ")\n",
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Nur Auswahl der ersten fünf Spalten von \"companies_filtered\", um später Rechenressourcen zu ersparen.\n",
    "def split_list1(row):\n",
    "    return pd.Series(row[\"companies_filtered\"][:5])\n",
    "\n",
    "\n",
    "# Apply the function and concatenate the result with the original DataFrame\n",
    "df2 = df1.apply(split_list1, axis=1).merge(df1, left_index=True, right_index=True)\n",
    "\n",
    "# Drop the original column with lists\n",
    "df2 = df2.drop(\"companies\", axis=1)\n",
    "df2 = df2.drop(\"companies_filtered\", axis=1)\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df2.rename(columns={0: \"company1\"}, inplace=True)\n",
    "df2.rename(columns={1: \"company2\"}, inplace=True)\n",
    "df2.rename(columns={2: \"company3\"}, inplace=True)\n",
    "df2.rename(columns={3: \"company4\"}, inplace=True)\n",
    "df2.rename(columns={4: \"company5\"}, inplace=True)\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# cell10 = df2.loc[3, 'company1']\n",
    "# print(cell10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df2.dropna(\n",
    "    subset=[\"company1\", \"company2\", \"company3\", \"company4\", \"company5\"],\n",
    "    how=\"all\",\n",
    "    inplace=True,\n",
    ")\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Die Indizes müssen resetted werden, da es ansonsten bei den nachfolgenden Schritte Fehlermeldungen gibt (Iterieren über i würde ansonsten nicht funktionieren).\n",
    "df2 = df2.reset_index(drop=True)\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def filter_sentences_company1(row):\n",
    "    target_word = row[\"company1\"]\n",
    "\n",
    "    # Check if target_word is NaN\n",
    "    if pd.isna(target_word):\n",
    "        return None\n",
    "\n",
    "    sentences = re.split(r\"[.:>]\", row[\"text\"])  # Split by dot\n",
    "\n",
    "    # Extract sentences containing target word\n",
    "    filtered_sentences = [\n",
    "        sentence.strip() for sentence in sentences if target_word in sentence\n",
    "    ]\n",
    "\n",
    "    # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n",
    "    concatenated_sentences = \". \".join(filtered_sentences)[:200]\n",
    "\n",
    "    # Return None if no sentences contain the target word\n",
    "    return concatenated_sentences if concatenated_sentences else None\n",
    "\n",
    "\n",
    "df2[\"text_company1\"] = df2.apply(filter_sentences_company1, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def filter_sentences_company2(row):\n",
    "    target_word = row[\"company2\"]\n",
    "\n",
    "    # Check if target_word is NaN\n",
    "    if pd.isna(target_word):\n",
    "        return None\n",
    "\n",
    "    sentences = re.split(r\"[.:>]\", row[\"text\"])  # Split by dot\n",
    "\n",
    "    # Extract sentences containing target word\n",
    "    filtered_sentences = [\n",
    "        sentence.strip() for sentence in sentences if target_word in sentence\n",
    "    ]\n",
    "\n",
    "    # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n",
    "    concatenated_sentences = \". \".join(filtered_sentences)[:200]\n",
    "\n",
    "    # Return None if no sentences contain the target word\n",
    "    return concatenated_sentences if concatenated_sentences else None\n",
    "\n",
    "\n",
    "df2[\"text_company2\"] = df2.apply(filter_sentences_company2, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def filter_sentences_company3(row):\n",
    "    target_word = row[\"company3\"]\n",
    "\n",
    "    # Check if target_word is NaN\n",
    "    if pd.isna(target_word):\n",
    "        return None\n",
    "\n",
    "    sentences = re.split(r\"[.:>]\", row[\"text\"])  # Split by dot\n",
    "\n",
    "    # Extract sentences containing target word\n",
    "    filtered_sentences = [\n",
    "        sentence.strip() for sentence in sentences if target_word in sentence\n",
    "    ]\n",
    "\n",
    "    # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n",
    "    concatenated_sentences = \". \".join(filtered_sentences)[:200]\n",
    "\n",
    "    # Return None if no sentences contain the target word\n",
    "    return concatenated_sentences if concatenated_sentences else None\n",
    "\n",
    "\n",
    "df2[\"text_company3\"] = df2.apply(filter_sentences_company3, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def filter_sentences_company4(row):\n",
    "    target_word = row[\"company4\"]\n",
    "\n",
    "    # Check if target_word is NaN\n",
    "    if pd.isna(target_word):\n",
    "        return None\n",
    "\n",
    "    sentences = re.split(r\"[.:>]\", row[\"text\"])  # Split by dot\n",
    "\n",
    "    # Extract sentences containing target word\n",
    "    filtered_sentences = [\n",
    "        sentence.strip() for sentence in sentences if target_word in sentence\n",
    "    ]\n",
    "\n",
    "    # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n",
    "    concatenated_sentences = \". \".join(filtered_sentences)[:200]\n",
    "\n",
    "    # Return None if no sentences contain the target word\n",
    "    return concatenated_sentences if concatenated_sentences else None\n",
    "\n",
    "\n",
    "df2[\"text_company4\"] = df2.apply(filter_sentences_company4, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def filter_sentences_company5(row):\n",
    "    target_word = row[\"company5\"]\n",
    "\n",
    "    # Check if target_word is NaN\n",
    "    if pd.isna(target_word):\n",
    "        return None\n",
    "\n",
    "    sentences = re.split(r\"[.:>]\", row[\"text\"])  # Split by dot\n",
    "\n",
    "    # Extract sentences containing target word\n",
    "    filtered_sentences = [\n",
    "        sentence.strip() for sentence in sentences if target_word in sentence\n",
    "    ]\n",
    "\n",
    "    # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n",
    "    concatenated_sentences = \". \".join(filtered_sentences)[:200]\n",
    "\n",
    "    # Return None if no sentences contain the target word\n",
    "    return concatenated_sentences if concatenated_sentences else None\n",
    "\n",
    "\n",
    "df2[\"text_company5\"] = df2.apply(filter_sentences_company5, axis=1)\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, MarianMTModel\n",
    "\n",
    "# Hinweis: Durch den zusätzlichen Code \"truncation=True\" kann die Beschränkung auf 512 Zeichen bei der maschinen Übersetzung ausgeschaltet werden.\n",
    "translation_tokenizer = AutoTokenizer.from_pretrained(\n",
    "    \"Helsinki-NLP/opus-mt-de-en\", truncation=True\n",
    ")\n",
    "\n",
    "translation_model = AutoModelForSeq2SeqLM.from_pretrained(\"Helsinki-NLP/opus-mt-de-en\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Testen, ob die maschinelle Übersetzung bei einem Beispielssatz funkioniert.\n",
    "def translate_sentiment(text: str) -> str:\n",
    "    input_tokens = translation_tokenizer([text], return_tensors=\"pt\")\n",
    "    generated_ids = translation_model.generate(**input_tokens)\n",
    "    return translation_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[\n",
    "        0\n",
    "    ]\n",
    "\n",
    "\n",
    "headline = \"Pelabresib: Biotech-Unternehmen Morphosys hofft mit neuem Krebs-Medikament endlich auf den Durchbruch. \"\n",
    "tf = translate_sentiment(headline)\n",
    "tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "translate_list_company1 = []\n",
    "\n",
    "for i in range(len(df2)):\n",
    "    text = str(\n",
    "        df2[\"text_company1\"].loc[i]\n",
    "    )  # Convert to string (this is very important)\n",
    "    texttrans = translate_sentiment(text)\n",
    "    translate_list_company1.append(texttrans)\n",
    "\n",
    "dftrans_company1 = pd.DataFrame(translate_list_company1)\n",
    "dftrans_company1.rename(columns={0: \"text_company1_eng\"}, inplace=True)\n",
    "dftrans_company1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "translate_list_company2 = []\n",
    "\n",
    "for i in range(len(df2)):\n",
    "    text = str(\n",
    "        df2[\"text_company2\"].loc[i]\n",
    "    )  # Convert to string (this is very important)\n",
    "    texttrans = translate_sentiment(text)\n",
    "    translate_list_company2.append(texttrans)\n",
    "\n",
    "dftrans_company2 = pd.DataFrame(translate_list_company2)\n",
    "dftrans_company2.rename(columns={0: \"text_company2_eng\"}, inplace=True)\n",
    "dftrans_company2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "translate_list_company3 = []\n",
    "\n",
    "for i in range(len(df2)):\n",
    "    text = str(\n",
    "        df2[\"text_company3\"].loc[i]\n",
    "    )  # Convert to string (this is very important)\n",
    "    texttrans = translate_sentiment(text)\n",
    "    translate_list_company3.append(texttrans)\n",
    "\n",
    "dftrans_company3 = pd.DataFrame(translate_list_company3)\n",
    "dftrans_company3.rename(columns={0: \"text_company3_eng\"}, inplace=True)\n",
    "dftrans_company3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "translate_list_company4 = []\n",
    "\n",
    "for i in range(len(df2)):\n",
    "    text = str(\n",
    "        df2[\"text_company4\"].loc[i]\n",
    "    )  # Convert to string (this is very important)\n",
    "    texttrans = translate_sentiment(text)\n",
    "    translate_list_company4.append(texttrans)\n",
    "\n",
    "dftrans_company4 = pd.DataFrame(translate_list_company4)\n",
    "dftrans_company4.rename(columns={0: \"text_company4_eng\"}, inplace=True)\n",
    "dftrans_company4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "translate_list_company5 = []\n",
    "\n",
    "for i in range(len(df2)):\n",
    "    text = str(\n",
    "        df2[\"text_company5\"].loc[i]\n",
    "    )  # Convert to string (this is very important)\n",
    "    texttrans = translate_sentiment(text)\n",
    "    translate_list_company5.append(texttrans)\n",
    "\n",
    "dftrans_company5 = pd.DataFrame(translate_list_company5)\n",
    "dftrans_company5.rename(columns={0: \"text_company5_eng\"}, inplace=True)\n",
    "dftrans_company5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df3 = df2[\n",
    "    [\n",
    "        \"_id\",\n",
    "        \"title\",\n",
    "        \"text\",\n",
    "        \"company1\",\n",
    "        \"text_company1\",\n",
    "        \"company2\",\n",
    "        \"text_company2\",\n",
    "        \"company3\",\n",
    "        \"text_company3\",\n",
    "        \"company4\",\n",
    "        \"text_company4\",\n",
    "        \"company5\",\n",
    "        \"text_company5\",\n",
    "    ]\n",
    "]\n",
    "df3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df3.insert(4, \"text_company1_eng\", dftrans_company1.iloc[:, 0])\n",
    "df3.insert(7, \"text_company2_eng\", dftrans_company2.iloc[:, 0])\n",
    "df3.insert(10, \"text_company3_eng\", dftrans_company3.iloc[:, 0])\n",
    "df3.insert(13, \"text_company4_eng\", dftrans_company4.iloc[:, 0])\n",
    "df3.insert(16, \"text_company5_eng\", dftrans_company5.iloc[:, 0])\n",
    "df3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df3.to_csv('df3_20231213.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df3 = df3.map(lambda x: None if pd.isna(x) else x)\n",
    "df3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# test ABSA\n",
    "from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline\n",
    "\n",
    "# Das ABSA-Modell und der Tokenizer werden geladen bzw. definiert\n",
    "model_name = \"yangheng/deberta-v3-base-absa-v1.1\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "model = AutoModelForSequenceClassification.from_pretrained(model_name)\n",
    "\n",
    "classifier = pipeline(\"text-classification\", model=model, tokenizer=tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Test anhand eines Beispielsatzes\n",
    "for aspect in [\"Siemens\"]:\n",
    "    print(\n",
    "        aspect,\n",
    "        classifier(\n",
    "            \"Siemens is doing great\",\n",
    "            text_pair=aspect,\n",
    "        ),\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Test anhand eines Beispielsatzes Teil 2\n",
    "for aspect in [df2.loc[1, \"company1\"]]:\n",
    "    print(\n",
    "        aspect,\n",
    "        classifier(\n",
    "            \"Siemens is doing great\",\n",
    "            text_pair=aspect,\n",
    "        ),\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Muster für ABSA CompanyX\n",
    "\n",
    "result_list_company1 = []\n",
    "\n",
    "for i in range(len(df3)):\n",
    "    aspect = df3[\"company1\"].loc[i]\n",
    "    textfinal = aspect, classifier(df3[\"text_company1_eng\"].loc[i], text_pair=aspect)\n",
    "    result_list_company1.append(textfinal)\n",
    "\n",
    "dfcompany1 = pd.DataFrame(result_list_company1)\n",
    "dfcompany1.rename(columns={0: \"company1_new\"}, inplace=True)\n",
    "dfcompany1.rename(columns={1: \"company1_ABSA_v1\"}, inplace=True)\n",
    "\n",
    "dfcompany1[\"company1_ABSA_v1\"] = dfcompany1[\"company1_ABSA_v1\"].astype(str)\n",
    "dfcompany1[\"company1_ABSA\"] = dfcompany1[\"company1_ABSA_v1\"].str[12:19]\n",
    "\n",
    "import re\n",
    "\n",
    "# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n",
    "pattern = r\"(\\d+(\\.\\d+)?)\"\n",
    "\n",
    "# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n",
    "dfcompany1[\"company1_numABSA\"] = dfcompany1[\"company1_ABSA_v1\"].apply(\n",
    "    lambda x: re.search(pattern, str(x)).group(1)\n",
    "    if re.search(pattern, str(x))\n",
    "    else None\n",
    ")\n",
    "\n",
    "# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n",
    "dfcompany1[\"company1_numABSA\"] = pd.to_numeric(\n",
    "    dfcompany1[\"company1_numABSA\"], errors=\"coerce\"\n",
    ")\n",
    "dfcompany1 = dfcompany1.drop(\"company1_ABSA_v1\", axis=1)\n",
    "dfcompany1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def handle_none(row):\n",
    "    if row[\"company1_new\"] is None:\n",
    "        row[\"company1_ABSA\"] = None\n",
    "        row[\"company1_numABSA\"] = None\n",
    "    return row\n",
    "\n",
    "\n",
    "# Apply the custom function to each row\n",
    "dfcompany1new = dfcompany1.apply(handle_none, axis=1)\n",
    "dfcompany1new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Test anhand eines Beispielsatzes Teil 2\n",
    "for aspect in [df3.loc[9, \"company2\"]]:\n",
    "    if df3[\"text_company2_eng\"].loc[9] != \"None\":\n",
    "        print(\n",
    "            aspect,\n",
    "            classifier(\n",
    "                df3[\"text_company2_eng\"].loc[9],\n",
    "                text_pair=aspect,\n",
    "            ),\n",
    "        )\n",
    "    else:\n",
    "        print(None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result_list_company2 = []\n",
    "\n",
    "for i in range(len(df3)):\n",
    "    aspect = df3[\"company2\"].loc[i]\n",
    "    textfinal = aspect, classifier(df3[\"text_company2_eng\"].loc[i], text_pair=aspect)\n",
    "    result_list_company2.append(textfinal)\n",
    "\n",
    "dfcompany2 = pd.DataFrame(result_list_company2)\n",
    "dfcompany2.rename(columns={0: \"company2_new\"}, inplace=True)\n",
    "dfcompany2.rename(columns={1: \"company2_ABSA_v1\"}, inplace=True)\n",
    "\n",
    "dfcompany2[\"company2_ABSA_v1\"] = dfcompany2[\"company2_ABSA_v1\"].astype(str)\n",
    "dfcompany2[\"company2_ABSA\"] = dfcompany2[\"company2_ABSA_v1\"].str[12:19]\n",
    "\n",
    "import re\n",
    "\n",
    "# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n",
    "pattern = r\"(\\d+(\\.\\d+)?)\"\n",
    "\n",
    "# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n",
    "dfcompany2[\"company2_numABSA\"] = dfcompany2[\"company2_ABSA_v1\"].apply(\n",
    "    lambda x: re.search(pattern, str(x)).group(1)\n",
    "    if re.search(pattern, str(x))\n",
    "    else None\n",
    ")\n",
    "\n",
    "# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n",
    "dfcompany2[\"company2_numABSA\"] = pd.to_numeric(\n",
    "    dfcompany2[\"company2_numABSA\"], errors=\"coerce\"\n",
    ")\n",
    "dfcompany2 = dfcompany2.drop(\"company2_ABSA_v1\", axis=1)\n",
    "dfcompany2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def handle_none(row):\n",
    "    if row[\"company2_new\"] is None:\n",
    "        row[\"company2_ABSA\"] = None\n",
    "        row[\"company2_numABSA\"] = None\n",
    "    return row\n",
    "\n",
    "\n",
    "# Apply the custom function to each row\n",
    "dfcompany2new = dfcompany2.apply(handle_none, axis=1)\n",
    "dfcompany2new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result_list_company3 = []\n",
    "\n",
    "for i in range(len(df3)):\n",
    "    aspect = df3[\"company3\"].loc[i]\n",
    "    textfinal = aspect, classifier(df3[\"text_company3_eng\"].loc[i], text_pair=aspect)\n",
    "    result_list_company3.append(textfinal)\n",
    "\n",
    "dfcompany3 = pd.DataFrame(result_list_company3)\n",
    "dfcompany3.rename(columns={0: \"company3_new\"}, inplace=True)\n",
    "dfcompany3.rename(columns={1: \"company3_ABSA_v1\"}, inplace=True)\n",
    "\n",
    "dfcompany3[\"company3_ABSA_v1\"] = dfcompany3[\"company3_ABSA_v1\"].astype(str)\n",
    "dfcompany3[\"company3_ABSA\"] = dfcompany3[\"company3_ABSA_v1\"].str[12:19]\n",
    "\n",
    "import re\n",
    "\n",
    "# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n",
    "pattern = r\"(\\d+(\\.\\d+)?)\"\n",
    "\n",
    "# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n",
    "dfcompany3[\"company3_numABSA\"] = dfcompany3[\"company3_ABSA_v1\"].apply(\n",
    "    lambda x: re.search(pattern, str(x)).group(1)\n",
    "    if re.search(pattern, str(x))\n",
    "    else None\n",
    ")\n",
    "\n",
    "# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n",
    "dfcompany3[\"company3_numABSA\"] = pd.to_numeric(\n",
    "    dfcompany3[\"company3_numABSA\"], errors=\"coerce\"\n",
    ")\n",
    "dfcompany3 = dfcompany3.drop(\"company3_ABSA_v1\", axis=1)\n",
    "dfcompany3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def handle_none(row):\n",
    "    if row[\"company3_new\"] is None:\n",
    "        row[\"company3_ABSA\"] = None\n",
    "        row[\"company3_numABSA\"] = None\n",
    "    return row\n",
    "\n",
    "\n",
    "# Apply the custom function to each row\n",
    "dfcompany3new = dfcompany3.apply(handle_none, axis=1)\n",
    "dfcompany3new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result_list_company4 = []\n",
    "\n",
    "for i in range(len(df3)):\n",
    "    aspect = df3[\"company4\"].loc[i]\n",
    "    textfinal = aspect, classifier(df3[\"text_company4_eng\"].loc[i], text_pair=aspect)\n",
    "    result_list_company4.append(textfinal)\n",
    "\n",
    "dfcompany4 = pd.DataFrame(result_list_company4)\n",
    "dfcompany4.rename(columns={0: \"company4_new\"}, inplace=True)\n",
    "dfcompany4.rename(columns={1: \"company4_ABSA_v1\"}, inplace=True)\n",
    "\n",
    "dfcompany4[\"company4_ABSA_v1\"] = dfcompany4[\"company4_ABSA_v1\"].astype(str)\n",
    "dfcompany4[\"company4_ABSA\"] = dfcompany4[\"company4_ABSA_v1\"].str[12:19]\n",
    "\n",
    "import re\n",
    "\n",
    "# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n",
    "pattern = r\"(\\d+(\\.\\d+)?)\"\n",
    "\n",
    "# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n",
    "dfcompany4[\"company4_numABSA\"] = dfcompany4[\"company4_ABSA_v1\"].apply(\n",
    "    lambda x: re.search(pattern, str(x)).group(1)\n",
    "    if re.search(pattern, str(x))\n",
    "    else None\n",
    ")\n",
    "\n",
    "# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n",
    "dfcompany4[\"company4_numABSA\"] = pd.to_numeric(\n",
    "    dfcompany4[\"company4_numABSA\"], errors=\"coerce\"\n",
    ")\n",
    "dfcompany4 = dfcompany4.drop(\"company4_ABSA_v1\", axis=1)\n",
    "dfcompany4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def handle_none(row):\n",
    "    if row[\"company4_new\"] is None:\n",
    "        row[\"company4_ABSA\"] = None\n",
    "        row[\"company4_numABSA\"] = None\n",
    "    return row\n",
    "\n",
    "\n",
    "# Apply the custom function to each row\n",
    "dfcompany4new = dfcompany4.apply(handle_none, axis=1)\n",
    "dfcompany4new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result_list_company5 = []\n",
    "\n",
    "for i in range(len(df3)):\n",
    "    aspect = df3[\"company5\"].loc[i]\n",
    "    textfinal = aspect, classifier(df3[\"text_company5_eng\"].loc[i], text_pair=aspect)\n",
    "    result_list_company5.append(textfinal)\n",
    "\n",
    "dfcompany5 = pd.DataFrame(result_list_company5)\n",
    "dfcompany5.rename(columns={0: \"company5_new\"}, inplace=True)\n",
    "dfcompany5.rename(columns={1: \"company5_ABSA_v1\"}, inplace=True)\n",
    "\n",
    "dfcompany5[\"company5_ABSA_v1\"] = dfcompany5[\"company5_ABSA_v1\"].astype(str)\n",
    "dfcompany5[\"company5_ABSA\"] = dfcompany5[\"company5_ABSA_v1\"].str[12:19]\n",
    "\n",
    "import re\n",
    "\n",
    "# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n",
    "pattern = r\"(\\d+(\\.\\d+)?)\"\n",
    "\n",
    "# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n",
    "dfcompany5[\"company5_numABSA\"] = dfcompany5[\"company5_ABSA_v1\"].apply(\n",
    "    lambda x: re.search(pattern, str(x)).group(1)\n",
    "    if re.search(pattern, str(x))\n",
    "    else None\n",
    ")\n",
    "\n",
    "# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n",
    "dfcompany5[\"company5_numABSA\"] = pd.to_numeric(\n",
    "    dfcompany5[\"company5_numABSA\"], errors=\"coerce\"\n",
    ")\n",
    "dfcompany5 = dfcompany5.drop(\"company5_ABSA_v1\", axis=1)\n",
    "dfcompany5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def handle_none(row):\n",
    "    if row[\"company5_new\"] is None:\n",
    "        row[\"company5_ABSA\"] = None\n",
    "        row[\"company5_numABSA\"] = None\n",
    "    return row\n",
    "\n",
    "\n",
    "# Apply the custom function to each row\n",
    "dfcompany5new = dfcompany5.apply(handle_none, axis=1)\n",
    "dfcompany5new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dftotal1 = pd.concat(\n",
    "    [df3, dfcompany1new, dfcompany2new, dfcompany3new, dfcompany4new, dfcompany5new],\n",
    "    axis=1,\n",
    "    join=\"outer\",\n",
    ")\n",
    "columns_to_drop = [\"company1\", \"company2\", \"company3\", \"company4\", \"company5\"]\n",
    "dftotal1.drop(columns=columns_to_drop, inplace=True)\n",
    "dftotal1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Custom function to combine two columns into a list\n",
    "def combine_to_list_company1(row):\n",
    "    return {\n",
    "        \"company\": row[\"company1_new\"],\n",
    "        \"ABSA\": row[\"company1_ABSA\"],\n",
    "        \"numABSA\": row[\"company1_numABSA\"],\n",
    "    }\n",
    "\n",
    "\n",
    "# Apply the custom function to each row and create a new column 'Combined'\n",
    "dftotal1[\"company1_Combined\"] = dftotal1.apply(combine_to_list_company1, axis=1)\n",
    "columns_to_drop = [\"company1_new\", \"company1_ABSA\", \"company1_numABSA\"]\n",
    "dftotal1.drop(columns=columns_to_drop, inplace=True)\n",
    "dftotal1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Custom function to combine two columns into a list\n",
    "def combine_to_list_company2(row):\n",
    "    return {\n",
    "        \"company\": row[\"company2_new\"],\n",
    "        \"ABSA\": row[\"company2_ABSA\"],\n",
    "        \"numABSA\": row[\"company2_numABSA\"],\n",
    "    }\n",
    "\n",
    "\n",
    "# Apply the custom function to each row and create a new column 'Combined'\n",
    "dftotal1[\"company2_Combined\"] = dftotal1.apply(combine_to_list_company2, axis=1)\n",
    "columns_to_drop = [\"company2_new\", \"company2_ABSA\", \"company2_numABSA\"]\n",
    "dftotal1.drop(columns=columns_to_drop, inplace=True)\n",
    "dftotal1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Custom function to combine two columns into a list\n",
    "def combine_to_list_company3(row):\n",
    "    return {\n",
    "        \"company\": row[\"company3_new\"],\n",
    "        \"ABSA\": row[\"company3_ABSA\"],\n",
    "        \"numABSA\": row[\"company3_numABSA\"],\n",
    "    }\n",
    "\n",
    "\n",
    "# Apply the custom function to each row and create a new column 'Combined'\n",
    "dftotal1[\"company3_Combined\"] = dftotal1.apply(combine_to_list_company3, axis=1)\n",
    "columns_to_drop = [\"company3_new\", \"company3_ABSA\", \"company3_numABSA\"]\n",
    "dftotal1.drop(columns=columns_to_drop, inplace=True)\n",
    "dftotal1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Custom function to combine two columns into a list\n",
    "def combine_to_list_company4(row):\n",
    "    return {\n",
    "        \"company\": row[\"company4_new\"],\n",
    "        \"ABSA\": row[\"company4_ABSA\"],\n",
    "        \"numABSA\": row[\"company4_numABSA\"],\n",
    "    }\n",
    "\n",
    "\n",
    "# Apply the custom function to each row and create a new column 'Combined'\n",
    "dftotal1[\"company4_Combined\"] = dftotal1.apply(combine_to_list_company4, axis=1)\n",
    "columns_to_drop = [\"company4_new\", \"company4_ABSA\", \"company4_numABSA\"]\n",
    "dftotal1.drop(columns=columns_to_drop, inplace=True)\n",
    "dftotal1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Custom function to combine two columns into a list\n",
    "def combine_to_list_company5(row):\n",
    "    return {\n",
    "        \"company\": row[\"company5_new\"],\n",
    "        \"ABSA\": row[\"company5_ABSA\"],\n",
    "        \"numABSA\": row[\"company5_numABSA\"],\n",
    "    }\n",
    "\n",
    "\n",
    "# Apply the custom function to each row and create a new column 'Combined'\n",
    "dftotal1[\"company5_Combined\"] = dftotal1.apply(combine_to_list_company5, axis=1)\n",
    "columns_to_drop = [\"company5_new\", \"company5_ABSA\", \"company5_numABSA\"]\n",
    "dftotal1.drop(columns=columns_to_drop, inplace=True)\n",
    "dftotal1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dftotal1.rename(columns={\"company1_Combined\": \"company1\"}, inplace=True)\n",
    "dftotal1.rename(columns={\"company2_Combined\": \"company2\"}, inplace=True)\n",
    "dftotal1.rename(columns={\"company3_Combined\": \"company3\"}, inplace=True)\n",
    "dftotal1.rename(columns={\"company4_Combined\": \"company4\"}, inplace=True)\n",
    "dftotal1.rename(columns={\"company5_Combined\": \"company5\"}, inplace=True)\n",
    "dftotal1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Custom function to combine two columns into a dictionary\n",
    "def combine_to_dict(row):\n",
    "    return {\n",
    "        \"company1\": row[\"company1\"],\n",
    "        \"company2\": row[\"company2\"],\n",
    "        \"company3\": row[\"company3\"],\n",
    "        \"company4\": row[\"company4\"],\n",
    "        \"company5\": row[\"company5\"],\n",
    "    }\n",
    "\n",
    "\n",
    "# Apply the custom function to each row and create a new column 'Combined'\n",
    "dftotal1[\"Combined_ABSA\"] = dftotal1.apply(combine_to_dict, axis=1)\n",
    "columns_to_drop = [\"company1\", \"company2\", \"company3\", \"company4\", \"company5\"]\n",
    "dftotal1.drop(columns=columns_to_drop, inplace=True)\n",
    "dftotal1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dftotal1.to_csv('dftotal1_20231214_v4.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ganz am Ende packen\n",
    "# Create an instance of NewsObject (replace 'your_collection' with your actual collection)\n",
    "newsObj = news.MongoNewsService(connector)\n",
    "\n",
    "if len(dftotal1) > 0:\n",
    "    for i in range(len(dftotal1)):\n",
    "        # ents=NERService.NERCompanyList(company_list,document)\n",
    "        # add a new attribute 'Combined_ABSA' to document\n",
    "        newsObj.collection.update_one(\n",
    "            {\"_id\": dftotal1[\"_id\"].iloc[i]},  # Filter für das entsprechende Dokument\n",
    "            {\n",
    "                \"$set\": {\"Combined_ABSA\": dftotal1[\"Combined_ABSA\"].iloc[i]}\n",
    "            },  # Neues Attribut hinzufügen, initialisiert mit einer leeren Liste\n",
    "        )\n",
    "\n",
    "else:\n",
    "    print(\"No documents found.\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "aki-prj23-transparenzregister-z8SxnVl_-py3.11",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}