{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "import pandas as pd\n", "import aki_prj23_transparenzregister.utils.mongo.connector as conn\n", "from aki_prj23_transparenzregister.config.config_providers import JsonFileConfigProvider\n", "import aki_prj23_transparenzregister.utils.mongo.news_mongo_service as news\n", "import aki_prj23_transparenzregister.utils.mongo.company_mongo_service as comps" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class ABSA:\n", " def __init__(self):\n", " self.config_provider = JsonFileConfigProvider(\"./secrets.json\")\n", " self.connect_string = self.config_provider.get_mongo_connection_string()\n", " self.connect_string.database = \"transparenzregister_ner\"\n", " self.connector = conn.MongoConnector(self.connect_string)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Mongo Connect: create connection string and connect\n", "config_provider = JsonFileConfigProvider(\"../../secrets.json\")\n", "engine = config_provider.get_mongo_connection_string()\n", "engine.database = \"transparenzregister_ner\"\n", "connector = conn.MongoConnector(engine)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Process all documents and check if attribute 'name' is existing\n", "# Read data from database\n", "CompsObj = comps.CompanyMongoService(connector)\n", "allComps = CompsObj.get_all()\n", "\n", "# Create a cursor which has all unprogressed articles; articles without the attribute 'companies'\n", "CursorCompNames = CompsObj.collection.find({\"name\": {\"$exists\": True}})\n", "documents = list(CursorCompNames)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# create a list with all company names\n", "compList = []\n", "\n", "if len(documents) > 0:\n", " for document in documents:\n", " # ents=NERService.NERCompanyList(company_list,document)\n", " compList.append(document[\"name\"])\n", " # add a new attribute 'companies' to document\n", " # newsObj.collection.update_one(\n", " # {\"_id\": document[\"_id\"]}, # Filter für das entsprechende Dokument\n", " # {\"$set\": {\"companies\": ents}} # Neues Attribut hinzufügen, initialisiert mit einer leeren Liste\n", " # )\n", "\n", "else:\n", " print(\"No documents found.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Process all documents in news collection and check if attribute 'companies' is existing\n", "\n", "# Read data from database\n", "NERObj = news.MongoNewsService(connector)\n", "allNER = NERObj.get_all()\n", "\n", "# Create a cursor which has all unprogressed articles; articles without the attribute 'companies'\n", "CursorNERNames = NERObj.collection.find({\"companies\": {\"$exists\": True}})\n", "documentsNER = list(CursorNERNames)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# install and import rapidfuzz\n", "# pip install rapidfuzz\n", "from rapidfuzz import process" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Process all documents in news collection and check if attribute 'companies' is existing\n", "\n", "# Read data from database\n", "NERObj = news.MongoNewsService(connector)\n", "allNER = NERObj.get_all()\n", "\n", "# Create a cursor which has all unprogressed articles; articles without the attribute 'companies'\n", "CursorNERNames = NERObj.collection.find({\"companies\": {\"$exists\": True}})\n", "documentsNER = list(CursorNERNames)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if len(documentsNER) > 0:\n", " for document in documentsNER:\n", " resList = [] # result list with matched names\n", " for entity_name, frequency in document[\"companies\"].items():\n", " if len(entity_name) > 2:\n", " result = process.extractOne(entity_name, compList)\n", " if result is not None:\n", " # Wenn ein ähnlicher Name gefunden wurde\n", " if result[1] >= 95:\n", " # Passen Sie die Ähnlichkeitsbewertungsschwelle nach Bedarf an\n", " similar_name = result[0]\n", " # print(f\"Ähnlicher Name gefunden: {entity_name} (Ähnlichkeit: {result[1]})\")\n", " # print(f\"Ähnlichkeit mit: {similar_name}\")\n", " # print(f\"Häufigkeit: {frequency}\")\n", " print(\n", " f\"NER Entität: {entity_name} passt zu:{similar_name} zu: {result[1]}% und {frequency} Matches \"\n", " )\n", "\n", " # ents=NERService.NERCompanyList(company_list,document)\n", " # compList.append(document['name'])\n", " # add a new attribute 'companies' to document\n", " # newsObj.collection.update_one(\n", " # {\"_id\": document[\"_id\"]}, # Filter für das entsprechende Dokument\n", " # {\"$set\": {\"companies\": ents}} # Neues Attribut hinzufügen, initialisiert mit einer leeren Liste\n", " # )\n", "\n", "else:\n", " print(\"No documents found.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "documentsNER[1][\"companies\"].items()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "compList" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "\n", "# Funktion zum Entfernen von Unternehmenstypen und rechtlichen Zusätzen\n", "def remove_legal_additions(name):\n", " # Verwenden Sie einen regulären Ausdruck, um gängige Zusätze wie \"GmbH\" und \"AG\" zu entfernen\n", " cleaned_name = re.sub(\n", " r\"\\b(GmbH|AG|KG|SE|& Co\\. KGaA|& Co\\.|e\\.K\\.|mbH|mbH & Co\\. KG)\\b\", \"\", name\n", " )\n", " # Entfernen Sie führende und nachfolgende Leerzeichen\n", " cleaned_name = cleaned_name.strip()\n", " return cleaned_name\n", "\n", "\n", "# Bereinigen Sie die Liste von Unternehmensnamen\n", "complist2 = [remove_legal_additions(name) for name in compList]\n", "complist2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "complist3 = [word.split()[0] for word in complist2 if len(word.split()[0]) >= 3]\n", "complist4 = list(set(complist3))\n", "complist4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Bestimmte Begriffe, die keine Firmennamen sind, müssen aus complist4 entfernt werden\n", "if (\"Deutsche\") in complist4:\n", " complist4.remove(\"Deutsche\")\n", "else:\n", " pass\n", "\n", "if (\"Hamburg\") in complist4:\n", " complist4.remove(\"Hamburg\")\n", "else:\n", " pass\n", "\n", "if (\"Hamburger\") in complist4:\n", " complist4.remove(\"Hamburger\")\n", "else:\n", " pass\n", "\n", "if (\"Union\") in complist4:\n", " complist4.remove(\"Union\")\n", "else:\n", " pass\n", "\n", "if (\"Energy\") in complist4:\n", " complist4.remove(\"Energy\")\n", "else:\n", " pass\n", "\n", "if (\"Hugo\") in complist4:\n", " complist4.remove(\"Hugo\")\n", "else:\n", " pass\n", "\n", "if (\"Pro\") in complist4:\n", " complist4.remove(\"Pro\")\n", "else:\n", " pass\n", "\n", "if (\"OTC\") in complist4:\n", " complist4.remove(\"OTC\")\n", "else:\n", " pass\n", "\n", "if (\"web\") in complist4:\n", " complist4.remove(\"web\")\n", "else:\n", " pass\n", "\n", "if (\"Kabel\") in complist4:\n", " complist4.remove(\"Kabel\")\n", "else:\n", " pass\n", "\n", "if (\"Club\") in complist4:\n", " complist4.remove(\"Club\")\n", "else:\n", " pass\n", "\n", "if (\"The\") in complist4:\n", " complist4.remove(\"The\")\n", "else:\n", " pass\n", "\n", "if (\"United\") in complist4:\n", " complist4.remove(\"United\")\n", "else:\n", " pass\n", "\n", "if (\"Frankfurter\") in complist4:\n", " complist4.remove(\"Frankfurter\")\n", "else:\n", " pass\n", "\n", "if (\"CMC\") in complist4:\n", " complist4.remove(\"CMC\")\n", "else:\n", " pass\n", "\n", "if (\"Bayern\") in complist4:\n", " complist4.remove(\"Bayern\")\n", "else:\n", " pass\n", "\n", "if (\"Haus\") in complist4:\n", " complist4.remove(\"Haus\")\n", "else:\n", " pass\n", "\n", "if (\"Gesellschaft\") in complist4:\n", " complist4.remove(\"Gesellschaft\")\n", "else:\n", " pass\n", "\n", "if (\"Delivery\") in complist4:\n", " complist4.remove(\"Delivery\")\n", "else:\n", " pass\n", "\n", "if (\"Aachener\") in complist4:\n", " complist4.remove(\"Aachener\")\n", "else:\n", " pass\n", "\n", "if (\"Group\") in complist4:\n", " complist4.remove(\"Group\")\n", "else:\n", " pass\n", "\n", "if (\"Retail\") in complist4:\n", " complist4.remove(\"Retail\")\n", "else:\n", " pass\n", "\n", "if (\"Media\") in complist4:\n", " complist4.remove(\"Media\")\n", "else:\n", " pass\n", "\n", "if (\"European\") in complist4:\n", " complist4.remove(\"European\")\n", "else:\n", " pass\n", "\n", "if (\"Fuels\") in complist4:\n", " complist4.remove(\"Fuels\")\n", "else:\n", " pass" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Zusammenführung der beiden Listen complist2 und complist4\n", "complist5 = complist2 + complist4\n", "complist5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df1 = pd.DataFrame(documentsNER)\n", "df1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Function to extract company names\n", "def extract_company_names(company_dict):\n", " return list(company_dict.keys())\n", "\n", "\n", "# Apply the function to the 'companies' column\n", "df1[\"companies\"] = df1[\"companies\"].apply(lambda x: extract_company_names(x))\n", "df1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df1[\"companies_filtered\"] = df1[\"companies\"].apply(\n", " lambda x: [company for company in x if company in complist5]\n", ")\n", "df1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Nur Auswahl der ersten fünf Spalten von \"companies_filtered\", um später Rechenressourcen zu ersparen.\n", "def split_list1(row):\n", " return pd.Series(row[\"companies_filtered\"][:5])\n", "\n", "\n", "# Apply the function and concatenate the result with the original DataFrame\n", "df2 = df1.apply(split_list1, axis=1).merge(df1, left_index=True, right_index=True)\n", "\n", "# Drop the original column with lists\n", "df2 = df2.drop(\"companies\", axis=1)\n", "df2 = df2.drop(\"companies_filtered\", axis=1)\n", "df2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df2.rename(columns={0: \"company1\"}, inplace=True)\n", "df2.rename(columns={1: \"company2\"}, inplace=True)\n", "df2.rename(columns={2: \"company3\"}, inplace=True)\n", "df2.rename(columns={3: \"company4\"}, inplace=True)\n", "df2.rename(columns={4: \"company5\"}, inplace=True)\n", "df2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# cell10 = df2.loc[3, 'company1']\n", "# print(cell10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df2.dropna(\n", " subset=[\"company1\", \"company2\", \"company3\", \"company4\", \"company5\"],\n", " how=\"all\",\n", " inplace=True,\n", ")\n", "df2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Die Indizes müssen resetted werden, da es ansonsten bei den nachfolgenden Schritte Fehlermeldungen gibt (Iterieren über i würde ansonsten nicht funktionieren).\n", "df2 = df2.reset_index(drop=True)\n", "df2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def filter_sentences_company1(row):\n", " target_word = row[\"company1\"]\n", "\n", " # Check if target_word is NaN\n", " if pd.isna(target_word):\n", " return None\n", "\n", " sentences = re.split(r\"[.:>]\", row[\"text\"]) # Split by dot\n", "\n", " # Extract sentences containing target word\n", " filtered_sentences = [\n", " sentence.strip() for sentence in sentences if target_word in sentence\n", " ]\n", "\n", " # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n", " concatenated_sentences = \". \".join(filtered_sentences)[:200]\n", "\n", " # Return None if no sentences contain the target word\n", " return concatenated_sentences if concatenated_sentences else None\n", "\n", "\n", "df2[\"text_company1\"] = df2.apply(filter_sentences_company1, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def filter_sentences_company2(row):\n", " target_word = row[\"company2\"]\n", "\n", " # Check if target_word is NaN\n", " if pd.isna(target_word):\n", " return None\n", "\n", " sentences = re.split(r\"[.:>]\", row[\"text\"]) # Split by dot\n", "\n", " # Extract sentences containing target word\n", " filtered_sentences = [\n", " sentence.strip() for sentence in sentences if target_word in sentence\n", " ]\n", "\n", " # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n", " concatenated_sentences = \". \".join(filtered_sentences)[:200]\n", "\n", " # Return None if no sentences contain the target word\n", " return concatenated_sentences if concatenated_sentences else None\n", "\n", "\n", "df2[\"text_company2\"] = df2.apply(filter_sentences_company2, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def filter_sentences_company3(row):\n", " target_word = row[\"company3\"]\n", "\n", " # Check if target_word is NaN\n", " if pd.isna(target_word):\n", " return None\n", "\n", " sentences = re.split(r\"[.:>]\", row[\"text\"]) # Split by dot\n", "\n", " # Extract sentences containing target word\n", " filtered_sentences = [\n", " sentence.strip() for sentence in sentences if target_word in sentence\n", " ]\n", "\n", " # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n", " concatenated_sentences = \". \".join(filtered_sentences)[:200]\n", "\n", " # Return None if no sentences contain the target word\n", " return concatenated_sentences if concatenated_sentences else None\n", "\n", "\n", "df2[\"text_company3\"] = df2.apply(filter_sentences_company3, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def filter_sentences_company4(row):\n", " target_word = row[\"company4\"]\n", "\n", " # Check if target_word is NaN\n", " if pd.isna(target_word):\n", " return None\n", "\n", " sentences = re.split(r\"[.:>]\", row[\"text\"]) # Split by dot\n", "\n", " # Extract sentences containing target word\n", " filtered_sentences = [\n", " sentence.strip() for sentence in sentences if target_word in sentence\n", " ]\n", "\n", " # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n", " concatenated_sentences = \". \".join(filtered_sentences)[:200]\n", "\n", " # Return None if no sentences contain the target word\n", " return concatenated_sentences if concatenated_sentences else None\n", "\n", "\n", "df2[\"text_company4\"] = df2.apply(filter_sentences_company4, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def filter_sentences_company5(row):\n", " target_word = row[\"company5\"]\n", "\n", " # Check if target_word is NaN\n", " if pd.isna(target_word):\n", " return None\n", "\n", " sentences = re.split(r\"[.:>]\", row[\"text\"]) # Split by dot\n", "\n", " # Extract sentences containing target word\n", " filtered_sentences = [\n", " sentence.strip() for sentence in sentences if target_word in sentence\n", " ]\n", "\n", " # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n", " concatenated_sentences = \". \".join(filtered_sentences)[:200]\n", "\n", " # Return None if no sentences contain the target word\n", " return concatenated_sentences if concatenated_sentences else None\n", "\n", "\n", "df2[\"text_company5\"] = df2.apply(filter_sentences_company5, axis=1)\n", "df2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, MarianMTModel\n", "\n", "# Hinweis: Durch den zusätzlichen Code \"truncation=True\" kann die Beschränkung auf 512 Zeichen bei der maschinen Übersetzung ausgeschaltet werden.\n", "translation_tokenizer = AutoTokenizer.from_pretrained(\n", " \"Helsinki-NLP/opus-mt-de-en\", truncation=True\n", ")\n", "\n", "translation_model = AutoModelForSeq2SeqLM.from_pretrained(\"Helsinki-NLP/opus-mt-de-en\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Testen, ob die maschinelle Übersetzung bei einem Beispielssatz funkioniert.\n", "def translate_sentiment(text: str) -> str:\n", " input_tokens = translation_tokenizer([text], return_tensors=\"pt\")\n", " generated_ids = translation_model.generate(**input_tokens)\n", " return translation_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[\n", " 0\n", " ]\n", "\n", "\n", "headline = \"Pelabresib: Biotech-Unternehmen Morphosys hofft mit neuem Krebs-Medikament endlich auf den Durchbruch. \"\n", "tf = translate_sentiment(headline)\n", "tf" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "translate_list_company1 = []\n", "\n", "for i in range(len(df2)):\n", " text = str(\n", " df2[\"text_company1\"].loc[i]\n", " ) # Convert to string (this is very important)\n", " texttrans = translate_sentiment(text)\n", " translate_list_company1.append(texttrans)\n", "\n", "dftrans_company1 = pd.DataFrame(translate_list_company1)\n", "dftrans_company1.rename(columns={0: \"text_company1_eng\"}, inplace=True)\n", "dftrans_company1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "translate_list_company2 = []\n", "\n", "for i in range(len(df2)):\n", " text = str(\n", " df2[\"text_company2\"].loc[i]\n", " ) # Convert to string (this is very important)\n", " texttrans = translate_sentiment(text)\n", " translate_list_company2.append(texttrans)\n", "\n", "dftrans_company2 = pd.DataFrame(translate_list_company2)\n", "dftrans_company2.rename(columns={0: \"text_company2_eng\"}, inplace=True)\n", "dftrans_company2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "translate_list_company3 = []\n", "\n", "for i in range(len(df2)):\n", " text = str(\n", " df2[\"text_company3\"].loc[i]\n", " ) # Convert to string (this is very important)\n", " texttrans = translate_sentiment(text)\n", " translate_list_company3.append(texttrans)\n", "\n", "dftrans_company3 = pd.DataFrame(translate_list_company3)\n", "dftrans_company3.rename(columns={0: \"text_company3_eng\"}, inplace=True)\n", "dftrans_company3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "translate_list_company4 = []\n", "\n", "for i in range(len(df2)):\n", " text = str(\n", " df2[\"text_company4\"].loc[i]\n", " ) # Convert to string (this is very important)\n", " texttrans = translate_sentiment(text)\n", " translate_list_company4.append(texttrans)\n", "\n", "dftrans_company4 = pd.DataFrame(translate_list_company4)\n", "dftrans_company4.rename(columns={0: \"text_company4_eng\"}, inplace=True)\n", "dftrans_company4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "translate_list_company5 = []\n", "\n", "for i in range(len(df2)):\n", " text = str(\n", " df2[\"text_company5\"].loc[i]\n", " ) # Convert to string (this is very important)\n", " texttrans = translate_sentiment(text)\n", " translate_list_company5.append(texttrans)\n", "\n", "dftrans_company5 = pd.DataFrame(translate_list_company5)\n", "dftrans_company5.rename(columns={0: \"text_company5_eng\"}, inplace=True)\n", "dftrans_company5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df3 = df2[\n", " [\n", " \"_id\",\n", " \"title\",\n", " \"text\",\n", " \"company1\",\n", " \"text_company1\",\n", " \"company2\",\n", " \"text_company2\",\n", " \"company3\",\n", " \"text_company3\",\n", " \"company4\",\n", " \"text_company4\",\n", " \"company5\",\n", " \"text_company5\",\n", " ]\n", "]\n", "df3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df3.insert(4, \"text_company1_eng\", dftrans_company1.iloc[:, 0])\n", "df3.insert(7, \"text_company2_eng\", dftrans_company2.iloc[:, 0])\n", "df3.insert(10, \"text_company3_eng\", dftrans_company3.iloc[:, 0])\n", "df3.insert(13, \"text_company4_eng\", dftrans_company4.iloc[:, 0])\n", "df3.insert(16, \"text_company5_eng\", dftrans_company5.iloc[:, 0])\n", "df3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# df3.to_csv('df3_20231213.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df3 = df3.map(lambda x: None if pd.isna(x) else x)\n", "df3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# test ABSA\n", "from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline\n", "\n", "# Das ABSA-Modell und der Tokenizer werden geladen bzw. definiert\n", "model_name = \"yangheng/deberta-v3-base-absa-v1.1\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "model = AutoModelForSequenceClassification.from_pretrained(model_name)\n", "\n", "classifier = pipeline(\"text-classification\", model=model, tokenizer=tokenizer)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Test anhand eines Beispielsatzes\n", "for aspect in [\"Siemens\"]:\n", " print(\n", " aspect,\n", " classifier(\n", " \"Siemens is doing great\",\n", " text_pair=aspect,\n", " ),\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Test anhand eines Beispielsatzes Teil 2\n", "for aspect in [df2.loc[1, \"company1\"]]:\n", " print(\n", " aspect,\n", " classifier(\n", " \"Siemens is doing great\",\n", " text_pair=aspect,\n", " ),\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Muster für ABSA CompanyX\n", "\n", "result_list_company1 = []\n", "\n", "for i in range(len(df3)):\n", " aspect = df3[\"company1\"].loc[i]\n", " textfinal = aspect, classifier(df3[\"text_company1_eng\"].loc[i], text_pair=aspect)\n", " result_list_company1.append(textfinal)\n", "\n", "dfcompany1 = pd.DataFrame(result_list_company1)\n", "dfcompany1.rename(columns={0: \"company1_new\"}, inplace=True)\n", "dfcompany1.rename(columns={1: \"company1_ABSA_v1\"}, inplace=True)\n", "\n", "dfcompany1[\"company1_ABSA_v1\"] = dfcompany1[\"company1_ABSA_v1\"].astype(str)\n", "dfcompany1[\"company1_ABSA\"] = dfcompany1[\"company1_ABSA_v1\"].str[12:19]\n", "\n", "import re\n", "\n", "# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n", "pattern = r\"(\\d+(\\.\\d+)?)\"\n", "\n", "# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n", "dfcompany1[\"company1_numABSA\"] = dfcompany1[\"company1_ABSA_v1\"].apply(\n", " lambda x: re.search(pattern, str(x)).group(1)\n", " if re.search(pattern, str(x))\n", " else None\n", ")\n", "\n", "# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n", "dfcompany1[\"company1_numABSA\"] = pd.to_numeric(\n", " dfcompany1[\"company1_numABSA\"], errors=\"coerce\"\n", ")\n", "dfcompany1 = dfcompany1.drop(\"company1_ABSA_v1\", axis=1)\n", "dfcompany1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def handle_none(row):\n", " if row[\"company1_new\"] is None:\n", " row[\"company1_ABSA\"] = None\n", " row[\"company1_numABSA\"] = None\n", " return row\n", "\n", "\n", "# Apply the custom function to each row\n", "dfcompany1new = dfcompany1.apply(handle_none, axis=1)\n", "dfcompany1new" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Test anhand eines Beispielsatzes Teil 2\n", "for aspect in [df3.loc[9, \"company2\"]]:\n", " if df3[\"text_company2_eng\"].loc[9] != \"None\":\n", " print(\n", " aspect,\n", " classifier(\n", " df3[\"text_company2_eng\"].loc[9],\n", " text_pair=aspect,\n", " ),\n", " )\n", " else:\n", " print(None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "result_list_company2 = []\n", "\n", "for i in range(len(df3)):\n", " aspect = df3[\"company2\"].loc[i]\n", " textfinal = aspect, classifier(df3[\"text_company2_eng\"].loc[i], text_pair=aspect)\n", " result_list_company2.append(textfinal)\n", "\n", "dfcompany2 = pd.DataFrame(result_list_company2)\n", "dfcompany2.rename(columns={0: \"company2_new\"}, inplace=True)\n", "dfcompany2.rename(columns={1: \"company2_ABSA_v1\"}, inplace=True)\n", "\n", "dfcompany2[\"company2_ABSA_v1\"] = dfcompany2[\"company2_ABSA_v1\"].astype(str)\n", "dfcompany2[\"company2_ABSA\"] = dfcompany2[\"company2_ABSA_v1\"].str[12:19]\n", "\n", "import re\n", "\n", "# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n", "pattern = r\"(\\d+(\\.\\d+)?)\"\n", "\n", "# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n", "dfcompany2[\"company2_numABSA\"] = dfcompany2[\"company2_ABSA_v1\"].apply(\n", " lambda x: re.search(pattern, str(x)).group(1)\n", " if re.search(pattern, str(x))\n", " else None\n", ")\n", "\n", "# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n", "dfcompany2[\"company2_numABSA\"] = pd.to_numeric(\n", " dfcompany2[\"company2_numABSA\"], errors=\"coerce\"\n", ")\n", "dfcompany2 = dfcompany2.drop(\"company2_ABSA_v1\", axis=1)\n", "dfcompany2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def handle_none(row):\n", " if row[\"company2_new\"] is None:\n", " row[\"company2_ABSA\"] = None\n", " row[\"company2_numABSA\"] = None\n", " return row\n", "\n", "\n", "# Apply the custom function to each row\n", "dfcompany2new = dfcompany2.apply(handle_none, axis=1)\n", "dfcompany2new" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "result_list_company3 = []\n", "\n", "for i in range(len(df3)):\n", " aspect = df3[\"company3\"].loc[i]\n", " textfinal = aspect, classifier(df3[\"text_company3_eng\"].loc[i], text_pair=aspect)\n", " result_list_company3.append(textfinal)\n", "\n", "dfcompany3 = pd.DataFrame(result_list_company3)\n", "dfcompany3.rename(columns={0: \"company3_new\"}, inplace=True)\n", "dfcompany3.rename(columns={1: \"company3_ABSA_v1\"}, inplace=True)\n", "\n", "dfcompany3[\"company3_ABSA_v1\"] = dfcompany3[\"company3_ABSA_v1\"].astype(str)\n", "dfcompany3[\"company3_ABSA\"] = dfcompany3[\"company3_ABSA_v1\"].str[12:19]\n", "\n", "import re\n", "\n", "# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n", "pattern = r\"(\\d+(\\.\\d+)?)\"\n", "\n", "# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n", "dfcompany3[\"company3_numABSA\"] = dfcompany3[\"company3_ABSA_v1\"].apply(\n", " lambda x: re.search(pattern, str(x)).group(1)\n", " if re.search(pattern, str(x))\n", " else None\n", ")\n", "\n", "# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n", "dfcompany3[\"company3_numABSA\"] = pd.to_numeric(\n", " dfcompany3[\"company3_numABSA\"], errors=\"coerce\"\n", ")\n", "dfcompany3 = dfcompany3.drop(\"company3_ABSA_v1\", axis=1)\n", "dfcompany3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def handle_none(row):\n", " if row[\"company3_new\"] is None:\n", " row[\"company3_ABSA\"] = None\n", " row[\"company3_numABSA\"] = None\n", " return row\n", "\n", "\n", "# Apply the custom function to each row\n", "dfcompany3new = dfcompany3.apply(handle_none, axis=1)\n", "dfcompany3new" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "result_list_company4 = []\n", "\n", "for i in range(len(df3)):\n", " aspect = df3[\"company4\"].loc[i]\n", " textfinal = aspect, classifier(df3[\"text_company4_eng\"].loc[i], text_pair=aspect)\n", " result_list_company4.append(textfinal)\n", "\n", "dfcompany4 = pd.DataFrame(result_list_company4)\n", "dfcompany4.rename(columns={0: \"company4_new\"}, inplace=True)\n", "dfcompany4.rename(columns={1: \"company4_ABSA_v1\"}, inplace=True)\n", "\n", "dfcompany4[\"company4_ABSA_v1\"] = dfcompany4[\"company4_ABSA_v1\"].astype(str)\n", "dfcompany4[\"company4_ABSA\"] = dfcompany4[\"company4_ABSA_v1\"].str[12:19]\n", "\n", "import re\n", "\n", "# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n", "pattern = r\"(\\d+(\\.\\d+)?)\"\n", "\n", "# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n", "dfcompany4[\"company4_numABSA\"] = dfcompany4[\"company4_ABSA_v1\"].apply(\n", " lambda x: re.search(pattern, str(x)).group(1)\n", " if re.search(pattern, str(x))\n", " else None\n", ")\n", "\n", "# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n", "dfcompany4[\"company4_numABSA\"] = pd.to_numeric(\n", " dfcompany4[\"company4_numABSA\"], errors=\"coerce\"\n", ")\n", "dfcompany4 = dfcompany4.drop(\"company4_ABSA_v1\", axis=1)\n", "dfcompany4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def handle_none(row):\n", " if row[\"company4_new\"] is None:\n", " row[\"company4_ABSA\"] = None\n", " row[\"company4_numABSA\"] = None\n", " return row\n", "\n", "\n", "# Apply the custom function to each row\n", "dfcompany4new = dfcompany4.apply(handle_none, axis=1)\n", "dfcompany4new" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "result_list_company5 = []\n", "\n", "for i in range(len(df3)):\n", " aspect = df3[\"company5\"].loc[i]\n", " textfinal = aspect, classifier(df3[\"text_company5_eng\"].loc[i], text_pair=aspect)\n", " result_list_company5.append(textfinal)\n", "\n", "dfcompany5 = pd.DataFrame(result_list_company5)\n", "dfcompany5.rename(columns={0: \"company5_new\"}, inplace=True)\n", "dfcompany5.rename(columns={1: \"company5_ABSA_v1\"}, inplace=True)\n", "\n", "dfcompany5[\"company5_ABSA_v1\"] = dfcompany5[\"company5_ABSA_v1\"].astype(str)\n", "dfcompany5[\"company5_ABSA\"] = dfcompany5[\"company5_ABSA_v1\"].str[12:19]\n", "\n", "import re\n", "\n", "# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n", "pattern = r\"(\\d+(\\.\\d+)?)\"\n", "\n", "# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n", "dfcompany5[\"company5_numABSA\"] = dfcompany5[\"company5_ABSA_v1\"].apply(\n", " lambda x: re.search(pattern, str(x)).group(1)\n", " if re.search(pattern, str(x))\n", " else None\n", ")\n", "\n", "# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n", "dfcompany5[\"company5_numABSA\"] = pd.to_numeric(\n", " dfcompany5[\"company5_numABSA\"], errors=\"coerce\"\n", ")\n", "dfcompany5 = dfcompany5.drop(\"company5_ABSA_v1\", axis=1)\n", "dfcompany5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def handle_none(row):\n", " if row[\"company5_new\"] is None:\n", " row[\"company5_ABSA\"] = None\n", " row[\"company5_numABSA\"] = None\n", " return row\n", "\n", "\n", "# Apply the custom function to each row\n", "dfcompany5new = dfcompany5.apply(handle_none, axis=1)\n", "dfcompany5new" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dftotal1 = pd.concat(\n", " [df3, dfcompany1new, dfcompany2new, dfcompany3new, dfcompany4new, dfcompany5new],\n", " axis=1,\n", " join=\"outer\",\n", ")\n", "columns_to_drop = [\"company1\", \"company2\", \"company3\", \"company4\", \"company5\"]\n", "dftotal1.drop(columns=columns_to_drop, inplace=True)\n", "dftotal1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Custom function to combine two columns into a list\n", "def combine_to_list_company1(row):\n", " return {\n", " \"company\": row[\"company1_new\"],\n", " \"ABSA\": row[\"company1_ABSA\"],\n", " \"numABSA\": row[\"company1_numABSA\"],\n", " }\n", "\n", "\n", "# Apply the custom function to each row and create a new column 'Combined'\n", "dftotal1[\"company1_Combined\"] = dftotal1.apply(combine_to_list_company1, axis=1)\n", "columns_to_drop = [\"company1_new\", \"company1_ABSA\", \"company1_numABSA\"]\n", "dftotal1.drop(columns=columns_to_drop, inplace=True)\n", "dftotal1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Custom function to combine two columns into a list\n", "def combine_to_list_company2(row):\n", " return {\n", " \"company\": row[\"company2_new\"],\n", " \"ABSA\": row[\"company2_ABSA\"],\n", " \"numABSA\": row[\"company2_numABSA\"],\n", " }\n", "\n", "\n", "# Apply the custom function to each row and create a new column 'Combined'\n", "dftotal1[\"company2_Combined\"] = dftotal1.apply(combine_to_list_company2, axis=1)\n", "columns_to_drop = [\"company2_new\", \"company2_ABSA\", \"company2_numABSA\"]\n", "dftotal1.drop(columns=columns_to_drop, inplace=True)\n", "dftotal1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Custom function to combine two columns into a list\n", "def combine_to_list_company3(row):\n", " return {\n", " \"company\": row[\"company3_new\"],\n", " \"ABSA\": row[\"company3_ABSA\"],\n", " \"numABSA\": row[\"company3_numABSA\"],\n", " }\n", "\n", "\n", "# Apply the custom function to each row and create a new column 'Combined'\n", "dftotal1[\"company3_Combined\"] = dftotal1.apply(combine_to_list_company3, axis=1)\n", "columns_to_drop = [\"company3_new\", \"company3_ABSA\", \"company3_numABSA\"]\n", "dftotal1.drop(columns=columns_to_drop, inplace=True)\n", "dftotal1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Custom function to combine two columns into a list\n", "def combine_to_list_company4(row):\n", " return {\n", " \"company\": row[\"company4_new\"],\n", " \"ABSA\": row[\"company4_ABSA\"],\n", " \"numABSA\": row[\"company4_numABSA\"],\n", " }\n", "\n", "\n", "# Apply the custom function to each row and create a new column 'Combined'\n", "dftotal1[\"company4_Combined\"] = dftotal1.apply(combine_to_list_company4, axis=1)\n", "columns_to_drop = [\"company4_new\", \"company4_ABSA\", \"company4_numABSA\"]\n", "dftotal1.drop(columns=columns_to_drop, inplace=True)\n", "dftotal1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Custom function to combine two columns into a list\n", "def combine_to_list_company5(row):\n", " return {\n", " \"company\": row[\"company5_new\"],\n", " \"ABSA\": row[\"company5_ABSA\"],\n", " \"numABSA\": row[\"company5_numABSA\"],\n", " }\n", "\n", "\n", "# Apply the custom function to each row and create a new column 'Combined'\n", "dftotal1[\"company5_Combined\"] = dftotal1.apply(combine_to_list_company5, axis=1)\n", "columns_to_drop = [\"company5_new\", \"company5_ABSA\", \"company5_numABSA\"]\n", "dftotal1.drop(columns=columns_to_drop, inplace=True)\n", "dftotal1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dftotal1.rename(columns={\"company1_Combined\": \"company1\"}, inplace=True)\n", "dftotal1.rename(columns={\"company2_Combined\": \"company2\"}, inplace=True)\n", "dftotal1.rename(columns={\"company3_Combined\": \"company3\"}, inplace=True)\n", "dftotal1.rename(columns={\"company4_Combined\": \"company4\"}, inplace=True)\n", "dftotal1.rename(columns={\"company5_Combined\": \"company5\"}, inplace=True)\n", "dftotal1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Custom function to combine two columns into a dictionary\n", "def combine_to_dict(row):\n", " return {\n", " \"company1\": row[\"company1\"],\n", " \"company2\": row[\"company2\"],\n", " \"company3\": row[\"company3\"],\n", " \"company4\": row[\"company4\"],\n", " \"company5\": row[\"company5\"],\n", " }\n", "\n", "\n", "# Apply the custom function to each row and create a new column 'Combined'\n", "dftotal1[\"Combined_ABSA\"] = dftotal1.apply(combine_to_dict, axis=1)\n", "columns_to_drop = [\"company1\", \"company2\", \"company3\", \"company4\", \"company5\"]\n", "dftotal1.drop(columns=columns_to_drop, inplace=True)\n", "dftotal1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# dftotal1.to_csv('dftotal1_20231214_v4.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Ganz am Ende packen\n", "# Create an instance of NewsObject (replace 'your_collection' with your actual collection)\n", "newsObj = news.MongoNewsService(connector)\n", "\n", "if len(dftotal1) > 0:\n", " for i in range(len(dftotal1)):\n", " # ents=NERService.NERCompanyList(company_list,document)\n", " # add a new attribute 'Combined_ABSA' to document\n", " newsObj.collection.update_one(\n", " {\"_id\": dftotal1[\"_id\"].iloc[i]}, # Filter für das entsprechende Dokument\n", " {\n", " \"$set\": {\"Combined_ABSA\": dftotal1[\"Combined_ABSA\"].iloc[i]}\n", " }, # Neues Attribut hinzufügen, initialisiert mit einer leeren Liste\n", " )\n", "\n", "else:\n", " print(\"No documents found.\")" ] } ], "metadata": { "kernelspec": { "display_name": "aki-prj23-transparenzregister-z8SxnVl_-py3.11", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 2 }