mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-04-22 05:02:53 +02:00
1399 lines
44 KiB
Plaintext
1399 lines
44 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import json\n",
|
|
"import pandas as pd\n",
|
|
"import aki_prj23_transparenzregister.utils.mongo.connector as conn\n",
|
|
"from aki_prj23_transparenzregister.config.config_providers import JsonFileConfigProvider\n",
|
|
"import aki_prj23_transparenzregister.utils.mongo.news_mongo_service as news\n",
|
|
"import aki_prj23_transparenzregister.utils.mongo.company_mongo_service as comps"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"class ABSA:\n",
|
|
" def __init__(self):\n",
|
|
" self.config_provider = JsonFileConfigProvider(\"./secrets.json\")\n",
|
|
" self.connect_string = self.config_provider.get_mongo_connection_string()\n",
|
|
" self.connect_string.database = \"transparenzregister_ner\"\n",
|
|
" self.connector = conn.MongoConnector(self.connect_string)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Mongo Connect: create connection string and connect\n",
|
|
"config_provider = JsonFileConfigProvider(\"../../secrets.json\")\n",
|
|
"engine = config_provider.get_mongo_connection_string()\n",
|
|
"engine.database = \"transparenzregister_ner\"\n",
|
|
"connector = conn.MongoConnector(engine)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Process all documents and check if attribute 'name' is existing\n",
|
|
"# Read data from database\n",
|
|
"CompsObj = comps.CompanyMongoService(connector)\n",
|
|
"allComps = CompsObj.get_all()\n",
|
|
"\n",
|
|
"# Create a cursor which has all unprogressed articles; articles without the attribute 'companies'\n",
|
|
"CursorCompNames = CompsObj.collection.find({\"name\": {\"$exists\": True}})\n",
|
|
"documents = list(CursorCompNames)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# create a list with all company names\n",
|
|
"compList = []\n",
|
|
"\n",
|
|
"if len(documents) > 0:\n",
|
|
" for document in documents:\n",
|
|
" # ents=NERService.NERCompanyList(company_list,document)\n",
|
|
" compList.append(document[\"name\"])\n",
|
|
" # add a new attribute 'companies' to document\n",
|
|
" # newsObj.collection.update_one(\n",
|
|
" # {\"_id\": document[\"_id\"]}, # Filter für das entsprechende Dokument\n",
|
|
" # {\"$set\": {\"companies\": ents}} # Neues Attribut hinzufügen, initialisiert mit einer leeren Liste\n",
|
|
" # )\n",
|
|
"\n",
|
|
"else:\n",
|
|
" print(\"No documents found.\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Process all documents in news collection and check if attribute 'companies' is existing\n",
|
|
"\n",
|
|
"# Read data from database\n",
|
|
"NERObj = news.MongoNewsService(connector)\n",
|
|
"allNER = NERObj.get_all()\n",
|
|
"\n",
|
|
"# Create a cursor which has all unprogressed articles; articles without the attribute 'companies'\n",
|
|
"CursorNERNames = NERObj.collection.find({\"companies\": {\"$exists\": True}})\n",
|
|
"documentsNER = list(CursorNERNames)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# install and import rapidfuzz\n",
|
|
"# pip install rapidfuzz\n",
|
|
"from rapidfuzz import process"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Process all documents in news collection and check if attribute 'companies' is existing\n",
|
|
"\n",
|
|
"# Read data from database\n",
|
|
"NERObj = news.MongoNewsService(connector)\n",
|
|
"allNER = NERObj.get_all()\n",
|
|
"\n",
|
|
"# Create a cursor which has all unprogressed articles; articles without the attribute 'companies'\n",
|
|
"CursorNERNames = NERObj.collection.find({\"companies\": {\"$exists\": True}})\n",
|
|
"documentsNER = list(CursorNERNames)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"if len(documentsNER) > 0:\n",
|
|
" for document in documentsNER:\n",
|
|
" resList = [] # result list with matched names\n",
|
|
" for entity_name, frequency in document[\"companies\"].items():\n",
|
|
" if len(entity_name) > 2:\n",
|
|
" result = process.extractOne(entity_name, compList)\n",
|
|
" if result is not None:\n",
|
|
" # Wenn ein ähnlicher Name gefunden wurde\n",
|
|
" if result[1] >= 95:\n",
|
|
" # Passen Sie die Ähnlichkeitsbewertungsschwelle nach Bedarf an\n",
|
|
" similar_name = result[0]\n",
|
|
" # print(f\"Ähnlicher Name gefunden: {entity_name} (Ähnlichkeit: {result[1]})\")\n",
|
|
" # print(f\"Ähnlichkeit mit: {similar_name}\")\n",
|
|
" # print(f\"Häufigkeit: {frequency}\")\n",
|
|
" print(\n",
|
|
" f\"NER Entität: {entity_name} passt zu:{similar_name} zu: {result[1]}% und {frequency} Matches \"\n",
|
|
" )\n",
|
|
"\n",
|
|
" # ents=NERService.NERCompanyList(company_list,document)\n",
|
|
" # compList.append(document['name'])\n",
|
|
" # add a new attribute 'companies' to document\n",
|
|
" # newsObj.collection.update_one(\n",
|
|
" # {\"_id\": document[\"_id\"]}, # Filter für das entsprechende Dokument\n",
|
|
" # {\"$set\": {\"companies\": ents}} # Neues Attribut hinzufügen, initialisiert mit einer leeren Liste\n",
|
|
" # )\n",
|
|
"\n",
|
|
"else:\n",
|
|
" print(\"No documents found.\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"documentsNER[1][\"companies\"].items()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"compList"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import re\n",
|
|
"\n",
|
|
"\n",
|
|
"# Funktion zum Entfernen von Unternehmenstypen und rechtlichen Zusätzen\n",
|
|
"def remove_legal_additions(name):\n",
|
|
" # Verwenden Sie einen regulären Ausdruck, um gängige Zusätze wie \"GmbH\" und \"AG\" zu entfernen\n",
|
|
" cleaned_name = re.sub(\n",
|
|
" r\"\\b(GmbH|AG|KG|SE|& Co\\. KGaA|& Co\\.|e\\.K\\.|mbH|mbH & Co\\. KG)\\b\", \"\", name\n",
|
|
" )\n",
|
|
" # Entfernen Sie führende und nachfolgende Leerzeichen\n",
|
|
" cleaned_name = cleaned_name.strip()\n",
|
|
" return cleaned_name\n",
|
|
"\n",
|
|
"\n",
|
|
"# Bereinigen Sie die Liste von Unternehmensnamen\n",
|
|
"complist2 = [remove_legal_additions(name) for name in compList]\n",
|
|
"complist2"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"complist3 = [word.split()[0] for word in complist2 if len(word.split()[0]) >= 3]\n",
|
|
"complist4 = list(set(complist3))\n",
|
|
"complist4"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Bestimmte Begriffe, die keine Firmennamen sind, müssen aus complist4 entfernt werden\n",
|
|
"if (\"Deutsche\") in complist4:\n",
|
|
" complist4.remove(\"Deutsche\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"Hamburg\") in complist4:\n",
|
|
" complist4.remove(\"Hamburg\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"Hamburger\") in complist4:\n",
|
|
" complist4.remove(\"Hamburger\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"Union\") in complist4:\n",
|
|
" complist4.remove(\"Union\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"Energy\") in complist4:\n",
|
|
" complist4.remove(\"Energy\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"Hugo\") in complist4:\n",
|
|
" complist4.remove(\"Hugo\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"Pro\") in complist4:\n",
|
|
" complist4.remove(\"Pro\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"OTC\") in complist4:\n",
|
|
" complist4.remove(\"OTC\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"web\") in complist4:\n",
|
|
" complist4.remove(\"web\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"Kabel\") in complist4:\n",
|
|
" complist4.remove(\"Kabel\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"Club\") in complist4:\n",
|
|
" complist4.remove(\"Club\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"The\") in complist4:\n",
|
|
" complist4.remove(\"The\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"United\") in complist4:\n",
|
|
" complist4.remove(\"United\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"Frankfurter\") in complist4:\n",
|
|
" complist4.remove(\"Frankfurter\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"CMC\") in complist4:\n",
|
|
" complist4.remove(\"CMC\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"Bayern\") in complist4:\n",
|
|
" complist4.remove(\"Bayern\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"Haus\") in complist4:\n",
|
|
" complist4.remove(\"Haus\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"Gesellschaft\") in complist4:\n",
|
|
" complist4.remove(\"Gesellschaft\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"Delivery\") in complist4:\n",
|
|
" complist4.remove(\"Delivery\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"Aachener\") in complist4:\n",
|
|
" complist4.remove(\"Aachener\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"Group\") in complist4:\n",
|
|
" complist4.remove(\"Group\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"Retail\") in complist4:\n",
|
|
" complist4.remove(\"Retail\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"Media\") in complist4:\n",
|
|
" complist4.remove(\"Media\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"European\") in complist4:\n",
|
|
" complist4.remove(\"European\")\n",
|
|
"else:\n",
|
|
" pass\n",
|
|
"\n",
|
|
"if (\"Fuels\") in complist4:\n",
|
|
" complist4.remove(\"Fuels\")\n",
|
|
"else:\n",
|
|
" pass"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Zusammenführung der beiden Listen complist2 und complist4\n",
|
|
"complist5 = complist2 + complist4\n",
|
|
"complist5"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1 = pd.DataFrame(documentsNER)\n",
|
|
"df1"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Function to extract company names\n",
|
|
"def extract_company_names(company_dict):\n",
|
|
" return list(company_dict.keys())\n",
|
|
"\n",
|
|
"\n",
|
|
"# Apply the function to the 'companies' column\n",
|
|
"df1[\"companies\"] = df1[\"companies\"].apply(lambda x: extract_company_names(x))\n",
|
|
"df1"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df1[\"companies_filtered\"] = df1[\"companies\"].apply(\n",
|
|
" lambda x: [company for company in x if company in complist5]\n",
|
|
")\n",
|
|
"df1"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Nur Auswahl der ersten fünf Spalten von \"companies_filtered\", um später Rechenressourcen zu ersparen.\n",
|
|
"def split_list1(row):\n",
|
|
" return pd.Series(row[\"companies_filtered\"][:5])\n",
|
|
"\n",
|
|
"\n",
|
|
"# Apply the function and concatenate the result with the original DataFrame\n",
|
|
"df2 = df1.apply(split_list1, axis=1).merge(df1, left_index=True, right_index=True)\n",
|
|
"\n",
|
|
"# Drop the original column with lists\n",
|
|
"df2 = df2.drop(\"companies\", axis=1)\n",
|
|
"df2 = df2.drop(\"companies_filtered\", axis=1)\n",
|
|
"df2"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df2.rename(columns={0: \"company1\"}, inplace=True)\n",
|
|
"df2.rename(columns={1: \"company2\"}, inplace=True)\n",
|
|
"df2.rename(columns={2: \"company3\"}, inplace=True)\n",
|
|
"df2.rename(columns={3: \"company4\"}, inplace=True)\n",
|
|
"df2.rename(columns={4: \"company5\"}, inplace=True)\n",
|
|
"df2"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# cell10 = df2.loc[3, 'company1']\n",
|
|
"# print(cell10)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df2.dropna(\n",
|
|
" subset=[\"company1\", \"company2\", \"company3\", \"company4\", \"company5\"],\n",
|
|
" how=\"all\",\n",
|
|
" inplace=True,\n",
|
|
")\n",
|
|
"df2"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Die Indizes müssen resetted werden, da es ansonsten bei den nachfolgenden Schritte Fehlermeldungen gibt (Iterieren über i würde ansonsten nicht funktionieren).\n",
|
|
"df2 = df2.reset_index(drop=True)\n",
|
|
"df2"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def filter_sentences_company1(row):\n",
|
|
" target_word = row[\"company1\"]\n",
|
|
"\n",
|
|
" # Check if target_word is NaN\n",
|
|
" if pd.isna(target_word):\n",
|
|
" return None\n",
|
|
"\n",
|
|
" sentences = re.split(r\"[.:>]\", row[\"text\"]) # Split by dot\n",
|
|
"\n",
|
|
" # Extract sentences containing target word\n",
|
|
" filtered_sentences = [\n",
|
|
" sentence.strip() for sentence in sentences if target_word in sentence\n",
|
|
" ]\n",
|
|
"\n",
|
|
" # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n",
|
|
" concatenated_sentences = \". \".join(filtered_sentences)[:200]\n",
|
|
"\n",
|
|
" # Return None if no sentences contain the target word\n",
|
|
" return concatenated_sentences if concatenated_sentences else None\n",
|
|
"\n",
|
|
"\n",
|
|
"df2[\"text_company1\"] = df2.apply(filter_sentences_company1, axis=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def filter_sentences_company2(row):\n",
|
|
" target_word = row[\"company2\"]\n",
|
|
"\n",
|
|
" # Check if target_word is NaN\n",
|
|
" if pd.isna(target_word):\n",
|
|
" return None\n",
|
|
"\n",
|
|
" sentences = re.split(r\"[.:>]\", row[\"text\"]) # Split by dot\n",
|
|
"\n",
|
|
" # Extract sentences containing target word\n",
|
|
" filtered_sentences = [\n",
|
|
" sentence.strip() for sentence in sentences if target_word in sentence\n",
|
|
" ]\n",
|
|
"\n",
|
|
" # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n",
|
|
" concatenated_sentences = \". \".join(filtered_sentences)[:200]\n",
|
|
"\n",
|
|
" # Return None if no sentences contain the target word\n",
|
|
" return concatenated_sentences if concatenated_sentences else None\n",
|
|
"\n",
|
|
"\n",
|
|
"df2[\"text_company2\"] = df2.apply(filter_sentences_company2, axis=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def filter_sentences_company3(row):\n",
|
|
" target_word = row[\"company3\"]\n",
|
|
"\n",
|
|
" # Check if target_word is NaN\n",
|
|
" if pd.isna(target_word):\n",
|
|
" return None\n",
|
|
"\n",
|
|
" sentences = re.split(r\"[.:>]\", row[\"text\"]) # Split by dot\n",
|
|
"\n",
|
|
" # Extract sentences containing target word\n",
|
|
" filtered_sentences = [\n",
|
|
" sentence.strip() for sentence in sentences if target_word in sentence\n",
|
|
" ]\n",
|
|
"\n",
|
|
" # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n",
|
|
" concatenated_sentences = \". \".join(filtered_sentences)[:200]\n",
|
|
"\n",
|
|
" # Return None if no sentences contain the target word\n",
|
|
" return concatenated_sentences if concatenated_sentences else None\n",
|
|
"\n",
|
|
"\n",
|
|
"df2[\"text_company3\"] = df2.apply(filter_sentences_company3, axis=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def filter_sentences_company4(row):\n",
|
|
" target_word = row[\"company4\"]\n",
|
|
"\n",
|
|
" # Check if target_word is NaN\n",
|
|
" if pd.isna(target_word):\n",
|
|
" return None\n",
|
|
"\n",
|
|
" sentences = re.split(r\"[.:>]\", row[\"text\"]) # Split by dot\n",
|
|
"\n",
|
|
" # Extract sentences containing target word\n",
|
|
" filtered_sentences = [\n",
|
|
" sentence.strip() for sentence in sentences if target_word in sentence\n",
|
|
" ]\n",
|
|
"\n",
|
|
" # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n",
|
|
" concatenated_sentences = \". \".join(filtered_sentences)[:200]\n",
|
|
"\n",
|
|
" # Return None if no sentences contain the target word\n",
|
|
" return concatenated_sentences if concatenated_sentences else None\n",
|
|
"\n",
|
|
"\n",
|
|
"df2[\"text_company4\"] = df2.apply(filter_sentences_company4, axis=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def filter_sentences_company5(row):\n",
|
|
" target_word = row[\"company5\"]\n",
|
|
"\n",
|
|
" # Check if target_word is NaN\n",
|
|
" if pd.isna(target_word):\n",
|
|
" return None\n",
|
|
"\n",
|
|
" sentences = re.split(r\"[.:>]\", row[\"text\"]) # Split by dot\n",
|
|
"\n",
|
|
" # Extract sentences containing target word\n",
|
|
" filtered_sentences = [\n",
|
|
" sentence.strip() for sentence in sentences if target_word in sentence\n",
|
|
" ]\n",
|
|
"\n",
|
|
" # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n",
|
|
" concatenated_sentences = \". \".join(filtered_sentences)[:200]\n",
|
|
"\n",
|
|
" # Return None if no sentences contain the target word\n",
|
|
" return concatenated_sentences if concatenated_sentences else None\n",
|
|
"\n",
|
|
"\n",
|
|
"df2[\"text_company5\"] = df2.apply(filter_sentences_company5, axis=1)\n",
|
|
"df2"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, MarianMTModel\n",
|
|
"\n",
|
|
"# Hinweis: Durch den zusätzlichen Code \"truncation=True\" kann die Beschränkung auf 512 Zeichen bei der maschinen Übersetzung ausgeschaltet werden.\n",
|
|
"translation_tokenizer = AutoTokenizer.from_pretrained(\n",
|
|
" \"Helsinki-NLP/opus-mt-de-en\", truncation=True\n",
|
|
")\n",
|
|
"\n",
|
|
"translation_model = AutoModelForSeq2SeqLM.from_pretrained(\"Helsinki-NLP/opus-mt-de-en\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Testen, ob die maschinelle Übersetzung bei einem Beispielssatz funkioniert.\n",
|
|
"def translate_sentiment(text: str) -> str:\n",
|
|
" input_tokens = translation_tokenizer([text], return_tensors=\"pt\")\n",
|
|
" generated_ids = translation_model.generate(**input_tokens)\n",
|
|
" return translation_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[\n",
|
|
" 0\n",
|
|
" ]\n",
|
|
"\n",
|
|
"\n",
|
|
"headline = \"Pelabresib: Biotech-Unternehmen Morphosys hofft mit neuem Krebs-Medikament endlich auf den Durchbruch. \"\n",
|
|
"tf = translate_sentiment(headline)\n",
|
|
"tf"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"translate_list_company1 = []\n",
|
|
"\n",
|
|
"for i in range(len(df2)):\n",
|
|
" text = str(\n",
|
|
" df2[\"text_company1\"].loc[i]\n",
|
|
" ) # Convert to string (this is very important)\n",
|
|
" texttrans = translate_sentiment(text)\n",
|
|
" translate_list_company1.append(texttrans)\n",
|
|
"\n",
|
|
"dftrans_company1 = pd.DataFrame(translate_list_company1)\n",
|
|
"dftrans_company1.rename(columns={0: \"text_company1_eng\"}, inplace=True)\n",
|
|
"dftrans_company1"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"translate_list_company2 = []\n",
|
|
"\n",
|
|
"for i in range(len(df2)):\n",
|
|
" text = str(\n",
|
|
" df2[\"text_company2\"].loc[i]\n",
|
|
" ) # Convert to string (this is very important)\n",
|
|
" texttrans = translate_sentiment(text)\n",
|
|
" translate_list_company2.append(texttrans)\n",
|
|
"\n",
|
|
"dftrans_company2 = pd.DataFrame(translate_list_company2)\n",
|
|
"dftrans_company2.rename(columns={0: \"text_company2_eng\"}, inplace=True)\n",
|
|
"dftrans_company2"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"translate_list_company3 = []\n",
|
|
"\n",
|
|
"for i in range(len(df2)):\n",
|
|
" text = str(\n",
|
|
" df2[\"text_company3\"].loc[i]\n",
|
|
" ) # Convert to string (this is very important)\n",
|
|
" texttrans = translate_sentiment(text)\n",
|
|
" translate_list_company3.append(texttrans)\n",
|
|
"\n",
|
|
"dftrans_company3 = pd.DataFrame(translate_list_company3)\n",
|
|
"dftrans_company3.rename(columns={0: \"text_company3_eng\"}, inplace=True)\n",
|
|
"dftrans_company3"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"translate_list_company4 = []\n",
|
|
"\n",
|
|
"for i in range(len(df2)):\n",
|
|
" text = str(\n",
|
|
" df2[\"text_company4\"].loc[i]\n",
|
|
" ) # Convert to string (this is very important)\n",
|
|
" texttrans = translate_sentiment(text)\n",
|
|
" translate_list_company4.append(texttrans)\n",
|
|
"\n",
|
|
"dftrans_company4 = pd.DataFrame(translate_list_company4)\n",
|
|
"dftrans_company4.rename(columns={0: \"text_company4_eng\"}, inplace=True)\n",
|
|
"dftrans_company4"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"translate_list_company5 = []\n",
|
|
"\n",
|
|
"for i in range(len(df2)):\n",
|
|
" text = str(\n",
|
|
" df2[\"text_company5\"].loc[i]\n",
|
|
" ) # Convert to string (this is very important)\n",
|
|
" texttrans = translate_sentiment(text)\n",
|
|
" translate_list_company5.append(texttrans)\n",
|
|
"\n",
|
|
"dftrans_company5 = pd.DataFrame(translate_list_company5)\n",
|
|
"dftrans_company5.rename(columns={0: \"text_company5_eng\"}, inplace=True)\n",
|
|
"dftrans_company5"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df3 = df2[\n",
|
|
" [\n",
|
|
" \"_id\",\n",
|
|
" \"title\",\n",
|
|
" \"text\",\n",
|
|
" \"company1\",\n",
|
|
" \"text_company1\",\n",
|
|
" \"company2\",\n",
|
|
" \"text_company2\",\n",
|
|
" \"company3\",\n",
|
|
" \"text_company3\",\n",
|
|
" \"company4\",\n",
|
|
" \"text_company4\",\n",
|
|
" \"company5\",\n",
|
|
" \"text_company5\",\n",
|
|
" ]\n",
|
|
"]\n",
|
|
"df3"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df3.insert(4, \"text_company1_eng\", dftrans_company1.iloc[:, 0])\n",
|
|
"df3.insert(7, \"text_company2_eng\", dftrans_company2.iloc[:, 0])\n",
|
|
"df3.insert(10, \"text_company3_eng\", dftrans_company3.iloc[:, 0])\n",
|
|
"df3.insert(13, \"text_company4_eng\", dftrans_company4.iloc[:, 0])\n",
|
|
"df3.insert(16, \"text_company5_eng\", dftrans_company5.iloc[:, 0])\n",
|
|
"df3"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# df3.to_csv('df3_20231213.csv')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df3 = df3.map(lambda x: None if pd.isna(x) else x)\n",
|
|
"df3"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# test ABSA\n",
|
|
"from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline\n",
|
|
"\n",
|
|
"# Das ABSA-Modell und der Tokenizer werden geladen bzw. definiert\n",
|
|
"model_name = \"yangheng/deberta-v3-base-absa-v1.1\"\n",
|
|
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
|
|
"model = AutoModelForSequenceClassification.from_pretrained(model_name)\n",
|
|
"\n",
|
|
"classifier = pipeline(\"text-classification\", model=model, tokenizer=tokenizer)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Test anhand eines Beispielsatzes\n",
|
|
"for aspect in [\"Siemens\"]:\n",
|
|
" print(\n",
|
|
" aspect,\n",
|
|
" classifier(\n",
|
|
" \"Siemens is doing great\",\n",
|
|
" text_pair=aspect,\n",
|
|
" ),\n",
|
|
" )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Test anhand eines Beispielsatzes Teil 2\n",
|
|
"for aspect in [df2.loc[1, \"company1\"]]:\n",
|
|
" print(\n",
|
|
" aspect,\n",
|
|
" classifier(\n",
|
|
" \"Siemens is doing great\",\n",
|
|
" text_pair=aspect,\n",
|
|
" ),\n",
|
|
" )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Muster für ABSA CompanyX\n",
|
|
"\n",
|
|
"result_list_company1 = []\n",
|
|
"\n",
|
|
"for i in range(len(df3)):\n",
|
|
" aspect = df3[\"company1\"].loc[i]\n",
|
|
" textfinal = aspect, classifier(df3[\"text_company1_eng\"].loc[i], text_pair=aspect)\n",
|
|
" result_list_company1.append(textfinal)\n",
|
|
"\n",
|
|
"dfcompany1 = pd.DataFrame(result_list_company1)\n",
|
|
"dfcompany1.rename(columns={0: \"company1_new\"}, inplace=True)\n",
|
|
"dfcompany1.rename(columns={1: \"company1_ABSA_v1\"}, inplace=True)\n",
|
|
"\n",
|
|
"dfcompany1[\"company1_ABSA_v1\"] = dfcompany1[\"company1_ABSA_v1\"].astype(str)\n",
|
|
"dfcompany1[\"company1_ABSA\"] = dfcompany1[\"company1_ABSA_v1\"].str[12:19]\n",
|
|
"\n",
|
|
"import re\n",
|
|
"\n",
|
|
"# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n",
|
|
"pattern = r\"(\\d+(\\.\\d+)?)\"\n",
|
|
"\n",
|
|
"# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n",
|
|
"dfcompany1[\"company1_numABSA\"] = dfcompany1[\"company1_ABSA_v1\"].apply(\n",
|
|
" lambda x: re.search(pattern, str(x)).group(1)\n",
|
|
" if re.search(pattern, str(x))\n",
|
|
" else None\n",
|
|
")\n",
|
|
"\n",
|
|
"# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n",
|
|
"dfcompany1[\"company1_numABSA\"] = pd.to_numeric(\n",
|
|
" dfcompany1[\"company1_numABSA\"], errors=\"coerce\"\n",
|
|
")\n",
|
|
"dfcompany1 = dfcompany1.drop(\"company1_ABSA_v1\", axis=1)\n",
|
|
"dfcompany1"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def handle_none(row):\n",
|
|
" if row[\"company1_new\"] is None:\n",
|
|
" row[\"company1_ABSA\"] = None\n",
|
|
" row[\"company1_numABSA\"] = None\n",
|
|
" return row\n",
|
|
"\n",
|
|
"\n",
|
|
"# Apply the custom function to each row\n",
|
|
"dfcompany1new = dfcompany1.apply(handle_none, axis=1)\n",
|
|
"dfcompany1new"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Test anhand eines Beispielsatzes Teil 2\n",
|
|
"for aspect in [df3.loc[9, \"company2\"]]:\n",
|
|
" if df3[\"text_company2_eng\"].loc[9] != \"None\":\n",
|
|
" print(\n",
|
|
" aspect,\n",
|
|
" classifier(\n",
|
|
" df3[\"text_company2_eng\"].loc[9],\n",
|
|
" text_pair=aspect,\n",
|
|
" ),\n",
|
|
" )\n",
|
|
" else:\n",
|
|
" print(None)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"result_list_company2 = []\n",
|
|
"\n",
|
|
"for i in range(len(df3)):\n",
|
|
" aspect = df3[\"company2\"].loc[i]\n",
|
|
" textfinal = aspect, classifier(df3[\"text_company2_eng\"].loc[i], text_pair=aspect)\n",
|
|
" result_list_company2.append(textfinal)\n",
|
|
"\n",
|
|
"dfcompany2 = pd.DataFrame(result_list_company2)\n",
|
|
"dfcompany2.rename(columns={0: \"company2_new\"}, inplace=True)\n",
|
|
"dfcompany2.rename(columns={1: \"company2_ABSA_v1\"}, inplace=True)\n",
|
|
"\n",
|
|
"dfcompany2[\"company2_ABSA_v1\"] = dfcompany2[\"company2_ABSA_v1\"].astype(str)\n",
|
|
"dfcompany2[\"company2_ABSA\"] = dfcompany2[\"company2_ABSA_v1\"].str[12:19]\n",
|
|
"\n",
|
|
"import re\n",
|
|
"\n",
|
|
"# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n",
|
|
"pattern = r\"(\\d+(\\.\\d+)?)\"\n",
|
|
"\n",
|
|
"# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n",
|
|
"dfcompany2[\"company2_numABSA\"] = dfcompany2[\"company2_ABSA_v1\"].apply(\n",
|
|
" lambda x: re.search(pattern, str(x)).group(1)\n",
|
|
" if re.search(pattern, str(x))\n",
|
|
" else None\n",
|
|
")\n",
|
|
"\n",
|
|
"# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n",
|
|
"dfcompany2[\"company2_numABSA\"] = pd.to_numeric(\n",
|
|
" dfcompany2[\"company2_numABSA\"], errors=\"coerce\"\n",
|
|
")\n",
|
|
"dfcompany2 = dfcompany2.drop(\"company2_ABSA_v1\", axis=1)\n",
|
|
"dfcompany2"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def handle_none(row):\n",
|
|
" if row[\"company2_new\"] is None:\n",
|
|
" row[\"company2_ABSA\"] = None\n",
|
|
" row[\"company2_numABSA\"] = None\n",
|
|
" return row\n",
|
|
"\n",
|
|
"\n",
|
|
"# Apply the custom function to each row\n",
|
|
"dfcompany2new = dfcompany2.apply(handle_none, axis=1)\n",
|
|
"dfcompany2new"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"result_list_company3 = []\n",
|
|
"\n",
|
|
"for i in range(len(df3)):\n",
|
|
" aspect = df3[\"company3\"].loc[i]\n",
|
|
" textfinal = aspect, classifier(df3[\"text_company3_eng\"].loc[i], text_pair=aspect)\n",
|
|
" result_list_company3.append(textfinal)\n",
|
|
"\n",
|
|
"dfcompany3 = pd.DataFrame(result_list_company3)\n",
|
|
"dfcompany3.rename(columns={0: \"company3_new\"}, inplace=True)\n",
|
|
"dfcompany3.rename(columns={1: \"company3_ABSA_v1\"}, inplace=True)\n",
|
|
"\n",
|
|
"dfcompany3[\"company3_ABSA_v1\"] = dfcompany3[\"company3_ABSA_v1\"].astype(str)\n",
|
|
"dfcompany3[\"company3_ABSA\"] = dfcompany3[\"company3_ABSA_v1\"].str[12:19]\n",
|
|
"\n",
|
|
"import re\n",
|
|
"\n",
|
|
"# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n",
|
|
"pattern = r\"(\\d+(\\.\\d+)?)\"\n",
|
|
"\n",
|
|
"# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n",
|
|
"dfcompany3[\"company3_numABSA\"] = dfcompany3[\"company3_ABSA_v1\"].apply(\n",
|
|
" lambda x: re.search(pattern, str(x)).group(1)\n",
|
|
" if re.search(pattern, str(x))\n",
|
|
" else None\n",
|
|
")\n",
|
|
"\n",
|
|
"# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n",
|
|
"dfcompany3[\"company3_numABSA\"] = pd.to_numeric(\n",
|
|
" dfcompany3[\"company3_numABSA\"], errors=\"coerce\"\n",
|
|
")\n",
|
|
"dfcompany3 = dfcompany3.drop(\"company3_ABSA_v1\", axis=1)\n",
|
|
"dfcompany3"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def handle_none(row):\n",
|
|
" if row[\"company3_new\"] is None:\n",
|
|
" row[\"company3_ABSA\"] = None\n",
|
|
" row[\"company3_numABSA\"] = None\n",
|
|
" return row\n",
|
|
"\n",
|
|
"\n",
|
|
"# Apply the custom function to each row\n",
|
|
"dfcompany3new = dfcompany3.apply(handle_none, axis=1)\n",
|
|
"dfcompany3new"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"result_list_company4 = []\n",
|
|
"\n",
|
|
"for i in range(len(df3)):\n",
|
|
" aspect = df3[\"company4\"].loc[i]\n",
|
|
" textfinal = aspect, classifier(df3[\"text_company4_eng\"].loc[i], text_pair=aspect)\n",
|
|
" result_list_company4.append(textfinal)\n",
|
|
"\n",
|
|
"dfcompany4 = pd.DataFrame(result_list_company4)\n",
|
|
"dfcompany4.rename(columns={0: \"company4_new\"}, inplace=True)\n",
|
|
"dfcompany4.rename(columns={1: \"company4_ABSA_v1\"}, inplace=True)\n",
|
|
"\n",
|
|
"dfcompany4[\"company4_ABSA_v1\"] = dfcompany4[\"company4_ABSA_v1\"].astype(str)\n",
|
|
"dfcompany4[\"company4_ABSA\"] = dfcompany4[\"company4_ABSA_v1\"].str[12:19]\n",
|
|
"\n",
|
|
"import re\n",
|
|
"\n",
|
|
"# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n",
|
|
"pattern = r\"(\\d+(\\.\\d+)?)\"\n",
|
|
"\n",
|
|
"# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n",
|
|
"dfcompany4[\"company4_numABSA\"] = dfcompany4[\"company4_ABSA_v1\"].apply(\n",
|
|
" lambda x: re.search(pattern, str(x)).group(1)\n",
|
|
" if re.search(pattern, str(x))\n",
|
|
" else None\n",
|
|
")\n",
|
|
"\n",
|
|
"# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n",
|
|
"dfcompany4[\"company4_numABSA\"] = pd.to_numeric(\n",
|
|
" dfcompany4[\"company4_numABSA\"], errors=\"coerce\"\n",
|
|
")\n",
|
|
"dfcompany4 = dfcompany4.drop(\"company4_ABSA_v1\", axis=1)\n",
|
|
"dfcompany4"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def handle_none(row):\n",
|
|
" if row[\"company4_new\"] is None:\n",
|
|
" row[\"company4_ABSA\"] = None\n",
|
|
" row[\"company4_numABSA\"] = None\n",
|
|
" return row\n",
|
|
"\n",
|
|
"\n",
|
|
"# Apply the custom function to each row\n",
|
|
"dfcompany4new = dfcompany4.apply(handle_none, axis=1)\n",
|
|
"dfcompany4new"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"result_list_company5 = []\n",
|
|
"\n",
|
|
"for i in range(len(df3)):\n",
|
|
" aspect = df3[\"company5\"].loc[i]\n",
|
|
" textfinal = aspect, classifier(df3[\"text_company5_eng\"].loc[i], text_pair=aspect)\n",
|
|
" result_list_company5.append(textfinal)\n",
|
|
"\n",
|
|
"dfcompany5 = pd.DataFrame(result_list_company5)\n",
|
|
"dfcompany5.rename(columns={0: \"company5_new\"}, inplace=True)\n",
|
|
"dfcompany5.rename(columns={1: \"company5_ABSA_v1\"}, inplace=True)\n",
|
|
"\n",
|
|
"dfcompany5[\"company5_ABSA_v1\"] = dfcompany5[\"company5_ABSA_v1\"].astype(str)\n",
|
|
"dfcompany5[\"company5_ABSA\"] = dfcompany5[\"company5_ABSA_v1\"].str[12:19]\n",
|
|
"\n",
|
|
"import re\n",
|
|
"\n",
|
|
"# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n",
|
|
"pattern = r\"(\\d+(\\.\\d+)?)\"\n",
|
|
"\n",
|
|
"# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n",
|
|
"dfcompany5[\"company5_numABSA\"] = dfcompany5[\"company5_ABSA_v1\"].apply(\n",
|
|
" lambda x: re.search(pattern, str(x)).group(1)\n",
|
|
" if re.search(pattern, str(x))\n",
|
|
" else None\n",
|
|
")\n",
|
|
"\n",
|
|
"# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n",
|
|
"dfcompany5[\"company5_numABSA\"] = pd.to_numeric(\n",
|
|
" dfcompany5[\"company5_numABSA\"], errors=\"coerce\"\n",
|
|
")\n",
|
|
"dfcompany5 = dfcompany5.drop(\"company5_ABSA_v1\", axis=1)\n",
|
|
"dfcompany5"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def handle_none(row):\n",
|
|
" if row[\"company5_new\"] is None:\n",
|
|
" row[\"company5_ABSA\"] = None\n",
|
|
" row[\"company5_numABSA\"] = None\n",
|
|
" return row\n",
|
|
"\n",
|
|
"\n",
|
|
"# Apply the custom function to each row\n",
|
|
"dfcompany5new = dfcompany5.apply(handle_none, axis=1)\n",
|
|
"dfcompany5new"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"dftotal1 = pd.concat(\n",
|
|
" [df3, dfcompany1new, dfcompany2new, dfcompany3new, dfcompany4new, dfcompany5new],\n",
|
|
" axis=1,\n",
|
|
" join=\"outer\",\n",
|
|
")\n",
|
|
"columns_to_drop = [\"company1\", \"company2\", \"company3\", \"company4\", \"company5\"]\n",
|
|
"dftotal1.drop(columns=columns_to_drop, inplace=True)\n",
|
|
"dftotal1"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Custom function to combine two columns into a list\n",
|
|
"def combine_to_list_company1(row):\n",
|
|
" return {\n",
|
|
" \"company\": row[\"company1_new\"],\n",
|
|
" \"ABSA\": row[\"company1_ABSA\"],\n",
|
|
" \"numABSA\": row[\"company1_numABSA\"],\n",
|
|
" }\n",
|
|
"\n",
|
|
"\n",
|
|
"# Apply the custom function to each row and create a new column 'Combined'\n",
|
|
"dftotal1[\"company1_Combined\"] = dftotal1.apply(combine_to_list_company1, axis=1)\n",
|
|
"columns_to_drop = [\"company1_new\", \"company1_ABSA\", \"company1_numABSA\"]\n",
|
|
"dftotal1.drop(columns=columns_to_drop, inplace=True)\n",
|
|
"dftotal1"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Custom function to combine two columns into a list\n",
|
|
"def combine_to_list_company2(row):\n",
|
|
" return {\n",
|
|
" \"company\": row[\"company2_new\"],\n",
|
|
" \"ABSA\": row[\"company2_ABSA\"],\n",
|
|
" \"numABSA\": row[\"company2_numABSA\"],\n",
|
|
" }\n",
|
|
"\n",
|
|
"\n",
|
|
"# Apply the custom function to each row and create a new column 'Combined'\n",
|
|
"dftotal1[\"company2_Combined\"] = dftotal1.apply(combine_to_list_company2, axis=1)\n",
|
|
"columns_to_drop = [\"company2_new\", \"company2_ABSA\", \"company2_numABSA\"]\n",
|
|
"dftotal1.drop(columns=columns_to_drop, inplace=True)\n",
|
|
"dftotal1"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Custom function to combine two columns into a list\n",
|
|
"def combine_to_list_company3(row):\n",
|
|
" return {\n",
|
|
" \"company\": row[\"company3_new\"],\n",
|
|
" \"ABSA\": row[\"company3_ABSA\"],\n",
|
|
" \"numABSA\": row[\"company3_numABSA\"],\n",
|
|
" }\n",
|
|
"\n",
|
|
"\n",
|
|
"# Apply the custom function to each row and create a new column 'Combined'\n",
|
|
"dftotal1[\"company3_Combined\"] = dftotal1.apply(combine_to_list_company3, axis=1)\n",
|
|
"columns_to_drop = [\"company3_new\", \"company3_ABSA\", \"company3_numABSA\"]\n",
|
|
"dftotal1.drop(columns=columns_to_drop, inplace=True)\n",
|
|
"dftotal1"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Custom function to combine two columns into a list\n",
|
|
"def combine_to_list_company4(row):\n",
|
|
" return {\n",
|
|
" \"company\": row[\"company4_new\"],\n",
|
|
" \"ABSA\": row[\"company4_ABSA\"],\n",
|
|
" \"numABSA\": row[\"company4_numABSA\"],\n",
|
|
" }\n",
|
|
"\n",
|
|
"\n",
|
|
"# Apply the custom function to each row and create a new column 'Combined'\n",
|
|
"dftotal1[\"company4_Combined\"] = dftotal1.apply(combine_to_list_company4, axis=1)\n",
|
|
"columns_to_drop = [\"company4_new\", \"company4_ABSA\", \"company4_numABSA\"]\n",
|
|
"dftotal1.drop(columns=columns_to_drop, inplace=True)\n",
|
|
"dftotal1"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Custom function to combine two columns into a list\n",
|
|
"def combine_to_list_company5(row):\n",
|
|
" return {\n",
|
|
" \"company\": row[\"company5_new\"],\n",
|
|
" \"ABSA\": row[\"company5_ABSA\"],\n",
|
|
" \"numABSA\": row[\"company5_numABSA\"],\n",
|
|
" }\n",
|
|
"\n",
|
|
"\n",
|
|
"# Apply the custom function to each row and create a new column 'Combined'\n",
|
|
"dftotal1[\"company5_Combined\"] = dftotal1.apply(combine_to_list_company5, axis=1)\n",
|
|
"columns_to_drop = [\"company5_new\", \"company5_ABSA\", \"company5_numABSA\"]\n",
|
|
"dftotal1.drop(columns=columns_to_drop, inplace=True)\n",
|
|
"dftotal1"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"dftotal1.rename(columns={\"company1_Combined\": \"company1\"}, inplace=True)\n",
|
|
"dftotal1.rename(columns={\"company2_Combined\": \"company2\"}, inplace=True)\n",
|
|
"dftotal1.rename(columns={\"company3_Combined\": \"company3\"}, inplace=True)\n",
|
|
"dftotal1.rename(columns={\"company4_Combined\": \"company4\"}, inplace=True)\n",
|
|
"dftotal1.rename(columns={\"company5_Combined\": \"company5\"}, inplace=True)\n",
|
|
"dftotal1"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Custom function to combine two columns into a dictionary\n",
|
|
"def combine_to_dict(row):\n",
|
|
" return {\n",
|
|
" \"company1\": row[\"company1\"],\n",
|
|
" \"company2\": row[\"company2\"],\n",
|
|
" \"company3\": row[\"company3\"],\n",
|
|
" \"company4\": row[\"company4\"],\n",
|
|
" \"company5\": row[\"company5\"],\n",
|
|
" }\n",
|
|
"\n",
|
|
"\n",
|
|
"# Apply the custom function to each row and create a new column 'Combined'\n",
|
|
"dftotal1[\"Combined_ABSA\"] = dftotal1.apply(combine_to_dict, axis=1)\n",
|
|
"columns_to_drop = [\"company1\", \"company2\", \"company3\", \"company4\", \"company5\"]\n",
|
|
"dftotal1.drop(columns=columns_to_drop, inplace=True)\n",
|
|
"dftotal1"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# dftotal1.to_csv('dftotal1_20231214_v4.csv')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Ganz am Ende packen\n",
|
|
"# Create an instance of NewsObject (replace 'your_collection' with your actual collection)\n",
|
|
"newsObj = news.MongoNewsService(connector)\n",
|
|
"\n",
|
|
"if len(dftotal1) > 0:\n",
|
|
" for i in range(len(dftotal1)):\n",
|
|
" # ents=NERService.NERCompanyList(company_list,document)\n",
|
|
" # add a new attribute 'Combined_ABSA' to document\n",
|
|
" newsObj.collection.update_one(\n",
|
|
" {\"_id\": dftotal1[\"_id\"].iloc[i]}, # Filter für das entsprechende Dokument\n",
|
|
" {\n",
|
|
" \"$set\": {\"Combined_ABSA\": dftotal1[\"Combined_ABSA\"].iloc[i]}\n",
|
|
" }, # Neues Attribut hinzufügen, initialisiert mit einer leeren Liste\n",
|
|
" )\n",
|
|
"\n",
|
|
"else:\n",
|
|
" print(\"No documents found.\")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "aki-prj23-transparenzregister-z8SxnVl_-py3.11",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.4"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|