diff --git a/Jupyter/ABSA/ABSA_v5.ipynb b/Jupyter/ABSA/ABSA_v5.ipynb new file mode 100644 index 0000000..dc23ea9 --- /dev/null +++ b/Jupyter/ABSA/ABSA_v5.ipynb @@ -0,0 +1,1398 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import pandas as pd\n", + "import aki_prj23_transparenzregister.utils.mongo.connector as conn\n", + "from aki_prj23_transparenzregister.config.config_providers import JsonFileConfigProvider\n", + "import aki_prj23_transparenzregister.utils.mongo.news_mongo_service as news\n", + "import aki_prj23_transparenzregister.utils.mongo.company_mongo_service as comps" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class ABSA:\n", + " def __init__(self):\n", + " self.config_provider = JsonFileConfigProvider(\"./secrets.json\")\n", + " self.connect_string = self.config_provider.get_mongo_connection_string()\n", + " self.connect_string.database = \"transparenzregister_ner\"\n", + " self.connector = conn.MongoConnector(self.connect_string)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Mongo Connect: create connection string and connect\n", + "config_provider = JsonFileConfigProvider(\"../../secrets.json\")\n", + "engine = config_provider.get_mongo_connection_string()\n", + "engine.database = \"transparenzregister_ner\"\n", + "connector = conn.MongoConnector(engine)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Process all documents and check if attribute 'name' is existing\n", + "# Read data from database\n", + "CompsObj = comps.CompanyMongoService(connector)\n", + "allComps = CompsObj.get_all()\n", + "\n", + "# Create a cursor which has all unprogressed articles; articles without the attribute 'companies'\n", + "CursorCompNames = CompsObj.collection.find({\"name\": {\"$exists\": True}})\n", + "documents = list(CursorCompNames)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create a list with all company names\n", + "compList = []\n", + "\n", + "if len(documents) > 0:\n", + " for document in documents:\n", + " # ents=NERService.NERCompanyList(company_list,document)\n", + " compList.append(document[\"name\"])\n", + " # add a new attribute 'companies' to document\n", + " # newsObj.collection.update_one(\n", + " # {\"_id\": document[\"_id\"]}, # Filter für das entsprechende Dokument\n", + " # {\"$set\": {\"companies\": ents}} # Neues Attribut hinzufügen, initialisiert mit einer leeren Liste\n", + " # )\n", + "\n", + "else:\n", + " print(\"No documents found.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Process all documents in news collection and check if attribute 'companies' is existing\n", + "\n", + "# Read data from database\n", + "NERObj = news.MongoNewsService(connector)\n", + "allNER = NERObj.get_all()\n", + "\n", + "# Create a cursor which has all unprogressed articles; articles without the attribute 'companies'\n", + "CursorNERNames = NERObj.collection.find({\"companies\": {\"$exists\": True}})\n", + "documentsNER = list(CursorNERNames)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# install and import rapidfuzz\n", + "# pip install rapidfuzz\n", + "from rapidfuzz import process" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Process all documents in news collection and check if attribute 'companies' is existing\n", + "\n", + "# Read data from database\n", + "NERObj = news.MongoNewsService(connector)\n", + "allNER = NERObj.get_all()\n", + "\n", + "# Create a cursor which has all unprogressed articles; articles without the attribute 'companies'\n", + "CursorNERNames = NERObj.collection.find({\"companies\": {\"$exists\": True}})\n", + "documentsNER = list(CursorNERNames)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if len(documentsNER) > 0:\n", + " for document in documentsNER:\n", + " resList = [] # result list with matched names\n", + " for entity_name, frequency in document[\"companies\"].items():\n", + " if len(entity_name) > 2:\n", + " result = process.extractOne(entity_name, compList)\n", + " if result is not None:\n", + " # Wenn ein ähnlicher Name gefunden wurde\n", + " if result[1] >= 95:\n", + " # Passen Sie die Ähnlichkeitsbewertungsschwelle nach Bedarf an\n", + " similar_name = result[0]\n", + " # print(f\"Ähnlicher Name gefunden: {entity_name} (Ähnlichkeit: {result[1]})\")\n", + " # print(f\"Ähnlichkeit mit: {similar_name}\")\n", + " # print(f\"Häufigkeit: {frequency}\")\n", + " print(\n", + " f\"NER Entität: {entity_name} passt zu:{similar_name} zu: {result[1]}% und {frequency} Matches \"\n", + " )\n", + "\n", + " # ents=NERService.NERCompanyList(company_list,document)\n", + " # compList.append(document['name'])\n", + " # add a new attribute 'companies' to document\n", + " # newsObj.collection.update_one(\n", + " # {\"_id\": document[\"_id\"]}, # Filter für das entsprechende Dokument\n", + " # {\"$set\": {\"companies\": ents}} # Neues Attribut hinzufügen, initialisiert mit einer leeren Liste\n", + " # )\n", + "\n", + "else:\n", + " print(\"No documents found.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "documentsNER[1][\"companies\"].items()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "compList" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "\n", + "# Funktion zum Entfernen von Unternehmenstypen und rechtlichen Zusätzen\n", + "def remove_legal_additions(name):\n", + " # Verwenden Sie einen regulären Ausdruck, um gängige Zusätze wie \"GmbH\" und \"AG\" zu entfernen\n", + " cleaned_name = re.sub(\n", + " r\"\\b(GmbH|AG|KG|SE|& Co\\. KGaA|& Co\\.|e\\.K\\.|mbH|mbH & Co\\. KG)\\b\", \"\", name\n", + " )\n", + " # Entfernen Sie führende und nachfolgende Leerzeichen\n", + " cleaned_name = cleaned_name.strip()\n", + " return cleaned_name\n", + "\n", + "\n", + "# Bereinigen Sie die Liste von Unternehmensnamen\n", + "complist2 = [remove_legal_additions(name) for name in compList]\n", + "complist2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "complist3 = [word.split()[0] for word in complist2 if len(word.split()[0]) >= 3]\n", + "complist4 = list(set(complist3))\n", + "complist4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Bestimmte Begriffe, die keine Firmennamen sind, müssen aus complist4 entfernt werden\n", + "if (\"Deutsche\") in complist4:\n", + " complist4.remove(\"Deutsche\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Hamburg\") in complist4:\n", + " complist4.remove(\"Hamburg\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Hamburger\") in complist4:\n", + " complist4.remove(\"Hamburger\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Union\") in complist4:\n", + " complist4.remove(\"Union\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Energy\") in complist4:\n", + " complist4.remove(\"Energy\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Hugo\") in complist4:\n", + " complist4.remove(\"Hugo\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Pro\") in complist4:\n", + " complist4.remove(\"Pro\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"OTC\") in complist4:\n", + " complist4.remove(\"OTC\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"web\") in complist4:\n", + " complist4.remove(\"web\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Kabel\") in complist4:\n", + " complist4.remove(\"Kabel\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Club\") in complist4:\n", + " complist4.remove(\"Club\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"The\") in complist4:\n", + " complist4.remove(\"The\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"United\") in complist4:\n", + " complist4.remove(\"United\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Frankfurter\") in complist4:\n", + " complist4.remove(\"Frankfurter\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"CMC\") in complist4:\n", + " complist4.remove(\"CMC\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Bayern\") in complist4:\n", + " complist4.remove(\"Bayern\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Haus\") in complist4:\n", + " complist4.remove(\"Haus\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Gesellschaft\") in complist4:\n", + " complist4.remove(\"Gesellschaft\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Delivery\") in complist4:\n", + " complist4.remove(\"Delivery\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Aachener\") in complist4:\n", + " complist4.remove(\"Aachener\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Group\") in complist4:\n", + " complist4.remove(\"Group\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Retail\") in complist4:\n", + " complist4.remove(\"Retail\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Media\") in complist4:\n", + " complist4.remove(\"Media\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"European\") in complist4:\n", + " complist4.remove(\"European\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Fuels\") in complist4:\n", + " complist4.remove(\"Fuels\")\n", + "else:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Zusammenführung der beiden Listen complist2 und complist4\n", + "complist5 = complist2 + complist4\n", + "complist5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df1 = pd.DataFrame(documentsNER)\n", + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Function to extract company names\n", + "def extract_company_names(company_dict):\n", + " return list(company_dict.keys())\n", + "\n", + "\n", + "# Apply the function to the 'companies' column\n", + "df1[\"companies\"] = df1[\"companies\"].apply(lambda x: extract_company_names(x))\n", + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df1[\"companies_filtered\"] = df1[\"companies\"].apply(\n", + " lambda x: [company for company in x if company in complist5]\n", + ")\n", + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Nur Auswahl der ersten fünf Spalten von \"companies_filtered\", um später Rechenressourcen zu ersparen.\n", + "def split_list1(row):\n", + " return pd.Series(row[\"companies_filtered\"][:5])\n", + "\n", + "\n", + "# Apply the function and concatenate the result with the original DataFrame\n", + "df2 = df1.apply(split_list1, axis=1).merge(df1, left_index=True, right_index=True)\n", + "\n", + "# Drop the original column with lists\n", + "df2 = df2.drop(\"companies\", axis=1)\n", + "df2 = df2.drop(\"companies_filtered\", axis=1)\n", + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df2.rename(columns={0: \"company1\"}, inplace=True)\n", + "df2.rename(columns={1: \"company2\"}, inplace=True)\n", + "df2.rename(columns={2: \"company3\"}, inplace=True)\n", + "df2.rename(columns={3: \"company4\"}, inplace=True)\n", + "df2.rename(columns={4: \"company5\"}, inplace=True)\n", + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# cell10 = df2.loc[3, 'company1']\n", + "# print(cell10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df2.dropna(\n", + " subset=[\"company1\", \"company2\", \"company3\", \"company4\", \"company5\"],\n", + " how=\"all\",\n", + " inplace=True,\n", + ")\n", + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Die Indizes müssen resetted werden, da es ansonsten bei den nachfolgenden Schritte Fehlermeldungen gibt (Iterieren über i würde ansonsten nicht funktionieren).\n", + "df2 = df2.reset_index(drop=True)\n", + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def filter_sentences_company1(row):\n", + " target_word = row[\"company1\"]\n", + "\n", + " # Check if target_word is NaN\n", + " if pd.isna(target_word):\n", + " return None\n", + "\n", + " sentences = re.split(r\"[.:>]\", row[\"text\"]) # Split by dot\n", + "\n", + " # Extract sentences containing target word\n", + " filtered_sentences = [\n", + " sentence.strip() for sentence in sentences if target_word in sentence\n", + " ]\n", + "\n", + " # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n", + " concatenated_sentences = \". \".join(filtered_sentences)[:200]\n", + "\n", + " # Return None if no sentences contain the target word\n", + " return concatenated_sentences if concatenated_sentences else None\n", + "\n", + "\n", + "df2[\"text_company1\"] = df2.apply(filter_sentences_company1, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def filter_sentences_company2(row):\n", + " target_word = row[\"company2\"]\n", + "\n", + " # Check if target_word is NaN\n", + " if pd.isna(target_word):\n", + " return None\n", + "\n", + " sentences = re.split(r\"[.:>]\", row[\"text\"]) # Split by dot\n", + "\n", + " # Extract sentences containing target word\n", + " filtered_sentences = [\n", + " sentence.strip() for sentence in sentences if target_word in sentence\n", + " ]\n", + "\n", + " # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n", + " concatenated_sentences = \". \".join(filtered_sentences)[:200]\n", + "\n", + " # Return None if no sentences contain the target word\n", + " return concatenated_sentences if concatenated_sentences else None\n", + "\n", + "\n", + "df2[\"text_company2\"] = df2.apply(filter_sentences_company2, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def filter_sentences_company3(row):\n", + " target_word = row[\"company3\"]\n", + "\n", + " # Check if target_word is NaN\n", + " if pd.isna(target_word):\n", + " return None\n", + "\n", + " sentences = re.split(r\"[.:>]\", row[\"text\"]) # Split by dot\n", + "\n", + " # Extract sentences containing target word\n", + " filtered_sentences = [\n", + " sentence.strip() for sentence in sentences if target_word in sentence\n", + " ]\n", + "\n", + " # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n", + " concatenated_sentences = \". \".join(filtered_sentences)[:200]\n", + "\n", + " # Return None if no sentences contain the target word\n", + " return concatenated_sentences if concatenated_sentences else None\n", + "\n", + "\n", + "df2[\"text_company3\"] = df2.apply(filter_sentences_company3, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def filter_sentences_company4(row):\n", + " target_word = row[\"company4\"]\n", + "\n", + " # Check if target_word is NaN\n", + " if pd.isna(target_word):\n", + " return None\n", + "\n", + " sentences = re.split(r\"[.:>]\", row[\"text\"]) # Split by dot\n", + "\n", + " # Extract sentences containing target word\n", + " filtered_sentences = [\n", + " sentence.strip() for sentence in sentences if target_word in sentence\n", + " ]\n", + "\n", + " # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n", + " concatenated_sentences = \". \".join(filtered_sentences)[:200]\n", + "\n", + " # Return None if no sentences contain the target word\n", + " return concatenated_sentences if concatenated_sentences else None\n", + "\n", + "\n", + "df2[\"text_company4\"] = df2.apply(filter_sentences_company4, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def filter_sentences_company5(row):\n", + " target_word = row[\"company5\"]\n", + "\n", + " # Check if target_word is NaN\n", + " if pd.isna(target_word):\n", + " return None\n", + "\n", + " sentences = re.split(r\"[.:>]\", row[\"text\"]) # Split by dot\n", + "\n", + " # Extract sentences containing target word\n", + " filtered_sentences = [\n", + " sentence.strip() for sentence in sentences if target_word in sentence\n", + " ]\n", + "\n", + " # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n", + " concatenated_sentences = \". \".join(filtered_sentences)[:200]\n", + "\n", + " # Return None if no sentences contain the target word\n", + " return concatenated_sentences if concatenated_sentences else None\n", + "\n", + "\n", + "df2[\"text_company5\"] = df2.apply(filter_sentences_company5, axis=1)\n", + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, MarianMTModel\n", + "\n", + "# Hinweis: Durch den zusätzlichen Code \"truncation=True\" kann die Beschränkung auf 512 Zeichen bei der maschinen Übersetzung ausgeschaltet werden.\n", + "translation_tokenizer = AutoTokenizer.from_pretrained(\n", + " \"Helsinki-NLP/opus-mt-de-en\", truncation=True\n", + ")\n", + "\n", + "translation_model = AutoModelForSeq2SeqLM.from_pretrained(\"Helsinki-NLP/opus-mt-de-en\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Testen, ob die maschinelle Übersetzung bei einem Beispielssatz funkioniert.\n", + "def translate_sentiment(text: str) -> str:\n", + " input_tokens = translation_tokenizer([text], return_tensors=\"pt\")\n", + " generated_ids = translation_model.generate(**input_tokens)\n", + " return translation_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[\n", + " 0\n", + " ]\n", + "\n", + "\n", + "headline = \"Pelabresib: Biotech-Unternehmen Morphosys hofft mit neuem Krebs-Medikament endlich auf den Durchbruch. \"\n", + "tf = translate_sentiment(headline)\n", + "tf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "translate_list_company1 = []\n", + "\n", + "for i in range(len(df2)):\n", + " text = str(\n", + " df2[\"text_company1\"].loc[i]\n", + " ) # Convert to string (this is very important)\n", + " texttrans = translate_sentiment(text)\n", + " translate_list_company1.append(texttrans)\n", + "\n", + "dftrans_company1 = pd.DataFrame(translate_list_company1)\n", + "dftrans_company1.rename(columns={0: \"text_company1_eng\"}, inplace=True)\n", + "dftrans_company1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "translate_list_company2 = []\n", + "\n", + "for i in range(len(df2)):\n", + " text = str(\n", + " df2[\"text_company2\"].loc[i]\n", + " ) # Convert to string (this is very important)\n", + " texttrans = translate_sentiment(text)\n", + " translate_list_company2.append(texttrans)\n", + "\n", + "dftrans_company2 = pd.DataFrame(translate_list_company2)\n", + "dftrans_company2.rename(columns={0: \"text_company2_eng\"}, inplace=True)\n", + "dftrans_company2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "translate_list_company3 = []\n", + "\n", + "for i in range(len(df2)):\n", + " text = str(\n", + " df2[\"text_company3\"].loc[i]\n", + " ) # Convert to string (this is very important)\n", + " texttrans = translate_sentiment(text)\n", + " translate_list_company3.append(texttrans)\n", + "\n", + "dftrans_company3 = pd.DataFrame(translate_list_company3)\n", + "dftrans_company3.rename(columns={0: \"text_company3_eng\"}, inplace=True)\n", + "dftrans_company3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "translate_list_company4 = []\n", + "\n", + "for i in range(len(df2)):\n", + " text = str(\n", + " df2[\"text_company4\"].loc[i]\n", + " ) # Convert to string (this is very important)\n", + " texttrans = translate_sentiment(text)\n", + " translate_list_company4.append(texttrans)\n", + "\n", + "dftrans_company4 = pd.DataFrame(translate_list_company4)\n", + "dftrans_company4.rename(columns={0: \"text_company4_eng\"}, inplace=True)\n", + "dftrans_company4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "translate_list_company5 = []\n", + "\n", + "for i in range(len(df2)):\n", + " text = str(\n", + " df2[\"text_company5\"].loc[i]\n", + " ) # Convert to string (this is very important)\n", + " texttrans = translate_sentiment(text)\n", + " translate_list_company5.append(texttrans)\n", + "\n", + "dftrans_company5 = pd.DataFrame(translate_list_company5)\n", + "dftrans_company5.rename(columns={0: \"text_company5_eng\"}, inplace=True)\n", + "dftrans_company5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df3 = df2[\n", + " [\n", + " \"_id\",\n", + " \"title\",\n", + " \"text\",\n", + " \"company1\",\n", + " \"text_company1\",\n", + " \"company2\",\n", + " \"text_company2\",\n", + " \"company3\",\n", + " \"text_company3\",\n", + " \"company4\",\n", + " \"text_company4\",\n", + " \"company5\",\n", + " \"text_company5\",\n", + " ]\n", + "]\n", + "df3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df3.insert(4, \"text_company1_eng\", dftrans_company1.iloc[:, 0])\n", + "df3.insert(7, \"text_company2_eng\", dftrans_company2.iloc[:, 0])\n", + "df3.insert(10, \"text_company3_eng\", dftrans_company3.iloc[:, 0])\n", + "df3.insert(13, \"text_company4_eng\", dftrans_company4.iloc[:, 0])\n", + "df3.insert(16, \"text_company5_eng\", dftrans_company5.iloc[:, 0])\n", + "df3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# df3.to_csv('df3_20231213.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df3 = df3.map(lambda x: None if pd.isna(x) else x)\n", + "df3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# test ABSA\n", + "from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline\n", + "\n", + "# Das ABSA-Modell und der Tokenizer werden geladen bzw. definiert\n", + "model_name = \"yangheng/deberta-v3-base-absa-v1.1\"\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + "model = AutoModelForSequenceClassification.from_pretrained(model_name)\n", + "\n", + "classifier = pipeline(\"text-classification\", model=model, tokenizer=tokenizer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test anhand eines Beispielsatzes\n", + "for aspect in [\"Siemens\"]:\n", + " print(\n", + " aspect,\n", + " classifier(\n", + " \"Siemens is doing great\",\n", + " text_pair=aspect,\n", + " ),\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test anhand eines Beispielsatzes Teil 2\n", + "for aspect in [df2.loc[1, \"company1\"]]:\n", + " print(\n", + " aspect,\n", + " classifier(\n", + " \"Siemens is doing great\",\n", + " text_pair=aspect,\n", + " ),\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Muster für ABSA CompanyX\n", + "\n", + "result_list_company1 = []\n", + "\n", + "for i in range(len(df3)):\n", + " aspect = df3[\"company1\"].loc[i]\n", + " textfinal = aspect, classifier(df3[\"text_company1_eng\"].loc[i], text_pair=aspect)\n", + " result_list_company1.append(textfinal)\n", + "\n", + "dfcompany1 = pd.DataFrame(result_list_company1)\n", + "dfcompany1.rename(columns={0: \"company1_new\"}, inplace=True)\n", + "dfcompany1.rename(columns={1: \"company1_ABSA_v1\"}, inplace=True)\n", + "\n", + "dfcompany1[\"company1_ABSA_v1\"] = dfcompany1[\"company1_ABSA_v1\"].astype(str)\n", + "dfcompany1[\"company1_ABSA\"] = dfcompany1[\"company1_ABSA_v1\"].str[12:19]\n", + "\n", + "import re\n", + "\n", + "# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n", + "pattern = r\"(\\d+(\\.\\d+)?)\"\n", + "\n", + "# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n", + "dfcompany1[\"company1_numABSA\"] = dfcompany1[\"company1_ABSA_v1\"].apply(\n", + " lambda x: re.search(pattern, str(x)).group(1)\n", + " if re.search(pattern, str(x))\n", + " else None\n", + ")\n", + "\n", + "# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n", + "dfcompany1[\"company1_numABSA\"] = pd.to_numeric(\n", + " dfcompany1[\"company1_numABSA\"], errors=\"coerce\"\n", + ")\n", + "dfcompany1 = dfcompany1.drop(\"company1_ABSA_v1\", axis=1)\n", + "dfcompany1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def handle_none(row):\n", + " if row[\"company1_new\"] is None:\n", + " row[\"company1_ABSA\"] = None\n", + " row[\"company1_numABSA\"] = None\n", + " return row\n", + "\n", + "\n", + "# Apply the custom function to each row\n", + "dfcompany1new = dfcompany1.apply(handle_none, axis=1)\n", + "dfcompany1new" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test anhand eines Beispielsatzes Teil 2\n", + "for aspect in [df3.loc[9, \"company2\"]]:\n", + " if df3[\"text_company2_eng\"].loc[9] != \"None\":\n", + " print(\n", + " aspect,\n", + " classifier(\n", + " df3[\"text_company2_eng\"].loc[9],\n", + " text_pair=aspect,\n", + " ),\n", + " )\n", + " else:\n", + " print(None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result_list_company2 = []\n", + "\n", + "for i in range(len(df3)):\n", + " aspect = df3[\"company2\"].loc[i]\n", + " textfinal = aspect, classifier(df3[\"text_company2_eng\"].loc[i], text_pair=aspect)\n", + " result_list_company2.append(textfinal)\n", + "\n", + "dfcompany2 = pd.DataFrame(result_list_company2)\n", + "dfcompany2.rename(columns={0: \"company2_new\"}, inplace=True)\n", + "dfcompany2.rename(columns={1: \"company2_ABSA_v1\"}, inplace=True)\n", + "\n", + "dfcompany2[\"company2_ABSA_v1\"] = dfcompany2[\"company2_ABSA_v1\"].astype(str)\n", + "dfcompany2[\"company2_ABSA\"] = dfcompany2[\"company2_ABSA_v1\"].str[12:19]\n", + "\n", + "import re\n", + "\n", + "# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n", + "pattern = r\"(\\d+(\\.\\d+)?)\"\n", + "\n", + "# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n", + "dfcompany2[\"company2_numABSA\"] = dfcompany2[\"company2_ABSA_v1\"].apply(\n", + " lambda x: re.search(pattern, str(x)).group(1)\n", + " if re.search(pattern, str(x))\n", + " else None\n", + ")\n", + "\n", + "# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n", + "dfcompany2[\"company2_numABSA\"] = pd.to_numeric(\n", + " dfcompany2[\"company2_numABSA\"], errors=\"coerce\"\n", + ")\n", + "dfcompany2 = dfcompany2.drop(\"company2_ABSA_v1\", axis=1)\n", + "dfcompany2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def handle_none(row):\n", + " if row[\"company2_new\"] is None:\n", + " row[\"company2_ABSA\"] = None\n", + " row[\"company2_numABSA\"] = None\n", + " return row\n", + "\n", + "\n", + "# Apply the custom function to each row\n", + "dfcompany2new = dfcompany2.apply(handle_none, axis=1)\n", + "dfcompany2new" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result_list_company3 = []\n", + "\n", + "for i in range(len(df3)):\n", + " aspect = df3[\"company3\"].loc[i]\n", + " textfinal = aspect, classifier(df3[\"text_company3_eng\"].loc[i], text_pair=aspect)\n", + " result_list_company3.append(textfinal)\n", + "\n", + "dfcompany3 = pd.DataFrame(result_list_company3)\n", + "dfcompany3.rename(columns={0: \"company3_new\"}, inplace=True)\n", + "dfcompany3.rename(columns={1: \"company3_ABSA_v1\"}, inplace=True)\n", + "\n", + "dfcompany3[\"company3_ABSA_v1\"] = dfcompany3[\"company3_ABSA_v1\"].astype(str)\n", + "dfcompany3[\"company3_ABSA\"] = dfcompany3[\"company3_ABSA_v1\"].str[12:19]\n", + "\n", + "import re\n", + "\n", + "# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n", + "pattern = r\"(\\d+(\\.\\d+)?)\"\n", + "\n", + "# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n", + "dfcompany3[\"company3_numABSA\"] = dfcompany3[\"company3_ABSA_v1\"].apply(\n", + " lambda x: re.search(pattern, str(x)).group(1)\n", + " if re.search(pattern, str(x))\n", + " else None\n", + ")\n", + "\n", + "# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n", + "dfcompany3[\"company3_numABSA\"] = pd.to_numeric(\n", + " dfcompany3[\"company3_numABSA\"], errors=\"coerce\"\n", + ")\n", + "dfcompany3 = dfcompany3.drop(\"company3_ABSA_v1\", axis=1)\n", + "dfcompany3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def handle_none(row):\n", + " if row[\"company3_new\"] is None:\n", + " row[\"company3_ABSA\"] = None\n", + " row[\"company3_numABSA\"] = None\n", + " return row\n", + "\n", + "\n", + "# Apply the custom function to each row\n", + "dfcompany3new = dfcompany3.apply(handle_none, axis=1)\n", + "dfcompany3new" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result_list_company4 = []\n", + "\n", + "for i in range(len(df3)):\n", + " aspect = df3[\"company4\"].loc[i]\n", + " textfinal = aspect, classifier(df3[\"text_company4_eng\"].loc[i], text_pair=aspect)\n", + " result_list_company4.append(textfinal)\n", + "\n", + "dfcompany4 = pd.DataFrame(result_list_company4)\n", + "dfcompany4.rename(columns={0: \"company4_new\"}, inplace=True)\n", + "dfcompany4.rename(columns={1: \"company4_ABSA_v1\"}, inplace=True)\n", + "\n", + "dfcompany4[\"company4_ABSA_v1\"] = dfcompany4[\"company4_ABSA_v1\"].astype(str)\n", + "dfcompany4[\"company4_ABSA\"] = dfcompany4[\"company4_ABSA_v1\"].str[12:19]\n", + "\n", + "import re\n", + "\n", + "# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n", + "pattern = r\"(\\d+(\\.\\d+)?)\"\n", + "\n", + "# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n", + "dfcompany4[\"company4_numABSA\"] = dfcompany4[\"company4_ABSA_v1\"].apply(\n", + " lambda x: re.search(pattern, str(x)).group(1)\n", + " if re.search(pattern, str(x))\n", + " else None\n", + ")\n", + "\n", + "# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n", + "dfcompany4[\"company4_numABSA\"] = pd.to_numeric(\n", + " dfcompany4[\"company4_numABSA\"], errors=\"coerce\"\n", + ")\n", + "dfcompany4 = dfcompany4.drop(\"company4_ABSA_v1\", axis=1)\n", + "dfcompany4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def handle_none(row):\n", + " if row[\"company4_new\"] is None:\n", + " row[\"company4_ABSA\"] = None\n", + " row[\"company4_numABSA\"] = None\n", + " return row\n", + "\n", + "\n", + "# Apply the custom function to each row\n", + "dfcompany4new = dfcompany4.apply(handle_none, axis=1)\n", + "dfcompany4new" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result_list_company5 = []\n", + "\n", + "for i in range(len(df3)):\n", + " aspect = df3[\"company5\"].loc[i]\n", + " textfinal = aspect, classifier(df3[\"text_company5_eng\"].loc[i], text_pair=aspect)\n", + " result_list_company5.append(textfinal)\n", + "\n", + "dfcompany5 = pd.DataFrame(result_list_company5)\n", + "dfcompany5.rename(columns={0: \"company5_new\"}, inplace=True)\n", + "dfcompany5.rename(columns={1: \"company5_ABSA_v1\"}, inplace=True)\n", + "\n", + "dfcompany5[\"company5_ABSA_v1\"] = dfcompany5[\"company5_ABSA_v1\"].astype(str)\n", + "dfcompany5[\"company5_ABSA\"] = dfcompany5[\"company5_ABSA_v1\"].str[12:19]\n", + "\n", + "import re\n", + "\n", + "# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n", + "pattern = r\"(\\d+(\\.\\d+)?)\"\n", + "\n", + "# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n", + "dfcompany5[\"company5_numABSA\"] = dfcompany5[\"company5_ABSA_v1\"].apply(\n", + " lambda x: re.search(pattern, str(x)).group(1)\n", + " if re.search(pattern, str(x))\n", + " else None\n", + ")\n", + "\n", + "# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n", + "dfcompany5[\"company5_numABSA\"] = pd.to_numeric(\n", + " dfcompany5[\"company5_numABSA\"], errors=\"coerce\"\n", + ")\n", + "dfcompany5 = dfcompany5.drop(\"company5_ABSA_v1\", axis=1)\n", + "dfcompany5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def handle_none(row):\n", + " if row[\"company5_new\"] is None:\n", + " row[\"company5_ABSA\"] = None\n", + " row[\"company5_numABSA\"] = None\n", + " return row\n", + "\n", + "\n", + "# Apply the custom function to each row\n", + "dfcompany5new = dfcompany5.apply(handle_none, axis=1)\n", + "dfcompany5new" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dftotal1 = pd.concat(\n", + " [df3, dfcompany1new, dfcompany2new, dfcompany3new, dfcompany4new, dfcompany5new],\n", + " axis=1,\n", + " join=\"outer\",\n", + ")\n", + "columns_to_drop = [\"company1\", \"company2\", \"company3\", \"company4\", \"company5\"]\n", + "dftotal1.drop(columns=columns_to_drop, inplace=True)\n", + "dftotal1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Custom function to combine two columns into a list\n", + "def combine_to_list_company1(row):\n", + " return {\n", + " \"company\": row[\"company1_new\"],\n", + " \"ABSA\": row[\"company1_ABSA\"],\n", + " \"numABSA\": row[\"company1_numABSA\"],\n", + " }\n", + "\n", + "\n", + "# Apply the custom function to each row and create a new column 'Combined'\n", + "dftotal1[\"company1_Combined\"] = dftotal1.apply(combine_to_list_company1, axis=1)\n", + "columns_to_drop = [\"company1_new\", \"company1_ABSA\", \"company1_numABSA\"]\n", + "dftotal1.drop(columns=columns_to_drop, inplace=True)\n", + "dftotal1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Custom function to combine two columns into a list\n", + "def combine_to_list_company2(row):\n", + " return {\n", + " \"company\": row[\"company2_new\"],\n", + " \"ABSA\": row[\"company2_ABSA\"],\n", + " \"numABSA\": row[\"company2_numABSA\"],\n", + " }\n", + "\n", + "\n", + "# Apply the custom function to each row and create a new column 'Combined'\n", + "dftotal1[\"company2_Combined\"] = dftotal1.apply(combine_to_list_company2, axis=1)\n", + "columns_to_drop = [\"company2_new\", \"company2_ABSA\", \"company2_numABSA\"]\n", + "dftotal1.drop(columns=columns_to_drop, inplace=True)\n", + "dftotal1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Custom function to combine two columns into a list\n", + "def combine_to_list_company3(row):\n", + " return {\n", + " \"company\": row[\"company3_new\"],\n", + " \"ABSA\": row[\"company3_ABSA\"],\n", + " \"numABSA\": row[\"company3_numABSA\"],\n", + " }\n", + "\n", + "\n", + "# Apply the custom function to each row and create a new column 'Combined'\n", + "dftotal1[\"company3_Combined\"] = dftotal1.apply(combine_to_list_company3, axis=1)\n", + "columns_to_drop = [\"company3_new\", \"company3_ABSA\", \"company3_numABSA\"]\n", + "dftotal1.drop(columns=columns_to_drop, inplace=True)\n", + "dftotal1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Custom function to combine two columns into a list\n", + "def combine_to_list_company4(row):\n", + " return {\n", + " \"company\": row[\"company4_new\"],\n", + " \"ABSA\": row[\"company4_ABSA\"],\n", + " \"numABSA\": row[\"company4_numABSA\"],\n", + " }\n", + "\n", + "\n", + "# Apply the custom function to each row and create a new column 'Combined'\n", + "dftotal1[\"company4_Combined\"] = dftotal1.apply(combine_to_list_company4, axis=1)\n", + "columns_to_drop = [\"company4_new\", \"company4_ABSA\", \"company4_numABSA\"]\n", + "dftotal1.drop(columns=columns_to_drop, inplace=True)\n", + "dftotal1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Custom function to combine two columns into a list\n", + "def combine_to_list_company5(row):\n", + " return {\n", + " \"company\": row[\"company5_new\"],\n", + " \"ABSA\": row[\"company5_ABSA\"],\n", + " \"numABSA\": row[\"company5_numABSA\"],\n", + " }\n", + "\n", + "\n", + "# Apply the custom function to each row and create a new column 'Combined'\n", + "dftotal1[\"company5_Combined\"] = dftotal1.apply(combine_to_list_company5, axis=1)\n", + "columns_to_drop = [\"company5_new\", \"company5_ABSA\", \"company5_numABSA\"]\n", + "dftotal1.drop(columns=columns_to_drop, inplace=True)\n", + "dftotal1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dftotal1.rename(columns={\"company1_Combined\": \"company1\"}, inplace=True)\n", + "dftotal1.rename(columns={\"company2_Combined\": \"company2\"}, inplace=True)\n", + "dftotal1.rename(columns={\"company3_Combined\": \"company3\"}, inplace=True)\n", + "dftotal1.rename(columns={\"company4_Combined\": \"company4\"}, inplace=True)\n", + "dftotal1.rename(columns={\"company5_Combined\": \"company5\"}, inplace=True)\n", + "dftotal1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Custom function to combine two columns into a dictionary\n", + "def combine_to_dict(row):\n", + " return {\n", + " \"company1\": row[\"company1\"],\n", + " \"company2\": row[\"company2\"],\n", + " \"company3\": row[\"company3\"],\n", + " \"company4\": row[\"company4\"],\n", + " \"company5\": row[\"company5\"],\n", + " }\n", + "\n", + "\n", + "# Apply the custom function to each row and create a new column 'Combined'\n", + "dftotal1[\"Combined_ABSA\"] = dftotal1.apply(combine_to_dict, axis=1)\n", + "columns_to_drop = [\"company1\", \"company2\", \"company3\", \"company4\", \"company5\"]\n", + "dftotal1.drop(columns=columns_to_drop, inplace=True)\n", + "dftotal1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# dftotal1.to_csv('dftotal1_20231214_v4.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Ganz am Ende packen\n", + "# Create an instance of NewsObject (replace 'your_collection' with your actual collection)\n", + "newsObj = news.MongoNewsService(connector)\n", + "\n", + "if len(dftotal1) > 0:\n", + " for i in range(len(dftotal1)):\n", + " # ents=NERService.NERCompanyList(company_list,document)\n", + " # add a new attribute 'Combined_ABSA' to document\n", + " newsObj.collection.update_one(\n", + " {\"_id\": dftotal1[\"_id\"].iloc[i]}, # Filter für das entsprechende Dokument\n", + " {\n", + " \"$set\": {\"Combined_ABSA\": dftotal1[\"Combined_ABSA\"].iloc[i]}\n", + " }, # Neues Attribut hinzufügen, initialisiert mit einer leeren Liste\n", + " )\n", + "\n", + "else:\n", + " print(\"No documents found.\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "aki-prj23-transparenzregister-z8SxnVl_-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Jupyter/ABSA/ABSA_v6.ipynb b/Jupyter/ABSA/ABSA_v6.ipynb new file mode 100644 index 0000000..5b2ec5d --- /dev/null +++ b/Jupyter/ABSA/ABSA_v6.ipynb @@ -0,0 +1,1477 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import pandas as pd\n", + "import aki_prj23_transparenzregister.utils.mongo.connector as conn\n", + "from aki_prj23_transparenzregister.config.config_providers import JsonFileConfigProvider\n", + "import aki_prj23_transparenzregister.utils.mongo.news_mongo_service as news\n", + "import aki_prj23_transparenzregister.utils.mongo.company_mongo_service as comps" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class ABSA:\n", + " def __init__(self):\n", + " self.config_provider = JsonFileConfigProvider(\"./secrets.json\")\n", + " self.connect_string = self.config_provider.get_mongo_connection_string()\n", + " self.connect_string.database = \"transparenzregister_ner\"\n", + " self.connector = conn.MongoConnector(self.connect_string)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Mongo Connect: create connection string and connect\n", + "config_provider = JsonFileConfigProvider(\"../../secrets.json\")\n", + "engine = config_provider.get_mongo_connection_string()\n", + "engine.database = \"transparenzregister_ner\"\n", + "connector = conn.MongoConnector(engine)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Process all documents and check if attribute 'name' is existing\n", + "# Read data from database\n", + "CompsObj = comps.CompanyMongoService(connector)\n", + "allComps = CompsObj.get_all()\n", + "\n", + "# Create a cursor which has all unprogressed articles; articles without the attribute 'companies'\n", + "CursorCompNames = CompsObj.collection.find({\"name\": {\"$exists\": True}})\n", + "documents = list(CursorCompNames)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create a list with all company names\n", + "compList = []\n", + "\n", + "if len(documents) > 0:\n", + " for document in documents:\n", + " # ents=NERService.NERCompanyList(company_list,document)\n", + " compList.append(document[\"name\"])\n", + " # add a new attribute 'companies' to document\n", + " # newsObj.collection.update_one(\n", + " # {\"_id\": document[\"_id\"]}, # Filter für das entsprechende Dokument\n", + " # {\"$set\": {\"companies\": ents}} # Neues Attribut hinzufügen, initialisiert mit einer leeren Liste\n", + " # )\n", + "\n", + "else:\n", + " print(\"No documents found.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Process all documents in news collection and check if attribute 'companies' is existing\n", + "\n", + "# Read data from database\n", + "NERObj = news.MongoNewsService(connector)\n", + "allNER = NERObj.get_all()\n", + "\n", + "# Create a cursor which has all unprogressed articles; articles without the attribute 'companies'\n", + "CursorNERNames = NERObj.collection.find({\"companies\": {\"$exists\": True}})\n", + "documentsNER = list(CursorNERNames)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# install and import rapidfuzz\n", + "# pip install rapidfuzz\n", + "from rapidfuzz import process" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Process all documents in news collection and check if attribute 'companies' is existing\n", + "\n", + "# Read data from database\n", + "NERObj = news.MongoNewsService(connector)\n", + "allNER = NERObj.get_all()\n", + "\n", + "# Create a cursor which has all unprogressed articles; articles without the attribute 'companies'\n", + "CursorNERNames = NERObj.collection.find({\"companies\": {\"$exists\": True}})\n", + "documentsNER = list(CursorNERNames)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if len(documentsNER) > 0:\n", + " for document in documentsNER:\n", + " resList = [] # result list with matched names\n", + " for entity_name, frequency in document[\"companies\"].items():\n", + " if len(entity_name) > 2:\n", + " result = process.extractOne(entity_name, compList)\n", + " if result is not None:\n", + " # Wenn ein ähnlicher Name gefunden wurde\n", + " if result[1] >= 95:\n", + " # Passen Sie die Ähnlichkeitsbewertungsschwelle nach Bedarf an\n", + " similar_name = result[0]\n", + " # print(f\"Ähnlicher Name gefunden: {entity_name} (Ähnlichkeit: {result[1]})\")\n", + " # print(f\"Ähnlichkeit mit: {similar_name}\")\n", + " # print(f\"Häufigkeit: {frequency}\")\n", + " print(\n", + " f\"NER Entität: {entity_name} passt zu:{similar_name} zu: {result[1]}% und {frequency} Matches \"\n", + " )\n", + "\n", + " # ents=NERService.NERCompanyList(company_list,document)\n", + " # compList.append(document['name'])\n", + " # add a new attribute 'companies' to document\n", + " # newsObj.collection.update_one(\n", + " # {\"_id\": document[\"_id\"]}, # Filter für das entsprechende Dokument\n", + " # {\"$set\": {\"companies\": ents}} # Neues Attribut hinzufügen, initialisiert mit einer leeren Liste\n", + " # )\n", + "\n", + "else:\n", + " print(\"No documents found.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "documentsNER[1][\"companies\"].items()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "compList" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "\n", + "# Funktion zum Entfernen von Unternehmenstypen und rechtlichen Zusätzen\n", + "def remove_legal_additions(name):\n", + " # Verwenden Sie einen regulären Ausdruck, um gängige Zusätze wie \"GmbH\" und \"AG\" zu entfernen\n", + " cleaned_name = re.sub(\n", + " r\"\\b(GmbH|AG|KG|SE|& Co\\. KGaA|& Co\\.|e\\.K\\.|mbH|mbH & Co\\. KG)\\b\", \"\", name\n", + " )\n", + " # Entfernen Sie führende und nachfolgende Leerzeichen\n", + " cleaned_name = cleaned_name.strip()\n", + " return cleaned_name\n", + "\n", + "\n", + "# Bereinigen Sie die Liste von Unternehmensnamen\n", + "complist2 = [remove_legal_additions(name) for name in compList]\n", + "complist2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "complist3 = [word.split()[0] for word in complist2 if len(word.split()[0]) >= 3]\n", + "complist4 = list(set(complist3))\n", + "complist4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Bestimmte Begriffe, die keine Firmennamen sind, müssen aus complist4 entfernt werden\n", + "if (\"Deutsche\") in complist4:\n", + " complist4.remove(\"Deutsche\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Hamburg\") in complist4:\n", + " complist4.remove(\"Hamburg\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Hamburger\") in complist4:\n", + " complist4.remove(\"Hamburger\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Union\") in complist4:\n", + " complist4.remove(\"Union\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Energy\") in complist4:\n", + " complist4.remove(\"Energy\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Hugo\") in complist4:\n", + " complist4.remove(\"Hugo\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Pro\") in complist4:\n", + " complist4.remove(\"Pro\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"OTC\") in complist4:\n", + " complist4.remove(\"OTC\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"web\") in complist4:\n", + " complist4.remove(\"web\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Kabel\") in complist4:\n", + " complist4.remove(\"Kabel\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Club\") in complist4:\n", + " complist4.remove(\"Club\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"The\") in complist4:\n", + " complist4.remove(\"The\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"United\") in complist4:\n", + " complist4.remove(\"United\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Frankfurter\") in complist4:\n", + " complist4.remove(\"Frankfurter\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"CMC\") in complist4:\n", + " complist4.remove(\"CMC\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Bayern\") in complist4:\n", + " complist4.remove(\"Bayern\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Haus\") in complist4:\n", + " complist4.remove(\"Haus\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Gesellschaft\") in complist4:\n", + " complist4.remove(\"Gesellschaft\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Delivery\") in complist4:\n", + " complist4.remove(\"Delivery\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Aachener\") in complist4:\n", + " complist4.remove(\"Aachener\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Group\") in complist4:\n", + " complist4.remove(\"Group\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Retail\") in complist4:\n", + " complist4.remove(\"Retail\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Media\") in complist4:\n", + " complist4.remove(\"Media\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"European\") in complist4:\n", + " complist4.remove(\"European\")\n", + "else:\n", + " pass\n", + "\n", + "if (\"Fuels\") in complist4:\n", + " complist4.remove(\"Fuels\")\n", + "else:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Zusammenführung der beiden Listen complist2 und complist4\n", + "complist5 = complist2 + complist4\n", + "complist5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df1 = pd.DataFrame(documentsNER)\n", + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Function to extract company names\n", + "def extract_company_names(company_dict):\n", + " return list(company_dict.keys())\n", + "\n", + "\n", + "# Apply the function to the 'companies' column\n", + "df1[\"companies\"] = df1[\"companies\"].apply(lambda x: extract_company_names(x))\n", + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df1[\"companies_filtered\"] = df1[\"companies\"].apply(\n", + " lambda x: [company for company in x if company in complist5]\n", + ")\n", + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Nur Auswahl der ersten fünf Spalten von \"companies_filtered\", um später Rechenressourcen zu ersparen.\n", + "def split_list1(row):\n", + " return pd.Series(row[\"companies_filtered\"][:5])\n", + "\n", + "\n", + "# Apply the function and concatenate the result with the original DataFrame\n", + "df2 = df1.apply(split_list1, axis=1).merge(df1, left_index=True, right_index=True)\n", + "\n", + "# Drop the original column with lists\n", + "df2 = df2.drop(\"companies\", axis=1)\n", + "df2 = df2.drop(\"companies_filtered\", axis=1)\n", + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df2.rename(columns={0: \"company1\"}, inplace=True)\n", + "df2.rename(columns={1: \"company2\"}, inplace=True)\n", + "df2.rename(columns={2: \"company3\"}, inplace=True)\n", + "df2.rename(columns={3: \"company4\"}, inplace=True)\n", + "df2.rename(columns={4: \"company5\"}, inplace=True)\n", + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# cell10 = df2.loc[3, 'company1']\n", + "# print(cell10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df2.dropna(\n", + " subset=[\"company1\", \"company2\", \"company3\", \"company4\", \"company5\"],\n", + " how=\"all\",\n", + " inplace=True,\n", + ")\n", + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Die Indizes müssen resetted werden, da es ansonsten bei den nachfolgenden Schritte Fehlermeldungen gibt (Iterieren über i würde ansonsten nicht funktionieren).\n", + "df2 = df2.reset_index(drop=True)\n", + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def filter_sentences_company1(row):\n", + " target_word = row[\"company1\"]\n", + "\n", + " # Check if target_word is NaN\n", + " if pd.isna(target_word):\n", + " return None\n", + "\n", + " sentences = re.split(r\"[.:>]\", row[\"text\"]) # Split by dot\n", + "\n", + " # Extract sentences containing target word\n", + " filtered_sentences = [\n", + " sentence.strip() for sentence in sentences if target_word in sentence\n", + " ]\n", + "\n", + " # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n", + " concatenated_sentences = \". \".join(filtered_sentences)[:200]\n", + "\n", + " # Return None if no sentences contain the target word\n", + " return concatenated_sentences if concatenated_sentences else None\n", + "\n", + "\n", + "df2[\"text_company1\"] = df2.apply(filter_sentences_company1, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def filter_sentences_company2(row):\n", + " target_word = row[\"company2\"]\n", + "\n", + " # Check if target_word is NaN\n", + " if pd.isna(target_word):\n", + " return None\n", + "\n", + " sentences = re.split(r\"[.:>]\", row[\"text\"]) # Split by dot\n", + "\n", + " # Extract sentences containing target word\n", + " filtered_sentences = [\n", + " sentence.strip() for sentence in sentences if target_word in sentence\n", + " ]\n", + "\n", + " # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n", + " concatenated_sentences = \". \".join(filtered_sentences)[:200]\n", + "\n", + " # Return None if no sentences contain the target word\n", + " return concatenated_sentences if concatenated_sentences else None\n", + "\n", + "\n", + "df2[\"text_company2\"] = df2.apply(filter_sentences_company2, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def filter_sentences_company3(row):\n", + " target_word = row[\"company3\"]\n", + "\n", + " # Check if target_word is NaN\n", + " if pd.isna(target_word):\n", + " return None\n", + "\n", + " sentences = re.split(r\"[.:>]\", row[\"text\"]) # Split by dot\n", + "\n", + " # Extract sentences containing target word\n", + " filtered_sentences = [\n", + " sentence.strip() for sentence in sentences if target_word in sentence\n", + " ]\n", + "\n", + " # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n", + " concatenated_sentences = \". \".join(filtered_sentences)[:200]\n", + "\n", + " # Return None if no sentences contain the target word\n", + " return concatenated_sentences if concatenated_sentences else None\n", + "\n", + "\n", + "df2[\"text_company3\"] = df2.apply(filter_sentences_company3, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def filter_sentences_company4(row):\n", + " target_word = row[\"company4\"]\n", + "\n", + " # Check if target_word is NaN\n", + " if pd.isna(target_word):\n", + " return None\n", + "\n", + " sentences = re.split(r\"[.:>]\", row[\"text\"]) # Split by dot\n", + "\n", + " # Extract sentences containing target word\n", + " filtered_sentences = [\n", + " sentence.strip() for sentence in sentences if target_word in sentence\n", + " ]\n", + "\n", + " # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n", + " concatenated_sentences = \". \".join(filtered_sentences)[:200]\n", + "\n", + " # Return None if no sentences contain the target word\n", + " return concatenated_sentences if concatenated_sentences else None\n", + "\n", + "\n", + "df2[\"text_company4\"] = df2.apply(filter_sentences_company4, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def filter_sentences_company5(row):\n", + " target_word = row[\"company5\"]\n", + "\n", + " # Check if target_word is NaN\n", + " if pd.isna(target_word):\n", + " return None\n", + "\n", + " sentences = re.split(r\"[.:>]\", row[\"text\"]) # Split by dot\n", + "\n", + " # Extract sentences containing target word\n", + " filtered_sentences = [\n", + " sentence.strip() for sentence in sentences if target_word in sentence\n", + " ]\n", + "\n", + " # Concatenate sentences, add a dot between the concatenated sentences and limit total characters to 200!!!\n", + " concatenated_sentences = \". \".join(filtered_sentences)[:200]\n", + "\n", + " # Return None if no sentences contain the target word\n", + " return concatenated_sentences if concatenated_sentences else None\n", + "\n", + "\n", + "df2[\"text_company5\"] = df2.apply(filter_sentences_company5, axis=1)\n", + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, MarianMTModel\n", + "\n", + "# Hinweis: Durch den zusätzlichen Code \"truncation=True\" kann die Beschränkung auf 512 Zeichen bei der maschinen Übersetzung ausgeschaltet werden.\n", + "translation_tokenizer = AutoTokenizer.from_pretrained(\n", + " \"Helsinki-NLP/opus-mt-de-en\", truncation=True\n", + ")\n", + "\n", + "translation_model = AutoModelForSeq2SeqLM.from_pretrained(\"Helsinki-NLP/opus-mt-de-en\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Testen, ob die maschinelle Übersetzung bei einem Beispielssatz funkioniert.\n", + "def translate_sentiment(text: str) -> str:\n", + " input_tokens = translation_tokenizer([text], return_tensors=\"pt\")\n", + " generated_ids = translation_model.generate(**input_tokens)\n", + " return translation_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[\n", + " 0\n", + " ]\n", + "\n", + "\n", + "headline = \"Pelabresib: Biotech-Unternehmen Morphosys hofft mit neuem Krebs-Medikament endlich auf den Durchbruch. \"\n", + "tf = translate_sentiment(headline)\n", + "tf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "translate_list_company1 = []\n", + "\n", + "for i in range(len(df2)):\n", + " text = str(\n", + " df2[\"text_company1\"].loc[i]\n", + " ) # Convert to string (this is very important)\n", + " texttrans = translate_sentiment(text)\n", + " translate_list_company1.append(texttrans)\n", + "\n", + "dftrans_company1 = pd.DataFrame(translate_list_company1)\n", + "dftrans_company1.rename(columns={0: \"text_company1_eng\"}, inplace=True)\n", + "dftrans_company1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "translate_list_company2 = []\n", + "\n", + "for i in range(len(df2)):\n", + " text = str(\n", + " df2[\"text_company2\"].loc[i]\n", + " ) # Convert to string (this is very important)\n", + " texttrans = translate_sentiment(text)\n", + " translate_list_company2.append(texttrans)\n", + "\n", + "dftrans_company2 = pd.DataFrame(translate_list_company2)\n", + "dftrans_company2.rename(columns={0: \"text_company2_eng\"}, inplace=True)\n", + "dftrans_company2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "translate_list_company3 = []\n", + "\n", + "for i in range(len(df2)):\n", + " text = str(\n", + " df2[\"text_company3\"].loc[i]\n", + " ) # Convert to string (this is very important)\n", + " texttrans = translate_sentiment(text)\n", + " translate_list_company3.append(texttrans)\n", + "\n", + "dftrans_company3 = pd.DataFrame(translate_list_company3)\n", + "dftrans_company3.rename(columns={0: \"text_company3_eng\"}, inplace=True)\n", + "dftrans_company3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "translate_list_company4 = []\n", + "\n", + "for i in range(len(df2)):\n", + " text = str(\n", + " df2[\"text_company4\"].loc[i]\n", + " ) # Convert to string (this is very important)\n", + " texttrans = translate_sentiment(text)\n", + " translate_list_company4.append(texttrans)\n", + "\n", + "dftrans_company4 = pd.DataFrame(translate_list_company4)\n", + "dftrans_company4.rename(columns={0: \"text_company4_eng\"}, inplace=True)\n", + "dftrans_company4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "translate_list_company5 = []\n", + "\n", + "for i in range(len(df2)):\n", + " text = str(\n", + " df2[\"text_company5\"].loc[i]\n", + " ) # Convert to string (this is very important)\n", + " texttrans = translate_sentiment(text)\n", + " translate_list_company5.append(texttrans)\n", + "\n", + "dftrans_company5 = pd.DataFrame(translate_list_company5)\n", + "dftrans_company5.rename(columns={0: \"text_company5_eng\"}, inplace=True)\n", + "dftrans_company5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df3 = df2[\n", + " [\n", + " \"_id\",\n", + " \"title\",\n", + " \"text\",\n", + " \"company1\",\n", + " \"text_company1\",\n", + " \"company2\",\n", + " \"text_company2\",\n", + " \"company3\",\n", + " \"text_company3\",\n", + " \"company4\",\n", + " \"text_company4\",\n", + " \"company5\",\n", + " \"text_company5\",\n", + " ]\n", + "]\n", + "df3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df3.insert(4, \"text_company1_eng\", dftrans_company1.iloc[:, 0])\n", + "df3.insert(7, \"text_company2_eng\", dftrans_company2.iloc[:, 0])\n", + "df3.insert(10, \"text_company3_eng\", dftrans_company3.iloc[:, 0])\n", + "df3.insert(13, \"text_company4_eng\", dftrans_company4.iloc[:, 0])\n", + "df3.insert(16, \"text_company5_eng\", dftrans_company5.iloc[:, 0])\n", + "df3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# df3.to_csv('df3_20231213.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df3 = df3.map(lambda x: None if pd.isna(x) else x)\n", + "df3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# test ABSA\n", + "from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline\n", + "\n", + "# Das ABSA-Modell und der Tokenizer werden geladen bzw. definiert\n", + "model_name = \"yangheng/deberta-v3-base-absa-v1.1\"\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + "model = AutoModelForSequenceClassification.from_pretrained(model_name)\n", + "\n", + "classifier = pipeline(\"text-classification\", model=model, tokenizer=tokenizer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test anhand eines Beispielsatzes\n", + "for aspect in [\"Siemens\"]:\n", + " print(\n", + " aspect,\n", + " classifier(\n", + " \"Siemens is doing great\",\n", + " text_pair=aspect,\n", + " ),\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test anhand eines Beispielsatzes Teil 2\n", + "for aspect in [df2.loc[1, \"company1\"]]:\n", + " print(\n", + " aspect,\n", + " classifier(\n", + " \"Siemens is doing great\",\n", + " text_pair=aspect,\n", + " ),\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Muster für ABSA CompanyX\n", + "\n", + "result_list_company1 = []\n", + "\n", + "for i in range(len(df3)):\n", + " aspect = df3[\"company1\"].loc[i]\n", + " textfinal = aspect, classifier(df3[\"text_company1_eng\"].loc[i], text_pair=aspect)\n", + " result_list_company1.append(textfinal)\n", + "\n", + "dfcompany1 = pd.DataFrame(result_list_company1)\n", + "dfcompany1.rename(columns={0: \"company1_new\"}, inplace=True)\n", + "dfcompany1.rename(columns={1: \"company1_ABSA_v1\"}, inplace=True)\n", + "\n", + "dfcompany1[\"company1_ABSA_v1\"] = dfcompany1[\"company1_ABSA_v1\"].astype(str)\n", + "dfcompany1[\"company1_ABSA\"] = dfcompany1[\"company1_ABSA_v1\"].str[12:19]\n", + "\n", + "import re\n", + "\n", + "# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n", + "pattern = r\"(\\d+(\\.\\d+)?)\"\n", + "\n", + "# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n", + "dfcompany1[\"company1_numABSA\"] = dfcompany1[\"company1_ABSA_v1\"].apply(\n", + " lambda x: re.search(pattern, str(x)).group(1)\n", + " if re.search(pattern, str(x))\n", + " else None\n", + ")\n", + "\n", + "# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n", + "dfcompany1[\"company1_numABSA\"] = pd.to_numeric(\n", + " dfcompany1[\"company1_numABSA\"], errors=\"coerce\"\n", + ")\n", + "dfcompany1 = dfcompany1.drop(\"company1_ABSA_v1\", axis=1)\n", + "dfcompany1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def handle_none(row):\n", + " if row[\"company1_new\"] is None:\n", + " row[\"company1_ABSA\"] = None\n", + " row[\"company1_numABSA\"] = None\n", + " return row\n", + "\n", + "\n", + "# Apply the custom function to each row\n", + "dfcompany1new = dfcompany1.apply(handle_none, axis=1)\n", + "dfcompany1new" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test anhand eines Beispielsatzes Teil 2\n", + "for aspect in [df3.loc[9, \"company2\"]]:\n", + " if df3[\"text_company2_eng\"].loc[9] != \"None\":\n", + " print(\n", + " aspect,\n", + " classifier(\n", + " df3[\"text_company2_eng\"].loc[9],\n", + " text_pair=aspect,\n", + " ),\n", + " )\n", + " else:\n", + " print(None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result_list_company2 = []\n", + "\n", + "for i in range(len(df3)):\n", + " aspect = df3[\"company2\"].loc[i]\n", + " textfinal = aspect, classifier(df3[\"text_company2_eng\"].loc[i], text_pair=aspect)\n", + " result_list_company2.append(textfinal)\n", + "\n", + "dfcompany2 = pd.DataFrame(result_list_company2)\n", + "dfcompany2.rename(columns={0: \"company2_new\"}, inplace=True)\n", + "dfcompany2.rename(columns={1: \"company2_ABSA_v1\"}, inplace=True)\n", + "\n", + "dfcompany2[\"company2_ABSA_v1\"] = dfcompany2[\"company2_ABSA_v1\"].astype(str)\n", + "dfcompany2[\"company2_ABSA\"] = dfcompany2[\"company2_ABSA_v1\"].str[12:19]\n", + "\n", + "import re\n", + "\n", + "# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n", + "pattern = r\"(\\d+(\\.\\d+)?)\"\n", + "\n", + "# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n", + "dfcompany2[\"company2_numABSA\"] = dfcompany2[\"company2_ABSA_v1\"].apply(\n", + " lambda x: re.search(pattern, str(x)).group(1)\n", + " if re.search(pattern, str(x))\n", + " else None\n", + ")\n", + "\n", + "# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n", + "dfcompany2[\"company2_numABSA\"] = pd.to_numeric(\n", + " dfcompany2[\"company2_numABSA\"], errors=\"coerce\"\n", + ")\n", + "dfcompany2 = dfcompany2.drop(\"company2_ABSA_v1\", axis=1)\n", + "dfcompany2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def handle_none(row):\n", + " if row[\"company2_new\"] is None:\n", + " row[\"company2_ABSA\"] = None\n", + " row[\"company2_numABSA\"] = None\n", + " return row\n", + "\n", + "\n", + "# Apply the custom function to each row\n", + "dfcompany2new = dfcompany2.apply(handle_none, axis=1)\n", + "dfcompany2new" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result_list_company3 = []\n", + "\n", + "for i in range(len(df3)):\n", + " aspect = df3[\"company3\"].loc[i]\n", + " textfinal = aspect, classifier(df3[\"text_company3_eng\"].loc[i], text_pair=aspect)\n", + " result_list_company3.append(textfinal)\n", + "\n", + "dfcompany3 = pd.DataFrame(result_list_company3)\n", + "dfcompany3.rename(columns={0: \"company3_new\"}, inplace=True)\n", + "dfcompany3.rename(columns={1: \"company3_ABSA_v1\"}, inplace=True)\n", + "\n", + "dfcompany3[\"company3_ABSA_v1\"] = dfcompany3[\"company3_ABSA_v1\"].astype(str)\n", + "dfcompany3[\"company3_ABSA\"] = dfcompany3[\"company3_ABSA_v1\"].str[12:19]\n", + "\n", + "import re\n", + "\n", + "# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n", + "pattern = r\"(\\d+(\\.\\d+)?)\"\n", + "\n", + "# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n", + "dfcompany3[\"company3_numABSA\"] = dfcompany3[\"company3_ABSA_v1\"].apply(\n", + " lambda x: re.search(pattern, str(x)).group(1)\n", + " if re.search(pattern, str(x))\n", + " else None\n", + ")\n", + "\n", + "# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n", + "dfcompany3[\"company3_numABSA\"] = pd.to_numeric(\n", + " dfcompany3[\"company3_numABSA\"], errors=\"coerce\"\n", + ")\n", + "dfcompany3 = dfcompany3.drop(\"company3_ABSA_v1\", axis=1)\n", + "dfcompany3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def handle_none(row):\n", + " if row[\"company3_new\"] is None:\n", + " row[\"company3_ABSA\"] = None\n", + " row[\"company3_numABSA\"] = None\n", + " return row\n", + "\n", + "\n", + "# Apply the custom function to each row\n", + "dfcompany3new = dfcompany3.apply(handle_none, axis=1)\n", + "dfcompany3new" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result_list_company4 = []\n", + "\n", + "for i in range(len(df3)):\n", + " aspect = df3[\"company4\"].loc[i]\n", + " textfinal = aspect, classifier(df3[\"text_company4_eng\"].loc[i], text_pair=aspect)\n", + " result_list_company4.append(textfinal)\n", + "\n", + "dfcompany4 = pd.DataFrame(result_list_company4)\n", + "dfcompany4.rename(columns={0: \"company4_new\"}, inplace=True)\n", + "dfcompany4.rename(columns={1: \"company4_ABSA_v1\"}, inplace=True)\n", + "\n", + "dfcompany4[\"company4_ABSA_v1\"] = dfcompany4[\"company4_ABSA_v1\"].astype(str)\n", + "dfcompany4[\"company4_ABSA\"] = dfcompany4[\"company4_ABSA_v1\"].str[12:19]\n", + "\n", + "import re\n", + "\n", + "# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n", + "pattern = r\"(\\d+(\\.\\d+)?)\"\n", + "\n", + "# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n", + "dfcompany4[\"company4_numABSA\"] = dfcompany4[\"company4_ABSA_v1\"].apply(\n", + " lambda x: re.search(pattern, str(x)).group(1)\n", + " if re.search(pattern, str(x))\n", + " else None\n", + ")\n", + "\n", + "# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n", + "dfcompany4[\"company4_numABSA\"] = pd.to_numeric(\n", + " dfcompany4[\"company4_numABSA\"], errors=\"coerce\"\n", + ")\n", + "dfcompany4 = dfcompany4.drop(\"company4_ABSA_v1\", axis=1)\n", + "dfcompany4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def handle_none(row):\n", + " if row[\"company4_new\"] is None:\n", + " row[\"company4_ABSA\"] = None\n", + " row[\"company4_numABSA\"] = None\n", + " return row\n", + "\n", + "\n", + "# Apply the custom function to each row\n", + "dfcompany4new = dfcompany4.apply(handle_none, axis=1)\n", + "dfcompany4new" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result_list_company5 = []\n", + "\n", + "for i in range(len(df3)):\n", + " aspect = df3[\"company5\"].loc[i]\n", + " textfinal = aspect, classifier(df3[\"text_company5_eng\"].loc[i], text_pair=aspect)\n", + " result_list_company5.append(textfinal)\n", + "\n", + "dfcompany5 = pd.DataFrame(result_list_company5)\n", + "dfcompany5.rename(columns={0: \"company5_new\"}, inplace=True)\n", + "dfcompany5.rename(columns={1: \"company5_ABSA_v1\"}, inplace=True)\n", + "\n", + "dfcompany5[\"company5_ABSA_v1\"] = dfcompany5[\"company5_ABSA_v1\"].astype(str)\n", + "dfcompany5[\"company5_ABSA\"] = dfcompany5[\"company5_ABSA_v1\"].str[12:19]\n", + "\n", + "import re\n", + "\n", + "# Es wird ein Schema für einen \"Regulären Ausdruck\" definiert, um numerische Werte zu finden.\n", + "pattern = r\"(\\d+(\\.\\d+)?)\"\n", + "\n", + "# Die numerischen Werte werden mittels des \"Regulären Ausdrucks\" extrahiert.\n", + "dfcompany5[\"company5_numABSA\"] = dfcompany5[\"company5_ABSA_v1\"].apply(\n", + " lambda x: re.search(pattern, str(x)).group(1)\n", + " if re.search(pattern, str(x))\n", + " else None\n", + ")\n", + "\n", + "# Die extrahierten Werte werden bei Bedarf in einen Float-Wert konvertiert.\n", + "dfcompany5[\"company5_numABSA\"] = pd.to_numeric(\n", + " dfcompany5[\"company5_numABSA\"], errors=\"coerce\"\n", + ")\n", + "dfcompany5 = dfcompany5.drop(\"company5_ABSA_v1\", axis=1)\n", + "dfcompany5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def handle_none(row):\n", + " if row[\"company5_new\"] is None:\n", + " row[\"company5_ABSA\"] = None\n", + " row[\"company5_numABSA\"] = None\n", + " return row\n", + "\n", + "\n", + "# Apply the custom function to each row\n", + "dfcompany5new = dfcompany5.apply(handle_none, axis=1)\n", + "dfcompany5new" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dftotal1 = pd.concat(\n", + " [df3, dfcompany1new, dfcompany2new, dfcompany3new, dfcompany4new, dfcompany5new],\n", + " axis=1,\n", + " join=\"outer\",\n", + ")\n", + "columns_to_drop = [\"company1\", \"company2\", \"company3\", \"company4\", \"company5\"]\n", + "dftotal1.drop(columns=columns_to_drop, inplace=True)\n", + "dftotal1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Custom function to combine two columns into a list\n", + "def combine_to_list_company1(row):\n", + " return {\n", + " row[\"company1_new\"]: {\n", + " \"ABSA\": row[\"company1_ABSA\"],\n", + " \"numABSA\": row[\"company1_numABSA\"],\n", + " }\n", + " }\n", + "\n", + "\n", + "# Apply the custom function to each row and create a new column 'Combined'\n", + "dftotal1[\"company1_Combined\"] = dftotal1.apply(combine_to_list_company1, axis=1)\n", + "# columns_to_drop = [\"company1_new\", \"company1_ABSA\", \"company1_numABSA\"]\n", + "# dftotal1.drop(columns=columns_to_drop, inplace=True)\n", + "dftotal1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Custom function to combine two columns into a list\n", + "def combine_to_list_company2(row):\n", + " return {\n", + " row[\"company2_new\"]: {\n", + " \"ABSA\": row[\"company2_ABSA\"],\n", + " \"numABSA\": row[\"company2_numABSA\"],\n", + " }\n", + " }\n", + "\n", + "\n", + "# Apply the custom function to each row and create a new column 'Combined'\n", + "dftotal1[\"company2_Combined\"] = dftotal1.apply(combine_to_list_company2, axis=1)\n", + "# columns_to_drop = [\"company2_new\", \"company2_ABSA\", \"company2_numABSA\"]\n", + "# dftotal1.drop(columns=columns_to_drop, inplace=True)\n", + "dftotal1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Custom function to combine two columns into a list\n", + "def combine_to_list_company3(row):\n", + " return {\n", + " row[\"company3_new\"]: {\n", + " \"ABSA\": row[\"company3_ABSA\"],\n", + " \"numABSA\": row[\"company3_numABSA\"],\n", + " }\n", + " }\n", + "\n", + "\n", + "# Apply the custom function to each row and create a new column 'Combined'\n", + "dftotal1[\"company3_Combined\"] = dftotal1.apply(combine_to_list_company3, axis=1)\n", + "# columns_to_drop = [\"company3_new\", \"company3_ABSA\", \"company3_numABSA\"]\n", + "# dftotal1.drop(columns=columns_to_drop, inplace=True)\n", + "dftotal1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Custom function to combine two columns into a list\n", + "def combine_to_list_company4(row):\n", + " return {\n", + " row[\"company4_new\"]: {\n", + " \"ABSA\": row[\"company4_ABSA\"],\n", + " \"numABSA\": row[\"company4_numABSA\"],\n", + " }\n", + " }\n", + "\n", + "\n", + "# Apply the custom function to each row and create a new column 'Combined'\n", + "dftotal1[\"company4_Combined\"] = dftotal1.apply(combine_to_list_company4, axis=1)\n", + "# columns_to_drop = [\"company4_new\", \"company4_ABSA\", \"company4_numABSA\"]\n", + "# dftotal1.drop(columns=columns_to_drop, inplace=True)\n", + "dftotal1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Custom function to combine two columns into a list\n", + "def combine_to_list_company5(row):\n", + " return {\n", + " row[\"company5_new\"]: {\n", + " \"ABSA\": row[\"company5_ABSA\"],\n", + " \"numABSA\": row[\"company5_numABSA\"],\n", + " }\n", + " }\n", + "\n", + "\n", + "# Apply the custom function to each row and create a new column 'Combined'\n", + "dftotal1[\"company5_Combined\"] = dftotal1.apply(combine_to_list_company5, axis=1)\n", + "# columns_to_drop = [\"company5_new\", \"company5_ABSA\", \"company5_numABSA\"]\n", + "# dftotal1.drop(columns=columns_to_drop, inplace=True)\n", + "dftotal1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Combine the ABSAs from company1/2/3/4/5 into one combined ABSA, eliminating the \"Nones\"\n", + "\n", + "\n", + "def combine_to_dict(row):\n", + " if (\n", + " None in row[\"company5_Combined\"]\n", + " and None not in row[\"company2_Combined\"]\n", + " and None not in row[\"company3_Combined\"]\n", + " and None not in row[\"company4_Combined\"]\n", + " ):\n", + " return {\n", + " row[\"company1_new\"]: {\n", + " \"ABSA\": row[\"company1_ABSA\"],\n", + " \"numABSA\": row[\"company1_numABSA\"],\n", + " },\n", + " row[\"company2_new\"]: {\n", + " \"ABSA\": row[\"company2_ABSA\"],\n", + " \"numABSA\": row[\"company2_numABSA\"],\n", + " },\n", + " row[\"company3_new\"]: {\n", + " \"ABSA\": row[\"company3_ABSA\"],\n", + " \"numABSA\": row[\"company3_numABSA\"],\n", + " },\n", + " row[\"company4_new\"]: {\n", + " \"ABSA\": row[\"company4_ABSA\"],\n", + " \"numABSA\": row[\"company4_numABSA\"],\n", + " },\n", + " }\n", + " elif (\n", + " None in row[\"company4_Combined\"]\n", + " and None in row[\"company5_Combined\"]\n", + " and None not in row[\"company2_Combined\"]\n", + " and None not in row[\"company3_Combined\"]\n", + " ):\n", + " return {\n", + " row[\"company1_new\"]: {\n", + " \"ABSA\": row[\"company1_ABSA\"],\n", + " \"numABSA\": row[\"company1_numABSA\"],\n", + " },\n", + " row[\"company2_new\"]: {\n", + " \"ABSA\": row[\"company2_ABSA\"],\n", + " \"numABSA\": row[\"company2_numABSA\"],\n", + " },\n", + " row[\"company3_new\"]: {\n", + " \"ABSA\": row[\"company3_ABSA\"],\n", + " \"numABSA\": row[\"company3_numABSA\"],\n", + " },\n", + " }\n", + " elif (\n", + " None in row[\"company3_Combined\"]\n", + " and None in row[\"company4_Combined\"]\n", + " and None in row[\"company5_Combined\"]\n", + " and None not in row[\"company2_Combined\"]\n", + " ):\n", + " return {\n", + " row[\"company1_new\"]: {\n", + " \"ABSA\": row[\"company1_ABSA\"],\n", + " \"numABSA\": row[\"company1_numABSA\"],\n", + " },\n", + " row[\"company2_new\"]: {\n", + " \"ABSA\": row[\"company2_ABSA\"],\n", + " \"numABSA\": row[\"company2_numABSA\"],\n", + " },\n", + " }\n", + " elif (\n", + " None in row[\"company2_Combined\"]\n", + " and None in row[\"company3_Combined\"]\n", + " and None in row[\"company4_Combined\"]\n", + " and None in row[\"company5_Combined\"]\n", + " ):\n", + " return {\n", + " row[\"company1_new\"]: {\n", + " \"ABSA\": row[\"company1_ABSA\"],\n", + " \"numABSA\": row[\"company1_numABSA\"],\n", + " }\n", + " }\n", + " else:\n", + " return {\n", + " row[\"company1_new\"]: {\n", + " \"ABSA\": row[\"company1_ABSA\"],\n", + " \"numABSA\": row[\"company1_numABSA\"],\n", + " },\n", + " row[\"company2_new\"]: {\n", + " \"ABSA\": row[\"company2_ABSA\"],\n", + " \"numABSA\": row[\"company2_numABSA\"],\n", + " },\n", + " row[\"company3_new\"]: {\n", + " \"ABSA\": row[\"company3_ABSA\"],\n", + " \"numABSA\": row[\"company3_numABSA\"],\n", + " },\n", + " row[\"company4_new\"]: {\n", + " \"ABSA\": row[\"company4_ABSA\"],\n", + " \"numABSA\": row[\"company4_numABSA\"],\n", + " },\n", + " row[\"company5_new\"]: {\n", + " \"ABSA\": row[\"company5_ABSA\"],\n", + " \"numABSA\": row[\"company5_numABSA\"],\n", + " },\n", + " }\n", + "\n", + "\n", + "# Apply the custom function to each row and create a new column 'Combined'\n", + "dftotal1[\"Combined_ABSA\"] = dftotal1.apply(combine_to_dict, axis=1)\n", + "dftotal1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# dftotal1.to_csv('dftotal1_20231217_v6.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Ganz am Ende packen\n", + "# Create an instance of NewsObject (replace 'your_collection' with your actual collection)\n", + "newsObj = news.MongoNewsService(connector)\n", + "\n", + "if len(dftotal1) > 0:\n", + " for i in range(len(dftotal1)):\n", + " # ents=NERService.NERCompanyList(company_list,document)\n", + " # add a new attribute 'Combined_ABSA' to document\n", + " newsObj.collection.update_one(\n", + " {\"_id\": dftotal1[\"_id\"].iloc[i]}, # Filter für das entsprechende Dokument\n", + " {\n", + " \"$set\": {\"Combined_ABSA\": dftotal1[\"Combined_ABSA\"].iloc[i]}\n", + " }, # Neues Attribut hinzufügen, initialisiert mit einer leeren Liste\n", + " )\n", + "\n", + "else:\n", + " print(\"No documents found.\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "aki-prj23-transparenzregister-z8SxnVl_-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/documentations/seminararbeiten/Seminarvortrag_Zhu_Text_Mining_Focus_Sentiment_Analyse_20230706.pptx b/documentations/seminararbeiten/Seminarvortrag_Zhu_Text_Mining_Focus_Sentiment_Analyse_20230706.pptx new file mode 100644 index 0000000..8d34069 Binary files /dev/null and b/documentations/seminararbeiten/Seminarvortrag_Zhu_Text_Mining_Focus_Sentiment_Analyse_20230706.pptx differ diff --git a/poetry.lock b/poetry.lock index 2c59be7..62647de 100644 --- a/poetry.lock +++ b/poetry.lock @@ -234,6 +234,16 @@ tests = ["attrs[tests-no-zope]", "zope-interface"] tests-mypy = ["mypy (>=1.6)", "pytest-mypy-plugins"] tests-no-zope = ["attrs[tests-mypy]", "cloudpickle", "hypothesis", "pympler", "pytest (>=4.3.0)", "pytest-xdist[psutil]"] +[[package]] +name = "autocuda" +version = "0.16" +description = "This package provides the function to auto-choose the cuda device hase largest free memory in Pytorch" +optional = false +python-versions = ">=3.6" +files = [ + {file = "autocuda-0.16-py3-none-any.whl", hash = "sha256:c33398872f4c9336815dce158400438d616b8e1616d7ddfde5c9a203b71ec856"}, +] + [[package]] name = "babel" version = "2.14.0" @@ -398,6 +408,22 @@ files = [ {file = "boolean.py-4.0.tar.gz", hash = "sha256:17b9a181630e43dde1851d42bef546d616d5d9b4480357514597e78b203d06e4"}, ] +[[package]] +name = "boostaug" +version = "2.3.5" +description = "" +optional = false +python-versions = ">=3.6" +files = [ + {file = "boostaug-2.3.5-py3-none-any.whl", hash = "sha256:1a7a0448960fffa14358c1580bbdde9cd815b27ba88ed6416f4438825c6adcef"}, +] + +[package.dependencies] +pyabsa = ">=2.0.10" + +[package.extras] +full = ["nlpaug", "tensorflow-text", "textattack"] + [[package]] name = "brotli" version = "1.1.0" @@ -818,13 +844,13 @@ cron = ["capturer (>=2.4)"] [[package]] name = "comm" -version = "0.2.0" +version = "0.2.1" description = "Jupyter Python Comm implementation, for usage in ipykernel, xeus-python etc." optional = false python-versions = ">=3.8" files = [ - {file = "comm-0.2.0-py3-none-any.whl", hash = "sha256:2da8d9ebb8dd7bfc247adaff99f24dce705638a8042b85cb995066793e391001"}, - {file = "comm-0.2.0.tar.gz", hash = "sha256:a517ea2ca28931c7007a7a99c562a0fa5883cfb48963140cf642c41c948498be"}, + {file = "comm-0.2.1-py3-none-any.whl", hash = "sha256:87928485c0dfc0e7976fd89fc1e187023cf587e7c353e4a9b417555b44adf021"}, + {file = "comm-0.2.1.tar.gz", hash = "sha256:0bc91edae1344d39d3661dcbc36937181fdaddb304790458f8b044dbc064b89a"}, ] [package.dependencies] @@ -1695,6 +1721,19 @@ docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1 testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"] typing = ["typing-extensions (>=4.8)"] +[[package]] +name = "findfile" +version = "2.0.1" +description = "This package provides the function to search target file(s)/dir(s) using keyword, alleviating the possibility of failure to find files by specifying relative/absolute path" +optional = false +python-versions = ">=3.6" +files = [ + {file = "findfile-2.0.1-py3-none-any.whl", hash = "sha256:29c426b9b958bcd8aaf4cf629685b5f67a78f22bbd3cd7c3be389f4fb4368b3d"}, +] + +[package.dependencies] +termcolor = "*" + [[package]] name = "flask" version = "3.0.0" @@ -2087,13 +2126,13 @@ lxml = ["lxml"] [[package]] name = "huggingface-hub" -version = "0.20.1" +version = "0.20.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.8.0" files = [ - {file = "huggingface_hub-0.20.1-py3-none-any.whl", hash = "sha256:ecfdea395a8bc68cd160106c5bd857f7e010768d95f9e1862a779010cc304831"}, - {file = "huggingface_hub-0.20.1.tar.gz", hash = "sha256:8c88c4c3c8853e22f2dfb4d84c3d493f4e1af52fb3856a90e1eeddcf191ddbb1"}, + {file = "huggingface_hub-0.20.2-py3-none-any.whl", hash = "sha256:53752eda2239d30a470c307a61cf9adcf136bc77b0a734338c7d04941af560d8"}, + {file = "huggingface_hub-0.20.2.tar.gz", hash = "sha256:215c5fceff631030c7a3d19ba7b588921c908b3f21eef31d160ebc245b200ff6"}, ] [package.dependencies] @@ -2347,6 +2386,17 @@ MarkupSafe = ">=2.0" [package.extras] i18n = ["Babel (>=2.7)"] +[[package]] +name = "joblib" +version = "1.3.2" +description = "Lightweight pipelining with Python functions" +optional = false +python-versions = ">=3.7" +files = [ + {file = "joblib-1.3.2-py3-none-any.whl", hash = "sha256:ef4331c65f239985f3f2220ecc87db222f08fd22097a3dd5698f693875f8cbb9"}, + {file = "joblib-1.3.2.tar.gz", hash = "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1"}, +] + [[package]] name = "json5" version = "0.9.14" @@ -2483,13 +2533,13 @@ test = ["flaky", "pexpect", "pytest"] [[package]] name = "jupyter-core" -version = "5.6.1" +version = "5.7.0" description = "Jupyter core package. A base package on which Jupyter projects rely." optional = false python-versions = ">=3.8" files = [ - {file = "jupyter_core-5.6.1-py3-none-any.whl", hash = "sha256:3d16aec2e1ec84b69f7794e49c32830c1d950ad149526aec954c100047c5f3a7"}, - {file = "jupyter_core-5.6.1.tar.gz", hash = "sha256:5139be639404f7f80f3db6f687f47b8a8ec97286b4fa063c984024720e7224dc"}, + {file = "jupyter_core-5.7.0-py3-none-any.whl", hash = "sha256:16eea462f7dad23ba9f86542bdf17f830804e2028eb48d609b6134d91681e983"}, + {file = "jupyter_core-5.7.0.tar.gz", hash = "sha256:cb8d3ed92144d2463a3c5664fdd686a3f0c1442ea45df8babb1c1a9e6333fe03"}, ] [package.dependencies] @@ -2542,13 +2592,13 @@ jupyter-server = ">=1.1.2" [[package]] name = "jupyter-server" -version = "2.12.1" +version = "2.12.2" description = "The backend—i.e. core services, APIs, and REST endpoints—to Jupyter web applications." optional = false python-versions = ">=3.8" files = [ - {file = "jupyter_server-2.12.1-py3-none-any.whl", hash = "sha256:fd030dd7be1ca572e4598203f718df6630c12bd28a599d7f1791c4d7938e1010"}, - {file = "jupyter_server-2.12.1.tar.gz", hash = "sha256:dc77b7dcc5fc0547acba2b2844f01798008667201eea27c6319ff9257d700a6d"}, + {file = "jupyter_server-2.12.2-py3-none-any.whl", hash = "sha256:abcfa33f98a959f908c8733aa2d9fa0101d26941cbd49b148f4cef4d3046fc61"}, + {file = "jupyter_server-2.12.2.tar.gz", hash = "sha256:5eae86be15224b5375cdec0c3542ce72ff20f7a25297a2a8166a250bb455a519"}, ] [package.dependencies] @@ -3159,6 +3209,30 @@ files = [ {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, ] +[[package]] +name = "metric-visualizer" +version = "0.9.8" +description = "This is a tool for automated experimental metrics statistics and visualization" +optional = false +python-versions = ">=3.6" +files = [ + {file = "metric_visualizer-0.9.8-py3-none-any.whl", hash = "sha256:50795c940817b3c6495aff94df5a1ccb9109cf3f7719175d31e4d904d76b410d"}, +] + +[package.dependencies] +click = "*" +findfile = "*" +matplotlib = ">=3.6.3" +natsort = "*" +numpy = "*" +openpyxl = "*" +pandas = "*" +scipy = ">=1.10.0" +tabulate = "*" +tikzplotlib = "*" +update-checker = "*" +xlsxwriter = "*" + [[package]] name = "mistune" version = "3.0.2" @@ -3388,6 +3462,21 @@ rtd = ["ipython", "pydata-sphinx-theme (==v0.13.0rc4)", "sphinx-autodoc2 (>=0.4. testing = ["beautifulsoup4", "coverage[toml]", "pytest (>=7,<8)", "pytest-cov", "pytest-param-files (>=0.3.4,<0.4.0)", "pytest-regressions", "sphinx-pytest"] testing-docutils = ["pygments", "pytest (>=7,<8)", "pytest-param-files (>=0.3.4,<0.4.0)"] +[[package]] +name = "natsort" +version = "8.4.0" +description = "Simple yet flexible natural sorting in Python." +optional = false +python-versions = ">=3.7" +files = [ + {file = "natsort-8.4.0-py3-none-any.whl", hash = "sha256:4732914fb471f56b5cce04d7bae6f164a592c7712e1c85f9ef585e197299521c"}, + {file = "natsort-8.4.0.tar.gz", hash = "sha256:45312c4a0e5507593da193dedd04abb1469253b601ecaf63445ad80f0a1ea581"}, +] + +[package.extras] +fast = ["fastnumbers (>=2.0.0)"] +icu = ["PyICU (>=1.0.0)"] + [[package]] name = "nbclient" version = "0.9.0" @@ -3589,47 +3678,47 @@ test = ["pytest", "pytest-console-scripts", "pytest-jupyter", "pytest-tornasync" [[package]] name = "numpy" -version = "1.26.2" +version = "1.26.3" description = "Fundamental package for array computing in Python" optional = false python-versions = ">=3.9" files = [ - {file = "numpy-1.26.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3703fc9258a4a122d17043e57b35e5ef1c5a5837c3db8be396c82e04c1cf9b0f"}, - {file = "numpy-1.26.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cc392fdcbd21d4be6ae1bb4475a03ce3b025cd49a9be5345d76d7585aea69440"}, - {file = "numpy-1.26.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:36340109af8da8805d8851ef1d74761b3b88e81a9bd80b290bbfed61bd2b4f75"}, - {file = "numpy-1.26.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bcc008217145b3d77abd3e4d5ef586e3bdfba8fe17940769f8aa09b99e856c00"}, - {file = "numpy-1.26.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:3ced40d4e9e18242f70dd02d739e44698df3dcb010d31f495ff00a31ef6014fe"}, - {file = "numpy-1.26.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b272d4cecc32c9e19911891446b72e986157e6a1809b7b56518b4f3755267523"}, - {file = "numpy-1.26.2-cp310-cp310-win32.whl", hash = "sha256:22f8fc02fdbc829e7a8c578dd8d2e15a9074b630d4da29cda483337e300e3ee9"}, - {file = "numpy-1.26.2-cp310-cp310-win_amd64.whl", hash = "sha256:26c9d33f8e8b846d5a65dd068c14e04018d05533b348d9eaeef6c1bd787f9919"}, - {file = "numpy-1.26.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b96e7b9c624ef3ae2ae0e04fa9b460f6b9f17ad8b4bec6d7756510f1f6c0c841"}, - {file = "numpy-1.26.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:aa18428111fb9a591d7a9cc1b48150097ba6a7e8299fb56bdf574df650e7d1f1"}, - {file = "numpy-1.26.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:06fa1ed84aa60ea6ef9f91ba57b5ed963c3729534e6e54055fc151fad0423f0a"}, - {file = "numpy-1.26.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:96ca5482c3dbdd051bcd1fce8034603d6ebfc125a7bd59f55b40d8f5d246832b"}, - {file = "numpy-1.26.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:854ab91a2906ef29dc3925a064fcd365c7b4da743f84b123002f6139bcb3f8a7"}, - {file = "numpy-1.26.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f43740ab089277d403aa07567be138fc2a89d4d9892d113b76153e0e412409f8"}, - {file = "numpy-1.26.2-cp311-cp311-win32.whl", hash = "sha256:a2bbc29fcb1771cd7b7425f98b05307776a6baf43035d3b80c4b0f29e9545186"}, - {file = "numpy-1.26.2-cp311-cp311-win_amd64.whl", hash = "sha256:2b3fca8a5b00184828d12b073af4d0fc5fdd94b1632c2477526f6bd7842d700d"}, - {file = "numpy-1.26.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a4cd6ed4a339c21f1d1b0fdf13426cb3b284555c27ac2f156dfdaaa7e16bfab0"}, - {file = "numpy-1.26.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5d5244aabd6ed7f312268b9247be47343a654ebea52a60f002dc70c769048e75"}, - {file = "numpy-1.26.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a3cdb4d9c70e6b8c0814239ead47da00934666f668426fc6e94cce869e13fd7"}, - {file = "numpy-1.26.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa317b2325f7aa0a9471663e6093c210cb2ae9c0ad824732b307d2c51983d5b6"}, - {file = "numpy-1.26.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:174a8880739c16c925799c018f3f55b8130c1f7c8e75ab0a6fa9d41cab092fd6"}, - {file = "numpy-1.26.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f79b231bf5c16b1f39c7f4875e1ded36abee1591e98742b05d8a0fb55d8a3eec"}, - {file = "numpy-1.26.2-cp312-cp312-win32.whl", hash = "sha256:4a06263321dfd3598cacb252f51e521a8cb4b6df471bb12a7ee5cbab20ea9167"}, - {file = "numpy-1.26.2-cp312-cp312-win_amd64.whl", hash = "sha256:b04f5dc6b3efdaab541f7857351aac359e6ae3c126e2edb376929bd3b7f92d7e"}, - {file = "numpy-1.26.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4eb8df4bf8d3d90d091e0146f6c28492b0be84da3e409ebef54349f71ed271ef"}, - {file = "numpy-1.26.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1a13860fdcd95de7cf58bd6f8bc5a5ef81c0b0625eb2c9a783948847abbef2c2"}, - {file = "numpy-1.26.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64308ebc366a8ed63fd0bf426b6a9468060962f1a4339ab1074c228fa6ade8e3"}, - {file = "numpy-1.26.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baf8aab04a2c0e859da118f0b38617e5ee65d75b83795055fb66c0d5e9e9b818"}, - {file = "numpy-1.26.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d73a3abcac238250091b11caef9ad12413dab01669511779bc9b29261dd50210"}, - {file = "numpy-1.26.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:b361d369fc7e5e1714cf827b731ca32bff8d411212fccd29ad98ad622449cc36"}, - {file = "numpy-1.26.2-cp39-cp39-win32.whl", hash = "sha256:bd3f0091e845164a20bd5a326860c840fe2af79fa12e0469a12768a3ec578d80"}, - {file = "numpy-1.26.2-cp39-cp39-win_amd64.whl", hash = "sha256:2beef57fb031dcc0dc8fa4fe297a742027b954949cabb52a2a376c144e5e6060"}, - {file = "numpy-1.26.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:1cc3d5029a30fb5f06704ad6b23b35e11309491c999838c31f124fee32107c79"}, - {file = "numpy-1.26.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94cc3c222bb9fb5a12e334d0479b97bb2df446fbe622b470928f5284ffca3f8d"}, - {file = "numpy-1.26.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fe6b44fb8fcdf7eda4ef4461b97b3f63c466b27ab151bec2366db8b197387841"}, - {file = "numpy-1.26.2.tar.gz", hash = "sha256:f65738447676ab5777f11e6bbbdb8ce11b785e105f690bc45966574816b6d3ea"}, + {file = "numpy-1.26.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:806dd64230dbbfaca8a27faa64e2f414bf1c6622ab78cc4264f7f5f028fee3bf"}, + {file = "numpy-1.26.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02f98011ba4ab17f46f80f7f8f1c291ee7d855fcef0a5a98db80767a468c85cd"}, + {file = "numpy-1.26.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6d45b3ec2faed4baca41c76617fcdcfa4f684ff7a151ce6fc78ad3b6e85af0a6"}, + {file = "numpy-1.26.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bdd2b45bf079d9ad90377048e2747a0c82351989a2165821f0c96831b4a2a54b"}, + {file = "numpy-1.26.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:211ddd1e94817ed2d175b60b6374120244a4dd2287f4ece45d49228b4d529178"}, + {file = "numpy-1.26.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b1240f767f69d7c4c8a29adde2310b871153df9b26b5cb2b54a561ac85146485"}, + {file = "numpy-1.26.3-cp310-cp310-win32.whl", hash = "sha256:21a9484e75ad018974a2fdaa216524d64ed4212e418e0a551a2d83403b0531d3"}, + {file = "numpy-1.26.3-cp310-cp310-win_amd64.whl", hash = "sha256:9e1591f6ae98bcfac2a4bbf9221c0b92ab49762228f38287f6eeb5f3f55905ce"}, + {file = "numpy-1.26.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b831295e5472954104ecb46cd98c08b98b49c69fdb7040483aff799a755a7374"}, + {file = "numpy-1.26.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9e87562b91f68dd8b1c39149d0323b42e0082db7ddb8e934ab4c292094d575d6"}, + {file = "numpy-1.26.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c66d6fec467e8c0f975818c1796d25c53521124b7cfb760114be0abad53a0a2"}, + {file = "numpy-1.26.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f25e2811a9c932e43943a2615e65fc487a0b6b49218899e62e426e7f0a57eeda"}, + {file = "numpy-1.26.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:af36e0aa45e25c9f57bf684b1175e59ea05d9a7d3e8e87b7ae1a1da246f2767e"}, + {file = "numpy-1.26.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:51c7f1b344f302067b02e0f5b5d2daa9ed4a721cf49f070280ac202738ea7f00"}, + {file = "numpy-1.26.3-cp311-cp311-win32.whl", hash = "sha256:7ca4f24341df071877849eb2034948459ce3a07915c2734f1abb4018d9c49d7b"}, + {file = "numpy-1.26.3-cp311-cp311-win_amd64.whl", hash = "sha256:39763aee6dfdd4878032361b30b2b12593fb445ddb66bbac802e2113eb8a6ac4"}, + {file = "numpy-1.26.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a7081fd19a6d573e1a05e600c82a1c421011db7935ed0d5c483e9dd96b99cf13"}, + {file = "numpy-1.26.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:12c70ac274b32bc00c7f61b515126c9205323703abb99cd41836e8125ea0043e"}, + {file = "numpy-1.26.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f784e13e598e9594750b2ef6729bcd5a47f6cfe4a12cca13def35e06d8163e3"}, + {file = "numpy-1.26.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f24750ef94d56ce6e33e4019a8a4d68cfdb1ef661a52cdaee628a56d2437419"}, + {file = "numpy-1.26.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:77810ef29e0fb1d289d225cabb9ee6cf4d11978a00bb99f7f8ec2132a84e0166"}, + {file = "numpy-1.26.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8ed07a90f5450d99dad60d3799f9c03c6566709bd53b497eb9ccad9a55867f36"}, + {file = "numpy-1.26.3-cp312-cp312-win32.whl", hash = "sha256:f73497e8c38295aaa4741bdfa4fda1a5aedda5473074369eca10626835445511"}, + {file = "numpy-1.26.3-cp312-cp312-win_amd64.whl", hash = "sha256:da4b0c6c699a0ad73c810736303f7fbae483bcb012e38d7eb06a5e3b432c981b"}, + {file = "numpy-1.26.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1666f634cb3c80ccbd77ec97bc17337718f56d6658acf5d3b906ca03e90ce87f"}, + {file = "numpy-1.26.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:18c3319a7d39b2c6a9e3bb75aab2304ab79a811ac0168a671a62e6346c29b03f"}, + {file = "numpy-1.26.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b7e807d6888da0db6e7e75838444d62495e2b588b99e90dd80c3459594e857b"}, + {file = "numpy-1.26.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4d362e17bcb0011738c2d83e0a65ea8ce627057b2fdda37678f4374a382a137"}, + {file = "numpy-1.26.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b8c275f0ae90069496068c714387b4a0eba5d531aace269559ff2b43655edd58"}, + {file = "numpy-1.26.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cc0743f0302b94f397a4a65a660d4cd24267439eb16493fb3caad2e4389bccbb"}, + {file = "numpy-1.26.3-cp39-cp39-win32.whl", hash = "sha256:9bc6d1a7f8cedd519c4b7b1156d98e051b726bf160715b769106661d567b3f03"}, + {file = "numpy-1.26.3-cp39-cp39-win_amd64.whl", hash = "sha256:867e3644e208c8922a3be26fc6bbf112a035f50f0a86497f98f228c50c607bb2"}, + {file = "numpy-1.26.3-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:3c67423b3703f8fbd90f5adaa37f85b5794d3366948efe9a5190a5f3a83fc34e"}, + {file = "numpy-1.26.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46f47ee566d98849323f01b349d58f2557f02167ee301e5e28809a8c0e27a2d0"}, + {file = "numpy-1.26.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a8474703bffc65ca15853d5fd4d06b18138ae90c17c8d12169968e998e448bb5"}, + {file = "numpy-1.26.3.tar.gz", hash = "sha256:697df43e2b6310ecc9d95f05d5ef20eacc09c7c4ecc9da3f235d39e71b7da1e4"}, ] [[package]] @@ -3714,13 +3803,13 @@ files = [ [[package]] name = "packageurl-python" -version = "0.13.1" +version = "0.13.3" description = "A purl aka. Package URL parser and builder" optional = false python-versions = ">=3.7" files = [ - {file = "packageurl-python-0.13.1.tar.gz", hash = "sha256:84f8053f4b85294b98b3b78715475847fb48f4525ec302d06dc35b26a9b3078a"}, - {file = "packageurl_python-0.13.1-py3-none-any.whl", hash = "sha256:d35090c7ec1d7afc611679912151b2157f28f68e729b6fa56f8ea5422819eaed"}, + {file = "packageurl-python-0.13.3.tar.gz", hash = "sha256:63514d3b72f22a9c092913b3fa7b4670e479998dff3d93597863c413735e2ec7"}, + {file = "packageurl_python-0.13.3-py3-none-any.whl", hash = "sha256:ed8abe1a92bbc91f0db60e7fe5a586d71d3d93036f558d43502cc2451833a1df"}, ] [package.extras] @@ -4440,6 +4529,44 @@ files = [ [package.dependencies] defusedxml = ">=0.7.1,<0.8.0" +[[package]] +name = "pyabsa" +version = "2.4.0" +description = "This tool provides the state-of-the-art models for aspect term extraction (ATE), aspect polarity classification (APC), and text classification (TC)." +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyabsa-2.4.0-py3-none-any.whl", hash = "sha256:81c7f9fe01bb28ec163771e5095fc4c28ef120583aec3b7f94203e43fc1a77f8"}, +] + +[package.dependencies] +autocuda = ">=0.16" +boostaug = ">=2.3.5" +findfile = ">=2.0.0" +gitpython = "*" +metric-visualizer = ">=0.9.6" +networkx = "*" +pandas = "*" +protobuf = "<4.0.0" +pytorch-warmup = "*" +sentencepiece = "*" +seqeval = "*" +spacy = "*" +termcolor = "*" +torch = ">=1.0.0" +tqdm = "*" +transformers = ">=4.18.0" +typing-extensions = "*" +update-checker = "*" + +[package.extras] +deploy = ["gradio", "setuptools", "twine", "wheel"] +dev = ["docformatter", "flake8", "gradio", "isort", "nbsphinx", "pytest", "pytest-xdist", "recommonmark", "sentence-transformers", "setuptools", "sphinx-autobuild", "sphinx-copybutton", "sphinx-markdown-tables", "sphinx-rtd-theme", "tensorboardX", "tensorflow", "tensorflow-estimator", "tensorflow-hub", "tensorflow-text", "twine", "wheel"] +docs = ["nbsphinx", "recommonmark", "sphinx-autobuild", "sphinx-copybutton", "sphinx-markdown-tables", "sphinx-rtd-theme"] +optional = ["sentence-transformers", "tensorflow", "tensorflow-hub"] +tensorflow = ["tensorboardX", "tensorflow", "tensorflow-estimator", "tensorflow-hub", "tensorflow-text"] +test = ["docformatter", "flake8", "isort", "pytest", "pytest-xdist"] + [[package]] name = "pyclipper" version = "1.3.0.post5" @@ -4918,6 +5045,20 @@ files = [ {file = "python_json_logger-2.0.7-py3-none-any.whl", hash = "sha256:f380b826a991ebbe3de4d897aeec42760035ac760345e57b812938dc8b35e2bd"}, ] +[[package]] +name = "pytorch-warmup" +version = "0.1.1" +description = "A PyTorch Extension for Learning Rate Warmup" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytorch-warmup-0.1.1.tar.gz", hash = "sha256:c594760b29657a127aa6a8c3424dd0b5068140b3b7d4988118f4a9f3e99b1457"}, + {file = "pytorch_warmup-0.1.1-py3-none-any.whl", hash = "sha256:eecc4af0975bb181198c0817be145bccb17c7ea09ce3fdf69140f65d8c32b746"}, +] + +[package.dependencies] +torch = ">=1.1" + [[package]] name = "pytz" version = "2023.3.post1" @@ -5277,13 +5418,13 @@ full = ["numpy"] [[package]] name = "referencing" -version = "0.32.0" +version = "0.32.1" description = "JSON Referencing + Python" optional = false python-versions = ">=3.8" files = [ - {file = "referencing-0.32.0-py3-none-any.whl", hash = "sha256:bdcd3efb936f82ff86f993093f6da7435c7de69a3b3a5a06678a6050184bee99"}, - {file = "referencing-0.32.0.tar.gz", hash = "sha256:689e64fe121843dcfd57b71933318ef1f91188ffb45367332700a86ac8fd6161"}, + {file = "referencing-0.32.1-py3-none-any.whl", hash = "sha256:7e4dc12271d8e15612bfe35792f5ea1c40970dadf8624602e33db2758f7ee554"}, + {file = "referencing-0.32.1.tar.gz", hash = "sha256:3c57da0513e9563eb7e203ebe9bb3a1b509b042016433bd1e45a2853466c3dd3"}, ] [package.dependencies] @@ -5758,6 +5899,53 @@ files = [ {file = "schedule-1.2.1.tar.gz", hash = "sha256:843bc0538b99c93f02b8b50e3e39886c06f2d003b24f48e1aa4cadfa3f341279"}, ] +[[package]] +name = "scikit-learn" +version = "1.3.2" +description = "A set of python modules for machine learning and data mining" +optional = false +python-versions = ">=3.8" +files = [ + {file = "scikit-learn-1.3.2.tar.gz", hash = "sha256:a2f54c76accc15a34bfb9066e6c7a56c1e7235dda5762b990792330b52ccfb05"}, + {file = "scikit_learn-1.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e326c0eb5cf4d6ba40f93776a20e9a7a69524c4db0757e7ce24ba222471ee8a1"}, + {file = "scikit_learn-1.3.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:535805c2a01ccb40ca4ab7d081d771aea67e535153e35a1fd99418fcedd1648a"}, + {file = "scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1215e5e58e9880b554b01187b8c9390bf4dc4692eedeaf542d3273f4785e342c"}, + {file = "scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ee107923a623b9f517754ea2f69ea3b62fc898a3641766cb7deb2f2ce450161"}, + {file = "scikit_learn-1.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:35a22e8015048c628ad099da9df5ab3004cdbf81edc75b396fd0cff8699ac58c"}, + {file = "scikit_learn-1.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6fb6bc98f234fda43163ddbe36df8bcde1d13ee176c6dc9b92bb7d3fc842eb66"}, + {file = "scikit_learn-1.3.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:18424efee518a1cde7b0b53a422cde2f6625197de6af36da0b57ec502f126157"}, + {file = "scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3271552a5eb16f208a6f7f617b8cc6d1f137b52c8a1ef8edf547db0259b2c9fb"}, + {file = "scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc4144a5004a676d5022b798d9e573b05139e77f271253a4703eed295bde0433"}, + {file = "scikit_learn-1.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:67f37d708f042a9b8d59551cf94d30431e01374e00dc2645fa186059c6c5d78b"}, + {file = "scikit_learn-1.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8db94cd8a2e038b37a80a04df8783e09caac77cbe052146432e67800e430c028"}, + {file = "scikit_learn-1.3.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:61a6efd384258789aa89415a410dcdb39a50e19d3d8410bd29be365bcdd512d5"}, + {file = "scikit_learn-1.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb06f8dce3f5ddc5dee1715a9b9f19f20d295bed8e3cd4fa51e1d050347de525"}, + {file = "scikit_learn-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5b2de18d86f630d68fe1f87af690d451388bb186480afc719e5f770590c2ef6c"}, + {file = "scikit_learn-1.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:0402638c9a7c219ee52c94cbebc8fcb5eb9fe9c773717965c1f4185588ad3107"}, + {file = "scikit_learn-1.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a19f90f95ba93c1a7f7924906d0576a84da7f3b2282ac3bfb7a08a32801add93"}, + {file = "scikit_learn-1.3.2-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:b8692e395a03a60cd927125eef3a8e3424d86dde9b2370d544f0ea35f78a8073"}, + {file = "scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15e1e94cc23d04d39da797ee34236ce2375ddea158b10bee3c343647d615581d"}, + {file = "scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:785a2213086b7b1abf037aeadbbd6d67159feb3e30263434139c98425e3dcfcf"}, + {file = "scikit_learn-1.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:64381066f8aa63c2710e6b56edc9f0894cc7bf59bd71b8ce5613a4559b6145e0"}, + {file = "scikit_learn-1.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6c43290337f7a4b969d207e620658372ba3c1ffb611f8bc2b6f031dc5c6d1d03"}, + {file = "scikit_learn-1.3.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:dc9002fc200bed597d5d34e90c752b74df516d592db162f756cc52836b38fe0e"}, + {file = "scikit_learn-1.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d08ada33e955c54355d909b9c06a4789a729977f165b8bae6f225ff0a60ec4a"}, + {file = "scikit_learn-1.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:763f0ae4b79b0ff9cca0bf3716bcc9915bdacff3cebea15ec79652d1cc4fa5c9"}, + {file = "scikit_learn-1.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:ed932ea780517b00dae7431e031faae6b49b20eb6950918eb83bd043237950e0"}, +] + +[package.dependencies] +joblib = ">=1.1.1" +numpy = ">=1.17.3,<2.0" +scipy = ">=1.5.0" +threadpoolctl = ">=2.0.0" + +[package.extras] +benchmark = ["matplotlib (>=3.1.3)", "memory-profiler (>=0.57.0)", "pandas (>=1.0.5)"] +docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.1.3)", "memory-profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)", "sphinx (>=6.0.0)", "sphinx-copybutton (>=0.5.2)", "sphinx-gallery (>=0.10.1)", "sphinx-prompt (>=1.3.0)", "sphinxext-opengraph (>=0.4.2)"] +examples = ["matplotlib (>=3.1.3)", "pandas (>=1.0.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)"] +tests = ["black (>=23.3.0)", "matplotlib (>=3.1.3)", "mypy (>=1.3)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pytest (>=7.1.2)", "pytest-cov (>=2.9.0)", "ruff (>=0.0.272)", "scikit-image (>=0.16.2)"] + [[package]] name = "scipy" version = "1.11.4" @@ -5854,6 +6042,74 @@ nativelib = ["pyobjc-framework-Cocoa", "pywin32"] objc = ["pyobjc-framework-Cocoa"] win32 = ["pywin32"] +[[package]] +name = "sentencepiece" +version = "0.1.99" +description = "SentencePiece python wrapper" +optional = false +python-versions = "*" +files = [ + {file = "sentencepiece-0.1.99-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0eb528e70571b7c02723e5804322469b82fe7ea418c96051d0286c0fa028db73"}, + {file = "sentencepiece-0.1.99-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:77d7fafb2c4e4659cbdf303929503f37a26eabc4ff31d3a79bf1c5a1b338caa7"}, + {file = "sentencepiece-0.1.99-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:be9cf5b9e404c245aeb3d3723c737ba7a8f5d4ba262ef233a431fa6c45f732a0"}, + {file = "sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:baed1a26464998f9710d20e52607c29ffd4293e7c71c6a1f83f51ad0911ec12c"}, + {file = "sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9832f08bb372d4c8b567612f8eab9e36e268dff645f1c28f9f8e851be705f6d1"}, + {file = "sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:019e7535108e309dae2b253a75834fc3128240aa87c00eb80732078cdc182588"}, + {file = "sentencepiece-0.1.99-cp310-cp310-win32.whl", hash = "sha256:fa16a830416bb823fa2a52cbdd474d1f7f3bba527fd2304fb4b140dad31bb9bc"}, + {file = "sentencepiece-0.1.99-cp310-cp310-win_amd64.whl", hash = "sha256:14b0eccb7b641d4591c3e12ae44cab537d68352e4d3b6424944f0c447d2348d5"}, + {file = "sentencepiece-0.1.99-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6d3c56f24183a1e8bd61043ff2c58dfecdc68a5dd8955dc13bab83afd5f76b81"}, + {file = "sentencepiece-0.1.99-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ed6ea1819fd612c989999e44a51bf556d0ef6abfb553080b9be3d347e18bcfb7"}, + {file = "sentencepiece-0.1.99-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a2a0260cd1fb7bd8b4d4f39dc2444a8d5fd4e0a0c4d5c899810ef1abf99b2d45"}, + {file = "sentencepiece-0.1.99-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a1abff4d1ff81c77cac3cc6fefa34fa4b8b371e5ee51cb7e8d1ebc996d05983"}, + {file = "sentencepiece-0.1.99-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:004e6a621d4bc88978eecb6ea7959264239a17b70f2cbc348033d8195c9808ec"}, + {file = "sentencepiece-0.1.99-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db361e03342c41680afae5807590bc88aa0e17cfd1a42696a160e4005fcda03b"}, + {file = "sentencepiece-0.1.99-cp311-cp311-win32.whl", hash = "sha256:2d95e19168875b70df62916eb55428a0cbcb834ac51d5a7e664eda74def9e1e0"}, + {file = "sentencepiece-0.1.99-cp311-cp311-win_amd64.whl", hash = "sha256:f90d73a6f81248a909f55d8e6ef56fec32d559e1e9af045f0b0322637cb8e5c7"}, + {file = "sentencepiece-0.1.99-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:62e24c81e74bd87a6e0d63c51beb6527e4c0add67e1a17bac18bcd2076afcfeb"}, + {file = "sentencepiece-0.1.99-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57efcc2d51caff20d9573567d9fd3f854d9efe613ed58a439c78c9f93101384a"}, + {file = "sentencepiece-0.1.99-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6a904c46197993bd1e95b93a6e373dca2f170379d64441041e2e628ad4afb16f"}, + {file = "sentencepiece-0.1.99-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d89adf59854741c0d465f0e1525b388c0d174f611cc04af54153c5c4f36088c4"}, + {file = "sentencepiece-0.1.99-cp36-cp36m-win32.whl", hash = "sha256:47c378146928690d1bc106fdf0da768cebd03b65dd8405aa3dd88f9c81e35dba"}, + {file = "sentencepiece-0.1.99-cp36-cp36m-win_amd64.whl", hash = "sha256:9ba142e7a90dd6d823c44f9870abdad45e6c63958eb60fe44cca6828d3b69da2"}, + {file = "sentencepiece-0.1.99-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b7b1a9ae4d7c6f1f867e63370cca25cc17b6f4886729595b885ee07a58d3cec3"}, + {file = "sentencepiece-0.1.99-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0f644c9d4d35c096a538507b2163e6191512460035bf51358794a78515b74f7"}, + {file = "sentencepiece-0.1.99-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c8843d23a0f686d85e569bd6dcd0dd0e0cbc03731e63497ca6d5bacd18df8b85"}, + {file = "sentencepiece-0.1.99-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33e6f690a1caebb4867a2e367afa1918ad35be257ecdb3455d2bbd787936f155"}, + {file = "sentencepiece-0.1.99-cp37-cp37m-win32.whl", hash = "sha256:8a321866c2f85da7beac74a824b4ad6ddc2a4c9bccd9382529506d48f744a12c"}, + {file = "sentencepiece-0.1.99-cp37-cp37m-win_amd64.whl", hash = "sha256:c42f753bcfb7661c122a15b20be7f684b61fc8592c89c870adf52382ea72262d"}, + {file = "sentencepiece-0.1.99-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:85b476406da69c70586f0bb682fcca4c9b40e5059814f2db92303ea4585c650c"}, + {file = "sentencepiece-0.1.99-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cfbcfe13c69d3f87b7fcd5da168df7290a6d006329be71f90ba4f56bc77f8561"}, + {file = "sentencepiece-0.1.99-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:445b0ec381af1cd4eef95243e7180c63d9c384443c16c4c47a28196bd1cda937"}, + {file = "sentencepiece-0.1.99-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c6890ea0f2b4703f62d0bf27932e35808b1f679bdb05c7eeb3812b935ba02001"}, + {file = "sentencepiece-0.1.99-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fb71af492b0eefbf9f2501bec97bcd043b6812ab000d119eaf4bd33f9e283d03"}, + {file = "sentencepiece-0.1.99-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27b866b5bd3ddd54166bbcbf5c8d7dd2e0b397fac8537991c7f544220b1f67bc"}, + {file = "sentencepiece-0.1.99-cp38-cp38-win32.whl", hash = "sha256:b133e8a499eac49c581c3c76e9bdd08c338cc1939e441fee6f92c0ccb5f1f8be"}, + {file = "sentencepiece-0.1.99-cp38-cp38-win_amd64.whl", hash = "sha256:0eaf3591dd0690a87f44f4df129cf8d05d8a4029b5b6709b489b8e27f9a9bcff"}, + {file = "sentencepiece-0.1.99-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:38efeda9bbfb55052d482a009c6a37e52f42ebffcea9d3a98a61de7aee356a28"}, + {file = "sentencepiece-0.1.99-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6c030b081dc1e1bcc9fadc314b19b740715d3d566ad73a482da20d7d46fd444c"}, + {file = "sentencepiece-0.1.99-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:84dbe53e02e4f8a2e45d2ac3e430d5c83182142658e25edd76539b7648928727"}, + {file = "sentencepiece-0.1.99-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b0f55d0a0ee1719b4b04221fe0c9f0c3461dc3dabd77a035fa2f4788eb3ef9a"}, + {file = "sentencepiece-0.1.99-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18e800f206cd235dc27dc749299e05853a4e4332e8d3dfd81bf13d0e5b9007d9"}, + {file = "sentencepiece-0.1.99-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ae1c40cda8f9d5b0423cfa98542735c0235e7597d79caf318855cdf971b2280"}, + {file = "sentencepiece-0.1.99-cp39-cp39-win32.whl", hash = "sha256:c84ce33af12ca222d14a1cdd37bd76a69401e32bc68fe61c67ef6b59402f4ab8"}, + {file = "sentencepiece-0.1.99-cp39-cp39-win_amd64.whl", hash = "sha256:350e5c74d739973f1c9643edb80f7cc904dc948578bcb1d43c6f2b173e5d18dd"}, + {file = "sentencepiece-0.1.99.tar.gz", hash = "sha256:189c48f5cb2949288f97ccdb97f0473098d9c3dcf5a3d99d4eabe719ec27297f"}, +] + +[[package]] +name = "seqeval" +version = "1.2.2" +description = "Testing framework for sequence labeling" +optional = false +python-versions = "*" +files = [ + {file = "seqeval-1.2.2.tar.gz", hash = "sha256:f28e97c3ab96d6fcd32b648f6438ff2e09cfba87f05939da9b3970713ec56e6f"}, +] + +[package.dependencies] +numpy = ">=1.14.0" +scikit-learn = ">=0.21.3" + [[package]] name = "setuptools" version = "69.0.3" @@ -6426,36 +6682,36 @@ test = ["pytest"] [[package]] name = "sqlalchemy" -version = "1.4.50" +version = "1.4.51" description = "Database Abstraction Library" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" files = [ - {file = "SQLAlchemy-1.4.50-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d00665725063692c42badfd521d0c4392e83c6c826795d38eb88fb108e5660e5"}, - {file = "SQLAlchemy-1.4.50-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85292ff52ddf85a39367057c3d7968a12ee1fb84565331a36a8fead346f08796"}, - {file = "SQLAlchemy-1.4.50-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d0fed0f791d78e7767c2db28d34068649dfeea027b83ed18c45a423f741425cb"}, - {file = "SQLAlchemy-1.4.50-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db4db3c08ffbb18582f856545f058a7a5e4ab6f17f75795ca90b3c38ee0a8ba4"}, - {file = "SQLAlchemy-1.4.50-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:14b0cacdc8a4759a1e1bd47dc3ee3f5db997129eb091330beda1da5a0e9e5bd7"}, - {file = "SQLAlchemy-1.4.50-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1fb9cb60e0f33040e4f4681e6658a7eb03b5cb4643284172f91410d8c493dace"}, - {file = "SQLAlchemy-1.4.50-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4cb501d585aa74a0f86d0ea6263b9c5e1d1463f8f9071392477fd401bd3c7cc"}, - {file = "SQLAlchemy-1.4.50-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a7a66297e46f85a04d68981917c75723e377d2e0599d15fbe7a56abed5e2d75"}, - {file = "SQLAlchemy-1.4.50-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1db0221cb26d66294f4ca18c533e427211673ab86c1fbaca8d6d9ff78654293"}, - {file = "SQLAlchemy-1.4.50-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b7dbe6369677a2bea68fe9812c6e4bbca06ebfa4b5cde257b2b0bf208709131"}, - {file = "SQLAlchemy-1.4.50-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a9bddb60566dc45c57fd0a5e14dd2d9e5f106d2241e0a2dc0c1da144f9444516"}, - {file = "SQLAlchemy-1.4.50-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82dd4131d88395df7c318eeeef367ec768c2a6fe5bd69423f7720c4edb79473c"}, - {file = "SQLAlchemy-1.4.50-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:273505fcad22e58cc67329cefab2e436006fc68e3c5423056ee0513e6523268a"}, - {file = "SQLAlchemy-1.4.50-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a3257a6e09626d32b28a0c5b4f1a97bced585e319cfa90b417f9ab0f6145c33c"}, - {file = "SQLAlchemy-1.4.50-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d69738d582e3a24125f0c246ed8d712b03bd21e148268421e4a4d09c34f521a5"}, - {file = "SQLAlchemy-1.4.50-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:34e1c5d9cd3e6bf3d1ce56971c62a40c06bfc02861728f368dcfec8aeedb2814"}, - {file = "SQLAlchemy-1.4.50-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1fcee5a2c859eecb4ed179edac5ffbc7c84ab09a5420219078ccc6edda45436"}, - {file = "SQLAlchemy-1.4.50-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbaf6643a604aa17e7a7afd74f665f9db882df5c297bdd86c38368f2c471f37d"}, - {file = "SQLAlchemy-1.4.50-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2e70e0673d7d12fa6cd363453a0d22dac0d9978500aa6b46aa96e22690a55eab"}, - {file = "SQLAlchemy-1.4.50-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b881ac07d15fb3e4f68c5a67aa5cdaf9eb8f09eb5545aaf4b0a5f5f4659be18"}, - {file = "SQLAlchemy-1.4.50-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f6997da81114daef9203d30aabfa6b218a577fc2bd797c795c9c88c9eb78d49"}, - {file = "SQLAlchemy-1.4.50-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bdb77e1789e7596b77fd48d99ec1d2108c3349abd20227eea0d48d3f8cf398d9"}, - {file = "SQLAlchemy-1.4.50-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:128a948bd40780667114b0297e2cc6d657b71effa942e0a368d8cc24293febb3"}, - {file = "SQLAlchemy-1.4.50-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2d526aeea1bd6a442abc7c9b4b00386fd70253b80d54a0930c0a216230a35be"}, - {file = "SQLAlchemy-1.4.50.tar.gz", hash = "sha256:3b97ddf509fc21e10b09403b5219b06c5b558b27fc2453150274fa4e70707dbf"}, + {file = "SQLAlchemy-1.4.51-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2be4e6294c53f2ec8ea36486b56390e3bcaa052bf3a9a47005687ccf376745d1"}, + {file = "SQLAlchemy-1.4.51-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ca484ca11c65e05639ffe80f20d45e6be81fbec7683d6c9a15cd421e6e8b340"}, + {file = "SQLAlchemy-1.4.51-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:0535d5b57d014d06ceeaeffd816bb3a6e2dddeb670222570b8c4953e2d2ea678"}, + {file = "SQLAlchemy-1.4.51-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af55cc207865d641a57f7044e98b08b09220da3d1b13a46f26487cc2f898a072"}, + {file = "SQLAlchemy-1.4.51-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7deeae5071930abb3669b5185abb6c33ddfd2398f87660fafdb9e6a5fb0f3f2f"}, + {file = "SQLAlchemy-1.4.51-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0892e7ac8bc76da499ad3ee8de8da4d7905a3110b952e2a35a940dab1ffa550e"}, + {file = "SQLAlchemy-1.4.51-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6cacc0b2dd7d22a918a9642fc89840a5d3cee18a0e1fe41080b1141b23b10916"}, + {file = "SQLAlchemy-1.4.51-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:245c67c88e63f1523e9216cad6ba3107dea2d3ee19adc359597a628afcabfbcb"}, + {file = "SQLAlchemy-1.4.51-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3ec7a0ed9b32afdf337172678a4a0e6419775ba4e649b66f49415615fa47efbd"}, + {file = "SQLAlchemy-1.4.51-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:352df882088a55293f621328ec33b6ffca936ad7f23013b22520542e1ab6ad1b"}, + {file = "SQLAlchemy-1.4.51-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:86a22143a4001f53bf58027b044da1fb10d67b62a785fc1390b5c7f089d9838c"}, + {file = "SQLAlchemy-1.4.51-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c37bc677690fd33932182b85d37433845de612962ed080c3e4d92f758d1bd894"}, + {file = "SQLAlchemy-1.4.51-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c55040d8ea65414de7c47f1a23823cd9f3fad0dc93e6b6b728fee81230f817b"}, + {file = "SQLAlchemy-1.4.51-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38ef80328e3fee2be0a1abe3fe9445d3a2e52a1282ba342d0dab6edf1fef4707"}, + {file = "SQLAlchemy-1.4.51-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f8cafa6f885a0ff5e39efa9325195217bb47d5929ab0051636610d24aef45ade"}, + {file = "SQLAlchemy-1.4.51-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8f2df79a46e130235bc5e1bbef4de0583fb19d481eaa0bffa76e8347ea45ec6"}, + {file = "SQLAlchemy-1.4.51-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eb18549b770351b54e1ab5da37d22bc530b8bfe2ee31e22b9ebe650640d2ef12"}, + {file = "SQLAlchemy-1.4.51-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55e699466106d09f028ab78d3c2e1f621b5ef2c8694598242259e4515715da7c"}, + {file = "SQLAlchemy-1.4.51-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2ad16880ccd971ac8e570550fbdef1385e094b022d6fc85ef3ce7df400dddad3"}, + {file = "SQLAlchemy-1.4.51-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b97fd5bb6b7c1a64b7ac0632f7ce389b8ab362e7bd5f60654c2a418496be5d7f"}, + {file = "SQLAlchemy-1.4.51-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e646b19f47d655261b22df9976e572f588185279970efba3d45c377127d35349"}, + {file = "SQLAlchemy-1.4.51-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3cf56cc36d42908495760b223ca9c2c0f9f0002b4eddc994b24db5fcb86a9e4"}, + {file = "SQLAlchemy-1.4.51-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:0d661cff58c91726c601cc0ee626bf167b20cc4d7941c93c5f3ac28dc34ddbea"}, + {file = "SQLAlchemy-1.4.51-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3823dda635988e6744d4417e13f2e2b5fe76c4bf29dd67e95f98717e1b094cad"}, + {file = "SQLAlchemy-1.4.51.tar.gz", hash = "sha256:e7908c2025eb18394e32d65dd02d2e37e17d733cdbe7d78231c2b6d7eb20cdb9"}, ] [package.dependencies] @@ -6465,7 +6721,7 @@ sqlalchemy2-stubs = {version = "*", optional = true, markers = "extra == \"mypy\ [package.extras] aiomysql = ["aiomysql (>=0.2.0)", "greenlet (!=0.4.17)"] -aiosqlite = ["aiosqlite", "greenlet (!=0.4.17)", "typing-extensions (!=3.10.0.1)"] +aiosqlite = ["aiosqlite", "greenlet (!=0.4.17)", "typing_extensions (!=3.10.0.1)"] asyncio = ["greenlet (!=0.4.17)"] asyncmy = ["asyncmy (>=0.2.3,!=0.2.4)", "greenlet (!=0.4.17)"] mariadb-connector = ["mariadb (>=1.0.1,!=1.1.2)"] @@ -6475,14 +6731,14 @@ mssql-pyodbc = ["pyodbc"] mypy = ["mypy (>=0.910)", "sqlalchemy2-stubs"] mysql = ["mysqlclient (>=1.4.0)", "mysqlclient (>=1.4.0,<2)"] mysql-connector = ["mysql-connector-python"] -oracle = ["cx-oracle (>=7)", "cx-oracle (>=7,<8)"] +oracle = ["cx_oracle (>=7)", "cx_oracle (>=7,<8)"] postgresql = ["psycopg2 (>=2.7)"] postgresql-asyncpg = ["asyncpg", "greenlet (!=0.4.17)"] postgresql-pg8000 = ["pg8000 (>=1.16.6,!=1.29.0)"] postgresql-psycopg2binary = ["psycopg2-binary"] postgresql-psycopg2cffi = ["psycopg2cffi"] pymysql = ["pymysql", "pymysql (<1)"] -sqlcipher = ["sqlcipher3-binary"] +sqlcipher = ["sqlcipher3_binary"] [[package]] name = "sqlalchemy2-stubs" @@ -6577,6 +6833,20 @@ files = [ [package.dependencies] mpmath = ">=0.19" +[[package]] +name = "tabulate" +version = "0.9.0" +description = "Pretty-print tabular data" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}, + {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}, +] + +[package.extras] +widechars = ["wcwidth"] + [[package]] name = "tenacity" version = "8.2.3" @@ -6591,6 +6861,20 @@ files = [ [package.extras] doc = ["reno", "sphinx", "tornado (>=4.5)"] +[[package]] +name = "termcolor" +version = "2.4.0" +description = "ANSI color formatting for output in terminal" +optional = false +python-versions = ">=3.8" +files = [ + {file = "termcolor-2.4.0-py3-none-any.whl", hash = "sha256:9297c0df9c99445c2412e832e882a7884038a25617c60cea2ad69488d4040d63"}, + {file = "termcolor-2.4.0.tar.gz", hash = "sha256:aab9e56047c8ac41ed798fa36d892a37aca6b3e9159f3e0c24bc64a9b3ac7b7a"}, +] + +[package.extras] +tests = ["pytest", "pytest-cov"] + [[package]] name = "terminado" version = "0.18.0" @@ -6694,6 +6978,34 @@ mxnet = ["mxnet (>=1.5.1,<1.6.0)"] tensorflow = ["tensorflow (>=2.0.0,<2.6.0)"] torch = ["torch (>=1.6.0)"] +[[package]] +name = "threadpoolctl" +version = "3.2.0" +description = "threadpoolctl" +optional = false +python-versions = ">=3.8" +files = [ + {file = "threadpoolctl-3.2.0-py3-none-any.whl", hash = "sha256:2b7818516e423bdaebb97c723f86a7c6b0a83d3f3b0970328d66f4d9104dc032"}, + {file = "threadpoolctl-3.2.0.tar.gz", hash = "sha256:c96a0ba3bdddeaca37dc4cc7344aafad41cdb8c313f74fdfe387a867bba93355"}, +] + +[[package]] +name = "tikzplotlib" +version = "0.10.1" +description = "Convert matplotlib figures into TikZ/PGFPlots" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tikzplotlib-0.10.1-py3-none-any.whl", hash = "sha256:bf0451b86fe4db40aa742f7e5a180dfaaadf57c746ddb2ab7e58a5163d8be75f"}, + {file = "tikzplotlib-0.10.1.tar.gz", hash = "sha256:93d141342d143804fc1dfabe03e6d4e38e547cf72803bdf124615affdd56f59d"}, +] + +[package.dependencies] +matplotlib = ">=1.4.0" +numpy = "*" +Pillow = "*" +webcolors = "*" + [[package]] name = "tinycss2" version = "1.2.1" @@ -6925,13 +7237,13 @@ telegram = ["requests"] [[package]] name = "traitlets" -version = "5.14.0" +version = "5.14.1" description = "Traitlets Python configuration system" optional = false python-versions = ">=3.8" files = [ - {file = "traitlets-5.14.0-py3-none-any.whl", hash = "sha256:f14949d23829023013c47df20b4a76ccd1a85effb786dc060f34de7948361b33"}, - {file = "traitlets-5.14.0.tar.gz", hash = "sha256:fcdaa8ac49c04dfa0ed3ee3384ef6dfdb5d6f3741502be247279407679296772"}, + {file = "traitlets-5.14.1-py3-none-any.whl", hash = "sha256:2e5a030e6eff91737c643231bfcf04a65b0132078dad75e4936700b213652e74"}, + {file = "traitlets-5.14.1.tar.gz", hash = "sha256:8585105b371a04b8316a43d5ce29c098575c2e477850b62b848b964f1444527e"}, ] [package.extras] @@ -7076,24 +7388,24 @@ files = [ [[package]] name = "types-decorator" -version = "5.1.8.4" +version = "5.1.8.20240106" description = "Typing stubs for decorator" optional = false -python-versions = "*" +python-versions = ">=3.8" files = [ - {file = "types-decorator-5.1.8.4.tar.gz", hash = "sha256:a8c39024634e99834bef146cec2e36c585f47884addf4dc65d6d0b7b1f627517"}, - {file = "types_decorator-5.1.8.4-py3-none-any.whl", hash = "sha256:e411203e0ec0116964dcd491e162a951e4a68cd2d4a946172690bee43c4ebe6e"}, + {file = "types-decorator-5.1.8.20240106.tar.gz", hash = "sha256:32ff92b33615060d23b9d3760124bdb3506c4aa8d9eb50963cf1a3c20b9ecbbf"}, + {file = "types_decorator-5.1.8.20240106-py3-none-any.whl", hash = "sha256:14d21e6a0755dbb8f301f2f532b3eab5148f433c69dad2d98bf5bd2b3a2ef4e7"}, ] [[package]] name = "types-pyopenssl" -version = "23.3.0.0" +version = "23.3.0.20240106" description = "Typing stubs for pyOpenSSL" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "types-pyOpenSSL-23.3.0.0.tar.gz", hash = "sha256:5ffb077fe70b699c88d5caab999ae80e192fe28bf6cda7989b7e79b1e4e2dcd3"}, - {file = "types_pyOpenSSL-23.3.0.0-py3-none-any.whl", hash = "sha256:00171433653265843b7469ddb9f3c86d698668064cc33ef10537822156130ebf"}, + {file = "types-pyOpenSSL-23.3.0.20240106.tar.gz", hash = "sha256:3d6f3462bec0c260caadf93fbb377225c126661b779c7d9ab99b6dad5ca10db9"}, + {file = "types_pyOpenSSL-23.3.0.20240106-py3-none-any.whl", hash = "sha256:47a7eedbd18b7bcad17efebf1c53416148f5a173918a6d75027e75e32fe039ae"}, ] [package.dependencies] @@ -7101,13 +7413,13 @@ cryptography = ">=35.0.0" [[package]] name = "types-python-dateutil" -version = "2.8.19.14" +version = "2.8.19.20240106" description = "Typing stubs for python-dateutil" optional = false -python-versions = "*" +python-versions = ">=3.8" files = [ - {file = "types-python-dateutil-2.8.19.14.tar.gz", hash = "sha256:1f4f10ac98bb8b16ade9dbee3518d9ace017821d94b057a425b069f834737f4b"}, - {file = "types_python_dateutil-2.8.19.14-py3-none-any.whl", hash = "sha256:f977b8de27787639986b4e28963263fd0e5158942b3ecef91b9335c130cb1ce9"}, + {file = "types-python-dateutil-2.8.19.20240106.tar.gz", hash = "sha256:1f8db221c3b98e6ca02ea83a58371b22c374f42ae5bbdf186db9c9a76581459f"}, + {file = "types_python_dateutil-2.8.19.20240106-py3-none-any.whl", hash = "sha256:efbbdc54590d0f16152fa103c9879c7d4a00e82078f6e2cf01769042165acaa2"}, ] [[package]] @@ -7123,13 +7435,13 @@ files = [ [[package]] name = "types-requests" -version = "2.31.0.20231231" +version = "2.31.0.20240106" description = "Typing stubs for requests" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "types-requests-2.31.0.20231231.tar.gz", hash = "sha256:0f8c0c9764773384122813548d9eea92a5c4e1f33ed54556b508968ec5065cee"}, - {file = "types_requests-2.31.0.20231231-py3-none-any.whl", hash = "sha256:2e2230c7bc8dd63fa3153c1c0ae335f8a368447f0582fc332f17d54f88e69027"}, + {file = "types-requests-2.31.0.20240106.tar.gz", hash = "sha256:0e1c731c17f33618ec58e022b614a1a2ecc25f7dc86800b36ef341380402c612"}, + {file = "types_requests-2.31.0.20240106-py3-none-any.whl", hash = "sha256:da997b3b6a72cc08d09f4dba9802fdbabc89104b35fe24ee588e674037689354"}, ] [package.dependencies] @@ -7137,46 +7449,46 @@ urllib3 = ">=2" [[package]] name = "types-setuptools" -version = "69.0.0.0" +version = "69.0.0.20240106" description = "Typing stubs for setuptools" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "types-setuptools-69.0.0.0.tar.gz", hash = "sha256:b0a06219f628c6527b2f8ce770a4f47550e00d3e8c3ad83e2dc31bc6e6eda95d"}, - {file = "types_setuptools-69.0.0.0-py3-none-any.whl", hash = "sha256:8c86195bae2ad81e6dea900a570fe9d64a59dbce2b11cc63c046b03246ea77bf"}, + {file = "types-setuptools-69.0.0.20240106.tar.gz", hash = "sha256:e077f9089578df3c9938f6e4aa1633f182ba6740a6fdb1333f162bae5dfcbadc"}, + {file = "types_setuptools-69.0.0.20240106-py3-none-any.whl", hash = "sha256:b1da8981425723a674fd459c43dfa4402abeaee3f9cf682723ee9cf226125cc3"}, ] [[package]] name = "types-six" -version = "1.16.21.9" +version = "1.16.21.20240106" description = "Typing stubs for six" optional = false -python-versions = "*" +python-versions = ">=3.8" files = [ - {file = "types-six-1.16.21.9.tar.gz", hash = "sha256:746e6c25b8c48b3c8ab9efe7f68022839111de423d35ba4b206b88b12d75f233"}, - {file = "types_six-1.16.21.9-py3-none-any.whl", hash = "sha256:1591a09430a3035326da5fdb71692d0b3cc36b25a440cc5929ca6241f3984705"}, + {file = "types-six-1.16.21.20240106.tar.gz", hash = "sha256:c83908b4925583e973eb9971ef2bd60dbab647611e10e9cd588d2bef415bfe68"}, + {file = "types_six-1.16.21.20240106-py3-none-any.whl", hash = "sha256:3658c9e36e9cb003e522655b01b9ca39bd0db61b6383b3e7d0d10d14f873b338"}, ] [[package]] name = "types-tabulate" -version = "0.9.0.3" +version = "0.9.0.20240106" description = "Typing stubs for tabulate" optional = false -python-versions = "*" +python-versions = ">=3.8" files = [ - {file = "types-tabulate-0.9.0.3.tar.gz", hash = "sha256:197651f9d6467193cd166d8500116a6d3a26f2a4eb2db093bc9535ee1c0be55e"}, - {file = "types_tabulate-0.9.0.3-py3-none-any.whl", hash = "sha256:462d1b62e01728416e8277614d6a3eb172d53a8efaf04a04a973ff2dd45238f6"}, + {file = "types-tabulate-0.9.0.20240106.tar.gz", hash = "sha256:c9b6db10dd7fcf55bd1712dd3537f86ddce72a08fd62bb1af4338c7096ce947e"}, + {file = "types_tabulate-0.9.0.20240106-py3-none-any.whl", hash = "sha256:0378b7b6fe0ccb4986299496d027a6d4c218298ecad67199bbd0e2d7e9d335a1"}, ] [[package]] name = "types-tqdm" -version = "4.66.0.5" +version = "4.66.0.20240106" description = "Typing stubs for tqdm" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "types-tqdm-4.66.0.5.tar.gz", hash = "sha256:74bd7e469238c28816300f72a9b713d02036f6b557734616430adb7b7e74112c"}, - {file = "types_tqdm-4.66.0.5-py3-none-any.whl", hash = "sha256:d2c38085bec440e8ad1e94e8619f7cb3d1dd0a7ee06a863ccd0610a5945046ef"}, + {file = "types-tqdm-4.66.0.20240106.tar.gz", hash = "sha256:7acf4aade5bad3ded76eb829783f9961b1c2187948eaa6dd1ae8644dff95a938"}, + {file = "types_tqdm-4.66.0.20240106-py3-none-any.whl", hash = "sha256:7459b0f441b969735685645a5d8480f7912b10d05ab45f99a2db8a8e45cb550b"}, ] [[package]] @@ -7229,6 +7541,25 @@ tzdata = {version = "*", markers = "platform_system == \"Windows\""} [package.extras] devenv = ["check-manifest", "pytest (>=4.3)", "pytest-cov", "pytest-mock (>=3.3)", "zest.releaser"] +[[package]] +name = "update-checker" +version = "0.18.0" +description = "A python module that will check for package updates." +optional = false +python-versions = "*" +files = [ + {file = "update_checker-0.18.0-py3-none-any.whl", hash = "sha256:cbba64760a36fe2640d80d85306e8fe82b6816659190993b7bdabadee4d4bbfd"}, + {file = "update_checker-0.18.0.tar.gz", hash = "sha256:6a2d45bb4ac585884a6b03f9eade9161cedd9e8111545141e9aa9058932acb13"}, +] + +[package.dependencies] +requests = ">=2.3.0" + +[package.extras] +dev = ["black", "flake8", "pytest (>=2.7.3)"] +lint = ["black", "flake8"] +test = ["pytest (>=2.7.3)"] + [[package]] name = "uri-template" version = "1.3.0" @@ -7298,13 +7629,13 @@ colorama = {version = ">=0.4.6", markers = "sys_platform == \"win32\" and python [[package]] name = "wcwidth" -version = "0.2.12" +version = "0.2.13" description = "Measures the displayed width of unicode strings in a terminal" optional = false python-versions = "*" files = [ - {file = "wcwidth-0.2.12-py2.py3-none-any.whl", hash = "sha256:f26ec43d96c8cbfed76a5075dac87680124fa84e0855195a6184da9c187f133c"}, - {file = "wcwidth-0.2.12.tar.gz", hash = "sha256:f01c104efdf57971bcb756f054dd58ddec5204dd15fa31d6503ea57947d97c02"}, + {file = "wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859"}, + {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"}, ] [[package]] @@ -7427,6 +7758,17 @@ files = [ [package.dependencies] h11 = ">=0.9.0,<1" +[[package]] +name = "xlsxwriter" +version = "3.1.9" +description = "A Python module for creating Excel XLSX files." +optional = false +python-versions = ">=3.6" +files = [ + {file = "XlsxWriter-3.1.9-py3-none-any.whl", hash = "sha256:b61c1a0c786f82644936c0936ec96ee96cd3afb9440094232f7faef9b38689f0"}, + {file = "XlsxWriter-3.1.9.tar.gz", hash = "sha256:de810bf328c6a4550f4ffd6b0b34972aeb7ffcf40f3d285a0413734f9b63a929"}, +] + [[package]] name = "xmltodict" version = "0.13.0" @@ -7461,4 +7803,4 @@ web-server = ["dash", "dash-auth", "dash-bootstrap-components", "dash-daq", "net [metadata] lock-version = "2.0" python-versions = ">=3.11,<3.13" -content-hash = "1c754a05ecaa96c8fad0d26f3c154aaea085408cdb530bee1cb77c79b515aa25" +content-hash = "6b3aae3513a6c625f94a861bd33100c5f5df168c631fb9535054338ff4c6461d" diff --git a/pyproject.toml b/pyproject.toml index 64e2c8c..e46b3c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,6 +78,7 @@ torch = {version = "*", source = "torch-cpu"} tqdm = "^4.66.1" transformers = {version = "*", extras = ["torch"]} xmltodict = "^0.13.0" +pyabsa = "^2.4.0" [tool.poetry.extras] ingest = ["selenium", "deutschland", "xmltodict", "html5lib"]