Added a first trail with FinBert.

2026-02-13 22:17:38 +01:00 · 2023-04-30 23:23:02 +02:00
parent 983c06eead
commit 5a52f61c1c
4 changed files with 398 additions and 3 deletions
--- a/.flake8
+++ b/.flake8
@@ -0,0 +1,3 @@
 [flake8]
 max-line-length = 88
 extend-ignore = E203
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -67,8 +67,8 @@ repos:
 - repo: https://gitlab.com/smop/pre-commit-hooks
  rev: v1.0.0
-  hooks:
+  hooks: []
-  - id: check-poetry
+  # - id: check-poetry
 - repo: https://github.com/Lucas-C/pre-commit-hooks-java
  rev: 1.3.10
--- a/Jupyter/AI-models/Sentiment
+++ b/Jupyter/AI-models/Sentiment
@@ -0,0 +1,387 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# FinBert\n",
    "\n",
    "## Sources\n",
    "\n",
    "[HugginFace](https://huggingface.co/ProsusAI/finbert)\n",
    "[Tutorial](https://medium.com/codex/stocks-news-sentiment-analysis-with-deep-learning-transformers-and-machine-learning-cdcdb827fc06)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-04-30T21:54:44.056694Z",
     "start_time": "2023-04-30T21:53:45.027971Z"
    },
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "ERROR: To modify pip, please run the following command:\n",
      "C:\\Users\\phhor\\PycharmProjects\\aki_prj23_transparenzregister\\venv\\Scripts\\python.exe -m pip install transformers tqdm pandas numpy torch torchvision torchaudio pip -Uq\n",
      "\n",
      "[notice] A new release of pip is available: 23.0.1 -> 23.1.2\n",
      "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
     ]
    }
   ],
   "source": [
    "!pip install transformers tqdm pandas numpy torch torchvision torchaudio pip -Uq"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n",
    "\n",
    "# create a tokenizer object\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"ProsusAI/finbert\")\n",
    "\n",
    "# fetch the pretrained model\n",
    "model = AutoModelForSequenceClassification.from_pretrained(\"ProsusAI/finbert\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[0.0535, 0.0279, 0.9185]], grad_fn=<SoftmaxBackward0>)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# A headline to be used as input\n",
    "import torch\n",
    "\n",
    "headline = \"Microsoft fails to hit profit expectations\"\n",
    "headline2 = (\n",
    "    \"Am Aktienmarkt überwieg weiter die Zuversicht, wie der Kursverlauf des DAX zeigt.\"\n",
    ")\n",
    "\n",
    "# Pre-process input phrase\n",
    "input_tokens = tokenizer(headline2, padding=True, truncation=True, return_tensors=\"pt\")\n",
    "# Run inference on the tokenized phrase\n",
    "output = model(**input_tokens)\n",
    "\n",
    "# Pass model output logits through a softmax layer.\n",
    "sentim_scores = torch.nn.functional.softmax(output.logits, dim=-1)\n",
    "sentim_scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Microsoft fails to hit profit expectations\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "+    0.034084\n",
       "0    0.932933\n",
       "-    0.032982\n",
       "dtype: float32"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def analyze_sentiment(text: str):\n",
    "    print(text)\n",
    "    input_tokens = tokenizer(text, padding=True, truncation=True, return_tensors=\"pt\")\n",
    "    output = model(**input_tokens)\n",
    "    return pd.Series(\n",
    "        torch.nn.functional.softmax(output.logits, dim=-1)[0].data,\n",
    "        index=[\"+\", \"0\", \"-\"],\n",
    "    )\n",
    "\n",
    "\n",
    "tf = analyze_sentiment(headline)\n",
    "tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>lan</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Microsoft fails to hit profit expectations</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Am Aktienmarkt überwieg weiter die Zuversicht,...</td>\n",
       "      <td>de</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Stocks rallied and the British pound gained.</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Meyer Burger bedient ab sofort australischen M...</td>\n",
       "      <td>de</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                text lan\n",
       "0         Microsoft fails to hit profit expectations  en\n",
       "1  Am Aktienmarkt überwieg weiter die Zuversicht,...  de\n",
       "2       Stocks rallied and the British pound gained.  en\n",
       "3  Meyer Burger bedient ab sofort australischen M...  de"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text_df = pd.DataFrame(\n",
    "    [\n",
    "        {\"text\": \"Microsoft fails to hit profit expectations\", \"lan\": \"en\"},\n",
    "        {\n",
    "            \"text\": \"Am Aktienmarkt überwieg weiter die Zuversicht, wie der Kursverlauf des DAX zeigt.\",\n",
    "            \"lan\": \"de\",\n",
    "        },\n",
    "        {\"text\": \"Stocks rallied and the British pound gained.\", \"lan\": \"en\"},\n",
    "        {\n",
    "            \"text\": \"Meyer Burger bedient ab sofort australischen Markt und präsentiert sich auf Smart Energy Expo in Sydney.\",\n",
    "            \"lan\": \"de\",\n",
    "        },\n",
    "    ]\n",
    ")\n",
    "text_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Microsoft fails to hit profit expectations\n",
      "Am Aktienmarkt überwieg weiter die Zuversicht, wie der Kursverlauf des DAX zeigt.\n",
      "Stocks rallied and the British pound gained.\n",
      "Meyer Burger bedient ab sofort australischen Markt und präsentiert sich auf Smart Energy Expo in Sydney.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>lan</th>\n",
       "      <th>+</th>\n",
       "      <th>0</th>\n",
       "      <th>-</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Microsoft fails to hit profit expectations</td>\n",
       "      <td>en</td>\n",
       "      <td>0.034084</td>\n",
       "      <td>0.932933</td>\n",
       "      <td>0.032982</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Am Aktienmarkt überwieg weiter die Zuversicht,...</td>\n",
       "      <td>de</td>\n",
       "      <td>0.053528</td>\n",
       "      <td>0.027950</td>\n",
       "      <td>0.918522</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Stocks rallied and the British pound gained.</td>\n",
       "      <td>en</td>\n",
       "      <td>0.898361</td>\n",
       "      <td>0.034474</td>\n",
       "      <td>0.067165</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Meyer Burger bedient ab sofort australischen M...</td>\n",
       "      <td>de</td>\n",
       "      <td>0.116597</td>\n",
       "      <td>0.012790</td>\n",
       "      <td>0.870613</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                text lan         +         0   \n",
       "0         Microsoft fails to hit profit expectations  en  0.034084  0.932933  \\\n",
       "1  Am Aktienmarkt überwieg weiter die Zuversicht,...  de  0.053528  0.027950   \n",
       "2       Stocks rallied and the British pound gained.  en  0.898361  0.034474   \n",
       "3  Meyer Burger bedient ab sofort australischen M...  de  0.116597  0.012790   \n",
       "\n",
       "          -  \n",
       "0  0.032982  \n",
       "1  0.918522  \n",
       "2  0.067165  \n",
       "3  0.870613  "
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def analyse_sentiments(texts: pd.Series) -> pd.DataFrame:\n",
    "    values = texts[\"text\"].apply(analyze_sentiment)\n",
    "    # print(values)\n",
    "    texts[[\"+\", \"0\", \"-\"]] = values\n",
    "    return texts\n",
    "\n",
    "\n",
    "analyse_sentiments(text_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,5 @@
 [tool.isort]
 profile = "black"
 [tool.pylint.format]
 max-line-length = "88"