diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..8dd399a --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 88 +extend-ignore = E203 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0c545d6..f14652c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,7 +40,7 @@ repos: - id: pretty-format-yaml args: [--autofix] - id: pretty-format-toml - args: [ --autofix ] + args: [--autofix] exclude: (^poetry.lock$) @@ -67,8 +67,8 @@ repos: - repo: https://gitlab.com/smop/pre-commit-hooks rev: v1.0.0 - hooks: - - id: check-poetry + hooks: [] + # - id: check-poetry - repo: https://github.com/Lucas-C/pre-commit-hooks-java rev: 1.3.10 diff --git a/Jupyter/AI-models/Sentiment Analysis/FinBert.ipynb b/Jupyter/AI-models/Sentiment Analysis/FinBert.ipynb new file mode 100644 index 0000000..b12fd5d --- /dev/null +++ b/Jupyter/AI-models/Sentiment Analysis/FinBert.ipynb @@ -0,0 +1,387 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# FinBert\n", + "\n", + "## Sources\n", + "\n", + "[HugginFace](https://huggingface.co/ProsusAI/finbert)\n", + "[Tutorial](https://medium.com/codex/stocks-news-sentiment-analysis-with-deep-learning-transformers-and-machine-learning-cdcdb827fc06)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2023-04-30T21:54:44.056694Z", + "start_time": "2023-04-30T21:53:45.027971Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ERROR: To modify pip, please run the following command:\n", + "C:\\Users\\phhor\\PycharmProjects\\aki_prj23_transparenzregister\\venv\\Scripts\\python.exe -m pip install transformers tqdm pandas numpy torch torchvision torchaudio pip -Uq\n", + "\n", + "[notice] A new release of pip is available: 23.0.1 -> 23.1.2\n", + "[notice] To update, run: python.exe -m pip install --upgrade pip\n" + ] + } + ], + "source": [ + "!pip install transformers tqdm pandas numpy torch torchvision torchaudio pip -Uq" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n", + "\n", + "# create a tokenizer object\n", + "tokenizer = AutoTokenizer.from_pretrained(\"ProsusAI/finbert\")\n", + "\n", + "# fetch the pretrained model\n", + "model = AutoModelForSequenceClassification.from_pretrained(\"ProsusAI/finbert\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[0.0535, 0.0279, 0.9185]], grad_fn=)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# A headline to be used as input\n", + "import torch\n", + "\n", + "headline = \"Microsoft fails to hit profit expectations\"\n", + "headline2 = (\n", + " \"Am Aktienmarkt überwieg weiter die Zuversicht, wie der Kursverlauf des DAX zeigt.\"\n", + ")\n", + "\n", + "# Pre-process input phrase\n", + "input_tokens = tokenizer(headline2, padding=True, truncation=True, return_tensors=\"pt\")\n", + "# Run inference on the tokenized phrase\n", + "output = model(**input_tokens)\n", + "\n", + "# Pass model output logits through a softmax layer.\n", + "sentim_scores = torch.nn.functional.softmax(output.logits, dim=-1)\n", + "sentim_scores" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Microsoft fails to hit profit expectations\n" + ] + }, + { + "data": { + "text/plain": [ + "+ 0.034084\n", + "0 0.932933\n", + "- 0.032982\n", + "dtype: float32" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def analyze_sentiment(text: str):\n", + " print(text)\n", + " input_tokens = tokenizer(text, padding=True, truncation=True, return_tensors=\"pt\")\n", + " output = model(**input_tokens)\n", + " return pd.Series(\n", + " torch.nn.functional.softmax(output.logits, dim=-1)[0].data,\n", + " index=[\"+\", \"0\", \"-\"],\n", + " )\n", + "\n", + "\n", + "tf = analyze_sentiment(headline)\n", + "tf" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textlan
0Microsoft fails to hit profit expectationsen
1Am Aktienmarkt überwieg weiter die Zuversicht,...de
2Stocks rallied and the British pound gained.en
3Meyer Burger bedient ab sofort australischen M...de
\n", + "
" + ], + "text/plain": [ + " text lan\n", + "0 Microsoft fails to hit profit expectations en\n", + "1 Am Aktienmarkt überwieg weiter die Zuversicht,... de\n", + "2 Stocks rallied and the British pound gained. en\n", + "3 Meyer Burger bedient ab sofort australischen M... de" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text_df = pd.DataFrame(\n", + " [\n", + " {\"text\": \"Microsoft fails to hit profit expectations\", \"lan\": \"en\"},\n", + " {\n", + " \"text\": \"Am Aktienmarkt überwieg weiter die Zuversicht, wie der Kursverlauf des DAX zeigt.\",\n", + " \"lan\": \"de\",\n", + " },\n", + " {\"text\": \"Stocks rallied and the British pound gained.\", \"lan\": \"en\"},\n", + " {\n", + " \"text\": \"Meyer Burger bedient ab sofort australischen Markt und präsentiert sich auf Smart Energy Expo in Sydney.\",\n", + " \"lan\": \"de\",\n", + " },\n", + " ]\n", + ")\n", + "text_df" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Microsoft fails to hit profit expectations\n", + "Am Aktienmarkt überwieg weiter die Zuversicht, wie der Kursverlauf des DAX zeigt.\n", + "Stocks rallied and the British pound gained.\n", + "Meyer Burger bedient ab sofort australischen Markt und präsentiert sich auf Smart Energy Expo in Sydney.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textlan+0-
0Microsoft fails to hit profit expectationsen0.0340840.9329330.032982
1Am Aktienmarkt überwieg weiter die Zuversicht,...de0.0535280.0279500.918522
2Stocks rallied and the British pound gained.en0.8983610.0344740.067165
3Meyer Burger bedient ab sofort australischen M...de0.1165970.0127900.870613
\n", + "
" + ], + "text/plain": [ + " text lan + 0 \n", + "0 Microsoft fails to hit profit expectations en 0.034084 0.932933 \\\n", + "1 Am Aktienmarkt überwieg weiter die Zuversicht,... de 0.053528 0.027950 \n", + "2 Stocks rallied and the British pound gained. en 0.898361 0.034474 \n", + "3 Meyer Burger bedient ab sofort australischen M... de 0.116597 0.012790 \n", + "\n", + " - \n", + "0 0.032982 \n", + "1 0.918522 \n", + "2 0.067165 \n", + "3 0.870613 " + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def analyse_sentiments(texts: pd.Series) -> pd.DataFrame:\n", + " values = texts[\"text\"].apply(analyze_sentiment)\n", + " # print(values)\n", + " texts[[\"+\", \"0\", \"-\"]] = values\n", + " return texts\n", + "\n", + "\n", + "analyse_sentiments(text_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..6378954 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,5 @@ +[tool.isort] +profile = "black" + +[tool.pylint.format] +max-line-length = "88"