mirror of
				https://github.com/fhswf/aki_prj23_transparenzregister.git
				synced 2025-11-04 00:35:19 +01:00 
			
		
		
		
	Merge pull request #39 from fhswf/feature/data-extraktion
Feature/data extraktion
This commit is contained in:
		@@ -12,7 +12,7 @@ repos:
 | 
			
		||||
  - id: check-xml
 | 
			
		||||
  - id: check-ast
 | 
			
		||||
  - id: check-added-large-files
 | 
			
		||||
    args: [--enforce-all]
 | 
			
		||||
    args: [--enforce-all --maxkb=50000]
 | 
			
		||||
  - id: name-tests-test
 | 
			
		||||
  - id: detect-private-key
 | 
			
		||||
  - id: check-case-conflict
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										3
									
								
								.vscode/settings.json
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								.vscode/settings.json
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1,3 @@
 | 
			
		||||
{
 | 
			
		||||
  "files.eol": "\n"
 | 
			
		||||
}
 | 
			
		||||
@@ -8,14 +8,6 @@
 | 
			
		||||
    "# Daten Extraktion aus dem Bundesanzeiger"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "attachments": {},
 | 
			
		||||
   "cell_type": "markdown",
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "source": [
 | 
			
		||||
    "In order to run this notebooks, download the `deutschland` library source code from: [TrisNol/deutschland](https://github.com/TrisNol/deutschland/tree/feat/bundesanzeiger-raw-report) and place it in the `Jupyter/API-tests/Bundesanzeiger/deutschland` directory. Since the PR adding the required features to the main repo has not been completet as of of yet (see: [PR](https://github.com/bundesAPI/deutschland/pull/88)) we have to include it in another way..."
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "attachments": {},
 | 
			
		||||
   "cell_type": "markdown",
 | 
			
		||||
@@ -26,18 +18,9 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 5,
 | 
			
		||||
   "execution_count": 32,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
    {
 | 
			
		||||
     "name": "stderr",
 | 
			
		||||
     "output_type": "stream",
 | 
			
		||||
     "text": [
 | 
			
		||||
      "C:\\Users\\trist\\AppData\\Roaming\\Python\\Python310\\site-packages\\requests\\__init__.py:102: RequestsDependencyWarning: urllib3 (1.26.12) or chardet (5.1.0)/charset_normalizer (2.0.12) doesn't match a supported version!\n",
 | 
			
		||||
      "  warnings.warn(\"urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported \"\n"
 | 
			
		||||
     ]
 | 
			
		||||
    }
 | 
			
		||||
   ],
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "import pandas as pd\n",
 | 
			
		||||
    "from deutschland.bundesanzeiger import Bundesanzeiger"
 | 
			
		||||
@@ -45,26 +28,28 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 6,
 | 
			
		||||
   "execution_count": 33,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
    {
 | 
			
		||||
     "name": "stdout",
 | 
			
		||||
     "output_type": "stream",
 | 
			
		||||
     "text": [
 | 
			
		||||
      "dict_keys(['040860c00ef9020cfb6db2f58a163256', '9eb401c5af2f0289b8207233bf852b81', 'b14d979ea9f42367d413589050bd04e5', 'ecb8c1011456ea0d40f87e850fc216bf', '3a7c6c1f1d1b89bf5ceb165f0ee88053', '03b2e6aac2f2da2c0c5e8f23de9caec4', 'a5f8dc87fa797e7d2f8fb88c49a23c36', '9a3a8a3e84290ee650cbccf32323b3d7', '6c0fcc20a58aaa18a9d13f35a51e3996', 'bf276d441c339e787e22385d2b69b277', '90a79d28f3c11a2122d2827d2bf6acda', '88c785ce3b3c580dcc285661c7790cca', 'd3064baa8246c3ed02e30b5038200edc', '5bf92eed2808b484c005409764b825b7', 'fece6303c991a280850be1900ff78f8f', '26b0624c60cdbf647f3d45f4917ec6ea', '9f98bee55f598908cca60b6a47e5d49d', '99267bb7474e6d1d5d9e091ba5ef3ee8', '102738ef4b91408ed043d84fe785b50b', '94711f3e509518d073e1760d97550347'])\n"
 | 
			
		||||
      "dict_keys(['c1051233030a8e0232523052fd4a2310', '57d129e6fd7505d567fa13919e5e6bdd'])\n"
 | 
			
		||||
     ]
 | 
			
		||||
    }
 | 
			
		||||
   ],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "ba = Bundesanzeiger()\n",
 | 
			
		||||
    "reports = ba.get_reports(\"Atos IT-Dienstleistung und Beratung GmbH\")\n",
 | 
			
		||||
    "reports = ba.get_reports(\n",
 | 
			
		||||
    "    \"Volkswagen Economy Service Erdle Bernhard Erdle GmbH\"\n",
 | 
			
		||||
    ")  # \"Atos IT-Dienstleistung und Beratung GmbH\")\n",
 | 
			
		||||
    "print(reports.keys())"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 7,
 | 
			
		||||
   "execution_count": 34,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
   "source": [
 | 
			
		||||
@@ -75,7 +60,7 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 8,
 | 
			
		||||
   "execution_count": 35,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
    {
 | 
			
		||||
@@ -109,42 +94,18 @@
 | 
			
		||||
       "  <tbody>\n",
 | 
			
		||||
       "    <tr>\n",
 | 
			
		||||
       "      <th>0</th>\n",
 | 
			
		||||
       "      <td>2023-03-17</td>\n",
 | 
			
		||||
       "      <td>Aufsichtsrat</td>\n",
 | 
			
		||||
       "      <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
 | 
			
		||||
       "      <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl...</td>\n",
 | 
			
		||||
       "      <td>2023-05-25</td>\n",
 | 
			
		||||
       "      <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
 | 
			
		||||
       "      <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
 | 
			
		||||
       "      <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
 | 
			
		||||
       "      <td><div class=\"publication_container\">\\n <div cla...</td>\n",
 | 
			
		||||
       "    </tr>\n",
 | 
			
		||||
       "    <tr>\n",
 | 
			
		||||
       "      <th>1</th>\n",
 | 
			
		||||
       "      <td>2022-03-25</td>\n",
 | 
			
		||||
       "      <td>2023-05-24</td>\n",
 | 
			
		||||
       "      <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
 | 
			
		||||
       "      <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
 | 
			
		||||
       "      <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
 | 
			
		||||
       "      <td><div class=\"publication_container\">\\n <div cla...</td>\n",
 | 
			
		||||
       "    </tr>\n",
 | 
			
		||||
       "    <tr>\n",
 | 
			
		||||
       "      <th>2</th>\n",
 | 
			
		||||
       "      <td>2021-03-11</td>\n",
 | 
			
		||||
       "      <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
 | 
			
		||||
       "      <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
 | 
			
		||||
       "      <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
 | 
			
		||||
       "      <td><div class=\"publication_container\">\\n <div cla...</td>\n",
 | 
			
		||||
       "    </tr>\n",
 | 
			
		||||
       "    <tr>\n",
 | 
			
		||||
       "      <th>3</th>\n",
 | 
			
		||||
       "      <td>2020-03-24</td>\n",
 | 
			
		||||
       "      <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
 | 
			
		||||
       "      <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
 | 
			
		||||
       "      <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung...</td>\n",
 | 
			
		||||
       "      <td><div class=\"publication_container\">\\n <div cla...</td>\n",
 | 
			
		||||
       "    </tr>\n",
 | 
			
		||||
       "    <tr>\n",
 | 
			
		||||
       "      <th>4</th>\n",
 | 
			
		||||
       "      <td>2018-12-11</td>\n",
 | 
			
		||||
       "      <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
 | 
			
		||||
       "      <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
 | 
			
		||||
       "      <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
 | 
			
		||||
       "      <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
 | 
			
		||||
       "      <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
 | 
			
		||||
       "      <td><div class=\"publication_container\">\\n <div cla...</td>\n",
 | 
			
		||||
       "    </tr>\n",
 | 
			
		||||
       "  </tbody>\n",
 | 
			
		||||
@@ -153,35 +114,23 @@
 | 
			
		||||
      ],
 | 
			
		||||
      "text/plain": [
 | 
			
		||||
       "        date                                               name  \\\n",
 | 
			
		||||
       "0 2023-03-17                                       Aufsichtsrat   \n",
 | 
			
		||||
       "1 2022-03-25  Jahresabschluss zum Geschäftsjahr vom 01.01.20...   \n",
 | 
			
		||||
       "2 2021-03-11  Jahresabschluss zum Geschäftsjahr vom 01.01.20...   \n",
 | 
			
		||||
       "3 2020-03-24  Jahresabschluss zum Geschäftsjahr vom 01.01.20...   \n",
 | 
			
		||||
       "4 2018-12-11  Jahresabschluss zum Geschäftsjahr vom 01.01.20...   \n",
 | 
			
		||||
       "0 2023-05-25  Jahresabschluss zum Geschäftsjahr vom 01.01.20...   \n",
 | 
			
		||||
       "1 2023-05-24  Jahresabschluss zum Geschäftsjahr vom 01.01.20...   \n",
 | 
			
		||||
       "\n",
 | 
			
		||||
       "                                    company  \\\n",
 | 
			
		||||
       "0  Atos IT-Dienstleistung und Beratung GmbH   \n",
 | 
			
		||||
       "1  Atos IT-Dienstleistung und Beratung GmbH   \n",
 | 
			
		||||
       "2  Atos IT-Dienstleistung und Beratung GmbH   \n",
 | 
			
		||||
       "3  Atos IT-Dienstleistung und Beratung GmbH   \n",
 | 
			
		||||
       "4  Atos IT-Dienstleistung und Beratung GmbH   \n",
 | 
			
		||||
       "                                             company  \\\n",
 | 
			
		||||
       "0  Volkswagen Economy Service Erdle Bernhard Erdl...   \n",
 | 
			
		||||
       "1  Volkswagen Economy Service Erdle Bernhard Erdl...   \n",
 | 
			
		||||
       "\n",
 | 
			
		||||
       "                                              report  \\\n",
 | 
			
		||||
       "0  \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl...   \n",
 | 
			
		||||
       "1  \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...   \n",
 | 
			
		||||
       "2  \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...   \n",
 | 
			
		||||
       "3  \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung...   \n",
 | 
			
		||||
       "4  \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...   \n",
 | 
			
		||||
       "0  \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...   \n",
 | 
			
		||||
       "1  \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...   \n",
 | 
			
		||||
       "\n",
 | 
			
		||||
       "                                          raw_report  \n",
 | 
			
		||||
       "0  <div class=\"publication_container\">\\n <div cla...  \n",
 | 
			
		||||
       "1  <div class=\"publication_container\">\\n <div cla...  \n",
 | 
			
		||||
       "2  <div class=\"publication_container\">\\n <div cla...  \n",
 | 
			
		||||
       "3  <div class=\"publication_container\">\\n <div cla...  \n",
 | 
			
		||||
       "4  <div class=\"publication_container\">\\n <div cla...  "
 | 
			
		||||
       "1  <div class=\"publication_container\">\\n <div cla...  "
 | 
			
		||||
      ]
 | 
			
		||||
     },
 | 
			
		||||
     "execution_count": 8,
 | 
			
		||||
     "execution_count": 35,
 | 
			
		||||
     "metadata": {},
 | 
			
		||||
     "output_type": "execute_result"
 | 
			
		||||
    }
 | 
			
		||||
@@ -193,7 +142,7 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 9,
 | 
			
		||||
   "execution_count": 36,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
    {
 | 
			
		||||
@@ -228,46 +177,19 @@
 | 
			
		||||
       "  <tbody>\n",
 | 
			
		||||
       "    <tr>\n",
 | 
			
		||||
       "      <th>0</th>\n",
 | 
			
		||||
       "      <td>2023-03-17</td>\n",
 | 
			
		||||
       "      <td>Aufsichtsrat</td>\n",
 | 
			
		||||
       "      <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
 | 
			
		||||
       "      <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl...</td>\n",
 | 
			
		||||
       "      <td>2023-05-25</td>\n",
 | 
			
		||||
       "      <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
 | 
			
		||||
       "      <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
 | 
			
		||||
       "      <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
 | 
			
		||||
       "      <td><div class=\"publication_container\">\\n <div cla...</td>\n",
 | 
			
		||||
       "      <td>Aufsichtsrat</td>\n",
 | 
			
		||||
       "      <td>Jahresabschluss</td>\n",
 | 
			
		||||
       "    </tr>\n",
 | 
			
		||||
       "    <tr>\n",
 | 
			
		||||
       "      <th>1</th>\n",
 | 
			
		||||
       "      <td>2022-03-25</td>\n",
 | 
			
		||||
       "      <td>2023-05-24</td>\n",
 | 
			
		||||
       "      <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
 | 
			
		||||
       "      <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
 | 
			
		||||
       "      <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
 | 
			
		||||
       "      <td><div class=\"publication_container\">\\n <div cla...</td>\n",
 | 
			
		||||
       "      <td>Jahresabschluss</td>\n",
 | 
			
		||||
       "    </tr>\n",
 | 
			
		||||
       "    <tr>\n",
 | 
			
		||||
       "      <th>2</th>\n",
 | 
			
		||||
       "      <td>2021-03-11</td>\n",
 | 
			
		||||
       "      <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
 | 
			
		||||
       "      <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
 | 
			
		||||
       "      <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
 | 
			
		||||
       "      <td><div class=\"publication_container\">\\n <div cla...</td>\n",
 | 
			
		||||
       "      <td>Jahresabschluss</td>\n",
 | 
			
		||||
       "    </tr>\n",
 | 
			
		||||
       "    <tr>\n",
 | 
			
		||||
       "      <th>3</th>\n",
 | 
			
		||||
       "      <td>2020-03-24</td>\n",
 | 
			
		||||
       "      <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
 | 
			
		||||
       "      <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
 | 
			
		||||
       "      <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung...</td>\n",
 | 
			
		||||
       "      <td><div class=\"publication_container\">\\n <div cla...</td>\n",
 | 
			
		||||
       "      <td>Jahresabschluss</td>\n",
 | 
			
		||||
       "    </tr>\n",
 | 
			
		||||
       "    <tr>\n",
 | 
			
		||||
       "      <th>4</th>\n",
 | 
			
		||||
       "      <td>2018-12-11</td>\n",
 | 
			
		||||
       "      <td>Jahresabschluss zum Geschäftsjahr vom 01.01.20...</td>\n",
 | 
			
		||||
       "      <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
 | 
			
		||||
       "      <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...</td>\n",
 | 
			
		||||
       "      <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
 | 
			
		||||
       "      <td>\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...</td>\n",
 | 
			
		||||
       "      <td><div class=\"publication_container\">\\n <div cla...</td>\n",
 | 
			
		||||
       "      <td>Jahresabschluss</td>\n",
 | 
			
		||||
       "    </tr>\n",
 | 
			
		||||
@@ -277,35 +199,23 @@
 | 
			
		||||
      ],
 | 
			
		||||
      "text/plain": [
 | 
			
		||||
       "        date                                               name  \\\n",
 | 
			
		||||
       "0 2023-03-17                                       Aufsichtsrat   \n",
 | 
			
		||||
       "1 2022-03-25  Jahresabschluss zum Geschäftsjahr vom 01.01.20...   \n",
 | 
			
		||||
       "2 2021-03-11  Jahresabschluss zum Geschäftsjahr vom 01.01.20...   \n",
 | 
			
		||||
       "3 2020-03-24  Jahresabschluss zum Geschäftsjahr vom 01.01.20...   \n",
 | 
			
		||||
       "4 2018-12-11  Jahresabschluss zum Geschäftsjahr vom 01.01.20...   \n",
 | 
			
		||||
       "0 2023-05-25  Jahresabschluss zum Geschäftsjahr vom 01.01.20...   \n",
 | 
			
		||||
       "1 2023-05-24  Jahresabschluss zum Geschäftsjahr vom 01.01.20...   \n",
 | 
			
		||||
       "\n",
 | 
			
		||||
       "                                    company  \\\n",
 | 
			
		||||
       "0  Atos IT-Dienstleistung und Beratung GmbH   \n",
 | 
			
		||||
       "1  Atos IT-Dienstleistung und Beratung GmbH   \n",
 | 
			
		||||
       "2  Atos IT-Dienstleistung und Beratung GmbH   \n",
 | 
			
		||||
       "3  Atos IT-Dienstleistung und Beratung GmbH   \n",
 | 
			
		||||
       "4  Atos IT-Dienstleistung und Beratung GmbH   \n",
 | 
			
		||||
       "                                             company  \\\n",
 | 
			
		||||
       "0  Volkswagen Economy Service Erdle Bernhard Erdl...   \n",
 | 
			
		||||
       "1  Volkswagen Economy Service Erdle Bernhard Erdl...   \n",
 | 
			
		||||
       "\n",
 | 
			
		||||
       "                                              report  \\\n",
 | 
			
		||||
       "0  \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstl...   \n",
 | 
			
		||||
       "1  \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...   \n",
 | 
			
		||||
       "2  \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...   \n",
 | 
			
		||||
       "3  \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtosIT-Dienstleistung...   \n",
 | 
			
		||||
       "4  \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nAtos IT-Dienstleistun...   \n",
 | 
			
		||||
       "0  \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...   \n",
 | 
			
		||||
       "1  \\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\nVolkswagen Economy Se...   \n",
 | 
			
		||||
       "\n",
 | 
			
		||||
       "                                          raw_report             type  \n",
 | 
			
		||||
       "0  <div class=\"publication_container\">\\n <div cla...     Aufsichtsrat  \n",
 | 
			
		||||
       "1  <div class=\"publication_container\">\\n <div cla...  Jahresabschluss  \n",
 | 
			
		||||
       "2  <div class=\"publication_container\">\\n <div cla...  Jahresabschluss  \n",
 | 
			
		||||
       "3  <div class=\"publication_container\">\\n <div cla...  Jahresabschluss  \n",
 | 
			
		||||
       "4  <div class=\"publication_container\">\\n <div cla...  Jahresabschluss  "
 | 
			
		||||
       "0  <div class=\"publication_container\">\\n <div cla...  Jahresabschluss  \n",
 | 
			
		||||
       "1  <div class=\"publication_container\">\\n <div cla...  Jahresabschluss  "
 | 
			
		||||
      ]
 | 
			
		||||
     },
 | 
			
		||||
     "execution_count": 9,
 | 
			
		||||
     "execution_count": 36,
 | 
			
		||||
     "metadata": {},
 | 
			
		||||
     "output_type": "execute_result"
 | 
			
		||||
    }
 | 
			
		||||
@@ -317,21 +227,9 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 10,
 | 
			
		||||
   "execution_count": 37,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
    {
 | 
			
		||||
     "name": "stderr",
 | 
			
		||||
     "output_type": "stream",
 | 
			
		||||
     "text": [
 | 
			
		||||
      "C:\\Users\\trist\\AppData\\Local\\Temp\\ipykernel_6460\\963182859.py:2: SettingWithCopyWarning: \n",
 | 
			
		||||
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
 | 
			
		||||
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
 | 
			
		||||
      "\n",
 | 
			
		||||
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
 | 
			
		||||
      "  df_jahresabschluss['jahr'] = df_jahresabschluss.name.apply(\n"
 | 
			
		||||
     ]
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
     "data": {
 | 
			
		||||
      "text/html": [
 | 
			
		||||
@@ -361,61 +259,34 @@
 | 
			
		||||
       "  </thead>\n",
 | 
			
		||||
       "  <tbody>\n",
 | 
			
		||||
       "    <tr>\n",
 | 
			
		||||
       "      <th>1</th>\n",
 | 
			
		||||
       "      <td>2022-03-25</td>\n",
 | 
			
		||||
       "      <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
 | 
			
		||||
       "      <th>0</th>\n",
 | 
			
		||||
       "      <td>2023-05-25</td>\n",
 | 
			
		||||
       "      <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
 | 
			
		||||
       "      <td><div class=\"publication_container\">\\n <div cla...</td>\n",
 | 
			
		||||
       "      <td>2020</td>\n",
 | 
			
		||||
       "    </tr>\n",
 | 
			
		||||
       "    <tr>\n",
 | 
			
		||||
       "      <th>2</th>\n",
 | 
			
		||||
       "      <td>2021-03-11</td>\n",
 | 
			
		||||
       "      <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
 | 
			
		||||
       "      <th>1</th>\n",
 | 
			
		||||
       "      <td>2023-05-24</td>\n",
 | 
			
		||||
       "      <td>Volkswagen Economy Service Erdle Bernhard Erdl...</td>\n",
 | 
			
		||||
       "      <td><div class=\"publication_container\">\\n <div cla...</td>\n",
 | 
			
		||||
       "      <td>2019</td>\n",
 | 
			
		||||
       "    </tr>\n",
 | 
			
		||||
       "    <tr>\n",
 | 
			
		||||
       "      <th>3</th>\n",
 | 
			
		||||
       "      <td>2020-03-24</td>\n",
 | 
			
		||||
       "      <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
 | 
			
		||||
       "      <td><div class=\"publication_container\">\\n <div cla...</td>\n",
 | 
			
		||||
       "      <td>2018</td>\n",
 | 
			
		||||
       "    </tr>\n",
 | 
			
		||||
       "    <tr>\n",
 | 
			
		||||
       "      <th>4</th>\n",
 | 
			
		||||
       "      <td>2018-12-11</td>\n",
 | 
			
		||||
       "      <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
 | 
			
		||||
       "      <td><div class=\"publication_container\">\\n <div cla...</td>\n",
 | 
			
		||||
       "      <td>2017</td>\n",
 | 
			
		||||
       "    </tr>\n",
 | 
			
		||||
       "    <tr>\n",
 | 
			
		||||
       "      <th>6</th>\n",
 | 
			
		||||
       "      <td>2018-01-03</td>\n",
 | 
			
		||||
       "      <td>Atos IT-Dienstleistung und Beratung GmbH</td>\n",
 | 
			
		||||
       "      <td><div class=\"publication_container\">\\n <div cla...</td>\n",
 | 
			
		||||
       "      <td>2016</td>\n",
 | 
			
		||||
       "    </tr>\n",
 | 
			
		||||
       "  </tbody>\n",
 | 
			
		||||
       "</table>\n",
 | 
			
		||||
       "</div>"
 | 
			
		||||
      ],
 | 
			
		||||
      "text/plain": [
 | 
			
		||||
       "        date                                   company  \\\n",
 | 
			
		||||
       "1 2022-03-25  Atos IT-Dienstleistung und Beratung GmbH   \n",
 | 
			
		||||
       "2 2021-03-11  Atos IT-Dienstleistung und Beratung GmbH   \n",
 | 
			
		||||
       "3 2020-03-24  Atos IT-Dienstleistung und Beratung GmbH   \n",
 | 
			
		||||
       "4 2018-12-11  Atos IT-Dienstleistung und Beratung GmbH   \n",
 | 
			
		||||
       "6 2018-01-03  Atos IT-Dienstleistung und Beratung GmbH   \n",
 | 
			
		||||
       "        date                                            company  \\\n",
 | 
			
		||||
       "0 2023-05-25  Volkswagen Economy Service Erdle Bernhard Erdl...   \n",
 | 
			
		||||
       "1 2023-05-24  Volkswagen Economy Service Erdle Bernhard Erdl...   \n",
 | 
			
		||||
       "\n",
 | 
			
		||||
       "                                          raw_report  jahr  \n",
 | 
			
		||||
       "1  <div class=\"publication_container\">\\n <div cla...  2020  \n",
 | 
			
		||||
       "2  <div class=\"publication_container\">\\n <div cla...  2019  \n",
 | 
			
		||||
       "3  <div class=\"publication_container\">\\n <div cla...  2018  \n",
 | 
			
		||||
       "4  <div class=\"publication_container\">\\n <div cla...  2017  \n",
 | 
			
		||||
       "6  <div class=\"publication_container\">\\n <div cla...  2016  "
 | 
			
		||||
       "0  <div class=\"publication_container\">\\n <div cla...  2020  \n",
 | 
			
		||||
       "1  <div class=\"publication_container\">\\n <div cla...  2019  "
 | 
			
		||||
      ]
 | 
			
		||||
     },
 | 
			
		||||
     "execution_count": 10,
 | 
			
		||||
     "execution_count": 37,
 | 
			
		||||
     "metadata": {},
 | 
			
		||||
     "output_type": "execute_result"
 | 
			
		||||
    }
 | 
			
		||||
@@ -439,7 +310,7 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 11,
 | 
			
		||||
   "execution_count": 38,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
   "source": [
 | 
			
		||||
@@ -449,11 +320,12 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 12,
 | 
			
		||||
   "execution_count": 39,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "sample_report = df_jahresabschluss.iloc[0].raw_report"
 | 
			
		||||
    "sample_report = df_jahresabschluss.iloc[0].raw_report\n",
 | 
			
		||||
    "sample_report_content = df_jahresabschluss.iloc[0].raw_report"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
@@ -466,45 +338,20 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 14,
 | 
			
		||||
   "execution_count": 40,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "import re\n",
 | 
			
		||||
    "from dataclasses import dataclass\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "@dataclass\n",
 | 
			
		||||
    "class Auditor:\n",
 | 
			
		||||
    "    name: str\n",
 | 
			
		||||
    "    company: str\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "def extract_auditors(report: str) -> list:\n",
 | 
			
		||||
    "    auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
 | 
			
		||||
    "    hits = re.findall(auditor_regex, report)\n",
 | 
			
		||||
    "    return [hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip() for hit in hits]"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 15,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
    {
 | 
			
		||||
     "data": {
 | 
			
		||||
      "text/plain": [
 | 
			
		||||
       "['Eckhard Lewe', 'Renate Hermsdorf']"
 | 
			
		||||
      ]
 | 
			
		||||
     },
 | 
			
		||||
     "execution_count": 15,
 | 
			
		||||
     "metadata": {},
 | 
			
		||||
     "output_type": "execute_result"
 | 
			
		||||
    }
 | 
			
		||||
   ],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "extract_auditors(sample_report)"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 16,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "def extract_auditor_company(report: str) -> str:\n",
 | 
			
		||||
    "    soup = BeautifulSoup(report, features=\"html.parser\")\n",
 | 
			
		||||
    "    temp = soup.find_all(\"b\")\n",
 | 
			
		||||
@@ -512,27 +359,37 @@
 | 
			
		||||
    "        br = elem.findChildren(\"br\")\n",
 | 
			
		||||
    "        if len(br) > 0:\n",
 | 
			
		||||
    "            return elem.text.split(\"\\n\")[1].strip()\n",
 | 
			
		||||
    "    return None"
 | 
			
		||||
    "    return None\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "def extract_auditors(report: str) -> list:\n",
 | 
			
		||||
    "    auditor_company = extract_auditor_company(report)\n",
 | 
			
		||||
    "    auditor_regex = r\"[a-z A-Z,.'-]+, Wirtschaftsprüfer\"\n",
 | 
			
		||||
    "    hits = re.findall(auditor_regex, report)\n",
 | 
			
		||||
    "    return [\n",
 | 
			
		||||
    "        Auditor(hit.replace(\", Wirtschaftsprüfer\", \"\").lstrip(), auditor_company)\n",
 | 
			
		||||
    "        for hit in hits\n",
 | 
			
		||||
    "    ]"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 17,
 | 
			
		||||
   "execution_count": 41,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
    {
 | 
			
		||||
     "data": {
 | 
			
		||||
      "text/plain": [
 | 
			
		||||
       "'Warth & Klein Grant Thornton AG'"
 | 
			
		||||
       "[]"
 | 
			
		||||
      ]
 | 
			
		||||
     },
 | 
			
		||||
     "execution_count": 17,
 | 
			
		||||
     "execution_count": 41,
 | 
			
		||||
     "metadata": {},
 | 
			
		||||
     "output_type": "execute_result"
 | 
			
		||||
    }
 | 
			
		||||
   ],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "extract_auditor_company(sample_report)"
 | 
			
		||||
    "extract_auditors(sample_report)"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
@@ -561,97 +418,177 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 18,
 | 
			
		||||
   "execution_count": 42,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
    {
 | 
			
		||||
     "data": {
 | 
			
		||||
      "text/html": [
 | 
			
		||||
       "<div>\n",
 | 
			
		||||
       "<style scoped>\n",
 | 
			
		||||
       "    .dataframe tbody tr th:only-of-type {\n",
 | 
			
		||||
       "        vertical-align: middle;\n",
 | 
			
		||||
       "    }\n",
 | 
			
		||||
       "\n",
 | 
			
		||||
       "    .dataframe tbody tr th {\n",
 | 
			
		||||
       "        vertical-align: top;\n",
 | 
			
		||||
       "    }\n",
 | 
			
		||||
       "\n",
 | 
			
		||||
       "    .dataframe thead th {\n",
 | 
			
		||||
       "        text-align: right;\n",
 | 
			
		||||
       "    }\n",
 | 
			
		||||
       "</style>\n",
 | 
			
		||||
       "<table border=\"1\" class=\"dataframe\">\n",
 | 
			
		||||
       "  <thead>\n",
 | 
			
		||||
       "    <tr style=\"text-align: right;\">\n",
 | 
			
		||||
       "      <th></th>\n",
 | 
			
		||||
       "      <th>Unnamed: 0</th>\n",
 | 
			
		||||
       "      <th>Anhang</th>\n",
 | 
			
		||||
       "      <th>2020  TEUR</th>\n",
 | 
			
		||||
       "      <th>Vorjahr  TEUR</th>\n",
 | 
			
		||||
       "    </tr>\n",
 | 
			
		||||
       "  </thead>\n",
 | 
			
		||||
       "  <tbody>\n",
 | 
			
		||||
       "    <tr>\n",
 | 
			
		||||
       "      <th>0</th>\n",
 | 
			
		||||
       "      <td>1. Umsatzerlöse</td>\n",
 | 
			
		||||
       "      <td>(1)</td>\n",
 | 
			
		||||
       "      <td>69.819</td>\n",
 | 
			
		||||
       "      <td>77.429</td>\n",
 | 
			
		||||
       "    </tr>\n",
 | 
			
		||||
       "    <tr>\n",
 | 
			
		||||
       "      <th>1</th>\n",
 | 
			
		||||
       "      <td>2. Veränderung des Bestandes an unfertigen Lei...</td>\n",
 | 
			
		||||
       "      <td>NaN</td>\n",
 | 
			
		||||
       "      <td>-41.000</td>\n",
 | 
			
		||||
       "      <td>-66.000</td>\n",
 | 
			
		||||
       "    </tr>\n",
 | 
			
		||||
       "    <tr>\n",
 | 
			
		||||
       "      <th>2</th>\n",
 | 
			
		||||
       "      <td>3. Sonstige betriebliche Erträge</td>\n",
 | 
			
		||||
       "      <td>(2)</td>\n",
 | 
			
		||||
       "      <td>489.000</td>\n",
 | 
			
		||||
       "      <td>1.816</td>\n",
 | 
			
		||||
       "    </tr>\n",
 | 
			
		||||
       "    <tr>\n",
 | 
			
		||||
       "      <th>3</th>\n",
 | 
			
		||||
       "      <td>4. Materialaufwand</td>\n",
 | 
			
		||||
       "      <td>NaN</td>\n",
 | 
			
		||||
       "      <td>NaN</td>\n",
 | 
			
		||||
       "      <td>NaN</td>\n",
 | 
			
		||||
       "    </tr>\n",
 | 
			
		||||
       "    <tr>\n",
 | 
			
		||||
       "      <th>4</th>\n",
 | 
			
		||||
       "      <td>a) Aufwendungen für bezogene Waren</td>\n",
 | 
			
		||||
       "      <td>NaN</td>\n",
 | 
			
		||||
       "      <td>-1.220</td>\n",
 | 
			
		||||
       "      <td>-3.003</td>\n",
 | 
			
		||||
       "    </tr>\n",
 | 
			
		||||
       "  </tbody>\n",
 | 
			
		||||
       "</table>\n",
 | 
			
		||||
       "</div>"
 | 
			
		||||
      ],
 | 
			
		||||
      "text/plain": [
 | 
			
		||||
       "                                          Unnamed: 0 Anhang  2020  TEUR  \\\n",
 | 
			
		||||
       "0                                    1. Umsatzerlöse    (1)      69.819   \n",
 | 
			
		||||
       "1  2. Veränderung des Bestandes an unfertigen Lei...    NaN     -41.000   \n",
 | 
			
		||||
       "2                   3. Sonstige betriebliche Erträge    (2)     489.000   \n",
 | 
			
		||||
       "3                                 4. Materialaufwand    NaN         NaN   \n",
 | 
			
		||||
       "4                 a) Aufwendungen für bezogene Waren    NaN      -1.220   \n",
 | 
			
		||||
       "\n",
 | 
			
		||||
       "   Vorjahr  TEUR  \n",
 | 
			
		||||
       "0         77.429  \n",
 | 
			
		||||
       "1        -66.000  \n",
 | 
			
		||||
       "2          1.816  \n",
 | 
			
		||||
       "3            NaN  \n",
 | 
			
		||||
       "4         -3.003  "
 | 
			
		||||
       "{'net_income': 23484.67, 'equity': 65083.84, 'current_assets': 357613.61}"
 | 
			
		||||
      ]
 | 
			
		||||
     },
 | 
			
		||||
     "execution_count": 18,
 | 
			
		||||
     "execution_count": 42,
 | 
			
		||||
     "metadata": {},
 | 
			
		||||
     "output_type": "execute_result"
 | 
			
		||||
    }
 | 
			
		||||
   ],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "def extract_kpis(report_content) -> dict:\n",
 | 
			
		||||
    "    \"\"\"\n",
 | 
			
		||||
    "    Source: https://github.com/bundesAPI/deutschland/pull/87/files#diff-f5b9db5384cf523fcc677056065041e7793bfc4da9cf74c4eebd6fab732739bd\n",
 | 
			
		||||
    "    Extracts Key Performance Indicators (KPIs) from the financial reports.\n",
 | 
			
		||||
    "    Args:\n",
 | 
			
		||||
    "        reports (dict): A dictionary containing the financial reports with their hash as keys and report details as values.\n",
 | 
			
		||||
    "    Returns:\n",
 | 
			
		||||
    "        dict: A dictionary containing the extracted KPIs with their report hash as keys and KPIs as values.\n",
 | 
			
		||||
    "    \"\"\"\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "    kpis = {}\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "    # Define KPI patterns to search for\n",
 | 
			
		||||
    "    kpi_patterns = {\n",
 | 
			
		||||
    "        \"revenue\": r\"(?:revenue|umsatz|erlöse)[:\\s]*([\\d,.]+[mmb]?)\",\n",
 | 
			
		||||
    "        \"net_income\": r\"(?:net income|jahresüberschuss|nettoeinkommen|Ergebnis nach Steuern)[:\\s]*([\\d,.]+[mmb]?)\",\n",
 | 
			
		||||
    "        \"ebit\": r\"(?:ebit|operating income)[:\\s]*([\\d,.]+[mmb]?)\",\n",
 | 
			
		||||
    "        \"ebitda\": r\"(?:ebitda)[:\\s]*([\\d,.]+[mmb]?)\",\n",
 | 
			
		||||
    "        \"gross_profit\": r\"(?:gross profit|bruttogewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
 | 
			
		||||
    "        \"operating_profit\": r\"(?:operating profit|betriebsgewinn)[:\\s]*([\\d,.]+[mmb]?)\",\n",
 | 
			
		||||
    "        \"assets\": r\"(?:total assets|bilanzsumme)[:\\s]*([\\d,.]+[mmb]?)\",\n",
 | 
			
		||||
    "        \"liabilities\": r\"(?:total liabilities|gesamtverbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
 | 
			
		||||
    "        \"equity\": r\"(?:shareholders'? equity|eigenkapital)[:\\s]*([\\d,.]+[mmb]?)\",\n",
 | 
			
		||||
    "        \"current_assets\": r\"(?:current assets|umlaufvermögen)[:\\s]*([\\d,.]+[mmb]?)\",\n",
 | 
			
		||||
    "        \"current_liabilities\": r\"(?:current liabilities|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
 | 
			
		||||
    "        \"long_term_debt\": r\"(?:long[-\\s]?term debt|langfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
 | 
			
		||||
    "        \"short_term_debt\": r\"(?:short[-\\s]?term debt|kurzfristige verbindlichkeiten)[:\\s]*([\\d,.]+[mmb]?)\",\n",
 | 
			
		||||
    "        \"cash_and_cash_equivalents\": r\"(?:cash (?:and cash equivalents)?|barmittel)[:\\s]*([\\d,.]+[mmb]?)\",\n",
 | 
			
		||||
    "        \"dividends\": r\"(?:dividends?|dividende)[:\\s]*([\\d,.]+[mmb]?)\",\n",
 | 
			
		||||
    "        \"cash_flow\": r\"(?:cash flow|cashflow|cash flow from operating activities)[:\\s]*([\\d,.]+[mmb]?)\",\n",
 | 
			
		||||
    "    }\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "    report_kpis = {}\n",
 | 
			
		||||
    "    for kpi, pattern in kpi_patterns.items():\n",
 | 
			
		||||
    "        match = re.search(pattern, report_content, flags=re.IGNORECASE | re.UNICODE)\n",
 | 
			
		||||
    "        if match:\n",
 | 
			
		||||
    "            value = match.group(1)\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "            # Clean and validate the extracted number\n",
 | 
			
		||||
    "            try:\n",
 | 
			
		||||
    "                if not value:  # Check if value is empty\n",
 | 
			
		||||
    "                    cleaned_value = None\n",
 | 
			
		||||
    "                else:\n",
 | 
			
		||||
    "                    multiplier = 1\n",
 | 
			
		||||
    "                    if value[-1].lower() == \"m\":\n",
 | 
			
		||||
    "                        value = value[:-1]\n",
 | 
			
		||||
    "                        multiplier = 1_000_000\n",
 | 
			
		||||
    "                    elif value[-1].lower() == \"b\":\n",
 | 
			
		||||
    "                        value = value[:-1]\n",
 | 
			
		||||
    "                        multiplier = 1_000_000_000\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "                    # Remove commas after checking for multipliers\n",
 | 
			
		||||
    "                    value = value.replace(\".\", \"\").replace(\",\", \".\").strip()\n",
 | 
			
		||||
    "                    cleaned_value = float(value) * multiplier\n",
 | 
			
		||||
    "            except ValueError:\n",
 | 
			
		||||
    "                cleaned_value = None\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "            if cleaned_value is not None:\n",
 | 
			
		||||
    "                report_kpis[kpi] = cleaned_value\n",
 | 
			
		||||
    "    return report_kpis\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "extract_kpis(\n",
 | 
			
		||||
    "    BeautifulSoup(sample_report, features=\"html.parser\").get_text().replace(\"\\n\", \" \")\n",
 | 
			
		||||
    ")"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 43,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "import os\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "with open(\"./temp.txt\", \"w\") as file:\n",
 | 
			
		||||
    "    file.write(\n",
 | 
			
		||||
    "        BeautifulSoup(sample_report, features=\"html.parser\")\n",
 | 
			
		||||
    "        .get_text()\n",
 | 
			
		||||
    "        .replace(\"\\n\", \" \")\n",
 | 
			
		||||
    "    )"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 46,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
    {
 | 
			
		||||
     "name": "stdout",
 | 
			
		||||
     "output_type": "stream",
 | 
			
		||||
     "text": [
 | 
			
		||||
      "MultiIndex([('Aktiva', 'Unnamed: 0_level_1'),\n",
 | 
			
		||||
      "            ('Aktiva',    '31.12.2020  EUR'),\n",
 | 
			
		||||
      "            ('Aktiva',    '31.12.2019  EUR')],\n",
 | 
			
		||||
      "           )\n",
 | 
			
		||||
      "Aktiva  Unnamed: 0_level_1    object\n",
 | 
			
		||||
      "        31.12.2020  EUR       object\n",
 | 
			
		||||
      "        31.12.2019  EUR       object\n",
 | 
			
		||||
      "dtype: object\n",
 | 
			
		||||
      "MultiIndex([('Passiva', 'Unnamed: 0_level_1'),\n",
 | 
			
		||||
      "            ('Passiva',    '31.12.2020  EUR'),\n",
 | 
			
		||||
      "            ('Passiva',    '31.12.2019  EUR')],\n",
 | 
			
		||||
      "           )\n",
 | 
			
		||||
      "Passiva  Unnamed: 0_level_1    object\n",
 | 
			
		||||
      "         31.12.2020  EUR       object\n",
 | 
			
		||||
      "         31.12.2019  EUR       object\n",
 | 
			
		||||
      "dtype: object\n",
 | 
			
		||||
      "Index(['Angaben zur Identifikation der Gesellschaft laut Registergericht', 'Angaben zur Identifikation der Gesellschaft laut Registergericht.1'], dtype='object')\n",
 | 
			
		||||
      "Angaben zur Identifikation der Gesellschaft laut Registergericht      object\n",
 | 
			
		||||
      "Angaben zur Identifikation der Gesellschaft laut Registergericht.1    object\n",
 | 
			
		||||
      "dtype: object\n"
 | 
			
		||||
     ]
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
     "data": {
 | 
			
		||||
      "text/plain": [
 | 
			
		||||
       "{}"
 | 
			
		||||
      ]
 | 
			
		||||
     },
 | 
			
		||||
     "execution_count": 46,
 | 
			
		||||
     "metadata": {},
 | 
			
		||||
     "output_type": "execute_result"
 | 
			
		||||
    }
 | 
			
		||||
   ],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "def parse_tables(report: str) -> list:\n",
 | 
			
		||||
    "    result = {}\n",
 | 
			
		||||
    "    soup = BeautifulSoup(report, features=\"html.parser\")\n",
 | 
			
		||||
    "    for table in soup.find_all(\"table\", {\"class\": \"std_table\"}):\n",
 | 
			
		||||
    "        df = pd.read_html(StringIO(str(table)))[0]\n",
 | 
			
		||||
    "        print(df.columns)\n",
 | 
			
		||||
    "        print(df.dtypes)\n",
 | 
			
		||||
    "    return result\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "\n",
 | 
			
		||||
    "parse_tables(sample_report)"
 | 
			
		||||
   ]
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 45,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
    {
 | 
			
		||||
     "ename": "KeyError",
 | 
			
		||||
     "evalue": "'Passiva'",
 | 
			
		||||
     "output_type": "error",
 | 
			
		||||
     "traceback": [
 | 
			
		||||
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
 | 
			
		||||
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
 | 
			
		||||
      "\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Bundesanzeiger\\notebook.ipynb Cell 21\u001b[0m in \u001b[0;36m1\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=10'>11</a>\u001b[0m     \u001b[39mreturn\u001b[39;00m result\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=13'>14</a>\u001b[0m bilanz \u001b[39m=\u001b[39m get_bilanz(sample_report)\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Bundesanzeiger/notebook.ipynb#X26sZmlsZQ%3D%3D?line=14'>15</a>\u001b[0m bilanz[\u001b[39m\"\u001b[39;49m\u001b[39mPassiva\u001b[39;49m\u001b[39m\"\u001b[39;49m]\u001b[39m.\u001b[39mhead()\n",
 | 
			
		||||
      "\u001b[1;31mKeyError\u001b[0m: 'Passiva'"
 | 
			
		||||
     ]
 | 
			
		||||
    }
 | 
			
		||||
   ],
 | 
			
		||||
   "source": [
 | 
			
		||||
    "def get_bilanz(report: str) -> any:\n",
 | 
			
		||||
    "    result = {}\n",
 | 
			
		||||
@@ -672,30 +609,30 @@
 | 
			
		||||
  },
 | 
			
		||||
  {
 | 
			
		||||
   "cell_type": "code",
 | 
			
		||||
   "execution_count": 19,
 | 
			
		||||
   "execution_count": null,
 | 
			
		||||
   "metadata": {},
 | 
			
		||||
   "outputs": [
 | 
			
		||||
    {
 | 
			
		||||
     "name": "stdout",
 | 
			
		||||
     "output_type": "stream",
 | 
			
		||||
     "text": [
 | 
			
		||||
      "Index(['Gesellschafterbeschluss', 'Shareholder Resolution'], dtype='object')\n",
 | 
			
		||||
      "Index(['Unnamed: 0', 'Anhang', '31.12.2020  TEUR', 'Vorjahr  TEUR'], dtype='object')\n",
 | 
			
		||||
      "Index(['Unnamed: 0', 'Anhang', '2020  TEUR', 'Vorjahr  TEUR'], dtype='object')\n",
 | 
			
		||||
      "Index(['Aufgliederung nach Tätigkeitsbereichen', '2020  TEUR',\n",
 | 
			
		||||
      "Int64Index([0, 1], dtype='int64')\n",
 | 
			
		||||
      "Index(['Unnamed: 0', 'Anhang', '31.12.2021  TEUR', 'Vorjahr  TEUR'], dtype='object')\n",
 | 
			
		||||
      "Index(['Unnamed: 0', 'Anhang', '2021  TEUR', 'Vorjahr  TEUR'], dtype='object')\n",
 | 
			
		||||
      "Index(['Aufgliederung nach Tätigkeitsbereichen', '2021  TEUR',\n",
 | 
			
		||||
      "       'Vorjahr  TEUR'],\n",
 | 
			
		||||
      "      dtype='object')\n",
 | 
			
		||||
      "Index(['Aufgliederung nach Inland und Ausland', '2020  TEUR', 'Vorjahr  TEUR'], dtype='object')\n",
 | 
			
		||||
      "Index(['Unnamed: 0', '31.12.2020  TEUR', 'Vorjahr  TEUR'], dtype='object')\n",
 | 
			
		||||
      "Index(['Unnamed: 0', '31.12.2020  TEUR', 'Vorjahr  TEUR'], dtype='object')\n",
 | 
			
		||||
      "Index(['Unnamed: 0', '31.12.2020'], dtype='object')\n",
 | 
			
		||||
      "Index(['Aufgliederung nach Inland und Ausland', '2021  TEUR', 'Vorjahr  TEUR'], dtype='object')\n",
 | 
			
		||||
      "Index(['Unnamed: 0', '31.12.2021  TEUR', 'Vorjahr  TEUR'], dtype='object')\n",
 | 
			
		||||
      "Index(['Unnamed: 0', '31.12.2021  TEUR', 'Vorjahr  TEUR'], dtype='object')\n",
 | 
			
		||||
      "Index(['Unnamed: 0', '31.12.2021'], dtype='object')\n",
 | 
			
		||||
      "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
 | 
			
		||||
      "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
 | 
			
		||||
      "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
 | 
			
		||||
      "Int64Index([0, 1, 2], dtype='int64')\n",
 | 
			
		||||
      "Index(['Unnamed: 0', 'TEUR'], dtype='object')\n",
 | 
			
		||||
      "Index(['Unnamed: 0', '31.12.2020  TEUR', 'Vorjahr  TEUR'], dtype='object')\n",
 | 
			
		||||
      "Index(['Unnamed: 0', '2020 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
 | 
			
		||||
      "Index(['Unnamed: 0', '31.12.2021  TEUR', 'Vorjahr  TEUR'], dtype='object')\n",
 | 
			
		||||
      "Index(['Unnamed: 0', '2021 Anzahl MA', 'Vorjahr Anzahl MA'], dtype='object')\n",
 | 
			
		||||
      "MultiIndex([('Art des Geschäfts',           'Unnamed: 0_level_1'),\n",
 | 
			
		||||
      "            ('Art der Beziehung',       'Gesellschafterin  TEUR'),\n",
 | 
			
		||||
      "            ('Art der Beziehung', 'Verbundene Unternehmen  TEUR')],\n",
 | 
			
		||||
@@ -707,24 +644,23 @@
 | 
			
		||||
      "            ('Anschaffungs- oder Herstellungskosten', ...),\n",
 | 
			
		||||
      "            ('Anschaffungs- oder Herstellungskosten', ...)],\n",
 | 
			
		||||
      "           )\n",
 | 
			
		||||
      "MultiIndex([('Unnamed: 0_level_0',                      'Unnamed: 0_level_1'),\n",
 | 
			
		||||
      "            (    'Abschreibungen',                   'Stand 01.01.2020  EUR'),\n",
 | 
			
		||||
      "            (    'Abschreibungen', 'Abschreibungen des Geschäftsjahres  EUR'),\n",
 | 
			
		||||
      "            (    'Abschreibungen',                  'Abgänge Umbuchung  EUR'),\n",
 | 
			
		||||
      "            (    'Abschreibungen',                   'Stand 31.12.2020  EUR')],\n",
 | 
			
		||||
      "MultiIndex([('Unnamed: 0_level_0', ...),\n",
 | 
			
		||||
      "            (    'Abschreibungen', ...),\n",
 | 
			
		||||
      "            (    'Abschreibungen', ...),\n",
 | 
			
		||||
      "            (    'Abschreibungen', ...),\n",
 | 
			
		||||
      "            (    'Abschreibungen', ...)],\n",
 | 
			
		||||
      "           )\n",
 | 
			
		||||
      "MultiIndex([('Unnamed: 0_level_0',    'Unnamed: 0_level_1'),\n",
 | 
			
		||||
      "            (         'Buchwerte', 'Stand 31.12.2020  EUR'),\n",
 | 
			
		||||
      "            (         'Buchwerte', 'Stand 31.12.2019  EUR')],\n",
 | 
			
		||||
      "            (         'Buchwerte', 'Stand 31.12.2021  EUR'),\n",
 | 
			
		||||
      "            (         'Buchwerte', 'Stand 31.12.2020  EUR')],\n",
 | 
			
		||||
      "           )\n",
 | 
			
		||||
      "Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2020', '2019',\n",
 | 
			
		||||
      "       '2018'],\n",
 | 
			
		||||
      "Index(['Nichtfinanzieller Leistungsindikator', 'Unnamed: 1', '2021', '2020',\n",
 | 
			
		||||
      "       '2019'],\n",
 | 
			
		||||
      "      dtype='object')\n",
 | 
			
		||||
      "Index(['Gewinn- und Verlustrechnung', '2020  TEUR', 'Vorjahr  TEUR',\n",
 | 
			
		||||
      "Index(['Gewinn- und Verlustrechnung', '2021  TEUR', 'Vorjahr  TEUR',\n",
 | 
			
		||||
      "       'Veränderung  TEUR'],\n",
 | 
			
		||||
      "      dtype='object')\n",
 | 
			
		||||
      "Index(['Bilanz', '31.12.2020  TEUR', 'Vorjahr  TEUR', 'Veränderung  TEUR'], dtype='object')\n",
 | 
			
		||||
      "Index(['Bericht des Aufsichtsrats', 'Report of the Supervisory Board'], dtype='object')\n"
 | 
			
		||||
      "Index(['Bilanz', '31.12.2021  TEUR', 'Vorjahr  TEUR', 'Veränderung  TEUR'], dtype='object')\n"
 | 
			
		||||
     ]
 | 
			
		||||
    }
 | 
			
		||||
   ],
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										1
									
								
								Jupyter/API-tests/News/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								Jupyter/API-tests/News/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1 @@
 | 
			
		||||
data/
 | 
			
		||||
							
								
								
									
										879
									
								
								Jupyter/API-tests/News/notebook.ipynb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										879
									
								
								Jupyter/API-tests/News/notebook.ipynb
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										1
									
								
								Jupyter/API-tests/News/requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								Jupyter/API-tests/News/requirements.txt
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1 @@
 | 
			
		||||
pymongo
 | 
			
		||||
							
								
								
									
										1
									
								
								Jupyter/API-tests/Unternehmensregister/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								Jupyter/API-tests/Unternehmensregister/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1 @@
 | 
			
		||||
data/*
 | 
			
		||||
							
								
								
									
										192
									
								
								Jupyter/API-tests/Unternehmensregister/main.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										192
									
								
								Jupyter/API-tests/Unternehmensregister/main.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,192 @@
 | 
			
		||||
"""Unternehmensregister Scraping."""
 | 
			
		||||
import glob
 | 
			
		||||
import logging
 | 
			
		||||
import multiprocessing
 | 
			
		||||
import os
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
from selenium import webdriver
 | 
			
		||||
from selenium.webdriver.common.by import By
 | 
			
		||||
from selenium.webdriver.support import expected_conditions as ec
 | 
			
		||||
from selenium.webdriver.support.ui import WebDriverWait
 | 
			
		||||
from tqdm import tqdm
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def scrape(query: str, download_dir: list[str]):
 | 
			
		||||
    """Fetch results from Unternehmensregister for given query.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        query (str): Search Query (RegEx supported)
 | 
			
		||||
        download_dir (list[str]): Directory to place output files in
 | 
			
		||||
    """
 | 
			
		||||
    download_path = os.path.join(str(Path.cwd()), *download_dir)
 | 
			
		||||
    options = webdriver.ChromeOptions()
 | 
			
		||||
    preferences = {
 | 
			
		||||
        "profile.default_content_settings.popups": 0,
 | 
			
		||||
        "safebrowsing.enabled": True,
 | 
			
		||||
        "download": {
 | 
			
		||||
            "directory_upgrade": True,
 | 
			
		||||
            "prompt_for_download": False,
 | 
			
		||||
            "extensions_to_open": "",
 | 
			
		||||
            "default_directory": download_path,
 | 
			
		||||
        },
 | 
			
		||||
    }
 | 
			
		||||
    options.add_argument("--headless=new")
 | 
			
		||||
    options.add_experimental_option("prefs", preferences)
 | 
			
		||||
 | 
			
		||||
    driver = webdriver.Chrome(options=options)
 | 
			
		||||
 | 
			
		||||
    driver.get("https://www.unternehmensregister.de/ureg/")
 | 
			
		||||
    # Accept Cookies
 | 
			
		||||
    driver.find_elements(
 | 
			
		||||
        By.XPATH, '//button[text()="Nur technisch notwendige Cookies akzeptieren"]'
 | 
			
		||||
    )[0].click()
 | 
			
		||||
    # Enter search query
 | 
			
		||||
    driver.find_elements(By.ID, "globalSearchForm:extendedResearchCompanyName")[
 | 
			
		||||
        0
 | 
			
		||||
    ].send_keys(query)
 | 
			
		||||
    # Trigger search
 | 
			
		||||
    driver.find_elements(By.ID, "globalSearchForm:btnExecuteSearchOld")[0].click()
 | 
			
		||||
    # Wait for results
 | 
			
		||||
    wait = WebDriverWait(driver, 15)
 | 
			
		||||
    wait.until(
 | 
			
		||||
        lambda driver: driver.current_url != "https://www.unternehmensregister.de/ureg/"
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    num_pages = int(
 | 
			
		||||
        driver.find_element(By.XPATH, '//*[@class="page_count"]').text.split(" ")[0]
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    processed_companies = []
 | 
			
		||||
 | 
			
		||||
    for _ in tqdm(range(num_pages)):
 | 
			
		||||
        # Find all "Registerinformationen"
 | 
			
		||||
        companies_tab = driver.find_elements(
 | 
			
		||||
            By.LINK_TEXT, "Registerinformationen des Registergerichts"
 | 
			
		||||
        )
 | 
			
		||||
        company_names = [
 | 
			
		||||
            elem.text
 | 
			
		||||
            for elem in driver.find_elements(
 | 
			
		||||
                By.XPATH, '//div[@class="company_result"]/span/b'
 | 
			
		||||
            )
 | 
			
		||||
        ]
 | 
			
		||||
        for index, company_link in enumerate(companies_tab):
 | 
			
		||||
            company_name = company_names[index]
 | 
			
		||||
            if company_name in processed_companies:
 | 
			
		||||
                continue
 | 
			
		||||
            # Go to intermediary page
 | 
			
		||||
            company_link.click()
 | 
			
		||||
            # Trigger next redirect
 | 
			
		||||
            driver.find_element(By.LINK_TEXT, "Registerinformationen anzeigen").click()
 | 
			
		||||
            # Trigger SI download
 | 
			
		||||
            driver.find_element(By.LINK_TEXT, "SI").click()
 | 
			
		||||
            # Show shopping cart
 | 
			
		||||
            wait.until(
 | 
			
		||||
                ec.visibility_of_element_located(
 | 
			
		||||
                    (By.LINK_TEXT, "Dokumentenkorb ansehen")
 | 
			
		||||
                )
 | 
			
		||||
            )
 | 
			
		||||
            driver.find_element(By.LINK_TEXT, "Dokumentenkorb ansehen").click()
 | 
			
		||||
            # Get document
 | 
			
		||||
            elems = driver.find_elements(By.TAG_NAME, "input")
 | 
			
		||||
            elems[-2].click()
 | 
			
		||||
 | 
			
		||||
            wait.until(
 | 
			
		||||
                ec.visibility_of_element_located((By.ID, "paymentFormOverview:btnNext"))
 | 
			
		||||
            )
 | 
			
		||||
            driver.find_element(By.ID, "paymentFormOverview:btnNext").click()
 | 
			
		||||
 | 
			
		||||
            wait.until(
 | 
			
		||||
                ec.visibility_of_element_located((By.LINK_TEXT, "Zum Dokumentenkorb"))
 | 
			
		||||
            )
 | 
			
		||||
            driver.find_element(By.LINK_TEXT, "Zum Dokumentenkorb").click()
 | 
			
		||||
 | 
			
		||||
            num_files = get_num_files(download_path)
 | 
			
		||||
            driver.find_element(By.CLASS_NAME, "download-wrapper").click()
 | 
			
		||||
 | 
			
		||||
            try:
 | 
			
		||||
                wait.until(wait_for_download_condition(download_path, num_files))
 | 
			
		||||
                file_name = "".join(e for e in company_name if e.isalnum()) + ".xml"
 | 
			
		||||
                rename_latest_file(
 | 
			
		||||
                    download_path,
 | 
			
		||||
                    file_name,
 | 
			
		||||
                )
 | 
			
		||||
                processed_companies.append(company_name)
 | 
			
		||||
            except Exception:
 | 
			
		||||
                logger.warning("Exception caught in Scraping")
 | 
			
		||||
            finally:
 | 
			
		||||
                for _ in range(6):
 | 
			
		||||
                    driver.back()
 | 
			
		||||
        driver.find_element(By.XPATH, '//*[@class="fas fa-angle-right"]').click()
 | 
			
		||||
    driver.close()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def wait_for_download_condition(
 | 
			
		||||
    path: str, num_files: int, pattern: str = "*.xml"
 | 
			
		||||
) -> bool:
 | 
			
		||||
    """Selenium wait condition monitoring number of files in a dir.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        path (str): Directory path
 | 
			
		||||
        num_files (int): Current number of file
 | 
			
		||||
        pattern (str, optional): File pattern. Defaults to "*.xml".
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
        bool: Current num file exceeded
 | 
			
		||||
    """
 | 
			
		||||
    return len(glob.glob1(path, pattern)) > num_files
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_num_files(path: str, pattern: str = "*.xml") -> int:
 | 
			
		||||
    """Get number of files in directory.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        path (str): Directory to scan
 | 
			
		||||
        pattern (str, optional): File pattern. Defaults to "*.xml".
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
        int: Number of files matching pattern
 | 
			
		||||
    """
 | 
			
		||||
    return len(glob.glob1(path, pattern))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def rename_latest_file(path: str, filename: str, pattern: str = "*.xml"):
 | 
			
		||||
    """Rename file in dir with latest change date.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        path (str): Dir to check
 | 
			
		||||
        filename (str): Name of file
 | 
			
		||||
        pattern (str, optional): File pattern. Defaults to "*.xml".
 | 
			
		||||
    """
 | 
			
		||||
    list_of_files = [os.path.join(path, file) for file in glob.glob1(path, pattern)]
 | 
			
		||||
    latest_download = max(list_of_files, key=os.path.getctime)
 | 
			
		||||
    os.rename(latest_download, os.path.join(path, filename))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    """Main procedure"""
 | 
			
		||||
    import pandas as pd
 | 
			
		||||
 | 
			
		||||
    df_relevant_companies = pd.read_excel(
 | 
			
		||||
        "./data/study_id42887_top-100-unternehmen-deutschland.xlsx",
 | 
			
		||||
        sheet_name="Toplist",
 | 
			
		||||
        skiprows=1,
 | 
			
		||||
    )
 | 
			
		||||
    df_relevant_companies = df_relevant_companies[df_relevant_companies["Name"].notna()]
 | 
			
		||||
 | 
			
		||||
    batch_size = 5
 | 
			
		||||
    pool = multiprocessing.Pool(processes=batch_size)
 | 
			
		||||
    params = [
 | 
			
		||||
        (query, ["data", "Unternehmensregister", "scraping", query.strip()])
 | 
			
		||||
        for query in df_relevant_companies.Name
 | 
			
		||||
    ]
 | 
			
		||||
    # Map the process_handler function to the parameter list using the Pool
 | 
			
		||||
    pool.starmap(scrape, params)
 | 
			
		||||
 | 
			
		||||
    # Close the Pool to prevent any more tasks from being submitted
 | 
			
		||||
    pool.close()
 | 
			
		||||
 | 
			
		||||
    # Wait for all the processes to complete
 | 
			
		||||
    pool.join()
 | 
			
		||||
							
								
								
									
										4322
									
								
								Jupyter/API-tests/Unternehmensregister/notebook.ipynb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4322
									
								
								Jupyter/API-tests/Unternehmensregister/notebook.ipynb
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										10
									
								
								Jupyter/API-tests/Unternehmensregister/requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								Jupyter/API-tests/Unternehmensregister/requirements.txt
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,10 @@
 | 
			
		||||
ocrmypdf 
 | 
			
		||||
pytesseract 
 | 
			
		||||
opencv-python 
 | 
			
		||||
pdf2image
 | 
			
		||||
bs4
 | 
			
		||||
selenium
 | 
			
		||||
xmltodict
 | 
			
		||||
tqdm
 | 
			
		||||
openpyxl
 | 
			
		||||
pandas
 | 
			
		||||
							
								
								
									
										28
									
								
								Jupyter/API-tests/docker-compose.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								Jupyter/API-tests/docker-compose.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,28 @@
 | 
			
		||||
version: '3.8'
 | 
			
		||||
services:
 | 
			
		||||
  mongodb:
 | 
			
		||||
    image: mongo:6.0.6
 | 
			
		||||
    container_name: mongodb
 | 
			
		||||
    restart: unless-stopped
 | 
			
		||||
    environment:
 | 
			
		||||
      MONGO_INITDB_ROOT_USERNAME: root
 | 
			
		||||
      MONGO_INITDB_ROOT_PASSWORD: pR0R0v2e2
 | 
			
		||||
      MONGO_INITDB_DATABASE: transparenzregister
 | 
			
		||||
    ports:
 | 
			
		||||
    - 27017:27017
 | 
			
		||||
    volumes:
 | 
			
		||||
    - mongodb_data:/data/db
 | 
			
		||||
 | 
			
		||||
  mongo-express:
 | 
			
		||||
    image: mongo-express:1.0.0-alpha
 | 
			
		||||
    container_name: mongo-express
 | 
			
		||||
    restart: unless-stopped
 | 
			
		||||
    ports:
 | 
			
		||||
    - 8081:8081
 | 
			
		||||
    environment:
 | 
			
		||||
      ME_CONFIG_MONGODB_SERVER: mongodb
 | 
			
		||||
      ME_CONFIG_MONGODB_ADMINUSERNAME: root
 | 
			
		||||
      ME_CONFIG_MONGODB_ADMINPASSWORD: pR0R0v2e2
 | 
			
		||||
 | 
			
		||||
volumes:
 | 
			
		||||
  mongodb_data:
 | 
			
		||||
@@ -1,35 +1,35 @@
 | 
			
		||||
---
 | 
			
		||||
title: "Automatisierte Daten Extraktion aus Internetquellen als Grundlage für die Analyse von Kapitalgesellschaften"
 | 
			
		||||
author: "Nolde, Tristan Norbert"
 | 
			
		||||
date: "2023-05-06"
 | 
			
		||||
---
 | 
			
		||||
 | 
			
		||||
# Abstract: Automatisierte Daten Extraktion aus Internetquellen als Grundlage für die Analyse von Kapitalgesellschaften
 | 
			
		||||
 | 
			
		||||
## Gliederung
 | 
			
		||||
1. Einleitung (Zielsetzung/Problemstellung, Vorgehen)
 | 
			
		||||
2. Web Scraping/Crawling
 | 
			
		||||
   1. Definition und Theorie
 | 
			
		||||
   2. Technologien
 | 
			
		||||
   3. Umsetzung
 | 
			
		||||
3. RSS Feeds
 | 
			
		||||
   1. Definition und Theorie
 | 
			
		||||
   2. Technologien
 | 
			
		||||
   3. Umsetzung
 | 
			
		||||
4. APIs
 | 
			
		||||
   1. Definition und Theorie
 | 
			
		||||
   2. Technologien
 | 
			
		||||
   3. Umsetzung
 | 
			
		||||
5. Rechtliche Rahmenbedingungen
 | 
			
		||||
6. Vergleich der Lösungsansätze
 | 
			
		||||
7. Zusammenfassung
 | 
			
		||||
 | 
			
		||||
## Inhalt
 | 
			
		||||
 | 
			
		||||
In Zeiten von Big Data und AI stellen Daten und ihre Verfügbarkeit zunehmend eines der wichtigsten Wirtschaftsgüter dar. Als solches können sie auch eingesetzt werden, um Kapitalgesellschaften (eine Subklasse von Unternehmen) anhand verschiedener Kennzahlen wie der Mitarbeiterzahl oder dem Jahresgewinn zu analysieren. Obwohl solche Daten zu Genüge in Zeitungsartikeln, Newslettern oder dedizierten Aktienanalysen zu finden sind, so gestaltet sich eine automatisierte Extraktion dieser Daten aufgrund verschiedener Formate sowie weiterer Restriktionen schwierig.
 | 
			
		||||
 | 
			
		||||
Daher sollen im Rahmen dieser Seminararbeit verschiedene Wege betrachtet werden, die eben diese Daten erheben und zur Verfügung stellen können. Zu den nennenswerten Quellen gehören: Der Bundesanzeiger, RSS Feeds, Nachrichten APIs. Ziel ist es, aus diesen Quellen wertvolle Informationen bezogen auf den wirtschaftlichen Erfolg einer Kapitalgesellschaft sowie aktueller Nachrichten zu extrahieren und in ein einheitliches Format zu überführen.
 | 
			
		||||
 | 
			
		||||
Neben des technischen Einsatzes von Web Scraping/Crawling, um Informationen aus Webseiten zu gewinnen, sowie des Abfragens verfügbarer APIs soll auch der rechltiche Aspekt dieser Vorgehens Berücksichtigung finden, um die Rechtmäßigkeit zu bewerten.
 | 
			
		||||
 | 
			
		||||
Abschließend wird der Einsatz der verschiedenen Technologien an den Faktoren: Flexibilität, Simplizität, Verfügbarkeit und Rechtmäßigkeit, ein Fazit gezogen sowie ein Ausblick des weiteren Einsatzes gegeben.
 | 
			
		||||
---
 | 
			
		||||
title: "Automatisierte Daten Extraktion aus Internetquellen als Grundlage für die Analyse von Kapitalgesellschaften"
 | 
			
		||||
author: "Nolde, Tristan Norbert"
 | 
			
		||||
date: "2023-05-06"
 | 
			
		||||
---
 | 
			
		||||
 | 
			
		||||
# Abstract: Automatisierte Daten Extraktion aus Internetquellen als Grundlage für die Analyse von Kapitalgesellschaften
 | 
			
		||||
 | 
			
		||||
## Gliederung
 | 
			
		||||
1. Einleitung (Zielsetzung/Problemstellung, Vorgehen)
 | 
			
		||||
2. Web Scraping/Crawling
 | 
			
		||||
   1. Definition und Theorie
 | 
			
		||||
   2. Technologien
 | 
			
		||||
   3. Umsetzung
 | 
			
		||||
3. RSS Feeds
 | 
			
		||||
   1. Definition und Theorie
 | 
			
		||||
   2. Technologien
 | 
			
		||||
   3. Umsetzung
 | 
			
		||||
4. APIs
 | 
			
		||||
   1. Definition und Theorie
 | 
			
		||||
   2. Technologien
 | 
			
		||||
   3. Umsetzung
 | 
			
		||||
5. Rechtliche Rahmenbedingungen
 | 
			
		||||
6. Vergleich der Lösungsansätze
 | 
			
		||||
7. Zusammenfassung
 | 
			
		||||
 | 
			
		||||
## Inhalt
 | 
			
		||||
 | 
			
		||||
In Zeiten von Big Data und AI stellen Daten und ihre Verfügbarkeit zunehmend eines der wichtigsten Wirtschaftsgüter dar. Als solches können sie auch eingesetzt werden, um Kapitalgesellschaften (eine Subklasse von Unternehmen) anhand verschiedener Kennzahlen wie der Mitarbeiterzahl oder dem Jahresgewinn zu analysieren. Obwohl solche Daten zu Genüge in Zeitungsartikeln, Newslettern oder dedizierten Aktienanalysen zu finden sind, so gestaltet sich eine automatisierte Extraktion dieser Daten aufgrund verschiedener Formate sowie weiterer Restriktionen schwierig.
 | 
			
		||||
 | 
			
		||||
Daher sollen im Rahmen dieser Seminararbeit verschiedene Wege betrachtet werden, die eben diese Daten erheben und zur Verfügung stellen können. Zu den nennenswerten Quellen gehören: Der Bundesanzeiger, RSS Feeds, Nachrichten APIs. Ziel ist es, aus diesen Quellen wertvolle Informationen bezogen auf den wirtschaftlichen Erfolg einer Kapitalgesellschaft sowie aktueller Nachrichten zu extrahieren und in ein einheitliches Format zu überführen.
 | 
			
		||||
 | 
			
		||||
Neben des technischen Einsatzes von Web Scraping/Crawling, um Informationen aus Webseiten zu gewinnen, sowie des Abfragens verfügbarer APIs soll auch der rechltiche Aspekt dieser Vorgehens Berücksichtigung finden, um die Rechtmäßigkeit zu bewerten.
 | 
			
		||||
 | 
			
		||||
Abschließend wird der Einsatz der verschiedenen Technologien an den Faktoren: Flexibilität, Simplizität, Verfügbarkeit und Rechtmäßigkeit, ein Fazit gezogen sowie ein Ausblick des weiteren Einsatzes gegeben.
 | 
			
		||||
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										7616
									
								
								poetry.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										7616
									
								
								poetry.lock
									
									
									
										generated
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										220
									
								
								pyproject.toml
									
									
									
									
									
								
							
							
						
						
									
										220
									
								
								pyproject.toml
									
									
									
									
									
								
							@@ -1,107 +1,113 @@
 | 
			
		||||
[build-system]
 | 
			
		||||
build-backend = "poetry.core.masonry.api"
 | 
			
		||||
requires = ["poetry-core"]
 | 
			
		||||
 | 
			
		||||
[tookl.mypy]
 | 
			
		||||
disallow_untyped_defs = true
 | 
			
		||||
follow_imports = "silent"
 | 
			
		||||
python_version = "3.11"
 | 
			
		||||
warn_redudant_casts = true
 | 
			
		||||
warn_unused_ignores = true
 | 
			
		||||
 | 
			
		||||
[tool.black]
 | 
			
		||||
target-version = ["py311"]
 | 
			
		||||
 | 
			
		||||
[tool.coverage.run]
 | 
			
		||||
branch = true
 | 
			
		||||
dynamic_context = "test_function"
 | 
			
		||||
relative_files = true
 | 
			
		||||
source = ["src"]
 | 
			
		||||
 | 
			
		||||
[tool.poetry]
 | 
			
		||||
authors = ["AKI Projektgruppe 23"]
 | 
			
		||||
description = "A project analysing the german transparenzregister and other data sources to find shared business interests and shared personal and other links for lots of companies."
 | 
			
		||||
name = "aki-prj23-transparenzregister"
 | 
			
		||||
packages = [{include = "aki_prj23_transparenzregister", from = "src"}]
 | 
			
		||||
readme = "README.md"
 | 
			
		||||
version = "0.1.0"
 | 
			
		||||
 | 
			
		||||
[tool.poetry.dependencies]
 | 
			
		||||
loguru = "^0.7.0"
 | 
			
		||||
matplotlib = "^3.7.1"
 | 
			
		||||
plotly = "^5.14.1"
 | 
			
		||||
python = "^3.11"
 | 
			
		||||
seaborn = "^0.12.2"
 | 
			
		||||
tqdm = "^4.65.0"
 | 
			
		||||
 | 
			
		||||
[tool.poetry.group.develop.dependencies]
 | 
			
		||||
black = {extras = ["jupyter"], version = "^23.3.0"}
 | 
			
		||||
jupyterlab = "^4.0.0"
 | 
			
		||||
nbconvert = "^7.4.0"
 | 
			
		||||
pre-commit = "^3.3.2"
 | 
			
		||||
rise = "^5.7.1"
 | 
			
		||||
 | 
			
		||||
[tool.poetry.group.doc.dependencies]
 | 
			
		||||
jupyter = "^1.0.0"
 | 
			
		||||
myst-parser = "^1.0.0"
 | 
			
		||||
nbsphinx = "^0.9.2"
 | 
			
		||||
sphinx = "^6.0.0"
 | 
			
		||||
sphinx-copybutton = "^0.5.2"
 | 
			
		||||
sphinx-rtd-theme = "^1.2.1"
 | 
			
		||||
sphinx_autodoc_typehints = "*"
 | 
			
		||||
sphinxcontrib-mermaid = "^0.9.2"
 | 
			
		||||
sphinxcontrib-napoleon = "^0.7"
 | 
			
		||||
 | 
			
		||||
[tool.poetry.group.lint.dependencies]
 | 
			
		||||
black = "^23.3.0"
 | 
			
		||||
mypy = "^1.3.0"
 | 
			
		||||
pandas-stubs = "^2.0.1.230501"
 | 
			
		||||
ruff = "^0.0.270"
 | 
			
		||||
types-requests = "^2.31.0.1"
 | 
			
		||||
 | 
			
		||||
[tool.poetry.group.test.dependencies]
 | 
			
		||||
pytest = "^7.3.1"
 | 
			
		||||
pytest-clarity = "^1.0.1"
 | 
			
		||||
pytest-cov = "^4.1.0"
 | 
			
		||||
pytest-mock = "^3.10.0"
 | 
			
		||||
pytest-repeat = "^0.9.1"
 | 
			
		||||
 | 
			
		||||
[tool.ruff]
 | 
			
		||||
exclude = [
 | 
			
		||||
  ".bzr",
 | 
			
		||||
  ".direnv",
 | 
			
		||||
  ".eggs",
 | 
			
		||||
  ".git",
 | 
			
		||||
  ".git-rewrite",
 | 
			
		||||
  ".hg",
 | 
			
		||||
  ".mypy_cache",
 | 
			
		||||
  ".nox",
 | 
			
		||||
  ".pants.d",
 | 
			
		||||
  ".pytype",
 | 
			
		||||
  ".ruff_cache",
 | 
			
		||||
  ".svn",
 | 
			
		||||
  ".tox",
 | 
			
		||||
  ".venv",
 | 
			
		||||
  "__pypackages__",
 | 
			
		||||
  "_build",
 | 
			
		||||
  "buck-out",
 | 
			
		||||
  "build",
 | 
			
		||||
  "dist",
 | 
			
		||||
  "node_modules",
 | 
			
		||||
  "venv"
 | 
			
		||||
]
 | 
			
		||||
# Never enforce `E501` (line length violations).
 | 
			
		||||
ignore = ["E501"]
 | 
			
		||||
line-length = 88
 | 
			
		||||
# Enable flake8-bugbear (`B`) rules.
 | 
			
		||||
select = ["E", "F", "B", "I", "S", "RSE", "RET", "SLF", "SIM", "TID", "PD", "PL", "PLE", "PLR", "PLW", "NPY", "UP", "D", "N", "A", "C4", "T20", "PT"]
 | 
			
		||||
src = ["src"]
 | 
			
		||||
target-version = "py311"
 | 
			
		||||
# Avoid trying to fix flake8-bugbear (`B`) violations.
 | 
			
		||||
unfixable = ["B"]
 | 
			
		||||
 | 
			
		||||
[tool.ruff.per-file-ignores]
 | 
			
		||||
"tests/*.py" = ["S101"]
 | 
			
		||||
 | 
			
		||||
[tool.ruff.pydocstyle]
 | 
			
		||||
convention = "google"
 | 
			
		||||
[build-system]
 | 
			
		||||
build-backend = "poetry.core.masonry.api"
 | 
			
		||||
requires = ["poetry-core"]
 | 
			
		||||
 | 
			
		||||
[tookl.mypy]
 | 
			
		||||
disallow_untyped_defs = true
 | 
			
		||||
follow_imports = "silent"
 | 
			
		||||
python_version = "3.11"
 | 
			
		||||
warn_redudant_casts = true
 | 
			
		||||
warn_unused_ignores = true
 | 
			
		||||
 | 
			
		||||
[tool.black]
 | 
			
		||||
target-version = ["py311"]
 | 
			
		||||
 | 
			
		||||
[tool.coverage.run]
 | 
			
		||||
branch = true
 | 
			
		||||
dynamic_context = "test_function"
 | 
			
		||||
relative_files = true
 | 
			
		||||
source = ["src"]
 | 
			
		||||
 | 
			
		||||
[tool.poetry]
 | 
			
		||||
authors = ["AKI Projektgruppe 23"]
 | 
			
		||||
description = "A project analysing the german transparenzregister and other data sources to find shared business interests and shared personal and other links for lots of companies."
 | 
			
		||||
name = "aki-prj23-transparenzregister"
 | 
			
		||||
packages = [{include = "aki_prj23_transparenzregister", from = "src"}]
 | 
			
		||||
readme = "README.md"
 | 
			
		||||
version = "0.1.0"
 | 
			
		||||
 | 
			
		||||
[tool.poetry.dependencies]
 | 
			
		||||
loguru = "^0.7.0"
 | 
			
		||||
matplotlib = "^3.7.1"
 | 
			
		||||
plotly = "^5.14.1"
 | 
			
		||||
pymongo = "^4.4.1"
 | 
			
		||||
python = "^3.11"
 | 
			
		||||
seaborn = "^0.12.2"
 | 
			
		||||
selenium = "^4.10.0"
 | 
			
		||||
tqdm = "^4.65.0"
 | 
			
		||||
types-tqdm = "^4.65.0"
 | 
			
		||||
 | 
			
		||||
[tool.poetry.group.develop.dependencies]
 | 
			
		||||
black = {extras = ["jupyter"], version = "^23.3.0"}
 | 
			
		||||
jupyterlab = "^4.0.0"
 | 
			
		||||
nbconvert = "^7.4.0"
 | 
			
		||||
pre-commit = "^3.3.2"
 | 
			
		||||
rise = "^5.7.1"
 | 
			
		||||
 | 
			
		||||
[tool.poetry.group.doc.dependencies]
 | 
			
		||||
jupyter = "^1.0.0"
 | 
			
		||||
myst-parser = "^1.0.0"
 | 
			
		||||
nbsphinx = "^0.9.2"
 | 
			
		||||
sphinx = "^6.0.0"
 | 
			
		||||
sphinx-copybutton = "^0.5.2"
 | 
			
		||||
sphinx-rtd-theme = "^1.2.1"
 | 
			
		||||
sphinx_autodoc_typehints = "*"
 | 
			
		||||
sphinxcontrib-mermaid = "^0.9.2"
 | 
			
		||||
sphinxcontrib-napoleon = "^0.7"
 | 
			
		||||
 | 
			
		||||
[tool.poetry.group.lint.dependencies]
 | 
			
		||||
black = "^23.3.0"
 | 
			
		||||
mypy = "^1.3.0"
 | 
			
		||||
pandas-stubs = "^2.0.1.230501"
 | 
			
		||||
ruff = "^0.0.270"
 | 
			
		||||
types-requests = "^2.31.0.1"
 | 
			
		||||
 | 
			
		||||
[tool.poetry.group.test.dependencies]
 | 
			
		||||
pytest = "^7.3.1"
 | 
			
		||||
pytest-clarity = "^1.0.1"
 | 
			
		||||
pytest-cov = "^4.1.0"
 | 
			
		||||
pytest-mock = "^3.10.0"
 | 
			
		||||
pytest-repeat = "^0.9.1"
 | 
			
		||||
 | 
			
		||||
[tool.ruff]
 | 
			
		||||
exclude = [
 | 
			
		||||
  ".bzr",
 | 
			
		||||
  ".direnv",
 | 
			
		||||
  ".eggs",
 | 
			
		||||
  ".git",
 | 
			
		||||
  ".git-rewrite",
 | 
			
		||||
  ".hg",
 | 
			
		||||
  ".mypy_cache",
 | 
			
		||||
  ".nox",
 | 
			
		||||
  ".pants.d",
 | 
			
		||||
  ".pytype",
 | 
			
		||||
  ".ruff_cache",
 | 
			
		||||
  ".svn",
 | 
			
		||||
  ".tox",
 | 
			
		||||
  ".venv",
 | 
			
		||||
  "__pypackages__",
 | 
			
		||||
  "_build",
 | 
			
		||||
  "buck-out",
 | 
			
		||||
  "build",
 | 
			
		||||
  "dist",
 | 
			
		||||
  "node_modules",
 | 
			
		||||
  "venv"
 | 
			
		||||
]
 | 
			
		||||
# Never enforce `E501` (line length violations).
 | 
			
		||||
ignore = ["E501"]
 | 
			
		||||
line-length = 88
 | 
			
		||||
# Enable flake8-bugbear (`B`) rules.
 | 
			
		||||
select = ["E", "F", "B", "I", "S", "RSE", "RET", "SLF", "SIM", "TID", "PD", "PL", "PLE", "PLR", "PLW", "NPY", "UP", "D", "N", "A", "C4", "T20", "PT"]
 | 
			
		||||
src = ["src"]
 | 
			
		||||
target-version = "py311"
 | 
			
		||||
# Avoid trying to fix flake8-bugbear (`B`) violations.
 | 
			
		||||
unfixable = ["B"]
 | 
			
		||||
 | 
			
		||||
[tool.ruff.flake8-builtins]
 | 
			
		||||
builtins-ignorelist = ["id"]
 | 
			
		||||
 | 
			
		||||
[tool.ruff.per-file-ignores]
 | 
			
		||||
"tests/*.py" = ["S101", "D100", "D101", "D107", "D103"]
 | 
			
		||||
 | 
			
		||||
[tool.ruff.pydocstyle]
 | 
			
		||||
convention = "google"
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										1
									
								
								src/aki_prj23_transparenzregister/models/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								src/aki_prj23_transparenzregister/models/__init__.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1 @@
 | 
			
		||||
"""Model classes."""
 | 
			
		||||
							
								
								
									
										68
									
								
								src/aki_prj23_transparenzregister/models/company.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										68
									
								
								src/aki_prj23_transparenzregister/models/company.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,68 @@
 | 
			
		||||
"""Company model."""
 | 
			
		||||
from abc import ABC
 | 
			
		||||
from dataclasses import asdict, dataclass
 | 
			
		||||
from enum import Enum
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class RelationshipRoleEnum(Enum):
 | 
			
		||||
    """_summary_.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        Enum (_type_): _description_
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    STAKEHOLDER = ""
 | 
			
		||||
    ORGANISATION = "ORGANISATION"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class CompanyID:
 | 
			
		||||
    """_summary_."""
 | 
			
		||||
 | 
			
		||||
    district_court: str
 | 
			
		||||
    hr_number: str
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class Location:
 | 
			
		||||
    """_summary_."""
 | 
			
		||||
 | 
			
		||||
    city: str
 | 
			
		||||
    street: str | None = None
 | 
			
		||||
    house_number: str | None = None
 | 
			
		||||
    zip_code: str | None = None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class CompanyRelationship(ABC):
 | 
			
		||||
    """_summary_.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        ABC (_type_): _description_
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    role: RelationshipRoleEnum
 | 
			
		||||
    location: Location
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class Company:
 | 
			
		||||
    """_summary_.
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
        _type_: _description_
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    id: CompanyID
 | 
			
		||||
    location: Location
 | 
			
		||||
    name: str
 | 
			
		||||
    last_update: str
 | 
			
		||||
    relationships: list[CompanyRelationship]
 | 
			
		||||
 | 
			
		||||
    def to_dict(self) -> dict:
 | 
			
		||||
        """_summary_.
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
            dict: _description_
 | 
			
		||||
        """
 | 
			
		||||
        return asdict(self)
 | 
			
		||||
							
								
								
									
										25
									
								
								src/aki_prj23_transparenzregister/models/news.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								src/aki_prj23_transparenzregister/models/news.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,25 @@
 | 
			
		||||
"""News mnodel."""
 | 
			
		||||
from dataclasses import asdict, dataclass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class News:
 | 
			
		||||
    """_summary_.
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
        _type_: _description_
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    id: str
 | 
			
		||||
    title: str
 | 
			
		||||
    date: str
 | 
			
		||||
    text: str
 | 
			
		||||
    source_url: str
 | 
			
		||||
 | 
			
		||||
    def to_dict(self) -> dict:
 | 
			
		||||
        """_summary_.
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
            dict: _description_
 | 
			
		||||
        """
 | 
			
		||||
        return asdict(self)
 | 
			
		||||
							
								
								
									
										1
									
								
								src/aki_prj23_transparenzregister/utils/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								src/aki_prj23_transparenzregister/utils/__init__.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1 @@
 | 
			
		||||
"""Util classes and services."""
 | 
			
		||||
@@ -0,0 +1,49 @@
 | 
			
		||||
"""CompanyMongoService."""
 | 
			
		||||
from aki_prj23_transparenzregister.models.company import Company, CompanyID
 | 
			
		||||
from aki_prj23_transparenzregister.utils.mongo import MongoConnector
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class CompanyMongoService:
 | 
			
		||||
    """_summary_."""
 | 
			
		||||
 | 
			
		||||
    def __init__(self, connector: MongoConnector):
 | 
			
		||||
        """_summary_.
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
            connector (MongoConnector): _description_
 | 
			
		||||
        """
 | 
			
		||||
        self.collection = connector.database["companies"]
 | 
			
		||||
 | 
			
		||||
    def get_all(self) -> list[Company]:
 | 
			
		||||
        """_summary_.
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
            list[Company]: _description_
 | 
			
		||||
        """
 | 
			
		||||
        result = self.collection.find()
 | 
			
		||||
        return list(result)
 | 
			
		||||
 | 
			
		||||
    def get_by_id(self, id: CompanyID) -> Company | None:
 | 
			
		||||
        """_summary_.
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
            id (str): _description_
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
            Company | None: _description_
 | 
			
		||||
        """
 | 
			
		||||
        result = list(self.collection.find({"id": id}))
 | 
			
		||||
        if len(result) == 1:
 | 
			
		||||
            return result[0]
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
    def insert(self, company: Company):
 | 
			
		||||
        """_summary_.
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
            company (Company): _description_
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
            _type_: _description_
 | 
			
		||||
        """
 | 
			
		||||
        return self.collection.insert_one(company.to_dict())
 | 
			
		||||
							
								
								
									
										47
									
								
								src/aki_prj23_transparenzregister/utils/mongo.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										47
									
								
								src/aki_prj23_transparenzregister/utils/mongo.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,47 @@
 | 
			
		||||
"""Mongo Wrapper."""
 | 
			
		||||
from dataclasses import dataclass
 | 
			
		||||
 | 
			
		||||
import pymongo
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class MongoConnection:
 | 
			
		||||
    """_summary_."""
 | 
			
		||||
 | 
			
		||||
    hostname: str
 | 
			
		||||
    database: str
 | 
			
		||||
    port: int | None
 | 
			
		||||
    username: str | None
 | 
			
		||||
    password: str | None
 | 
			
		||||
 | 
			
		||||
    def get_conn_string(self) -> str:
 | 
			
		||||
        """Transforms the information of the object to a MongoDB connection string.
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
            str: Connection string
 | 
			
		||||
        """
 | 
			
		||||
        if self.username is not None and self.password is not None:
 | 
			
		||||
            connection_string = (
 | 
			
		||||
                f"mongodb+srv://{self.username}:{self.password}@{self.hostname}"
 | 
			
		||||
            )
 | 
			
		||||
        else:
 | 
			
		||||
            connection_string = f"mongodb+srv://{self.hostname}"
 | 
			
		||||
        if self.port is not None:
 | 
			
		||||
            connection_string += f":{self.port}"
 | 
			
		||||
            connection_string = connection_string.replace("mongodb+srv", "mongodb")
 | 
			
		||||
        return connection_string
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class MongoConnector:
 | 
			
		||||
    """Wrapper for establishing a connection to a MongoDB instance."""
 | 
			
		||||
 | 
			
		||||
    def __init__(self, connection: MongoConnection):
 | 
			
		||||
        """_summary_.
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
            connection (MongoConnection): Wrapper for connection string
 | 
			
		||||
        """
 | 
			
		||||
        self.client: pymongo.MongoClient = pymongo.MongoClient(
 | 
			
		||||
            connection.get_conn_string()
 | 
			
		||||
        )
 | 
			
		||||
        self.database = self.client[connection.database]
 | 
			
		||||
@@ -0,0 +1,94 @@
 | 
			
		||||
"""MongoNewsService."""
 | 
			
		||||
from aki_prj23_transparenzregister.models.news import News
 | 
			
		||||
from aki_prj23_transparenzregister.utils.mongo import MongoConnector
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class MongoNewsService:
 | 
			
		||||
    """_summary_.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        NewsServiceInterface (_type_): _description_
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(self, connector: MongoConnector):
 | 
			
		||||
        """_summary_.
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
            connector (MongoConnector): _description_
 | 
			
		||||
        """
 | 
			
		||||
        self.collection = connector.database["news"]
 | 
			
		||||
 | 
			
		||||
    def get_all(self) -> list[News]:
 | 
			
		||||
        """_summary_.
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
            list[News]: _description_
 | 
			
		||||
        """
 | 
			
		||||
        result = self.collection.find()
 | 
			
		||||
        return [MongoEntryTransformer.transform_outgoing(elem) for elem in result]
 | 
			
		||||
 | 
			
		||||
    def get_by_id(self, id: str) -> News | None:
 | 
			
		||||
        """_summary_.
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
            id (str): _description_
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
            News | None: _description_
 | 
			
		||||
        """
 | 
			
		||||
        result = list(self.collection.find({"_id": id}))
 | 
			
		||||
        if len(result) == 1:
 | 
			
		||||
            return MongoEntryTransformer.transform_outgoing(result[0])
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
    def insert(self, news: News):
 | 
			
		||||
        """_summary_.
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
            news (News): _description_
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
            _type_: _description_
 | 
			
		||||
        """
 | 
			
		||||
        return self.collection.insert_one(MongoEntryTransformer.transform_ingoing(news))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class MongoEntryTransformer:
 | 
			
		||||
    """_summary_.
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
        _type_: _description_
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def transform_ingoing(news: News) -> dict:
 | 
			
		||||
        """Convert a News object to a dictionary compatible with a MongoDB entry.
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
            news (News): News object to be transformed
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
            dict: Transformed data with added _id field
 | 
			
		||||
        """
 | 
			
		||||
        transport_object = news.to_dict()
 | 
			
		||||
        transport_object["_id"] = news.id
 | 
			
		||||
        del transport_object["id"]
 | 
			
		||||
        return transport_object
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def transform_outgoing(data: dict) -> News:
 | 
			
		||||
        """Reverse the transform_ingoing method.
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
            data (dict): dict from the MongoDB to be transformed
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
            News: News entry based on MongoDB document
 | 
			
		||||
        """
 | 
			
		||||
        return News(
 | 
			
		||||
            id=data["_id"],
 | 
			
		||||
            title=data["title"],
 | 
			
		||||
            date=data["date"],
 | 
			
		||||
            text=data["text"],
 | 
			
		||||
            source_url=data["source_url"],
 | 
			
		||||
        )
 | 
			
		||||
							
								
								
									
										35
									
								
								tests/models/company_test.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								tests/models/company_test.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,35 @@
 | 
			
		||||
"""Test Models.company."""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
from aki_prj23_transparenzregister.models.company import Company, CompanyID, Location
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_to_dict() -> None:
 | 
			
		||||
    """Tests if the version tag is entered."""
 | 
			
		||||
    company_id = CompanyID("The Shire", "420")
 | 
			
		||||
    location = Location(
 | 
			
		||||
        city="Insmouth", house_number="19", street="Harbor", zip_code="1890"
 | 
			
		||||
    )
 | 
			
		||||
    company = Company(
 | 
			
		||||
        id=company_id,
 | 
			
		||||
        last_update="Tomorrow",
 | 
			
		||||
        location=location,
 | 
			
		||||
        name="BLANK GmbH",
 | 
			
		||||
        relationships=[],
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    assert company.to_dict() == {
 | 
			
		||||
        "id": {
 | 
			
		||||
            "district_court": company_id.district_court,
 | 
			
		||||
            "hr_number": company_id.hr_number,
 | 
			
		||||
        },
 | 
			
		||||
        "last_update": company.last_update,
 | 
			
		||||
        "location": {
 | 
			
		||||
            "city": location.city,
 | 
			
		||||
            "house_number": location.house_number,
 | 
			
		||||
            "street": location.street,
 | 
			
		||||
            "zip_code": location.zip_code,
 | 
			
		||||
        },
 | 
			
		||||
        "name": "BLANK GmbH",
 | 
			
		||||
        "relationships": [],
 | 
			
		||||
    }
 | 
			
		||||
							
								
								
									
										23
									
								
								tests/models/news_test.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								tests/models/news_test.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,23 @@
 | 
			
		||||
"""Test Models.nesws."""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
from aki_prj23_transparenzregister.models.news import News
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_to_dict() -> None:
 | 
			
		||||
    """Tests if the version tag is entered."""
 | 
			
		||||
    news = News(
 | 
			
		||||
        "4711",
 | 
			
		||||
        "Economy collapses",
 | 
			
		||||
        "2042",
 | 
			
		||||
        "Toilet paper prices rising",
 | 
			
		||||
        "https://www.google.com",
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    assert news.to_dict() == {
 | 
			
		||||
        "id": news.id,
 | 
			
		||||
        "title": news.title,
 | 
			
		||||
        "date": news.date,
 | 
			
		||||
        "text": news.text,
 | 
			
		||||
        "source_url": news.source_url,
 | 
			
		||||
    }
 | 
			
		||||
							
								
								
									
										103
									
								
								tests/utils/company_mongo_service_test.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										103
									
								
								tests/utils/company_mongo_service_test.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,103 @@
 | 
			
		||||
"""Test utils.company_mongo_service."""
 | 
			
		||||
from unittest.mock import Mock
 | 
			
		||||
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
from aki_prj23_transparenzregister.models.company import Company
 | 
			
		||||
from aki_prj23_transparenzregister.utils.company_mongo_service import (
 | 
			
		||||
    CompanyMongoService,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture()
 | 
			
		||||
def mock_mongo_connector(mocker) -> Mock:
 | 
			
		||||
    """Mock MongoConnector class.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        mocker (any): Library mocker
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
        Mock: Mocked MongoConnector
 | 
			
		||||
    """
 | 
			
		||||
    mock = Mock()
 | 
			
		||||
    mocker.patch(
 | 
			
		||||
        "aki_prj23_transparenzregister.utils.mongo.MongoConnector", return_value=mock
 | 
			
		||||
    )
 | 
			
		||||
    return mock
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture()
 | 
			
		||||
def mock_collection() -> Mock:
 | 
			
		||||
    """Mock mongo collection.
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
        Mock: Mock object
 | 
			
		||||
    """
 | 
			
		||||
    return Mock()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_init(mock_mongo_connector, mock_collection):
 | 
			
		||||
    """Test CompanyMongoService constructor.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        mock_mongo_connector (Mock): Mocked MongoConnector library
 | 
			
		||||
        mock_collection (Mock): Mocked pymongo collection
 | 
			
		||||
    """
 | 
			
		||||
    mock_mongo_connector.database = {"companies": mock_collection}
 | 
			
		||||
    service = CompanyMongoService(mock_mongo_connector)
 | 
			
		||||
    assert service.collection == mock_collection
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_get_all(mock_mongo_connector, mock_collection):
 | 
			
		||||
    """Test CompanyMongoService get_all method.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        mock_mongo_connector (Mock): Mocked MongoConnector library
 | 
			
		||||
        mock_collection (Mock): Mocked pymongo collection
 | 
			
		||||
    """
 | 
			
		||||
    mock_mongo_connector.database = {"companies": mock_collection}
 | 
			
		||||
    service = CompanyMongoService(mock_mongo_connector)
 | 
			
		||||
    mock_result = [{"id": "42"}]
 | 
			
		||||
    mock_collection.find.return_value = mock_result
 | 
			
		||||
    assert service.get_all() == mock_result
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_by_id_no_result(mock_mongo_connector, mock_collection):
 | 
			
		||||
    """Test CompanyMongoService get_by_id with no result.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        mock_mongo_connector (Mock): Mocked MongoConnector library
 | 
			
		||||
        mock_collection (Mock): Mocked pymongo collection
 | 
			
		||||
    """
 | 
			
		||||
    mock_mongo_connector.database = {"companies": mock_collection}
 | 
			
		||||
    service = CompanyMongoService(mock_mongo_connector)
 | 
			
		||||
    mock_collection.find.return_value = []
 | 
			
		||||
    assert service.get_by_id("Does not exist") is None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_by_id_result(mock_mongo_connector, mock_collection):
 | 
			
		||||
    """Test CompanyMongoService get_by_id with result.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        mock_mongo_connector (Mock): Mocked MongoConnector library
 | 
			
		||||
        mock_collection (Mock): Mocked pymongo collection
 | 
			
		||||
    """
 | 
			
		||||
    mock_mongo_connector.database = {"companies": mock_collection}
 | 
			
		||||
    service = CompanyMongoService(mock_mongo_connector)
 | 
			
		||||
    mock_entry = {"id": "Does exist", "vaue": 42}
 | 
			
		||||
    mock_collection.find.return_value = [mock_entry]
 | 
			
		||||
    assert service.get_by_id("Does exist") == mock_entry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_insert(mock_mongo_connector, mock_collection):
 | 
			
		||||
    """Test CompanyMongoService insert method.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        mock_mongo_connector (Mock): Mocked MongoConnector library
 | 
			
		||||
        mock_collection (Mock): Mocked pymongo collection
 | 
			
		||||
    """
 | 
			
		||||
    mock_mongo_connector.database = {"companies": mock_collection}
 | 
			
		||||
    service = CompanyMongoService(mock_mongo_connector)
 | 
			
		||||
    mock_result = 42
 | 
			
		||||
    mock_collection.insert_one.return_value = mock_result
 | 
			
		||||
    assert service.insert(Company(None, None, "", "", [])) == mock_result
 | 
			
		||||
							
								
								
									
										26
									
								
								tests/utils/mongo_test.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								tests/utils/mongo_test.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,26 @@
 | 
			
		||||
from unittest.mock import patch
 | 
			
		||||
 | 
			
		||||
from aki_prj23_transparenzregister.utils.mongo import MongoConnection, MongoConnector
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_get_conn_string_no_credentials():
 | 
			
		||||
    conn = MongoConnection("localhost", "", 27017, None, None)
 | 
			
		||||
    assert conn.get_conn_string() == "mongodb://localhost:27017"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_get_conn_string_no_port_but_credentials():
 | 
			
		||||
    conn = MongoConnection("localhost", "", None, "admin", "password")
 | 
			
		||||
    assert conn.get_conn_string() == "mongodb+srv://admin:password@localhost"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_get_conn_simple():
 | 
			
		||||
    conn = MongoConnection("localhost", "", None, None, None)
 | 
			
		||||
    assert conn.get_conn_string() == "mongodb+srv://localhost"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_mongo_connector():
 | 
			
		||||
    with patch("pymongo.MongoClient") as mock_mongo_client:
 | 
			
		||||
        expected_result = 42
 | 
			
		||||
        mock_mongo_client.return_value = {"db": expected_result}
 | 
			
		||||
        temp = MongoConnector(MongoConnection("localhost", "db", None, None, None))
 | 
			
		||||
        assert temp.database == expected_result
 | 
			
		||||
							
								
								
									
										115
									
								
								tests/utils/news_mongo_service_test.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										115
									
								
								tests/utils/news_mongo_service_test.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,115 @@
 | 
			
		||||
from unittest.mock import Mock, patch
 | 
			
		||||
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
from aki_prj23_transparenzregister.models.news import News
 | 
			
		||||
from aki_prj23_transparenzregister.utils.news_mongo_service import (
 | 
			
		||||
    MongoEntryTransformer,
 | 
			
		||||
    MongoNewsService,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture()
 | 
			
		||||
def mock_mongo_connector(mocker) -> Mock:
 | 
			
		||||
    """Mock MongoConnector class.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        mocker (any): Library mocker
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
        Mock: Mocked MongoConnector
 | 
			
		||||
    """
 | 
			
		||||
    mock = Mock()
 | 
			
		||||
    mocker.patch(
 | 
			
		||||
        "aki_prj23_transparenzregister.utils.mongo.MongoConnector", return_value=mock
 | 
			
		||||
    )
 | 
			
		||||
    return mock
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture()
 | 
			
		||||
def mock_collection() -> Mock:
 | 
			
		||||
    """Mock mongo collection.
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
        Mock: Mock object
 | 
			
		||||
    """
 | 
			
		||||
    return Mock()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_init(mock_mongo_connector, mock_collection):
 | 
			
		||||
    """Test CompanyMongoService constructor.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        mock_mongo_connector (Mock): Mocked MongoConnector library
 | 
			
		||||
        mock_collection (Mock): Mocked pymongo collection
 | 
			
		||||
    """
 | 
			
		||||
    mock_mongo_connector.database = {"news": mock_collection}
 | 
			
		||||
    service = MongoNewsService(mock_mongo_connector)
 | 
			
		||||
    assert service.collection == mock_collection
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_get_all(mock_mongo_connector, mock_collection):
 | 
			
		||||
    mock_mongo_connector.database = {"news": mock_collection}
 | 
			
		||||
    service = MongoNewsService(mock_mongo_connector)
 | 
			
		||||
 | 
			
		||||
    mock_collection.find.return_value = []
 | 
			
		||||
    assert service.get_all() == []
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_get_by_id_with_result(mock_mongo_connector, mock_collection):
 | 
			
		||||
    mock_mongo_connector.database = {"news": mock_collection}
 | 
			
		||||
    service = MongoNewsService(mock_mongo_connector)
 | 
			
		||||
 | 
			
		||||
    with patch(
 | 
			
		||||
        "aki_prj23_transparenzregister.utils.news_mongo_service.MongoEntryTransformer.transform_outgoing"
 | 
			
		||||
    ) as mock_out:
 | 
			
		||||
        mock_collection.find.return_value = [{}]
 | 
			
		||||
        mock_out.return_value = {}
 | 
			
		||||
        assert service.get_by_id("foadh") == {}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_get_by_id_no_result(mock_mongo_connector, mock_collection):
 | 
			
		||||
    mock_mongo_connector.database = {"news": mock_collection}
 | 
			
		||||
    service = MongoNewsService(mock_mongo_connector)
 | 
			
		||||
 | 
			
		||||
    mock_collection.find.return_value = []
 | 
			
		||||
    assert service.get_by_id("foadh") is None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_insert(mock_mongo_connector, mock_collection):
 | 
			
		||||
    mock_mongo_connector.database = {"news": mock_collection}
 | 
			
		||||
    service = MongoNewsService(mock_mongo_connector)
 | 
			
		||||
 | 
			
		||||
    with patch(
 | 
			
		||||
        "aki_prj23_transparenzregister.utils.news_mongo_service.MongoEntryTransformer.transform_ingoing"
 | 
			
		||||
    ) as mock_in:
 | 
			
		||||
        mock_collection.insert_one.return_value = {}
 | 
			
		||||
        mock_in.return_value = {}
 | 
			
		||||
        assert service.insert({}) == {}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_transform_ingoing():
 | 
			
		||||
    news = News("42", None, None, None, None)
 | 
			
		||||
    result = MongoEntryTransformer.transform_ingoing(news)
 | 
			
		||||
    assert result["_id"] == "42"
 | 
			
		||||
    assert "id" not in result
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_transform_outgoing():
 | 
			
		||||
    data = {
 | 
			
		||||
        "_id": "4711",
 | 
			
		||||
        "title": "Hello",
 | 
			
		||||
        "date": "Today",
 | 
			
		||||
        "text": "World",
 | 
			
		||||
        "source_url": "chat.openai.com",
 | 
			
		||||
    }
 | 
			
		||||
    expected_result = News(
 | 
			
		||||
        **{
 | 
			
		||||
            "id": "4711",
 | 
			
		||||
            "title": "Hello",
 | 
			
		||||
            "date": "Today",
 | 
			
		||||
            "text": "World",
 | 
			
		||||
            "source_url": "chat.openai.com",
 | 
			
		||||
        }
 | 
			
		||||
    )
 | 
			
		||||
    assert MongoEntryTransformer.transform_outgoing(data) == expected_result
 | 
			
		||||
		Reference in New Issue
	
	Block a user