mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-04-22 11:52:54 +02:00
317 lines
25 KiB
Plaintext
317 lines
25 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Unternehmensregister"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Fetch Auszug"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 98,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"c:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Unternehmensregister\\data\\Unternehmensregister\n"
|
|
]
|
|
},
|
|
{
|
|
"ename": "KeyboardInterrupt",
|
|
"evalue": "",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
|
"\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Unternehmensregister\\notebook.ipynb Cell 3\u001b[0m in \u001b[0;36m5\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#X24sZmlsZQ%3D%3D?line=49'>50</a>\u001b[0m companies_tab \u001b[39m=\u001b[39m driver\u001b[39m.\u001b[39mfind_elements(\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#X24sZmlsZQ%3D%3D?line=50'>51</a>\u001b[0m By\u001b[39m.\u001b[39mLINK_TEXT, \u001b[39m\"\u001b[39m\u001b[39mRegisterinformationen des Registergerichts\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#X24sZmlsZQ%3D%3D?line=51'>52</a>\u001b[0m )\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#X24sZmlsZQ%3D%3D?line=52'>53</a>\u001b[0m \u001b[39mfor\u001b[39;00m company_link \u001b[39min\u001b[39;00m companies_tab:\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#X24sZmlsZQ%3D%3D?line=53'>54</a>\u001b[0m \u001b[39m# Go to intermediary page\u001b[39;00m\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#X24sZmlsZQ%3D%3D?line=54'>55</a>\u001b[0m company_link\u001b[39m.\u001b[39;49mclick()\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#X24sZmlsZQ%3D%3D?line=55'>56</a>\u001b[0m \u001b[39m# Trigger next redirect\u001b[39;00m\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#X24sZmlsZQ%3D%3D?line=56'>57</a>\u001b[0m driver\u001b[39m.\u001b[39mfind_element(By\u001b[39m.\u001b[39mLINK_TEXT, \u001b[39m\"\u001b[39m\u001b[39mRegisterinformationen anzeigen\u001b[39m\u001b[39m\"\u001b[39m)\u001b[39m.\u001b[39mclick()\n",
|
|
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webelement.py:94\u001b[0m, in \u001b[0;36mWebElement.click\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 92\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mclick\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 93\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Clicks the element.\"\"\"\u001b[39;00m\n\u001b[1;32m---> 94\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_execute(Command\u001b[39m.\u001b[39;49mCLICK_ELEMENT)\n",
|
|
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webelement.py:395\u001b[0m, in \u001b[0;36mWebElement._execute\u001b[1;34m(self, command, params)\u001b[0m\n\u001b[0;32m 393\u001b[0m params \u001b[39m=\u001b[39m {}\n\u001b[0;32m 394\u001b[0m params[\u001b[39m\"\u001b[39m\u001b[39mid\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_id\n\u001b[1;32m--> 395\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_parent\u001b[39m.\u001b[39;49mexecute(command, params)\n",
|
|
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py:344\u001b[0m, in \u001b[0;36mWebDriver.execute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m 341\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39msessionId\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m params:\n\u001b[0;32m 342\u001b[0m params[\u001b[39m\"\u001b[39m\u001b[39msessionId\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msession_id\n\u001b[1;32m--> 344\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mcommand_executor\u001b[39m.\u001b[39;49mexecute(driver_command, params)\n\u001b[0;32m 345\u001b[0m \u001b[39mif\u001b[39;00m response:\n\u001b[0;32m 346\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror_handler\u001b[39m.\u001b[39mcheck_response(response)\n",
|
|
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\remote_connection.py:290\u001b[0m, in \u001b[0;36mRemoteConnection.execute\u001b[1;34m(self, command, params)\u001b[0m\n\u001b[0;32m 288\u001b[0m data \u001b[39m=\u001b[39m utils\u001b[39m.\u001b[39mdump_json(params)\n\u001b[0;32m 289\u001b[0m url \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_url\u001b[39m}\u001b[39;00m\u001b[39m{\u001b[39;00mpath\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m--> 290\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_request(command_info[\u001b[39m0\u001b[39;49m], url, body\u001b[39m=\u001b[39;49mdata)\n",
|
|
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\remote_connection.py:311\u001b[0m, in \u001b[0;36mRemoteConnection._request\u001b[1;34m(self, method, url, body)\u001b[0m\n\u001b[0;32m 308\u001b[0m body \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m 310\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mkeep_alive:\n\u001b[1;32m--> 311\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_conn\u001b[39m.\u001b[39;49mrequest(method, url, body\u001b[39m=\u001b[39;49mbody, headers\u001b[39m=\u001b[39;49mheaders)\n\u001b[0;32m 312\u001b[0m statuscode \u001b[39m=\u001b[39m response\u001b[39m.\u001b[39mstatus\n\u001b[0;32m 313\u001b[0m \u001b[39melse\u001b[39;00m:\n",
|
|
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\request.py:78\u001b[0m, in \u001b[0;36mRequestMethods.request\u001b[1;34m(self, method, url, fields, headers, **urlopen_kw)\u001b[0m\n\u001b[0;32m 74\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mrequest_encode_url(\n\u001b[0;32m 75\u001b[0m method, url, fields\u001b[39m=\u001b[39mfields, headers\u001b[39m=\u001b[39mheaders, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39murlopen_kw\n\u001b[0;32m 76\u001b[0m )\n\u001b[0;32m 77\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m---> 78\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mrequest_encode_body(\n\u001b[0;32m 79\u001b[0m method, url, fields\u001b[39m=\u001b[39mfields, headers\u001b[39m=\u001b[39mheaders, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39murlopen_kw\n\u001b[0;32m 80\u001b[0m )\n",
|
|
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\request.py:170\u001b[0m, in \u001b[0;36mRequestMethods.request_encode_body\u001b[1;34m(self, method, url, fields, headers, encode_multipart, multipart_boundary, **urlopen_kw)\u001b[0m\n\u001b[0;32m 167\u001b[0m extra_kw[\u001b[39m\"\u001b[39m\u001b[39mheaders\u001b[39m\u001b[39m\"\u001b[39m]\u001b[39m.\u001b[39mupdate(headers)\n\u001b[0;32m 168\u001b[0m extra_kw\u001b[39m.\u001b[39mupdate(urlopen_kw)\n\u001b[1;32m--> 170\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39murlopen(method, url, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mextra_kw)\n",
|
|
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\poolmanager.py:376\u001b[0m, in \u001b[0;36mPoolManager.urlopen\u001b[1;34m(self, method, url, redirect, **kw)\u001b[0m\n\u001b[0;32m 374\u001b[0m response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39murlopen(method, url, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkw)\n\u001b[0;32m 375\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m--> 376\u001b[0m response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39murlopen(method, u\u001b[39m.\u001b[39mrequest_uri, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkw)\n\u001b[0;32m 378\u001b[0m redirect_location \u001b[39m=\u001b[39m redirect \u001b[39mand\u001b[39;00m response\u001b[39m.\u001b[39mget_redirect_location()\n\u001b[0;32m 379\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m redirect_location:\n",
|
|
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\connectionpool.py:703\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[1;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[0;32m 700\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_prepare_proxy(conn)\n\u001b[0;32m 702\u001b[0m \u001b[39m# Make the request on the httplib connection object.\u001b[39;00m\n\u001b[1;32m--> 703\u001b[0m httplib_response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_request(\n\u001b[0;32m 704\u001b[0m conn,\n\u001b[0;32m 705\u001b[0m method,\n\u001b[0;32m 706\u001b[0m url,\n\u001b[0;32m 707\u001b[0m timeout\u001b[39m=\u001b[39;49mtimeout_obj,\n\u001b[0;32m 708\u001b[0m body\u001b[39m=\u001b[39;49mbody,\n\u001b[0;32m 709\u001b[0m headers\u001b[39m=\u001b[39;49mheaders,\n\u001b[0;32m 710\u001b[0m chunked\u001b[39m=\u001b[39;49mchunked,\n\u001b[0;32m 711\u001b[0m )\n\u001b[0;32m 713\u001b[0m \u001b[39m# If we're going to release the connection in ``finally:``, then\u001b[39;00m\n\u001b[0;32m 714\u001b[0m \u001b[39m# the response doesn't need to know about the connection. Otherwise\u001b[39;00m\n\u001b[0;32m 715\u001b[0m \u001b[39m# it will also try to release it and we'll have a double-release\u001b[39;00m\n\u001b[0;32m 716\u001b[0m \u001b[39m# mess.\u001b[39;00m\n\u001b[0;32m 717\u001b[0m response_conn \u001b[39m=\u001b[39m conn \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m release_conn \u001b[39melse\u001b[39;00m \u001b[39mNone\u001b[39;00m\n",
|
|
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\connectionpool.py:449\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[1;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[0;32m 444\u001b[0m httplib_response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39mgetresponse()\n\u001b[0;32m 445\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 446\u001b[0m \u001b[39m# Remove the TypeError from the exception chain in\u001b[39;00m\n\u001b[0;32m 447\u001b[0m \u001b[39m# Python 3 (including for exceptions like SystemExit).\u001b[39;00m\n\u001b[0;32m 448\u001b[0m \u001b[39m# Otherwise it looks like a bug in the code.\u001b[39;00m\n\u001b[1;32m--> 449\u001b[0m six\u001b[39m.\u001b[39;49mraise_from(e, \u001b[39mNone\u001b[39;49;00m)\n\u001b[0;32m 450\u001b[0m \u001b[39mexcept\u001b[39;00m (SocketTimeout, BaseSSLError, SocketError) \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 451\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_raise_timeout(err\u001b[39m=\u001b[39me, url\u001b[39m=\u001b[39murl, timeout_value\u001b[39m=\u001b[39mread_timeout)\n",
|
|
"File \u001b[1;32m<string>:3\u001b[0m, in \u001b[0;36mraise_from\u001b[1;34m(value, from_value)\u001b[0m\n",
|
|
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\connectionpool.py:444\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[1;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[0;32m 441\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mTypeError\u001b[39;00m:\n\u001b[0;32m 442\u001b[0m \u001b[39m# Python 3\u001b[39;00m\n\u001b[0;32m 443\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 444\u001b[0m httplib_response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39;49mgetresponse()\n\u001b[0;32m 445\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 446\u001b[0m \u001b[39m# Remove the TypeError from the exception chain in\u001b[39;00m\n\u001b[0;32m 447\u001b[0m \u001b[39m# Python 3 (including for exceptions like SystemExit).\u001b[39;00m\n\u001b[0;32m 448\u001b[0m \u001b[39m# Otherwise it looks like a bug in the code.\u001b[39;00m\n\u001b[0;32m 449\u001b[0m six\u001b[39m.\u001b[39mraise_from(e, \u001b[39mNone\u001b[39;00m)\n",
|
|
"File \u001b[1;32mc:\\Python310\\lib\\http\\client.py:1374\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1372\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 1373\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m-> 1374\u001b[0m response\u001b[39m.\u001b[39;49mbegin()\n\u001b[0;32m 1375\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mConnectionError\u001b[39;00m:\n\u001b[0;32m 1376\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mclose()\n",
|
|
"File \u001b[1;32mc:\\Python310\\lib\\http\\client.py:318\u001b[0m, in \u001b[0;36mHTTPResponse.begin\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 316\u001b[0m \u001b[39m# read until we get a non-100 response\u001b[39;00m\n\u001b[0;32m 317\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m--> 318\u001b[0m version, status, reason \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_read_status()\n\u001b[0;32m 319\u001b[0m \u001b[39mif\u001b[39;00m status \u001b[39m!=\u001b[39m CONTINUE:\n\u001b[0;32m 320\u001b[0m \u001b[39mbreak\u001b[39;00m\n",
|
|
"File \u001b[1;32mc:\\Python310\\lib\\http\\client.py:279\u001b[0m, in \u001b[0;36mHTTPResponse._read_status\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 278\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_read_status\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[1;32m--> 279\u001b[0m line \u001b[39m=\u001b[39m \u001b[39mstr\u001b[39m(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfp\u001b[39m.\u001b[39;49mreadline(_MAXLINE \u001b[39m+\u001b[39;49m \u001b[39m1\u001b[39;49m), \u001b[39m\"\u001b[39m\u001b[39miso-8859-1\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 280\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(line) \u001b[39m>\u001b[39m _MAXLINE:\n\u001b[0;32m 281\u001b[0m \u001b[39mraise\u001b[39;00m LineTooLong(\u001b[39m\"\u001b[39m\u001b[39mstatus line\u001b[39m\u001b[39m\"\u001b[39m)\n",
|
|
"File \u001b[1;32mc:\\Python310\\lib\\socket.py:705\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[1;34m(self, b)\u001b[0m\n\u001b[0;32m 703\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[0;32m 704\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 705\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sock\u001b[39m.\u001b[39;49mrecv_into(b)\n\u001b[0;32m 706\u001b[0m \u001b[39mexcept\u001b[39;00m timeout:\n\u001b[0;32m 707\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_timeout_occurred \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n",
|
|
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import os\n",
|
|
"from pathlib import Path\n",
|
|
"\n",
|
|
"from selenium import webdriver\n",
|
|
"from selenium.webdriver.common.by import By\n",
|
|
"from selenium.webdriver.support.ui import WebDriverWait\n",
|
|
"from selenium.webdriver.support import expected_conditions as EC\n",
|
|
"\n",
|
|
"search_query = \"A*\"\n",
|
|
"\n",
|
|
"options = webdriver.ChromeOptions()\n",
|
|
"\n",
|
|
"download_path = str(Path(Path.cwd() / \"data\" / \"Unternehmensregister\"))\n",
|
|
"print(download_path)\n",
|
|
"\n",
|
|
"preferences = {\n",
|
|
" \"profile.default_content_settings.popups\": 0,\n",
|
|
" \"safebrowsing.enabled\": True,\n",
|
|
" \"download\": {\n",
|
|
" \"directory_upgrade\": True,\n",
|
|
" \"prompt_for_download\": False,\n",
|
|
" \"extensions_to_open\": \"\",\n",
|
|
" \"default_directory\": download_path,\n",
|
|
" },\n",
|
|
"}\n",
|
|
"options.add_experimental_option(\"prefs\", preferences)\n",
|
|
"\n",
|
|
"driver = webdriver.Chrome(options=options)\n",
|
|
"\n",
|
|
"driver.get(\"https://www.unternehmensregister.de/ureg/\")\n",
|
|
"# Accept Cookies\n",
|
|
"driver.find_elements(\n",
|
|
" By.XPATH, '//button[text()=\"Nur technisch notwendige Cookies akzeptieren\"]'\n",
|
|
")[0].click()\n",
|
|
"# Enter search query\n",
|
|
"driver.find_elements(By.ID, \"globalSearchForm:extendedResearchCompanyName\")[\n",
|
|
" 0\n",
|
|
"].send_keys(search_query)\n",
|
|
"# Trigger search\n",
|
|
"driver.find_elements(By.ID, \"globalSearchForm:btnExecuteSearchOld\")[0].click()\n",
|
|
"# Wait for results\n",
|
|
"wait = WebDriverWait(driver, 5)\n",
|
|
"wait.until(\n",
|
|
" lambda driver: driver.current_url != \"https://www.unternehmensregister.de/ureg/\"\n",
|
|
")\n",
|
|
"## TODO Iterate over tabs\n",
|
|
"num_pages = int(\n",
|
|
" driver.find_element(By.XPATH, '//*[@class=\"page_count\"]').text.split(\" \")[0]\n",
|
|
")\n",
|
|
"for page_index in range(num_pages):\n",
|
|
" # Find all \"Registerinformationen\"\n",
|
|
" companies_tab = driver.find_elements(\n",
|
|
" By.LINK_TEXT, \"Registerinformationen des Registergerichts\"\n",
|
|
" )\n",
|
|
" for company_link in companies_tab:\n",
|
|
" # Go to intermediary page\n",
|
|
" company_link.click()\n",
|
|
" # Trigger next redirect\n",
|
|
" driver.find_element(By.LINK_TEXT, \"Registerinformationen anzeigen\").click()\n",
|
|
" # Trigger SI download\n",
|
|
" driver.find_element(By.LINK_TEXT, \"SI\").click()\n",
|
|
" # Show shopping cart - TODO evaluate restructuring behaviour by filling cart first and then bulk downloading\n",
|
|
" wait.until(\n",
|
|
" EC.visibility_of_element_located((By.LINK_TEXT, \"Dokumentenkorb ansehen\"))\n",
|
|
" )\n",
|
|
" driver.find_element(By.LINK_TEXT, \"Dokumentenkorb ansehen\").click()\n",
|
|
" # Get document\n",
|
|
" xpath = \"//input[@type='submit']\"\n",
|
|
" elems = driver.find_elements(By.TAG_NAME, \"input\")\n",
|
|
" elems[-2].click()\n",
|
|
"\n",
|
|
" driver.find_element(By.ID, \"paymentFormOverview:btnNext\").click()\n",
|
|
" driver.find_element(By.LINK_TEXT, \"Zum Dokumentenkorb\").click()\n",
|
|
" driver.find_element(By.CLASS_NAME, \"download-wrapper\").click()\n",
|
|
"\n",
|
|
" for i in range(6):\n",
|
|
" driver.back()\n",
|
|
" driver.find_element(By.XPATH, '//*[@class=\"fas fa-angle-right\"]').click()\n",
|
|
"driver.close()"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Analyze Auszug"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 97,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"['registerdocument-2023-06-09-13-48-58.xml',\n",
|
|
" 'registerdocument-2023-06-09-13-49-02.xml',\n",
|
|
" 'registerdocument-2023-06-09-13-49-05.xml']"
|
|
]
|
|
},
|
|
"execution_count": 97,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"files = os.listdir(\"./data/Unternehmensregister\")\n",
|
|
"files"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 27,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from pdf2image import convert_from_path\n",
|
|
"\n",
|
|
"pdfs = r\"./data/test.pdf\"\n",
|
|
"pages = convert_from_path(pdfs, 350)\n",
|
|
"\n",
|
|
"\n",
|
|
"for i, page in enumerate(pages):\n",
|
|
" image_name = f\"./data/Page_{i+1}.jpg\"\n",
|
|
" page.save(image_name, \"JPEG\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 28,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Handelsregister B des Abteilung B Nummer der Firma:\n",
|
|
"Amtsgerichts Hamm Wiedergabe des aktuellen HRB 5363\n",
|
|
"Registerinhalts\n",
|
|
"Abruf vom 07.06.2023 19:37\n",
|
|
"1. Anzahl der bisherigen Eintragungen:\n",
|
|
"51\n",
|
|
"2. a) Firma:\n",
|
|
"GEA Farm Technologies GmbH\n",
|
|
"b) Sitz, Niederlassung, inländische Geschäftsanschrift, empfangsberechtigte Person, Zweigniederlassungen:\n",
|
|
"Bönen\n",
|
|
"Geschäftsanschrift: Siemensstraße 25-27, 59199 Bönen\n",
|
|
"c) Gegenstand des Unternehmens:\n",
|
|
"Entwicklung, Herstellung und der Vertrieb von Landtechnik, insbesondere von Komponenten und Anlagen\n",
|
|
"(a) zur Gewinnung, Kühlung, Behandlung und Lagerung von Milch;\n",
|
|
"(b) für das Milchvieh-Herdenmanagement;\n",
|
|
"(c) zur Tierhygiene und Sicherung der Milchqualität und\n",
|
|
"(d) zur Aufstallung von Tieren;\n",
|
|
"sowie die Herstellung und der Vertrieb von Anlagen und Fahrzeugen zur Aufbereitung und zum Transport von Gülle.\n",
|
|
"3. Grund- oder Stammkapital:\n",
|
|
"5.115.000,00 EUR\n",
|
|
"4. a) Allgemeine Vertretungsregelung:\n",
|
|
"Ist nur ein Geschäftsführer bestellt, so vertritt er die Gesellschaft allein. Sind mehrere Geschäftsführer bestellt, so wird die\n",
|
|
"Gesellschaft durch zwei Geschäftsführer oder durch einen Geschäftsführer gemeinsam mit einem Prokuristen vertreten.\n",
|
|
"b) Vorstand, Leitungsorgan, geschäftsführende Direktoren, persönlich haftender Gesellschafter, Geschäftsführer,\n",
|
|
"Vertretungsberechtigte und besondere Vertretungsbefugnis:\n",
|
|
"Geschäftsführer: Basaran, Erkul, Erkrath, *06.05.1977\n",
|
|
"Geschäftsführer: Cheng, Liang, Göppingen, *29.12.1980\n",
|
|
"Geschäftsführer: Lauwers, Peter, Düsseldorf, *26.03.1970\n",
|
|
"5. Prokura:\n",
|
|
"Gesamtprokura gemeinsam mit einem Geschäftsführer oder einem anderen Prokuristen:\n",
|
|
"Barkmeyer, Ralf, Dortmund, *28.02.1974\n",
|
|
"Böttner, Henrik, Bochum, *07.11.1982\n",
|
|
"Dörner-Rodeheger, Astrid, Beckum, *24.12.1968\n",
|
|
"Frombach, Ralf, Werne, *25.01.1977\n",
|
|
"Gebing, Reinhard, Oelde, *26.04.1964\n",
|
|
"Hommel, Sven, Berlin, *22.04.1979\n",
|
|
"Kramps, Mark, Witten, *04.09.1967\n",
|
|
"Kreft, Markus, Wetter, *03.04.1966\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import cv2\n",
|
|
"import pytesseract\n",
|
|
"\n",
|
|
"image_path = \"./data/Page_1.jpg\"\n",
|
|
"image = cv2.imread(image_path)\n",
|
|
"\n",
|
|
"text = str(pytesseract.image_to_string(image, config=\"--psm 6\", lang=\"deu\"))\n",
|
|
"print(text)\n",
|
|
"with open(\"./data/Page_1.txt\", \"w\", encoding=\"utf-8\") as output_file:\n",
|
|
" output_file.write(text)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 29,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"['Geschäftsführer: Basaran, Erkul', 'Geschäftsführer: Cheng, Liang', 'Geschäftsführer: Lauwers, Peter']\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"['Erkul Basaran', 'Liang Cheng', 'Peter Lauwers']"
|
|
]
|
|
},
|
|
"execution_count": 29,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"import re\n",
|
|
"\n",
|
|
"\n",
|
|
"def get_managing_directors(text: str) -> list:\n",
|
|
" managing_directors_regex = r\"Geschäftsführer: [a-zA-ZäöüÄÖÜ]*, [a-zA-ZäöüÄÖÜ]*\"\n",
|
|
" hits = re.findall(managing_directors_regex, text)\n",
|
|
" print(hits)\n",
|
|
" return [\n",
|
|
" \" \".join(hit.replace(\"Geschäftsführer: \", \"\").replace(\",\", \"\").split(\" \")[::-1])\n",
|
|
" for hit in hits\n",
|
|
" ]\n",
|
|
"\n",
|
|
"\n",
|
|
"get_managing_directors(text)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.7"
|
|
},
|
|
"orig_nbformat": 4
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|