Extract first stakeholder informationen from Unternehmensregister export

This commit is contained in:
TrisNol 2023-06-09 14:23:56 +02:00
parent e2ad2d475a
commit 1010b43a5f
2 changed files with 106 additions and 35 deletions

View File

@ -18,7 +18,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 98, "execution_count": 118,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -27,33 +27,6 @@
"text": [ "text": [
"c:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Unternehmensregister\\data\\Unternehmensregister\n" "c:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Unternehmensregister\\data\\Unternehmensregister\n"
] ]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Unternehmensregister\\notebook.ipynb Cell 3\u001b[0m in \u001b[0;36m5\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#X24sZmlsZQ%3D%3D?line=49'>50</a>\u001b[0m companies_tab \u001b[39m=\u001b[39m driver\u001b[39m.\u001b[39mfind_elements(\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#X24sZmlsZQ%3D%3D?line=50'>51</a>\u001b[0m By\u001b[39m.\u001b[39mLINK_TEXT, \u001b[39m\"\u001b[39m\u001b[39mRegisterinformationen des Registergerichts\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#X24sZmlsZQ%3D%3D?line=51'>52</a>\u001b[0m )\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#X24sZmlsZQ%3D%3D?line=52'>53</a>\u001b[0m \u001b[39mfor\u001b[39;00m company_link \u001b[39min\u001b[39;00m companies_tab:\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#X24sZmlsZQ%3D%3D?line=53'>54</a>\u001b[0m \u001b[39m# Go to intermediary page\u001b[39;00m\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#X24sZmlsZQ%3D%3D?line=54'>55</a>\u001b[0m company_link\u001b[39m.\u001b[39;49mclick()\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#X24sZmlsZQ%3D%3D?line=55'>56</a>\u001b[0m \u001b[39m# Trigger next redirect\u001b[39;00m\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#X24sZmlsZQ%3D%3D?line=56'>57</a>\u001b[0m driver\u001b[39m.\u001b[39mfind_element(By\u001b[39m.\u001b[39mLINK_TEXT, \u001b[39m\"\u001b[39m\u001b[39mRegisterinformationen anzeigen\u001b[39m\u001b[39m\"\u001b[39m)\u001b[39m.\u001b[39mclick()\n",
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webelement.py:94\u001b[0m, in \u001b[0;36mWebElement.click\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 92\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mclick\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 93\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Clicks the element.\"\"\"\u001b[39;00m\n\u001b[1;32m---> 94\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_execute(Command\u001b[39m.\u001b[39;49mCLICK_ELEMENT)\n",
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webelement.py:395\u001b[0m, in \u001b[0;36mWebElement._execute\u001b[1;34m(self, command, params)\u001b[0m\n\u001b[0;32m 393\u001b[0m params \u001b[39m=\u001b[39m {}\n\u001b[0;32m 394\u001b[0m params[\u001b[39m\"\u001b[39m\u001b[39mid\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_id\n\u001b[1;32m--> 395\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_parent\u001b[39m.\u001b[39;49mexecute(command, params)\n",
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py:344\u001b[0m, in \u001b[0;36mWebDriver.execute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m 341\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39msessionId\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m params:\n\u001b[0;32m 342\u001b[0m params[\u001b[39m\"\u001b[39m\u001b[39msessionId\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msession_id\n\u001b[1;32m--> 344\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mcommand_executor\u001b[39m.\u001b[39;49mexecute(driver_command, params)\n\u001b[0;32m 345\u001b[0m \u001b[39mif\u001b[39;00m response:\n\u001b[0;32m 346\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror_handler\u001b[39m.\u001b[39mcheck_response(response)\n",
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\remote_connection.py:290\u001b[0m, in \u001b[0;36mRemoteConnection.execute\u001b[1;34m(self, command, params)\u001b[0m\n\u001b[0;32m 288\u001b[0m data \u001b[39m=\u001b[39m utils\u001b[39m.\u001b[39mdump_json(params)\n\u001b[0;32m 289\u001b[0m url \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_url\u001b[39m}\u001b[39;00m\u001b[39m{\u001b[39;00mpath\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m--> 290\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_request(command_info[\u001b[39m0\u001b[39;49m], url, body\u001b[39m=\u001b[39;49mdata)\n",
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\remote_connection.py:311\u001b[0m, in \u001b[0;36mRemoteConnection._request\u001b[1;34m(self, method, url, body)\u001b[0m\n\u001b[0;32m 308\u001b[0m body \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m 310\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mkeep_alive:\n\u001b[1;32m--> 311\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_conn\u001b[39m.\u001b[39;49mrequest(method, url, body\u001b[39m=\u001b[39;49mbody, headers\u001b[39m=\u001b[39;49mheaders)\n\u001b[0;32m 312\u001b[0m statuscode \u001b[39m=\u001b[39m response\u001b[39m.\u001b[39mstatus\n\u001b[0;32m 313\u001b[0m \u001b[39melse\u001b[39;00m:\n",
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\request.py:78\u001b[0m, in \u001b[0;36mRequestMethods.request\u001b[1;34m(self, method, url, fields, headers, **urlopen_kw)\u001b[0m\n\u001b[0;32m 74\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mrequest_encode_url(\n\u001b[0;32m 75\u001b[0m method, url, fields\u001b[39m=\u001b[39mfields, headers\u001b[39m=\u001b[39mheaders, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39murlopen_kw\n\u001b[0;32m 76\u001b[0m )\n\u001b[0;32m 77\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m---> 78\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mrequest_encode_body(\n\u001b[0;32m 79\u001b[0m method, url, fields\u001b[39m=\u001b[39mfields, headers\u001b[39m=\u001b[39mheaders, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39murlopen_kw\n\u001b[0;32m 80\u001b[0m )\n",
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\request.py:170\u001b[0m, in \u001b[0;36mRequestMethods.request_encode_body\u001b[1;34m(self, method, url, fields, headers, encode_multipart, multipart_boundary, **urlopen_kw)\u001b[0m\n\u001b[0;32m 167\u001b[0m extra_kw[\u001b[39m\"\u001b[39m\u001b[39mheaders\u001b[39m\u001b[39m\"\u001b[39m]\u001b[39m.\u001b[39mupdate(headers)\n\u001b[0;32m 168\u001b[0m extra_kw\u001b[39m.\u001b[39mupdate(urlopen_kw)\n\u001b[1;32m--> 170\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39murlopen(method, url, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mextra_kw)\n",
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\poolmanager.py:376\u001b[0m, in \u001b[0;36mPoolManager.urlopen\u001b[1;34m(self, method, url, redirect, **kw)\u001b[0m\n\u001b[0;32m 374\u001b[0m response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39murlopen(method, url, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkw)\n\u001b[0;32m 375\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m--> 376\u001b[0m response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39murlopen(method, u\u001b[39m.\u001b[39mrequest_uri, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkw)\n\u001b[0;32m 378\u001b[0m redirect_location \u001b[39m=\u001b[39m redirect \u001b[39mand\u001b[39;00m response\u001b[39m.\u001b[39mget_redirect_location()\n\u001b[0;32m 379\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m redirect_location:\n",
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\connectionpool.py:703\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[1;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[0;32m 700\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_prepare_proxy(conn)\n\u001b[0;32m 702\u001b[0m \u001b[39m# Make the request on the httplib connection object.\u001b[39;00m\n\u001b[1;32m--> 703\u001b[0m httplib_response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_request(\n\u001b[0;32m 704\u001b[0m conn,\n\u001b[0;32m 705\u001b[0m method,\n\u001b[0;32m 706\u001b[0m url,\n\u001b[0;32m 707\u001b[0m timeout\u001b[39m=\u001b[39;49mtimeout_obj,\n\u001b[0;32m 708\u001b[0m body\u001b[39m=\u001b[39;49mbody,\n\u001b[0;32m 709\u001b[0m headers\u001b[39m=\u001b[39;49mheaders,\n\u001b[0;32m 710\u001b[0m chunked\u001b[39m=\u001b[39;49mchunked,\n\u001b[0;32m 711\u001b[0m )\n\u001b[0;32m 713\u001b[0m \u001b[39m# If we're going to release the connection in ``finally:``, then\u001b[39;00m\n\u001b[0;32m 714\u001b[0m \u001b[39m# the response doesn't need to know about the connection. Otherwise\u001b[39;00m\n\u001b[0;32m 715\u001b[0m \u001b[39m# it will also try to release it and we'll have a double-release\u001b[39;00m\n\u001b[0;32m 716\u001b[0m \u001b[39m# mess.\u001b[39;00m\n\u001b[0;32m 717\u001b[0m response_conn \u001b[39m=\u001b[39m conn \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m release_conn \u001b[39melse\u001b[39;00m \u001b[39mNone\u001b[39;00m\n",
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\connectionpool.py:449\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[1;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[0;32m 444\u001b[0m httplib_response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39mgetresponse()\n\u001b[0;32m 445\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 446\u001b[0m \u001b[39m# Remove the TypeError from the exception chain in\u001b[39;00m\n\u001b[0;32m 447\u001b[0m \u001b[39m# Python 3 (including for exceptions like SystemExit).\u001b[39;00m\n\u001b[0;32m 448\u001b[0m \u001b[39m# Otherwise it looks like a bug in the code.\u001b[39;00m\n\u001b[1;32m--> 449\u001b[0m six\u001b[39m.\u001b[39;49mraise_from(e, \u001b[39mNone\u001b[39;49;00m)\n\u001b[0;32m 450\u001b[0m \u001b[39mexcept\u001b[39;00m (SocketTimeout, BaseSSLError, SocketError) \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 451\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_raise_timeout(err\u001b[39m=\u001b[39me, url\u001b[39m=\u001b[39murl, timeout_value\u001b[39m=\u001b[39mread_timeout)\n",
"File \u001b[1;32m<string>:3\u001b[0m, in \u001b[0;36mraise_from\u001b[1;34m(value, from_value)\u001b[0m\n",
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\connectionpool.py:444\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[1;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[0;32m 441\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mTypeError\u001b[39;00m:\n\u001b[0;32m 442\u001b[0m \u001b[39m# Python 3\u001b[39;00m\n\u001b[0;32m 443\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 444\u001b[0m httplib_response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39;49mgetresponse()\n\u001b[0;32m 445\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 446\u001b[0m \u001b[39m# Remove the TypeError from the exception chain in\u001b[39;00m\n\u001b[0;32m 447\u001b[0m \u001b[39m# Python 3 (including for exceptions like SystemExit).\u001b[39;00m\n\u001b[0;32m 448\u001b[0m \u001b[39m# Otherwise it looks like a bug in the code.\u001b[39;00m\n\u001b[0;32m 449\u001b[0m six\u001b[39m.\u001b[39mraise_from(e, \u001b[39mNone\u001b[39;00m)\n",
"File \u001b[1;32mc:\\Python310\\lib\\http\\client.py:1374\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1372\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 1373\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m-> 1374\u001b[0m response\u001b[39m.\u001b[39;49mbegin()\n\u001b[0;32m 1375\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mConnectionError\u001b[39;00m:\n\u001b[0;32m 1376\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mclose()\n",
"File \u001b[1;32mc:\\Python310\\lib\\http\\client.py:318\u001b[0m, in \u001b[0;36mHTTPResponse.begin\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 316\u001b[0m \u001b[39m# read until we get a non-100 response\u001b[39;00m\n\u001b[0;32m 317\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m--> 318\u001b[0m version, status, reason \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_read_status()\n\u001b[0;32m 319\u001b[0m \u001b[39mif\u001b[39;00m status \u001b[39m!=\u001b[39m CONTINUE:\n\u001b[0;32m 320\u001b[0m \u001b[39mbreak\u001b[39;00m\n",
"File \u001b[1;32mc:\\Python310\\lib\\http\\client.py:279\u001b[0m, in \u001b[0;36mHTTPResponse._read_status\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 278\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_read_status\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[1;32m--> 279\u001b[0m line \u001b[39m=\u001b[39m \u001b[39mstr\u001b[39m(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfp\u001b[39m.\u001b[39;49mreadline(_MAXLINE \u001b[39m+\u001b[39;49m \u001b[39m1\u001b[39;49m), \u001b[39m\"\u001b[39m\u001b[39miso-8859-1\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 280\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(line) \u001b[39m>\u001b[39m _MAXLINE:\n\u001b[0;32m 281\u001b[0m \u001b[39mraise\u001b[39;00m LineTooLong(\u001b[39m\"\u001b[39m\u001b[39mstatus line\u001b[39m\u001b[39m\"\u001b[39m)\n",
"File \u001b[1;32mc:\\Python310\\lib\\socket.py:705\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[1;34m(self, b)\u001b[0m\n\u001b[0;32m 703\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[0;32m 704\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 705\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sock\u001b[39m.\u001b[39;49mrecv_into(b)\n\u001b[0;32m 706\u001b[0m \u001b[39mexcept\u001b[39;00m timeout:\n\u001b[0;32m 707\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_timeout_occurred \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n",
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
]
} }
], ],
"source": [ "source": [
@ -65,7 +38,7 @@
"from selenium.webdriver.support.ui import WebDriverWait\n", "from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n", "from selenium.webdriver.support import expected_conditions as EC\n",
"\n", "\n",
"search_query = \"A*\"\n", "search_query = \"GEA Farm Technologies\"\n",
"\n", "\n",
"options = webdriver.ChromeOptions()\n", "options = webdriver.ChromeOptions()\n",
"\n", "\n",
@ -148,18 +121,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 97, "execution_count": 119,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"['registerdocument-2023-06-09-13-48-58.xml',\n", "['registerdocument-2023-06-09-14-05-01.xml',\n",
" 'registerdocument-2023-06-09-13-49-02.xml',\n", " 'registerdocument-2023-06-09-14-05-03.xml']"
" 'registerdocument-2023-06-09-13-49-05.xml']"
] ]
}, },
"execution_count": 97, "execution_count": 119,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -169,6 +141,104 @@
"files" "files"
] ]
}, },
{
"cell_type": "code",
"execution_count": 135,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{}\n",
"{}\n",
"{'name': {'firstname': 'Reinhard', 'lastname': 'Gebing'}, 'date_of_birth': '1964-04-26', 'location': {'city': 'Oelde'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Markus', 'lastname': 'Kreft'}, 'date_of_birth': '1966-04-03', 'location': {'city': 'Wetter'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Kai', 'lastname': 'Luntz'}, 'date_of_birth': '1970-12-04', 'location': {'city': 'Holzminden'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Thomas', 'lastname': 'Mader'}, 'date_of_birth': '1972-05-24', 'location': {'city': 'Rheda-Wiedenbrück'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Peter', 'lastname': 'Lauwers'}, 'date_of_birth': '1970-03-26', 'location': {'city': 'Düsseldorf'}, 'role': 'Geschäftsführer(in)'}\n",
"{'name': {'firstname': 'Erkul', 'lastname': 'Basaran'}, 'date_of_birth': '1977-05-06', 'location': {'city': 'Erkrath'}, 'role': 'Geschäftsführer(in)'}\n",
"{'name': {'firstname': 'Katja', 'lastname': 'Voß'}, 'date_of_birth': '1978-02-24', 'location': {'city': 'Rheda-Wiedenbrück'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Henrik', 'lastname': 'Böttner'}, 'date_of_birth': '1982-11-07', 'location': {'city': 'Bochum'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Ulrich', 'lastname': 'Raßenhövel'}, 'date_of_birth': '1969-04-16', 'location': {'city': 'Oelde'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Andreas', 'lastname': 'Naroska'}, 'date_of_birth': '1967-03-23', 'location': {'city': 'Herdecke'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Mark', 'lastname': 'Kramps'}, 'date_of_birth': '1967-09-04', 'location': {'city': 'Witten'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Ralf', 'lastname': 'Barkmeyer'}, 'date_of_birth': '1974-02-28', 'location': {'city': 'Dortmund'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Holger', 'lastname': 'Siegwarth'}, 'date_of_birth': '1967-05-13', 'location': {'city': 'Tönnisvorst'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Oliver', 'lastname': 'Liß'}, 'date_of_birth': '1981-04-13', 'location': {'city': 'Herne'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Liang', 'lastname': 'Cheng'}, 'date_of_birth': '1980-12-29', 'location': {'city': 'Göppingen'}, 'role': 'Geschäftsführer(in)'}\n",
"{'name': {'firstname': 'Astrid', 'lastname': 'Dörner-Rodeheger'}, 'date_of_birth': '1968-12-24', 'location': {'city': 'Beckum'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Jon', 'lastname': 'Lange'}, 'date_of_birth': '1978-04-25', 'location': {'city': 'Dortmund'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Matthias', 'lastname': 'Peters'}, 'date_of_birth': '1973-08-28', 'location': {'city': 'Dortmund'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Ralf', 'lastname': 'Frombach'}, 'date_of_birth': '1977-01-25', 'location': {'city': 'Werne'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Sven', 'lastname': 'Hommel'}, 'date_of_birth': '1979-04-22', 'location': {'city': 'Berlin'}, 'role': 'Prokurist(in)'}\n"
]
}
],
"source": [
"import json\n",
"import xmltodict\n",
"\n",
"for file in files:\n",
" with open(\"./data/Unternehmensregister/\" + file, \"r\", encoding=\"utf-8\") as xml_file:\n",
" data = xmltodict.parse(xml_file.read())\n",
" with open(\"./data/temp.json\", \"w\", encoding=\"utf-8\") as json_file:\n",
" json_file.write(json.dumps(data))\n",
"\n",
" keys = dict.keys(data[\"XJustiz_Daten\"][\"Grunddaten\"])\n",
" base_info = {\n",
" \"company_name\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
" \"Basisdaten_Register\"\n",
" ][\"Rechtstraeger\"][\"Bezeichnung\"][\"Bezeichnung_Aktuell\"],\n",
" \"location\": {\n",
" \"city\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
" \"Basisdaten_Register\"\n",
" ][\"Rechtstraeger\"][\"Anschrift\"][\"Ort\"],\n",
" \"zip_code\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
" \"Basisdaten_Register\"\n",
" ][\"Rechtstraeger\"][\"Anschrift\"][\"Postleitzahl\"],\n",
" \"street\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
" \"Basisdaten_Register\"\n",
" ][\"Rechtstraeger\"][\"Anschrift\"][\"Strasse\"],\n",
" \"house_number\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
" \"Basisdaten_Register\"\n",
" ][\"Rechtstraeger\"][\"Anschrift\"][\"Hausnummer\"],\n",
" },\n",
" }\n",
"\n",
" def parse_stakeholder(data: dict) -> list:\n",
" if \"Natuerliche_Person\" in data[\"Beteiligter\"]:\n",
" return {\n",
" \"name\": {\n",
" \"firstname\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\n",
" \"Voller_Name\"\n",
" ][\"Vorname\"],\n",
" \"lastname\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\n",
" \"Voller_Name\"\n",
" ][\"Nachname\"],\n",
" },\n",
" \"date_of_birth\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\n",
" \"Geburt\"\n",
" ][\"Geburtsdatum\"],\n",
" \"location\": {\n",
" \"city\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\"Anschrift\"][\n",
" \"Ort\"\n",
" ]\n",
" },\n",
" \"role\": data[\"Rolle\"][\"Rollenbezeichnung\"][\"content\"],\n",
" }\n",
" return {}\n",
"\n",
" for i in range(\n",
" len(data[\"XJustiz_Daten\"][\"Grunddaten\"][\"Verfahrensdaten\"][\"Beteiligung\"])\n",
" ):\n",
" people = parse_stakeholder(\n",
" data[\"XJustiz_Daten\"][\"Grunddaten\"][\"Verfahrensdaten\"][\"Beteiligung\"][i]\n",
" )\n",
" print(people)\n",
" break"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 27, "execution_count": 27,

View File

@ -4,3 +4,4 @@ opencv-python
pdf2image pdf2image
bs4 bs4
selenium selenium
xmltodict