diff --git a/Jupyter/API-tests/Unternehmensregister/notebook.ipynb b/Jupyter/API-tests/Unternehmensregister/notebook.ipynb
index e8b75a7..f7ff970 100644
--- a/Jupyter/API-tests/Unternehmensregister/notebook.ipynb
+++ b/Jupyter/API-tests/Unternehmensregister/notebook.ipynb
@@ -18,7 +18,7 @@
},
{
"cell_type": "code",
- "execution_count": 74,
+ "execution_count": 98,
"metadata": {},
"outputs": [
{
@@ -27,6 +27,33 @@
"text": [
"c:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Unternehmensregister\\data\\Unternehmensregister\n"
]
+ },
+ {
+ "ename": "KeyboardInterrupt",
+ "evalue": "",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
+ "\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Unternehmensregister\\notebook.ipynb Cell 3\u001b[0m in \u001b[0;36m5\n\u001b[0;32m 50\u001b[0m companies_tab \u001b[39m=\u001b[39m driver\u001b[39m.\u001b[39mfind_elements(\n\u001b[0;32m 51\u001b[0m By\u001b[39m.\u001b[39mLINK_TEXT, \u001b[39m\"\u001b[39m\u001b[39mRegisterinformationen des Registergerichts\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 52\u001b[0m )\n\u001b[0;32m 53\u001b[0m \u001b[39mfor\u001b[39;00m company_link \u001b[39min\u001b[39;00m companies_tab:\n\u001b[0;32m 54\u001b[0m \u001b[39m# Go to intermediary page\u001b[39;00m\n\u001b[1;32m---> 55\u001b[0m company_link\u001b[39m.\u001b[39;49mclick()\n\u001b[0;32m 56\u001b[0m \u001b[39m# Trigger next redirect\u001b[39;00m\n\u001b[0;32m 57\u001b[0m driver\u001b[39m.\u001b[39mfind_element(By\u001b[39m.\u001b[39mLINK_TEXT, \u001b[39m\"\u001b[39m\u001b[39mRegisterinformationen anzeigen\u001b[39m\u001b[39m\"\u001b[39m)\u001b[39m.\u001b[39mclick()\n",
+ "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webelement.py:94\u001b[0m, in \u001b[0;36mWebElement.click\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 92\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mclick\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 93\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Clicks the element.\"\"\"\u001b[39;00m\n\u001b[1;32m---> 94\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_execute(Command\u001b[39m.\u001b[39;49mCLICK_ELEMENT)\n",
+ "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webelement.py:395\u001b[0m, in \u001b[0;36mWebElement._execute\u001b[1;34m(self, command, params)\u001b[0m\n\u001b[0;32m 393\u001b[0m params \u001b[39m=\u001b[39m {}\n\u001b[0;32m 394\u001b[0m params[\u001b[39m\"\u001b[39m\u001b[39mid\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_id\n\u001b[1;32m--> 395\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_parent\u001b[39m.\u001b[39;49mexecute(command, params)\n",
+ "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py:344\u001b[0m, in \u001b[0;36mWebDriver.execute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m 341\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39msessionId\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m params:\n\u001b[0;32m 342\u001b[0m params[\u001b[39m\"\u001b[39m\u001b[39msessionId\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msession_id\n\u001b[1;32m--> 344\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mcommand_executor\u001b[39m.\u001b[39;49mexecute(driver_command, params)\n\u001b[0;32m 345\u001b[0m \u001b[39mif\u001b[39;00m response:\n\u001b[0;32m 346\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror_handler\u001b[39m.\u001b[39mcheck_response(response)\n",
+ "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\remote_connection.py:290\u001b[0m, in \u001b[0;36mRemoteConnection.execute\u001b[1;34m(self, command, params)\u001b[0m\n\u001b[0;32m 288\u001b[0m data \u001b[39m=\u001b[39m utils\u001b[39m.\u001b[39mdump_json(params)\n\u001b[0;32m 289\u001b[0m url \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_url\u001b[39m}\u001b[39;00m\u001b[39m{\u001b[39;00mpath\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m--> 290\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_request(command_info[\u001b[39m0\u001b[39;49m], url, body\u001b[39m=\u001b[39;49mdata)\n",
+ "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\remote_connection.py:311\u001b[0m, in \u001b[0;36mRemoteConnection._request\u001b[1;34m(self, method, url, body)\u001b[0m\n\u001b[0;32m 308\u001b[0m body \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m 310\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mkeep_alive:\n\u001b[1;32m--> 311\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_conn\u001b[39m.\u001b[39;49mrequest(method, url, body\u001b[39m=\u001b[39;49mbody, headers\u001b[39m=\u001b[39;49mheaders)\n\u001b[0;32m 312\u001b[0m statuscode \u001b[39m=\u001b[39m response\u001b[39m.\u001b[39mstatus\n\u001b[0;32m 313\u001b[0m \u001b[39melse\u001b[39;00m:\n",
+ "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\request.py:78\u001b[0m, in \u001b[0;36mRequestMethods.request\u001b[1;34m(self, method, url, fields, headers, **urlopen_kw)\u001b[0m\n\u001b[0;32m 74\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mrequest_encode_url(\n\u001b[0;32m 75\u001b[0m method, url, fields\u001b[39m=\u001b[39mfields, headers\u001b[39m=\u001b[39mheaders, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39murlopen_kw\n\u001b[0;32m 76\u001b[0m )\n\u001b[0;32m 77\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m---> 78\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mrequest_encode_body(\n\u001b[0;32m 79\u001b[0m method, url, fields\u001b[39m=\u001b[39mfields, headers\u001b[39m=\u001b[39mheaders, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39murlopen_kw\n\u001b[0;32m 80\u001b[0m )\n",
+ "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\request.py:170\u001b[0m, in \u001b[0;36mRequestMethods.request_encode_body\u001b[1;34m(self, method, url, fields, headers, encode_multipart, multipart_boundary, **urlopen_kw)\u001b[0m\n\u001b[0;32m 167\u001b[0m extra_kw[\u001b[39m\"\u001b[39m\u001b[39mheaders\u001b[39m\u001b[39m\"\u001b[39m]\u001b[39m.\u001b[39mupdate(headers)\n\u001b[0;32m 168\u001b[0m extra_kw\u001b[39m.\u001b[39mupdate(urlopen_kw)\n\u001b[1;32m--> 170\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39murlopen(method, url, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mextra_kw)\n",
+ "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\poolmanager.py:376\u001b[0m, in \u001b[0;36mPoolManager.urlopen\u001b[1;34m(self, method, url, redirect, **kw)\u001b[0m\n\u001b[0;32m 374\u001b[0m response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39murlopen(method, url, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkw)\n\u001b[0;32m 375\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m--> 376\u001b[0m response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39murlopen(method, u\u001b[39m.\u001b[39mrequest_uri, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkw)\n\u001b[0;32m 378\u001b[0m redirect_location \u001b[39m=\u001b[39m redirect \u001b[39mand\u001b[39;00m response\u001b[39m.\u001b[39mget_redirect_location()\n\u001b[0;32m 379\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m redirect_location:\n",
+ "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\connectionpool.py:703\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[1;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[0;32m 700\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_prepare_proxy(conn)\n\u001b[0;32m 702\u001b[0m \u001b[39m# Make the request on the httplib connection object.\u001b[39;00m\n\u001b[1;32m--> 703\u001b[0m httplib_response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_request(\n\u001b[0;32m 704\u001b[0m conn,\n\u001b[0;32m 705\u001b[0m method,\n\u001b[0;32m 706\u001b[0m url,\n\u001b[0;32m 707\u001b[0m timeout\u001b[39m=\u001b[39;49mtimeout_obj,\n\u001b[0;32m 708\u001b[0m body\u001b[39m=\u001b[39;49mbody,\n\u001b[0;32m 709\u001b[0m headers\u001b[39m=\u001b[39;49mheaders,\n\u001b[0;32m 710\u001b[0m chunked\u001b[39m=\u001b[39;49mchunked,\n\u001b[0;32m 711\u001b[0m )\n\u001b[0;32m 713\u001b[0m \u001b[39m# If we're going to release the connection in ``finally:``, then\u001b[39;00m\n\u001b[0;32m 714\u001b[0m \u001b[39m# the response doesn't need to know about the connection. Otherwise\u001b[39;00m\n\u001b[0;32m 715\u001b[0m \u001b[39m# it will also try to release it and we'll have a double-release\u001b[39;00m\n\u001b[0;32m 716\u001b[0m \u001b[39m# mess.\u001b[39;00m\n\u001b[0;32m 717\u001b[0m response_conn \u001b[39m=\u001b[39m conn \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m release_conn \u001b[39melse\u001b[39;00m \u001b[39mNone\u001b[39;00m\n",
+ "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\connectionpool.py:449\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[1;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[0;32m 444\u001b[0m httplib_response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39mgetresponse()\n\u001b[0;32m 445\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 446\u001b[0m \u001b[39m# Remove the TypeError from the exception chain in\u001b[39;00m\n\u001b[0;32m 447\u001b[0m \u001b[39m# Python 3 (including for exceptions like SystemExit).\u001b[39;00m\n\u001b[0;32m 448\u001b[0m \u001b[39m# Otherwise it looks like a bug in the code.\u001b[39;00m\n\u001b[1;32m--> 449\u001b[0m six\u001b[39m.\u001b[39;49mraise_from(e, \u001b[39mNone\u001b[39;49;00m)\n\u001b[0;32m 450\u001b[0m \u001b[39mexcept\u001b[39;00m (SocketTimeout, BaseSSLError, SocketError) \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 451\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_raise_timeout(err\u001b[39m=\u001b[39me, url\u001b[39m=\u001b[39murl, timeout_value\u001b[39m=\u001b[39mread_timeout)\n",
+ "File \u001b[1;32m:3\u001b[0m, in \u001b[0;36mraise_from\u001b[1;34m(value, from_value)\u001b[0m\n",
+ "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\connectionpool.py:444\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[1;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[0;32m 441\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mTypeError\u001b[39;00m:\n\u001b[0;32m 442\u001b[0m \u001b[39m# Python 3\u001b[39;00m\n\u001b[0;32m 443\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 444\u001b[0m httplib_response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39;49mgetresponse()\n\u001b[0;32m 445\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 446\u001b[0m \u001b[39m# Remove the TypeError from the exception chain in\u001b[39;00m\n\u001b[0;32m 447\u001b[0m \u001b[39m# Python 3 (including for exceptions like SystemExit).\u001b[39;00m\n\u001b[0;32m 448\u001b[0m \u001b[39m# Otherwise it looks like a bug in the code.\u001b[39;00m\n\u001b[0;32m 449\u001b[0m six\u001b[39m.\u001b[39mraise_from(e, \u001b[39mNone\u001b[39;00m)\n",
+ "File \u001b[1;32mc:\\Python310\\lib\\http\\client.py:1374\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1372\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 1373\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m-> 1374\u001b[0m response\u001b[39m.\u001b[39;49mbegin()\n\u001b[0;32m 1375\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mConnectionError\u001b[39;00m:\n\u001b[0;32m 1376\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mclose()\n",
+ "File \u001b[1;32mc:\\Python310\\lib\\http\\client.py:318\u001b[0m, in \u001b[0;36mHTTPResponse.begin\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 316\u001b[0m \u001b[39m# read until we get a non-100 response\u001b[39;00m\n\u001b[0;32m 317\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m--> 318\u001b[0m version, status, reason \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_read_status()\n\u001b[0;32m 319\u001b[0m \u001b[39mif\u001b[39;00m status \u001b[39m!=\u001b[39m CONTINUE:\n\u001b[0;32m 320\u001b[0m \u001b[39mbreak\u001b[39;00m\n",
+ "File \u001b[1;32mc:\\Python310\\lib\\http\\client.py:279\u001b[0m, in \u001b[0;36mHTTPResponse._read_status\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 278\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_read_status\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[1;32m--> 279\u001b[0m line \u001b[39m=\u001b[39m \u001b[39mstr\u001b[39m(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfp\u001b[39m.\u001b[39;49mreadline(_MAXLINE \u001b[39m+\u001b[39;49m \u001b[39m1\u001b[39;49m), \u001b[39m\"\u001b[39m\u001b[39miso-8859-1\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 280\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(line) \u001b[39m>\u001b[39m _MAXLINE:\n\u001b[0;32m 281\u001b[0m \u001b[39mraise\u001b[39;00m LineTooLong(\u001b[39m\"\u001b[39m\u001b[39mstatus line\u001b[39m\u001b[39m\"\u001b[39m)\n",
+ "File \u001b[1;32mc:\\Python310\\lib\\socket.py:705\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[1;34m(self, b)\u001b[0m\n\u001b[0;32m 703\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[0;32m 704\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 705\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sock\u001b[39m.\u001b[39;49mrecv_into(b)\n\u001b[0;32m 706\u001b[0m \u001b[39mexcept\u001b[39;00m timeout:\n\u001b[0;32m 707\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_timeout_occurred \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n",
+ "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
+ ]
}
],
"source": [
@@ -76,112 +103,41 @@
" lambda driver: driver.current_url != \"https://www.unternehmensregister.de/ureg/\"\n",
")\n",
"## TODO Iterate over tabs\n",
- "# Find all \"Registerinformationen\"\n",
- "companies_tab = driver.find_elements(\n",
- " By.LINK_TEXT, \"Registerinformationen des Registergerichts\"\n",
+ "num_pages = int(\n",
+ " driver.find_element(By.XPATH, '//*[@class=\"page_count\"]').text.split(\" \")[0]\n",
")\n",
- "for company_link in companies_tab:\n",
- " # Go to intermediary page\n",
- " company_link.click()\n",
- " # Trigger next redirect\n",
- " driver.find_element(By.LINK_TEXT, \"Registerinformationen anzeigen\").click()\n",
- " # Trigger SI download\n",
- " driver.find_element(By.LINK_TEXT, \"SI\").click()\n",
- " # Show shopping cart - TODO evaluate restructuring behaviour by filling cart first and then bulk downloading\n",
- " wait.until(\n",
- " EC.visibility_of_element_located((By.LINK_TEXT, \"Dokumentenkorb ansehen\"))\n",
+ "for page_index in range(num_pages):\n",
+ " # Find all \"Registerinformationen\"\n",
+ " companies_tab = driver.find_elements(\n",
+ " By.LINK_TEXT, \"Registerinformationen des Registergerichts\"\n",
" )\n",
- " driver.find_element(By.LINK_TEXT, \"Dokumentenkorb ansehen\").click()\n",
- " # Get document\n",
- " xpath = \"//input[@type='submit']\"\n",
- " elems = driver.find_elements(By.TAG_NAME, \"input\")\n",
- " elems[-2].click()\n",
+ " for company_link in companies_tab:\n",
+ " # Go to intermediary page\n",
+ " company_link.click()\n",
+ " # Trigger next redirect\n",
+ " driver.find_element(By.LINK_TEXT, \"Registerinformationen anzeigen\").click()\n",
+ " # Trigger SI download\n",
+ " driver.find_element(By.LINK_TEXT, \"SI\").click()\n",
+ " # Show shopping cart - TODO evaluate restructuring behaviour by filling cart first and then bulk downloading\n",
+ " wait.until(\n",
+ " EC.visibility_of_element_located((By.LINK_TEXT, \"Dokumentenkorb ansehen\"))\n",
+ " )\n",
+ " driver.find_element(By.LINK_TEXT, \"Dokumentenkorb ansehen\").click()\n",
+ " # Get document\n",
+ " xpath = \"//input[@type='submit']\"\n",
+ " elems = driver.find_elements(By.TAG_NAME, \"input\")\n",
+ " elems[-2].click()\n",
"\n",
- " driver.find_element(By.ID, \"paymentFormOverview:btnNext\").click()\n",
- " driver.find_element(By.LINK_TEXT, \"Zum Dokumentenkorb\").click()\n",
- " driver.find_element(By.CLASS_NAME, \"download-wrapper\").click()\n",
- "\n",
- " for i in range(6):\n",
- " driver.back()\n",
+ " driver.find_element(By.ID, \"paymentFormOverview:btnNext\").click()\n",
+ " driver.find_element(By.LINK_TEXT, \"Zum Dokumentenkorb\").click()\n",
+ " driver.find_element(By.CLASS_NAME, \"download-wrapper\").click()\n",
"\n",
+ " for i in range(6):\n",
+ " driver.back()\n",
+ " driver.find_element(By.XPATH, '//*[@class=\"fas fa-angle-right\"]').click()\n",
"driver.close()"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import re\n",
- "import requests\n",
- "\n",
- "session = requests.Session()\n",
- "session.cookies[\"cc\"] = \"1686301974-69f11760d466bcea-10\"\n",
- "session.headers.update(\n",
- " {\n",
- " \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9\",\n",
- " \"Accept-Encoding\": \"gzip, deflate, br\",\n",
- " \"Accept-Language\": \"de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7,et;q=0.6,pl;q=0.5\",\n",
- " \"Cache-Control\": \"max-age=0\",\n",
- " \"Connection\": \"keep-alive\",\n",
- " \"DNT\": \"1\",\n",
- " \"Host\": \"www.unternehmensregister.de\",\n",
- " \"Pragma\": \"no-cache\",\n",
- " \"Referer\": \"https://www.unternehmensregister.de/ureg/\",\n",
- " \"sec-ch-ua-mobile\": \"?0\",\n",
- " \"Sec-Fetch-Dest\": \"document\",\n",
- " \"Sec-Fetch-Mode\": \"navigate\",\n",
- " \"Sec-Fetch-Site\": \"same-origin\",\n",
- " \"Sec-Fetch-User\": \"?1\",\n",
- " \"Upgrade-Insecure-Requests\": \"1\",\n",
- " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36\",\n",
- " \"Content-Type\": \"application/x-www-form-urlencoded\",\n",
- " }\n",
- ")\n",
- "\n",
- "\n",
- "# Get session cookie\n",
- "response = session.get(\"https://www.unternehmensregister.de\")\n",
- "jsessionid = re.findall(r\";jsessionid=[A-Z0-9]*.web[0-9]{2}.[0-9]\", response.text)[\n",
- " 0\n",
- "].replace(\";jsessionid=\", \"\")\n",
- "print(jsessionid)\n",
- "with open(\"temp.html\", \"w\", encoding=\"utf-8\") as file:\n",
- " file.write(response.text)\n",
- "print(session.cookies)\n",
- "# Go to search page\n",
- "response = session.get(\"https://www.unternehmensregister.de/ureg/\")\n",
- "# Start search\n",
- "response = session.post(\n",
- " f\"https://www.unternehmensregister.de/ureg/index.html;jsessionid={jsessionid}\",\n",
- " data={\n",
- " \"globalSearchForm\": \"globalSearchForm\",\n",
- " \"globalSearchForm:extendedResearchCompanyName\": \"A*\",\n",
- " },\n",
- ")\n",
- "print(response.status_code)\n",
- "# Get results\n",
- "response = session.get(\n",
- " f\"https://www.unternehmensregister.de/ureg/result.html;jsessionid={jsessionid}\"\n",
- ")\n",
- "with open(\"temp.html\", \"w\", encoding=\"utf-8\") as file:\n",
- " file.write(response.text)\n",
- "\n",
- "print(session.cookies)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "class Unternehmensregister:\n",
- " def __init__(self):\n",
- " self.session = requests.Session()"
- ]
- },
{
"attachments": {},
"cell_type": "markdown",
@@ -190,6 +146,29 @@
"## Analyze Auszug"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 97,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['registerdocument-2023-06-09-13-48-58.xml',\n",
+ " 'registerdocument-2023-06-09-13-49-02.xml',\n",
+ " 'registerdocument-2023-06-09-13-49-05.xml']"
+ ]
+ },
+ "execution_count": 97,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "files = os.listdir(\"./data/Unternehmensregister\")\n",
+ "files"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 27,