mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-04-22 05:02:53 +02:00
Bulk process Unternehmensregister .xmls
This commit is contained in:
parent
1010b43a5f
commit
058c16b3ff
@ -18,7 +18,39 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 118,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import glob"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def wait_for_download_condition(\n",
|
||||
" path: str, num_files: int, pattern: str = \"*.xml\"\n",
|
||||
") -> bool:\n",
|
||||
" return len(glob.glob1(path, pattern)) > num_files\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_num_files(path: str, pattern: str = \"*.xml\") -> int:\n",
|
||||
" return len(glob.glob1(path, pattern))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def rename_latest_file(path: str, filename: str, pattern: str = \"*.xml\"):\n",
|
||||
" list_of_files = [os.path.join(path, file) for file in glob.glob1(path, pattern)]\n",
|
||||
" latest_download = max(list_of_files, key=os.path.getctime)\n",
|
||||
" os.rename(latest_download, os.path.join(path, filename))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -27,10 +59,36 @@
|
||||
"text": [
|
||||
"c:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Unternehmensregister\\data\\Unternehmensregister\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "KeyboardInterrupt",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Unternehmensregister\\notebook.ipynb Cell 5\u001b[0m in \u001b[0;36m3\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W2sZmlsZQ%3D%3D?line=26'>27</a>\u001b[0m options\u001b[39m.\u001b[39madd_experimental_option(\u001b[39m\"\u001b[39m\u001b[39mprefs\u001b[39m\u001b[39m\"\u001b[39m, preferences)\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W2sZmlsZQ%3D%3D?line=28'>29</a>\u001b[0m driver \u001b[39m=\u001b[39m webdriver\u001b[39m.\u001b[39mChrome(options\u001b[39m=\u001b[39moptions)\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W2sZmlsZQ%3D%3D?line=30'>31</a>\u001b[0m driver\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mhttps://www.unternehmensregister.de/ureg/\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W2sZmlsZQ%3D%3D?line=31'>32</a>\u001b[0m \u001b[39m# Accept Cookies\u001b[39;00m\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W2sZmlsZQ%3D%3D?line=32'>33</a>\u001b[0m driver\u001b[39m.\u001b[39mfind_elements(\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W2sZmlsZQ%3D%3D?line=33'>34</a>\u001b[0m By\u001b[39m.\u001b[39mXPATH, \u001b[39m'\u001b[39m\u001b[39m//button[text()=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mNur technisch notwendige Cookies akzeptieren\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m]\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W2sZmlsZQ%3D%3D?line=34'>35</a>\u001b[0m )[\u001b[39m0\u001b[39m]\u001b[39m.\u001b[39mclick()\n",
|
||||
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py:355\u001b[0m, in \u001b[0;36mWebDriver.get\u001b[1;34m(self, url)\u001b[0m\n\u001b[0;32m 353\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mget\u001b[39m(\u001b[39mself\u001b[39m, url: \u001b[39mstr\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 354\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Loads a web page in the current browser session.\"\"\"\u001b[39;00m\n\u001b[1;32m--> 355\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mexecute(Command\u001b[39m.\u001b[39;49mGET, {\u001b[39m\"\u001b[39;49m\u001b[39murl\u001b[39;49m\u001b[39m\"\u001b[39;49m: url})\n",
|
||||
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py:344\u001b[0m, in \u001b[0;36mWebDriver.execute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m 341\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39msessionId\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m params:\n\u001b[0;32m 342\u001b[0m params[\u001b[39m\"\u001b[39m\u001b[39msessionId\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msession_id\n\u001b[1;32m--> 344\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mcommand_executor\u001b[39m.\u001b[39;49mexecute(driver_command, params)\n\u001b[0;32m 345\u001b[0m \u001b[39mif\u001b[39;00m response:\n\u001b[0;32m 346\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror_handler\u001b[39m.\u001b[39mcheck_response(response)\n",
|
||||
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\remote_connection.py:290\u001b[0m, in \u001b[0;36mRemoteConnection.execute\u001b[1;34m(self, command, params)\u001b[0m\n\u001b[0;32m 288\u001b[0m data \u001b[39m=\u001b[39m utils\u001b[39m.\u001b[39mdump_json(params)\n\u001b[0;32m 289\u001b[0m url \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_url\u001b[39m}\u001b[39;00m\u001b[39m{\u001b[39;00mpath\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m--> 290\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_request(command_info[\u001b[39m0\u001b[39;49m], url, body\u001b[39m=\u001b[39;49mdata)\n",
|
||||
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\remote_connection.py:311\u001b[0m, in \u001b[0;36mRemoteConnection._request\u001b[1;34m(self, method, url, body)\u001b[0m\n\u001b[0;32m 308\u001b[0m body \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m 310\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mkeep_alive:\n\u001b[1;32m--> 311\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_conn\u001b[39m.\u001b[39;49mrequest(method, url, body\u001b[39m=\u001b[39;49mbody, headers\u001b[39m=\u001b[39;49mheaders)\n\u001b[0;32m 312\u001b[0m statuscode \u001b[39m=\u001b[39m response\u001b[39m.\u001b[39mstatus\n\u001b[0;32m 313\u001b[0m \u001b[39melse\u001b[39;00m:\n",
|
||||
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\request.py:78\u001b[0m, in \u001b[0;36mRequestMethods.request\u001b[1;34m(self, method, url, fields, headers, **urlopen_kw)\u001b[0m\n\u001b[0;32m 74\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mrequest_encode_url(\n\u001b[0;32m 75\u001b[0m method, url, fields\u001b[39m=\u001b[39mfields, headers\u001b[39m=\u001b[39mheaders, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39murlopen_kw\n\u001b[0;32m 76\u001b[0m )\n\u001b[0;32m 77\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m---> 78\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mrequest_encode_body(\n\u001b[0;32m 79\u001b[0m method, url, fields\u001b[39m=\u001b[39mfields, headers\u001b[39m=\u001b[39mheaders, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39murlopen_kw\n\u001b[0;32m 80\u001b[0m )\n",
|
||||
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\request.py:170\u001b[0m, in \u001b[0;36mRequestMethods.request_encode_body\u001b[1;34m(self, method, url, fields, headers, encode_multipart, multipart_boundary, **urlopen_kw)\u001b[0m\n\u001b[0;32m 167\u001b[0m extra_kw[\u001b[39m\"\u001b[39m\u001b[39mheaders\u001b[39m\u001b[39m\"\u001b[39m]\u001b[39m.\u001b[39mupdate(headers)\n\u001b[0;32m 168\u001b[0m extra_kw\u001b[39m.\u001b[39mupdate(urlopen_kw)\n\u001b[1;32m--> 170\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39murlopen(method, url, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mextra_kw)\n",
|
||||
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\poolmanager.py:376\u001b[0m, in \u001b[0;36mPoolManager.urlopen\u001b[1;34m(self, method, url, redirect, **kw)\u001b[0m\n\u001b[0;32m 374\u001b[0m response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39murlopen(method, url, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkw)\n\u001b[0;32m 375\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m--> 376\u001b[0m response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39murlopen(method, u\u001b[39m.\u001b[39mrequest_uri, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkw)\n\u001b[0;32m 378\u001b[0m redirect_location \u001b[39m=\u001b[39m redirect \u001b[39mand\u001b[39;00m response\u001b[39m.\u001b[39mget_redirect_location()\n\u001b[0;32m 379\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m redirect_location:\n",
|
||||
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\connectionpool.py:703\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[1;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[0;32m 700\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_prepare_proxy(conn)\n\u001b[0;32m 702\u001b[0m \u001b[39m# Make the request on the httplib connection object.\u001b[39;00m\n\u001b[1;32m--> 703\u001b[0m httplib_response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_request(\n\u001b[0;32m 704\u001b[0m conn,\n\u001b[0;32m 705\u001b[0m method,\n\u001b[0;32m 706\u001b[0m url,\n\u001b[0;32m 707\u001b[0m timeout\u001b[39m=\u001b[39;49mtimeout_obj,\n\u001b[0;32m 708\u001b[0m body\u001b[39m=\u001b[39;49mbody,\n\u001b[0;32m 709\u001b[0m headers\u001b[39m=\u001b[39;49mheaders,\n\u001b[0;32m 710\u001b[0m chunked\u001b[39m=\u001b[39;49mchunked,\n\u001b[0;32m 711\u001b[0m )\n\u001b[0;32m 713\u001b[0m \u001b[39m# If we're going to release the connection in ``finally:``, then\u001b[39;00m\n\u001b[0;32m 714\u001b[0m \u001b[39m# the response doesn't need to know about the connection. Otherwise\u001b[39;00m\n\u001b[0;32m 715\u001b[0m \u001b[39m# it will also try to release it and we'll have a double-release\u001b[39;00m\n\u001b[0;32m 716\u001b[0m \u001b[39m# mess.\u001b[39;00m\n\u001b[0;32m 717\u001b[0m response_conn \u001b[39m=\u001b[39m conn \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m release_conn \u001b[39melse\u001b[39;00m \u001b[39mNone\u001b[39;00m\n",
|
||||
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\connectionpool.py:449\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[1;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[0;32m 444\u001b[0m httplib_response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39mgetresponse()\n\u001b[0;32m 445\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 446\u001b[0m \u001b[39m# Remove the TypeError from the exception chain in\u001b[39;00m\n\u001b[0;32m 447\u001b[0m \u001b[39m# Python 3 (including for exceptions like SystemExit).\u001b[39;00m\n\u001b[0;32m 448\u001b[0m \u001b[39m# Otherwise it looks like a bug in the code.\u001b[39;00m\n\u001b[1;32m--> 449\u001b[0m six\u001b[39m.\u001b[39;49mraise_from(e, \u001b[39mNone\u001b[39;49;00m)\n\u001b[0;32m 450\u001b[0m \u001b[39mexcept\u001b[39;00m (SocketTimeout, BaseSSLError, SocketError) \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 451\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_raise_timeout(err\u001b[39m=\u001b[39me, url\u001b[39m=\u001b[39murl, timeout_value\u001b[39m=\u001b[39mread_timeout)\n",
|
||||
"File \u001b[1;32m<string>:3\u001b[0m, in \u001b[0;36mraise_from\u001b[1;34m(value, from_value)\u001b[0m\n",
|
||||
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\connectionpool.py:444\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[1;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[0;32m 441\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mTypeError\u001b[39;00m:\n\u001b[0;32m 442\u001b[0m \u001b[39m# Python 3\u001b[39;00m\n\u001b[0;32m 443\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 444\u001b[0m httplib_response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39;49mgetresponse()\n\u001b[0;32m 445\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 446\u001b[0m \u001b[39m# Remove the TypeError from the exception chain in\u001b[39;00m\n\u001b[0;32m 447\u001b[0m \u001b[39m# Python 3 (including for exceptions like SystemExit).\u001b[39;00m\n\u001b[0;32m 448\u001b[0m \u001b[39m# Otherwise it looks like a bug in the code.\u001b[39;00m\n\u001b[0;32m 449\u001b[0m six\u001b[39m.\u001b[39mraise_from(e, \u001b[39mNone\u001b[39;00m)\n",
|
||||
"File \u001b[1;32mc:\\Python310\\lib\\http\\client.py:1374\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1372\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 1373\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m-> 1374\u001b[0m response\u001b[39m.\u001b[39;49mbegin()\n\u001b[0;32m 1375\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mConnectionError\u001b[39;00m:\n\u001b[0;32m 1376\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mclose()\n",
|
||||
"File \u001b[1;32mc:\\Python310\\lib\\http\\client.py:318\u001b[0m, in \u001b[0;36mHTTPResponse.begin\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 316\u001b[0m \u001b[39m# read until we get a non-100 response\u001b[39;00m\n\u001b[0;32m 317\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m--> 318\u001b[0m version, status, reason \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_read_status()\n\u001b[0;32m 319\u001b[0m \u001b[39mif\u001b[39;00m status \u001b[39m!=\u001b[39m CONTINUE:\n\u001b[0;32m 320\u001b[0m \u001b[39mbreak\u001b[39;00m\n",
|
||||
"File \u001b[1;32mc:\\Python310\\lib\\http\\client.py:279\u001b[0m, in \u001b[0;36mHTTPResponse._read_status\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 278\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_read_status\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[1;32m--> 279\u001b[0m line \u001b[39m=\u001b[39m \u001b[39mstr\u001b[39m(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfp\u001b[39m.\u001b[39;49mreadline(_MAXLINE \u001b[39m+\u001b[39;49m \u001b[39m1\u001b[39;49m), \u001b[39m\"\u001b[39m\u001b[39miso-8859-1\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 280\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(line) \u001b[39m>\u001b[39m _MAXLINE:\n\u001b[0;32m 281\u001b[0m \u001b[39mraise\u001b[39;00m LineTooLong(\u001b[39m\"\u001b[39m\u001b[39mstatus line\u001b[39m\u001b[39m\"\u001b[39m)\n",
|
||||
"File \u001b[1;32mc:\\Python310\\lib\\socket.py:705\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[1;34m(self, b)\u001b[0m\n\u001b[0;32m 703\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[0;32m 704\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 705\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sock\u001b[39m.\u001b[39;49mrecv_into(b)\n\u001b[0;32m 706\u001b[0m \u001b[39mexcept\u001b[39;00m timeout:\n\u001b[0;32m 707\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_timeout_occurred \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n",
|
||||
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"from selenium import webdriver\n",
|
||||
@ -38,7 +96,7 @@
|
||||
"from selenium.webdriver.support.ui import WebDriverWait\n",
|
||||
"from selenium.webdriver.support import expected_conditions as EC\n",
|
||||
"\n",
|
||||
"search_query = \"GEA Farm Technologies\"\n",
|
||||
"search_query = \"A*\"\n",
|
||||
"\n",
|
||||
"options = webdriver.ChromeOptions()\n",
|
||||
"\n",
|
||||
@ -55,6 +113,7 @@
|
||||
" \"default_directory\": download_path,\n",
|
||||
" },\n",
|
||||
"}\n",
|
||||
"options.add_argument(\"--headless=new\")\n",
|
||||
"options.add_experimental_option(\"prefs\", preferences)\n",
|
||||
"\n",
|
||||
"driver = webdriver.Chrome(options=options)\n",
|
||||
@ -75,16 +134,28 @@
|
||||
"wait.until(\n",
|
||||
" lambda driver: driver.current_url != \"https://www.unternehmensregister.de/ureg/\"\n",
|
||||
")\n",
|
||||
"## TODO Iterate over tabs\n",
|
||||
"\n",
|
||||
"num_pages = int(\n",
|
||||
" driver.find_element(By.XPATH, '//*[@class=\"page_count\"]').text.split(\" \")[0]\n",
|
||||
")\n",
|
||||
"for page_index in range(num_pages):\n",
|
||||
"\n",
|
||||
"processed_companies = []\n",
|
||||
"\n",
|
||||
"for page_index in tqdm(range(num_pages)):\n",
|
||||
" # Find all \"Registerinformationen\"\n",
|
||||
" companies_tab = driver.find_elements(\n",
|
||||
" By.LINK_TEXT, \"Registerinformationen des Registergerichts\"\n",
|
||||
" )\n",
|
||||
" for company_link in companies_tab:\n",
|
||||
" company_names = [\n",
|
||||
" elem.text\n",
|
||||
" for elem in driver.find_elements(\n",
|
||||
" By.XPATH, '//div[@class=\"company_result\"]/span/b'\n",
|
||||
" )\n",
|
||||
" ]\n",
|
||||
" for index, company_link in enumerate(companies_tab):\n",
|
||||
" company_name = company_names[index]\n",
|
||||
" if company_name in processed_companies:\n",
|
||||
" continue\n",
|
||||
" # Go to intermediary page\n",
|
||||
" company_link.click()\n",
|
||||
" # Trigger next redirect\n",
|
||||
@ -101,14 +172,37 @@
|
||||
" elems = driver.find_elements(By.TAG_NAME, \"input\")\n",
|
||||
" elems[-2].click()\n",
|
||||
"\n",
|
||||
" wait.until(\n",
|
||||
" EC.visibility_of_element_located((By.ID, \"paymentFormOverview:btnNext\"))\n",
|
||||
" )\n",
|
||||
" driver.find_element(By.ID, \"paymentFormOverview:btnNext\").click()\n",
|
||||
"\n",
|
||||
" wait.until(\n",
|
||||
" EC.visibility_of_element_located((By.LINK_TEXT, \"Zum Dokumentenkorb\"))\n",
|
||||
" )\n",
|
||||
" driver.find_element(By.LINK_TEXT, \"Zum Dokumentenkorb\").click()\n",
|
||||
"\n",
|
||||
" num_files = get_num_files(\"./data/Unternehmensregister/\")\n",
|
||||
" driver.find_element(By.CLASS_NAME, \"download-wrapper\").click()\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" wait.until(\n",
|
||||
" lambda x: wait_for_download_condition(\n",
|
||||
" \"./data/Unternehmensregister/\", num_files\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" rename_latest_file(\n",
|
||||
" \"./data/Unternehmensregister/\",\n",
|
||||
" f\"{company_name.replace(' ', '_').replace('/','_')}.xml\",\n",
|
||||
" )\n",
|
||||
" processed_companies.append(company_name)\n",
|
||||
" except:\n",
|
||||
" print(f\"Could not process {company_name}\")\n",
|
||||
" for i in range(6):\n",
|
||||
" driver.back()\n",
|
||||
" driver.find_element(By.XPATH, '//*[@class=\"fas fa-angle-right\"]').click()\n",
|
||||
"driver.close()"
|
||||
"driver.close()\n",
|
||||
"print(processed_companies)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -121,243 +215,429 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 119,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['registerdocument-2023-06-09-14-05-01.xml',\n",
|
||||
" 'registerdocument-2023-06-09-14-05-03.xml']"
|
||||
"['A&A_Amini_Art_GmbH.xml',\n",
|
||||
" 'A&A_Immo_GmbH.xml',\n",
|
||||
" 'A&P_AUDITING_GmbH_Wirtschaftsprüfungsgesellschaft.xml',\n",
|
||||
" 'A&QUA_gemeinnützige_Gesellschaft_für_Arbeit_u._Qualifizierung_mbH.xml',\n",
|
||||
" 'a+b_Asphalt-_und_Betonmischwerke_GmbH_&_Co._KG.xml',\n",
|
||||
" 'a+b_Verwaltungsgesellschaft_mbH.xml',\n",
|
||||
" 'A+E_Beteiligungs-_und_Handels-GmbH.xml',\n",
|
||||
" 'A+W_Systemhaus_GmbH.xml',\n",
|
||||
" 'A-S-D_Kfz-Teile-Handel_GmbH.xml',\n",
|
||||
" 'A-TEAM_Industrielles_Roboterschweißen_GmbH.xml',\n",
|
||||
" 'A.C.C._Funk_Taxi_&_Minicar_e.K..xml',\n",
|
||||
" 'a.c.k._aqua_concept_GmbH_Karlsruhe.xml',\n",
|
||||
" 'A.C._Weiss_GmbH_&_Co._KG.xml',\n",
|
||||
" 'A.D.S._OHG.xml',\n",
|
||||
" 'A.D._Glas-_und_Gebäudereinigung_e.K..xml',\n",
|
||||
" 'A.E._Z-Line_Taxi_-_und_Shuttle-Service_e.K..xml',\n",
|
||||
" 'A.F.Z._Automatisierung,_Fördern,_Zuführen_GmbH.xml',\n",
|
||||
" 'A.G._Zentral_Michael_Greising_e.K..xml',\n",
|
||||
" 'A.H._Steuerberatungsgesellschaft_mbH.xml',\n",
|
||||
" 'A.I.V._SERVICES_GmbH.xml',\n",
|
||||
" 'A.I._Kommanditist-Gesellschaft_mbH.xml',\n",
|
||||
" 'A.KIein_Immobilien_KG.xml',\n",
|
||||
" 'A.L.G._Christian_Schmelzer.xml',\n",
|
||||
" 'A.L.S._Architektonische_Licht-Systeme_GmbH.xml',\n",
|
||||
" 'A.M.G._Motorenbau_Hans_Werner_Aufrecht.xml',\n",
|
||||
" 'A.M.P._Athos_GmbH.xml',\n",
|
||||
" 'A.N._Gartenbau_GmbH.xml',\n",
|
||||
" 'A.Q.U.A_Services_KG.xml',\n",
|
||||
" 'A.R.S._GmbH_Süd,_Alt_und_Reststoffverwertung.xml',\n",
|
||||
" 'A.S.G._Industrielackierungen_GmbH.xml',\n",
|
||||
" 'A.S.S._bikes_and_parts_GmbH.xml',\n",
|
||||
" 'A.S._Baubedarfvermittlung_Gesellschaft_mit_beschränkter_Haftung.xml',\n",
|
||||
" 'A.T.C._Automotive_GmbH.xml',\n",
|
||||
" 'A._&_S._Aigner_und_Schulz_GmbH.xml',\n",
|
||||
" 'A._+_H._Weier_GmbH.xml',\n",
|
||||
" 'A._+_K._Hertkorn_OHG_Möbel_-_Innenausbau.xml',\n",
|
||||
" 'A._Abele_GmbH.xml',\n",
|
||||
" 'A._Baur_Mineralöl-Abfertigungsspedition_GmbH.xml',\n",
|
||||
" 'A._Blum_GmbH.xml',\n",
|
||||
" 'A._Both_GmbH.xml',\n",
|
||||
" 'A._Both_GmbH_&_Co._KG_Werkzeugtechnik_CNC_Maschinenausrüstung.xml',\n",
|
||||
" 'A._DINKIC_GMBH.xml',\n",
|
||||
" 'A._Elsbecker_GmbH.xml',\n",
|
||||
" 'A._Erglis_GmbH.xml',\n",
|
||||
" 'A._Frauenrath_Landschaftsbau_GmbH_&_Co._KG..xml',\n",
|
||||
" 'A._Gradmann_GmbH_&_Co._KG.xml',\n",
|
||||
" 'A._Hanhart_GmbH_&_Co._KG.xml',\n",
|
||||
" 'A._Hüglin_-_Putz_und_Stuck_-_Gesellschaft_mit_beschränkter_Haftung.xml',\n",
|
||||
" 'A._Illmann_Zahntechnik_GmbH.xml',\n",
|
||||
" 'A._Junghanns_Automatisierungs_GmbH.xml',\n",
|
||||
" 'A._Jung_GmbH_&_Co._KG.xml',\n",
|
||||
" 'A._Kolbinger_GmbH_Versicherungs-Makler.xml',\n",
|
||||
" 'A._Kolckmann,_Weberei_und_Kunststoffbeschichtungen_GmbH.xml',\n",
|
||||
" 'A._Kolckmann_GmbH_&_Co._KG.xml',\n",
|
||||
" 'A._Kuhner_GmbH.xml',\n",
|
||||
" 'A._Lipp_GmbH.xml',\n",
|
||||
" 'A._Müller_Geschäftsführungs-_GmbH.xml',\n",
|
||||
" 'A._Müller_GmbH_&_Co._KG.xml',\n",
|
||||
" 'A._Nassal_GmbH.xml',\n",
|
||||
" 'A._Oster_e.K..xml',\n",
|
||||
" 'A._Pfeiffer_Zimmerei_GmbH.xml',\n",
|
||||
" 'A._Pfingsten_KG.xml',\n",
|
||||
" 'A._Pullmann_GmbH.xml',\n",
|
||||
" 'A._Randecker_Wirtschafts-_und_Steuerberatungsgesellschaft_mbH.xml',\n",
|
||||
" 'A._Reinhard_GmbH.xml',\n",
|
||||
" 'A._Ritter_GmbH.xml',\n",
|
||||
" 'A._Sabadinowitsch_Verwaltung_GmbH.xml',\n",
|
||||
" 'A._Sluka-Verwaltungsgesellschaft_mit_beschränkter_Haftung.xml',\n",
|
||||
" 'A._Sommer_Finanzdienstleistungsvermittlung_e.K..xml',\n",
|
||||
" 'A._Sorg_GmbH_&_Co._KG.xml',\n",
|
||||
" 'A._u._G_Sexton_GmbH.xml',\n",
|
||||
" 'A._Umminger_LUM-Air,_Elektro-_und_Filtertechnik_GmbH.xml',\n",
|
||||
" 'A._Wankmüller_GmbH_&_Co._KG.xml',\n",
|
||||
" 'A._Ziemann_Gesellschaft_mit_beschränkter_Haftung.xml',\n",
|
||||
" 'A._Zwisler_e.K..xml',\n",
|
||||
" 'A_&_A_Consulting_GmbH.xml',\n",
|
||||
" 'A_&_A_Gipserbetrieb_GmbH.xml',\n",
|
||||
" 'a_&_b_Beteiligungs-GmbH.xml',\n",
|
||||
" 'A_&_B_Gastronomie-Betriebe_GmbH.xml',\n",
|
||||
" 'A_&_C_Aqua_&_Care_Limited.xml',\n",
|
||||
" 'A_&_F_Lori_GmbH.xml',\n",
|
||||
" 'A_&_L_Engineering_Service_GmbH.xml',\n",
|
||||
" 'A_&_M_Stanzformzubehör_Olaf_Abendroth_GmbH.xml',\n",
|
||||
" 'A_&_O_Grundstücksverwaltungs_GmbH_&_Co._KG.xml',\n",
|
||||
" 'A_&_R_Textilproduktion_GmbH.xml',\n",
|
||||
" 'A_&_S_Bäder_GmbH_&_Co..xml',\n",
|
||||
" 'A_&_S_Vermögensverwaltungs_GmbH.xml',\n",
|
||||
" 'A_&_T_Roth_GmbH.xml',\n",
|
||||
" 'A_+_A_Aalsmeer_Blumen_-_Bräutigam_E._Kfr.,_Inh._Manuela_Bräutigam.xml',\n",
|
||||
" 'a_+_b_Wohnbau_GmbH.xml',\n",
|
||||
" 'A_+_H_Bauträger-_und_Verwaltungsgesellschaft_mit_beschränkter_Haftung.xml',\n",
|
||||
" 'A_+_M_Verwaltungs-GmbH.xml',\n",
|
||||
" 'A_+_P_Baumaschinen_GmbH_&_Co._KG.xml',\n",
|
||||
" 'A_+_R_Baumaschinen_-_Mietpark_+_-Vertriebs-GmbH.xml',\n",
|
||||
" 'A_+_S_Tierbedarf_GmbH.xml',\n",
|
||||
" 'A_+_Te_Stabil-Bau_GmbH.xml',\n",
|
||||
" 'A_+_W._Sahm_Bedachungs-GmbH.xml',\n",
|
||||
" 'a_-_Vermögensverwaltungs-GmbH_&_Co._KG.xml',\n",
|
||||
" 'A_-_Z_Kreditvermittlungs-Gesellschaft_mit_beschränkter_Haftung.xml',\n",
|
||||
" 'A_2000_Industrie-Elektronik_GmbH.xml',\n",
|
||||
" 'A_bis_Z_Verwaltungs_GmbH.xml',\n",
|
||||
" 'A_B_A_S_A_GmbH_Organisations_-_Planungsbüro_für_den_Innenausbau.xml',\n",
|
||||
" 'A_B_U_-_GmbH_Altlasten_Bauökologie_Umweltmanagement.xml',\n",
|
||||
" 'A_F_Fussbodentechnik_GmbH.xml',\n",
|
||||
" 'A_L_T_E_C_GmbH.xml',\n",
|
||||
" 'A_l_u_f_o_r_m_Alucobondverarbeitungs-GmbH.xml',\n",
|
||||
" 'A_L_Z_Auto_Licht_und_Zündung_Service_Gesellschaft_mit_beschränkter_Haftung.xml',\n",
|
||||
" 'a_m_friseure_GmbH_Karlsruhe.xml',\n",
|
||||
" 'a_m_friseure_GmbH_Koblenz.xml',\n",
|
||||
" 'a_priori_GmbH.xml',\n",
|
||||
" 'a_s_k_-_Kunststoffe_GmbH.xml',\n",
|
||||
" 'A_S_TRUCKS_e.K..xml',\n",
|
||||
" 'A_S_Y_S_Automatic_Systems_Beteiligungs-GmbH.xml',\n",
|
||||
" 'A_u_c_h_Gesellschaft_mit_beschränkter_Haftung.xml',\n",
|
||||
" 'export',\n",
|
||||
" 'registerdocument-2023-06-11-12-41-30 (1).xml',\n",
|
||||
" 'registerdocument-2023-06-11-12-41-30.xml',\n",
|
||||
" 'registerdocument-2023-06-11-12-52-33.xml',\n",
|
||||
" 'registerdocument-2023-06-11-12-52-41.xml']"
|
||||
]
|
||||
},
|
||||
"execution_count": 119,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"files = os.listdir(\"./data/Unternehmensregister\")\n",
|
||||
"files"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 135,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import xmltodict\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def transform_xml_to_json(source_dir: str, target_dir: str):\n",
|
||||
" for file in glob.glob1(source_dir, \"*.xml\"):\n",
|
||||
" source_path = os.path.join(source_dir, file)\n",
|
||||
" target_path = os.path.join(target_dir, file.replace(\".xml\", \".json\"))\n",
|
||||
"\n",
|
||||
" with open(source_path, \"r\", encoding=\"utf-8\") as source_file:\n",
|
||||
" data = xmltodict.parse(source_file.read().encode())\n",
|
||||
" with open(target_path, \"w\", encoding=\"utf-8\") as json_file:\n",
|
||||
" json_file.write(json.dumps(data))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"transform_xml_to_json(\"./data/Unternehmensregister/\", \"./data/Unternehmensregister/\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for file in glob.glob1(\"./data/Unternehmensregister/\", \"*.json\"):\n",
|
||||
" path = os.path.join(\"./data/Unternehmensregister/\", file)\n",
|
||||
" with open(path, \"r\", encoding=\"utf-8\") as file_object:\n",
|
||||
" data = json.loads(file_object.read())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def parse_stakeholder(data: dict) -> list:\n",
|
||||
" if \"Natuerliche_Person\" in data[\"Beteiligter\"]:\n",
|
||||
" return {\n",
|
||||
" \"name\": {\n",
|
||||
" \"firstname\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\"Voller_Name\"][\n",
|
||||
" \"Vorname\"\n",
|
||||
" ],\n",
|
||||
" \"lastname\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\"Voller_Name\"][\n",
|
||||
" \"Nachname\"\n",
|
||||
" ],\n",
|
||||
" },\n",
|
||||
" \"date_of_birth\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\"Geburt\"][\n",
|
||||
" \"Geburtsdatum\"\n",
|
||||
" ]\n",
|
||||
" if \"Geburt\" in data[\"Beteiligter\"][\"Natuerliche_Person\"]\n",
|
||||
" else None,\n",
|
||||
" \"location\": {\n",
|
||||
" \"city\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\"Anschrift\"][\"Ort\"]\n",
|
||||
" },\n",
|
||||
" \"role\": data[\"Rolle\"][\"Rollenbezeichnung\"][\"content\"],\n",
|
||||
" }\n",
|
||||
" if \"Organisation\" in data[\"Beteiligter\"]:\n",
|
||||
" return {\n",
|
||||
" \"role\": \"Organisation\",\n",
|
||||
" \"description\": data[\"Beteiligter\"][\"Organisation\"][\"Bezeichnung\"][\n",
|
||||
" \"Bezeichnung_Aktuell\"\n",
|
||||
" ],\n",
|
||||
" \"location\": {\n",
|
||||
" \"city\": data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"][\"Ort\"],\n",
|
||||
" \"street\": data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"][\"Strasse\"]\n",
|
||||
" if \"Strasse\" in data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"]\n",
|
||||
" else None,\n",
|
||||
" \"house_number\": data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"][\n",
|
||||
" \"Hausnummer\"\n",
|
||||
" ]\n",
|
||||
" if \"Hausnummer\" in data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"]\n",
|
||||
" else None,\n",
|
||||
" \"zip_code\": data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"][\n",
|
||||
" \"Postleitzahl\"\n",
|
||||
" ],\n",
|
||||
" },\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def map_unternehmensregister_json(data: dict) -> dict:\n",
|
||||
" result = {\"base_info\": None, \"relationships\": []}\n",
|
||||
"\n",
|
||||
" base_info = {\n",
|
||||
" \"company_name\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
|
||||
" \"Basisdaten_Register\"\n",
|
||||
" ][\"Rechtstraeger\"][\"Bezeichnung\"][\"Bezeichnung_Aktuell\"],\n",
|
||||
" \"location\": {\n",
|
||||
" \"city\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\"Basisdaten_Register\"][\n",
|
||||
" \"Rechtstraeger\"\n",
|
||||
" ][\"Anschrift\"][\"Ort\"],\n",
|
||||
" \"zip_code\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
|
||||
" \"Basisdaten_Register\"\n",
|
||||
" ][\"Rechtstraeger\"][\"Anschrift\"][\"Postleitzahl\"],\n",
|
||||
" \"street\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
|
||||
" \"Basisdaten_Register\"\n",
|
||||
" ][\"Rechtstraeger\"][\"Anschrift\"][\"Strasse\"]\n",
|
||||
" if \"Strasse\"\n",
|
||||
" in data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\"Basisdaten_Register\"][\n",
|
||||
" \"Rechtstraeger\"\n",
|
||||
" ][\"Anschrift\"]\n",
|
||||
" else None,\n",
|
||||
" \"house_number\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
|
||||
" \"Basisdaten_Register\"\n",
|
||||
" ][\"Rechtstraeger\"][\"Anschrift\"][\"Hausnummer\"]\n",
|
||||
" if \"Hausnummer\"\n",
|
||||
" in data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\"Basisdaten_Register\"][\n",
|
||||
" \"Rechtstraeger\"\n",
|
||||
" ][\"Anschrift\"]\n",
|
||||
" else None,\n",
|
||||
" },\n",
|
||||
" \"last_update\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\"Auszug\"][\n",
|
||||
" \"letzte_Eintragung\"\n",
|
||||
" ],\n",
|
||||
" }\n",
|
||||
" result[\"base_info\"] = base_info\n",
|
||||
" for i in range(\n",
|
||||
" len(data[\"XJustiz_Daten\"][\"Grunddaten\"][\"Verfahrensdaten\"][\"Beteiligung\"])\n",
|
||||
" ):\n",
|
||||
" people = parse_stakeholder(\n",
|
||||
" data[\"XJustiz_Daten\"][\"Grunddaten\"][\"Verfahrensdaten\"][\"Beteiligung\"][i]\n",
|
||||
" )\n",
|
||||
" result[\"relationships\"].append(people)\n",
|
||||
" return result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{}\n",
|
||||
"{}\n",
|
||||
"{'name': {'firstname': 'Reinhard', 'lastname': 'Gebing'}, 'date_of_birth': '1964-04-26', 'location': {'city': 'Oelde'}, 'role': 'Prokurist(in)'}\n",
|
||||
"{'name': {'firstname': 'Markus', 'lastname': 'Kreft'}, 'date_of_birth': '1966-04-03', 'location': {'city': 'Wetter'}, 'role': 'Prokurist(in)'}\n",
|
||||
"{'name': {'firstname': 'Kai', 'lastname': 'Luntz'}, 'date_of_birth': '1970-12-04', 'location': {'city': 'Holzminden'}, 'role': 'Prokurist(in)'}\n",
|
||||
"{'name': {'firstname': 'Thomas', 'lastname': 'Mader'}, 'date_of_birth': '1972-05-24', 'location': {'city': 'Rheda-Wiedenbrück'}, 'role': 'Prokurist(in)'}\n",
|
||||
"{'name': {'firstname': 'Peter', 'lastname': 'Lauwers'}, 'date_of_birth': '1970-03-26', 'location': {'city': 'Düsseldorf'}, 'role': 'Geschäftsführer(in)'}\n",
|
||||
"{'name': {'firstname': 'Erkul', 'lastname': 'Basaran'}, 'date_of_birth': '1977-05-06', 'location': {'city': 'Erkrath'}, 'role': 'Geschäftsführer(in)'}\n",
|
||||
"{'name': {'firstname': 'Katja', 'lastname': 'Voß'}, 'date_of_birth': '1978-02-24', 'location': {'city': 'Rheda-Wiedenbrück'}, 'role': 'Prokurist(in)'}\n",
|
||||
"{'name': {'firstname': 'Henrik', 'lastname': 'Böttner'}, 'date_of_birth': '1982-11-07', 'location': {'city': 'Bochum'}, 'role': 'Prokurist(in)'}\n",
|
||||
"{'name': {'firstname': 'Ulrich', 'lastname': 'Raßenhövel'}, 'date_of_birth': '1969-04-16', 'location': {'city': 'Oelde'}, 'role': 'Prokurist(in)'}\n",
|
||||
"{'name': {'firstname': 'Andreas', 'lastname': 'Naroska'}, 'date_of_birth': '1967-03-23', 'location': {'city': 'Herdecke'}, 'role': 'Prokurist(in)'}\n",
|
||||
"{'name': {'firstname': 'Mark', 'lastname': 'Kramps'}, 'date_of_birth': '1967-09-04', 'location': {'city': 'Witten'}, 'role': 'Prokurist(in)'}\n",
|
||||
"{'name': {'firstname': 'Ralf', 'lastname': 'Barkmeyer'}, 'date_of_birth': '1974-02-28', 'location': {'city': 'Dortmund'}, 'role': 'Prokurist(in)'}\n",
|
||||
"{'name': {'firstname': 'Holger', 'lastname': 'Siegwarth'}, 'date_of_birth': '1967-05-13', 'location': {'city': 'Tönnisvorst'}, 'role': 'Prokurist(in)'}\n",
|
||||
"{'name': {'firstname': 'Oliver', 'lastname': 'Liß'}, 'date_of_birth': '1981-04-13', 'location': {'city': 'Herne'}, 'role': 'Prokurist(in)'}\n",
|
||||
"{'name': {'firstname': 'Liang', 'lastname': 'Cheng'}, 'date_of_birth': '1980-12-29', 'location': {'city': 'Göppingen'}, 'role': 'Geschäftsführer(in)'}\n",
|
||||
"{'name': {'firstname': 'Astrid', 'lastname': 'Dörner-Rodeheger'}, 'date_of_birth': '1968-12-24', 'location': {'city': 'Beckum'}, 'role': 'Prokurist(in)'}\n",
|
||||
"{'name': {'firstname': 'Jon', 'lastname': 'Lange'}, 'date_of_birth': '1978-04-25', 'location': {'city': 'Dortmund'}, 'role': 'Prokurist(in)'}\n",
|
||||
"{'name': {'firstname': 'Matthias', 'lastname': 'Peters'}, 'date_of_birth': '1973-08-28', 'location': {'city': 'Dortmund'}, 'role': 'Prokurist(in)'}\n",
|
||||
"{'name': {'firstname': 'Ralf', 'lastname': 'Frombach'}, 'date_of_birth': '1977-01-25', 'location': {'city': 'Werne'}, 'role': 'Prokurist(in)'}\n",
|
||||
"{'name': {'firstname': 'Sven', 'lastname': 'Hommel'}, 'date_of_birth': '1979-04-22', 'location': {'city': 'Berlin'}, 'role': 'Prokurist(in)'}\n"
|
||||
"A&A Amini Art GmbH\n",
|
||||
"A&A Immo GmbH\n",
|
||||
"A&P AUDITING GmbH Wirtschaftsprüfungsgesellschaft\n",
|
||||
"A&QUA gemeinnützige Gesellschaft für Arbeit u. Qualifizierung mbH\n",
|
||||
"a+b Asphalt- und Betonmischwerke GmbH & Co. KG\n",
|
||||
"a+b Verwaltungsgesellschaft mbH\n",
|
||||
"A+E Beteiligungs- und Handels-GmbH\n",
|
||||
"A+W Systemhaus GmbH\n",
|
||||
"A-S-D Kfz-Teile-Handel GmbH\n",
|
||||
"A-TEAM Industrielles Roboterschweißen GmbH\n",
|
||||
"A.C.C. Funk Taxi & Minicar e.K.\n",
|
||||
"a.c.k. aqua concept GmbH Karlsruhe\n",
|
||||
"A.C. Weiss GmbH & Co. KG\n",
|
||||
"A.D.S. OHG\n",
|
||||
"A.D. Glas- und Gebäudereinigung e.K.\n",
|
||||
"A.E. Z-Line Taxi - und Shuttle-Service e.K.\n",
|
||||
"A.F.Z. Automatisierung, Fördern, Zuführen GmbH\n",
|
||||
"A.G. Zentral Michael Greising e.K.\n",
|
||||
"A.H. Steuerberatungsgesellschaft mbH\n",
|
||||
"A.I.V. SERVICES GmbH\n",
|
||||
"A.I. Kommanditist-Gesellschaft mbH\n",
|
||||
"A.KIein Immobilien KG\n",
|
||||
"A.L.G. Christian Schmelzer\n",
|
||||
"A.L.S. Architektonische Licht-Systeme GmbH\n",
|
||||
"A.M.G. Motorenbau Hans Werner Aufrecht\n",
|
||||
"A.M.P. Athos GmbH\n",
|
||||
"A.N. Gartenbau GmbH\n",
|
||||
"A.Q.U.A Services KG\n",
|
||||
"A.R.S. GmbH Süd, Alt und Reststoffverwertung\n",
|
||||
"A.S.G. Industrielackierungen GmbH\n",
|
||||
"A.S.S. bikes and parts GmbH\n",
|
||||
"A.S. Baubedarfvermittlung Gesellschaft mit beschränkter Haftung\n",
|
||||
"A.T.C. Automotive GmbH\n",
|
||||
"A. & S. Aigner und Schulz GmbH\n",
|
||||
"A. + H. Weier GmbH\n",
|
||||
"A. + K. Hertkorn OHG Möbel - Innenausbau\n",
|
||||
"A. Abele GmbH\n",
|
||||
"A. Baur Mineralöl-Abfertigungsspedition GmbH\n",
|
||||
"A. Blum GmbH\n",
|
||||
"A. Both GmbH\n",
|
||||
"A. Both GmbH & Co. KG Werkzeugtechnik CNC Maschinenausrüstung\n",
|
||||
"A. DINKIC GMBH\n",
|
||||
"A. Elsbecker GmbH\n",
|
||||
"A. Erglis GmbH\n",
|
||||
"A. Frauenrath Landschaftsbau GmbH & Co. KG.\n",
|
||||
"A. Gradmann GmbH & Co. KG\n",
|
||||
"A. Hanhart GmbH & Co. KG\n",
|
||||
"A. Hüglin - Putz und Stuck - Gesellschaft mit beschränkter Haftung\n",
|
||||
"A. Illmann Zahntechnik GmbH\n",
|
||||
"A. Junghanns Automatisierungs GmbH\n",
|
||||
"A. Jung GmbH & Co.KG\n",
|
||||
"A. Kolbinger GmbH Versicherungs-Makler\n",
|
||||
"A. Kolckmann, Weberei und Kunststoffbeschichtungen GmbH\n",
|
||||
"A. Kolckmann GmbH & Co. KG\n",
|
||||
"A. Kuhner GmbH\n",
|
||||
"A. Lipp GmbH\n",
|
||||
"A. Müller Geschäftsführungs- GmbH\n",
|
||||
"A. Müller GmbH & Co. KG\n",
|
||||
"A. Nassal GmbH\n",
|
||||
"A. Oster e.K.\n",
|
||||
"A. Pfeiffer Zimmerei GmbH\n",
|
||||
"A. Pfingsten KG\n",
|
||||
"A. Pullmann GmbH\n",
|
||||
"A. Randecker Wirtschafts- und Steuerberatungsgesellschaft mbH\n",
|
||||
"A. Reinhard GmbH\n",
|
||||
"A. Ritter GmbH\n",
|
||||
"A. Sabadinowitsch Verwaltung GmbH\n",
|
||||
"A. Sluka-Verwaltungsgesellschaft mit beschränkter Haftung\n",
|
||||
"A. Sommer Finanzdienstleistungsvermittlung e.K.\n",
|
||||
"A. Sorg GmbH & Co. KG\n",
|
||||
"A. u. G Sexton GmbH\n",
|
||||
"A. Umminger LUM-Air, Elektro- und Filtertechnik GmbH\n",
|
||||
"A. Wankmüller GmbH & Co. KG\n",
|
||||
"A. Ziemann Gesellschaft mit beschränkter Haftung\n",
|
||||
"A. Zwisler e.K.\n",
|
||||
"A & A Consulting GmbH\n",
|
||||
"A & A Gipserbetrieb GmbH\n",
|
||||
"a & b Beteiligungs-GmbH\n",
|
||||
"A & B Gastronomie-Betriebe GmbH\n",
|
||||
"A & C Aqua & Care Limited\n",
|
||||
"A & F Lori GmbH\n",
|
||||
"A & L Engineering Service GmbH\n",
|
||||
"A & M Stanzformzubehör Olaf Abendroth GmbH\n",
|
||||
"A & O Grundstücksverwaltungs GmbH & Co. KG\n",
|
||||
"A & R Textilproduktion GmbH\n",
|
||||
"A & S Bäder GmbH & Co.\n",
|
||||
"A & S Vermögensverwaltungs GmbH\n",
|
||||
"A & T Roth GmbH\n",
|
||||
"A + A Aalsmeer Blumen - Bräutigam E. Kfr., Inh. Manuela Bräutigam\n",
|
||||
"a + b Wohnbau GmbH\n",
|
||||
"A + H Bauträger- und Verwaltungsgesellschaft mit beschränkter Haftung\n",
|
||||
"A + M Verwaltungs-GmbH\n",
|
||||
"A + P Baumaschinen GmbH & Co. KG\n",
|
||||
"A + R Baumaschinen - Mietpark + -Vertriebs-GmbH\n",
|
||||
"A + S Tierbedarf GmbH\n",
|
||||
"A + Te Stabil-Bau GmbH\n",
|
||||
"A + W. Sahm Bedachungs-GmbH\n",
|
||||
"a - Vermögensverwaltungs-GmbH & Co. KG\n",
|
||||
"A - Z Kreditvermittlungs-Gesellschaft mit beschränkter Haftung\n",
|
||||
"A 2000 Industrie-Elektronik GmbH\n",
|
||||
"A bis Z Verwaltungs GmbH\n",
|
||||
"A B A S A GmbH Organisations - Planungsbüro für den Innenausbau\n",
|
||||
"A B U - GmbH Altlasten Bauökologie Umweltmanagement\n",
|
||||
"A F Fussbodentechnik GmbH\n",
|
||||
"A L T E C GmbH\n",
|
||||
"A l u f o r m Alucobondverarbeitungs-GmbH\n",
|
||||
"A L Z Auto Licht und Zündung Service Gesellschaft mit beschränkter Haftung\n",
|
||||
"a/m friseure GmbH Karlsruhe\n",
|
||||
"a/m friseure GmbH Koblenz\n",
|
||||
"a priori GmbH\n",
|
||||
"a s k - Kunststoffe GmbH\n",
|
||||
"A/S TRUCKS e.K.\n",
|
||||
"A S Y S Automatic Systems Beteiligungs-GmbH\n",
|
||||
"A u c h Gesellschaft mit beschränkter Haftung\n",
|
||||
"a s k - Kunststoffe GmbH\n",
|
||||
"A. Maier GmbH & Co. KG\n",
|
||||
"\"A/D/L/E/R Steuerberatungsgesellschaft mbH\"\n",
|
||||
"a | m | | medienservice e. k.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import xmltodict\n",
|
||||
"\n",
|
||||
"for file in files:\n",
|
||||
" with open(\"./data/Unternehmensregister/\" + file, \"r\", encoding=\"utf-8\") as xml_file:\n",
|
||||
" data = xmltodict.parse(xml_file.read())\n",
|
||||
" with open(\"./data/temp.json\", \"w\", encoding=\"utf-8\") as json_file:\n",
|
||||
" json_file.write(json.dumps(data))\n",
|
||||
"for file in glob.glob1(\"./data/Unternehmensregister/\", \"*.json\"):\n",
|
||||
" path = os.path.join(\"./data/Unternehmensregister/\", file)\n",
|
||||
" with open(path, \"r\", encoding=\"utf-8\") as file_object:\n",
|
||||
" data = json.loads(file_object.read())\n",
|
||||
"\n",
|
||||
" keys = dict.keys(data[\"XJustiz_Daten\"][\"Grunddaten\"])\n",
|
||||
" base_info = {\n",
|
||||
" \"company_name\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
|
||||
" \"Basisdaten_Register\"\n",
|
||||
" ][\"Rechtstraeger\"][\"Bezeichnung\"][\"Bezeichnung_Aktuell\"],\n",
|
||||
" \"location\": {\n",
|
||||
" \"city\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
|
||||
" \"Basisdaten_Register\"\n",
|
||||
" ][\"Rechtstraeger\"][\"Anschrift\"][\"Ort\"],\n",
|
||||
" \"zip_code\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
|
||||
" \"Basisdaten_Register\"\n",
|
||||
" ][\"Rechtstraeger\"][\"Anschrift\"][\"Postleitzahl\"],\n",
|
||||
" \"street\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
|
||||
" \"Basisdaten_Register\"\n",
|
||||
" ][\"Rechtstraeger\"][\"Anschrift\"][\"Strasse\"],\n",
|
||||
" \"house_number\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
|
||||
" \"Basisdaten_Register\"\n",
|
||||
" ][\"Rechtstraeger\"][\"Anschrift\"][\"Hausnummer\"],\n",
|
||||
" },\n",
|
||||
" }\n",
|
||||
" result = map_unternehmensregister_json(data)\n",
|
||||
" print(result[\"base_info\"][\"company_name\"])\n",
|
||||
"\n",
|
||||
" def parse_stakeholder(data: dict) -> list:\n",
|
||||
" if \"Natuerliche_Person\" in data[\"Beteiligter\"]:\n",
|
||||
" return {\n",
|
||||
" \"name\": {\n",
|
||||
" \"firstname\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\n",
|
||||
" \"Voller_Name\"\n",
|
||||
" ][\"Vorname\"],\n",
|
||||
" \"lastname\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\n",
|
||||
" \"Voller_Name\"\n",
|
||||
" ][\"Nachname\"],\n",
|
||||
" },\n",
|
||||
" \"date_of_birth\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\n",
|
||||
" \"Geburt\"\n",
|
||||
" ][\"Geburtsdatum\"],\n",
|
||||
" \"location\": {\n",
|
||||
" \"city\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\"Anschrift\"][\n",
|
||||
" \"Ort\"\n",
|
||||
" ]\n",
|
||||
" },\n",
|
||||
" \"role\": data[\"Rolle\"][\"Rollenbezeichnung\"][\"content\"],\n",
|
||||
" }\n",
|
||||
" return {}\n",
|
||||
"\n",
|
||||
" for i in range(\n",
|
||||
" len(data[\"XJustiz_Daten\"][\"Grunddaten\"][\"Verfahrensdaten\"][\"Beteiligung\"])\n",
|
||||
" ):\n",
|
||||
" people = parse_stakeholder(\n",
|
||||
" data[\"XJustiz_Daten\"][\"Grunddaten\"][\"Verfahrensdaten\"][\"Beteiligung\"][i]\n",
|
||||
" )\n",
|
||||
" print(people)\n",
|
||||
" break"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pdf2image import convert_from_path\n",
|
||||
"\n",
|
||||
"pdfs = r\"./data/test.pdf\"\n",
|
||||
"pages = convert_from_path(pdfs, 350)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"for i, page in enumerate(pages):\n",
|
||||
" image_name = f\"./data/Page_{i+1}.jpg\"\n",
|
||||
" page.save(image_name, \"JPEG\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Handelsregister B des Abteilung B Nummer der Firma:\n",
|
||||
"Amtsgerichts Hamm Wiedergabe des aktuellen HRB 5363\n",
|
||||
"Registerinhalts\n",
|
||||
"Abruf vom 07.06.2023 19:37\n",
|
||||
"1. Anzahl der bisherigen Eintragungen:\n",
|
||||
"51\n",
|
||||
"2. a) Firma:\n",
|
||||
"GEA Farm Technologies GmbH\n",
|
||||
"b) Sitz, Niederlassung, inländische Geschäftsanschrift, empfangsberechtigte Person, Zweigniederlassungen:\n",
|
||||
"Bönen\n",
|
||||
"Geschäftsanschrift: Siemensstraße 25-27, 59199 Bönen\n",
|
||||
"c) Gegenstand des Unternehmens:\n",
|
||||
"Entwicklung, Herstellung und der Vertrieb von Landtechnik, insbesondere von Komponenten und Anlagen\n",
|
||||
"(a) zur Gewinnung, Kühlung, Behandlung und Lagerung von Milch;\n",
|
||||
"(b) für das Milchvieh-Herdenmanagement;\n",
|
||||
"(c) zur Tierhygiene und Sicherung der Milchqualität und\n",
|
||||
"(d) zur Aufstallung von Tieren;\n",
|
||||
"sowie die Herstellung und der Vertrieb von Anlagen und Fahrzeugen zur Aufbereitung und zum Transport von Gülle.\n",
|
||||
"3. Grund- oder Stammkapital:\n",
|
||||
"5.115.000,00 EUR\n",
|
||||
"4. a) Allgemeine Vertretungsregelung:\n",
|
||||
"Ist nur ein Geschäftsführer bestellt, so vertritt er die Gesellschaft allein. Sind mehrere Geschäftsführer bestellt, so wird die\n",
|
||||
"Gesellschaft durch zwei Geschäftsführer oder durch einen Geschäftsführer gemeinsam mit einem Prokuristen vertreten.\n",
|
||||
"b) Vorstand, Leitungsorgan, geschäftsführende Direktoren, persönlich haftender Gesellschafter, Geschäftsführer,\n",
|
||||
"Vertretungsberechtigte und besondere Vertretungsbefugnis:\n",
|
||||
"Geschäftsführer: Basaran, Erkul, Erkrath, *06.05.1977\n",
|
||||
"Geschäftsführer: Cheng, Liang, Göppingen, *29.12.1980\n",
|
||||
"Geschäftsführer: Lauwers, Peter, Düsseldorf, *26.03.1970\n",
|
||||
"5. Prokura:\n",
|
||||
"Gesamtprokura gemeinsam mit einem Geschäftsführer oder einem anderen Prokuristen:\n",
|
||||
"Barkmeyer, Ralf, Dortmund, *28.02.1974\n",
|
||||
"Böttner, Henrik, Bochum, *07.11.1982\n",
|
||||
"Dörner-Rodeheger, Astrid, Beckum, *24.12.1968\n",
|
||||
"Frombach, Ralf, Werne, *25.01.1977\n",
|
||||
"Gebing, Reinhard, Oelde, *26.04.1964\n",
|
||||
"Hommel, Sven, Berlin, *22.04.1979\n",
|
||||
"Kramps, Mark, Witten, *04.09.1967\n",
|
||||
"Kreft, Markus, Wetter, *03.04.1966\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import cv2\n",
|
||||
"import pytesseract\n",
|
||||
"\n",
|
||||
"image_path = \"./data/Page_1.jpg\"\n",
|
||||
"image = cv2.imread(image_path)\n",
|
||||
"\n",
|
||||
"text = str(pytesseract.image_to_string(image, config=\"--psm 6\", lang=\"deu\"))\n",
|
||||
"print(text)\n",
|
||||
"with open(\"./data/Page_1.txt\", \"w\", encoding=\"utf-8\") as output_file:\n",
|
||||
" output_file.write(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['Geschäftsführer: Basaran, Erkul', 'Geschäftsführer: Cheng, Liang', 'Geschäftsführer: Lauwers, Peter']\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['Erkul Basaran', 'Liang Cheng', 'Peter Lauwers']"
|
||||
]
|
||||
},
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_managing_directors(text: str) -> list:\n",
|
||||
" managing_directors_regex = r\"Geschäftsführer: [a-zA-ZäöüÄÖÜ]*, [a-zA-ZäöüÄÖÜ]*\"\n",
|
||||
" hits = re.findall(managing_directors_regex, text)\n",
|
||||
" print(hits)\n",
|
||||
" return [\n",
|
||||
" \" \".join(hit.replace(\"Geschäftsführer: \", \"\").replace(\",\", \"\").split(\" \")[::-1])\n",
|
||||
" for hit in hits\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"get_managing_directors(text)"
|
||||
" name = (\n",
|
||||
" result[\"base_info\"][\"company_name\"]\n",
|
||||
" .replace(\" \", \"_\")\n",
|
||||
" .replace(\"/\", \"_\")\n",
|
||||
" .replace('\"', \"\")\n",
|
||||
" .replace(\"|\", \"_\")\n",
|
||||
" )\n",
|
||||
" with open(\n",
|
||||
" f\"./data/Unternehmensregister/export/{name}.json\", \"w+\", encoding=\"utf-8\"\n",
|
||||
" ) as export_file:\n",
|
||||
" json.dump(result, export_file, ensure_ascii=False)"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -5,3 +5,4 @@ pdf2image
|
||||
bs4
|
||||
selenium
|
||||
xmltodict
|
||||
tqdm
|
Loading…
x
Reference in New Issue
Block a user