Bulk process Unternehmensregister .xmls

This commit is contained in:
TrisNol 2023-06-11 13:11:44 +02:00
parent 1010b43a5f
commit 058c16b3ff
2 changed files with 494 additions and 213 deletions

View File

@ -18,7 +18,39 @@
},
{
"cell_type": "code",
"execution_count": 118,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import glob"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def wait_for_download_condition(\n",
" path: str, num_files: int, pattern: str = \"*.xml\"\n",
") -> bool:\n",
" return len(glob.glob1(path, pattern)) > num_files\n",
"\n",
"\n",
"def get_num_files(path: str, pattern: str = \"*.xml\") -> int:\n",
" return len(glob.glob1(path, pattern))\n",
"\n",
"\n",
"def rename_latest_file(path: str, filename: str, pattern: str = \"*.xml\"):\n",
" list_of_files = [os.path.join(path, file) for file in glob.glob1(path, pattern)]\n",
" latest_download = max(list_of_files, key=os.path.getctime)\n",
" os.rename(latest_download, os.path.join(path, filename))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
@ -27,10 +59,36 @@
"text": [
"c:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Unternehmensregister\\data\\Unternehmensregister\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Unternehmensregister\\notebook.ipynb Cell 5\u001b[0m in \u001b[0;36m3\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W2sZmlsZQ%3D%3D?line=26'>27</a>\u001b[0m options\u001b[39m.\u001b[39madd_experimental_option(\u001b[39m\"\u001b[39m\u001b[39mprefs\u001b[39m\u001b[39m\"\u001b[39m, preferences)\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W2sZmlsZQ%3D%3D?line=28'>29</a>\u001b[0m driver \u001b[39m=\u001b[39m webdriver\u001b[39m.\u001b[39mChrome(options\u001b[39m=\u001b[39moptions)\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W2sZmlsZQ%3D%3D?line=30'>31</a>\u001b[0m driver\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mhttps://www.unternehmensregister.de/ureg/\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W2sZmlsZQ%3D%3D?line=31'>32</a>\u001b[0m \u001b[39m# Accept Cookies\u001b[39;00m\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W2sZmlsZQ%3D%3D?line=32'>33</a>\u001b[0m driver\u001b[39m.\u001b[39mfind_elements(\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W2sZmlsZQ%3D%3D?line=33'>34</a>\u001b[0m By\u001b[39m.\u001b[39mXPATH, \u001b[39m'\u001b[39m\u001b[39m//button[text()=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mNur technisch notwendige Cookies akzeptieren\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m]\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/trist/Documents/Code/M.Sc/aki_prj23_transparenzregister/Jupyter/API-tests/Unternehmensregister/notebook.ipynb#W2sZmlsZQ%3D%3D?line=34'>35</a>\u001b[0m )[\u001b[39m0\u001b[39m]\u001b[39m.\u001b[39mclick()\n",
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py:355\u001b[0m, in \u001b[0;36mWebDriver.get\u001b[1;34m(self, url)\u001b[0m\n\u001b[0;32m 353\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mget\u001b[39m(\u001b[39mself\u001b[39m, url: \u001b[39mstr\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 354\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Loads a web page in the current browser session.\"\"\"\u001b[39;00m\n\u001b[1;32m--> 355\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mexecute(Command\u001b[39m.\u001b[39;49mGET, {\u001b[39m\"\u001b[39;49m\u001b[39murl\u001b[39;49m\u001b[39m\"\u001b[39;49m: url})\n",
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py:344\u001b[0m, in \u001b[0;36mWebDriver.execute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m 341\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39msessionId\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m params:\n\u001b[0;32m 342\u001b[0m params[\u001b[39m\"\u001b[39m\u001b[39msessionId\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msession_id\n\u001b[1;32m--> 344\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mcommand_executor\u001b[39m.\u001b[39;49mexecute(driver_command, params)\n\u001b[0;32m 345\u001b[0m \u001b[39mif\u001b[39;00m response:\n\u001b[0;32m 346\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror_handler\u001b[39m.\u001b[39mcheck_response(response)\n",
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\remote_connection.py:290\u001b[0m, in \u001b[0;36mRemoteConnection.execute\u001b[1;34m(self, command, params)\u001b[0m\n\u001b[0;32m 288\u001b[0m data \u001b[39m=\u001b[39m utils\u001b[39m.\u001b[39mdump_json(params)\n\u001b[0;32m 289\u001b[0m url \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_url\u001b[39m}\u001b[39;00m\u001b[39m{\u001b[39;00mpath\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m--> 290\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_request(command_info[\u001b[39m0\u001b[39;49m], url, body\u001b[39m=\u001b[39;49mdata)\n",
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\remote_connection.py:311\u001b[0m, in \u001b[0;36mRemoteConnection._request\u001b[1;34m(self, method, url, body)\u001b[0m\n\u001b[0;32m 308\u001b[0m body \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m 310\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mkeep_alive:\n\u001b[1;32m--> 311\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_conn\u001b[39m.\u001b[39;49mrequest(method, url, body\u001b[39m=\u001b[39;49mbody, headers\u001b[39m=\u001b[39;49mheaders)\n\u001b[0;32m 312\u001b[0m statuscode \u001b[39m=\u001b[39m response\u001b[39m.\u001b[39mstatus\n\u001b[0;32m 313\u001b[0m \u001b[39melse\u001b[39;00m:\n",
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\request.py:78\u001b[0m, in \u001b[0;36mRequestMethods.request\u001b[1;34m(self, method, url, fields, headers, **urlopen_kw)\u001b[0m\n\u001b[0;32m 74\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mrequest_encode_url(\n\u001b[0;32m 75\u001b[0m method, url, fields\u001b[39m=\u001b[39mfields, headers\u001b[39m=\u001b[39mheaders, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39murlopen_kw\n\u001b[0;32m 76\u001b[0m )\n\u001b[0;32m 77\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m---> 78\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mrequest_encode_body(\n\u001b[0;32m 79\u001b[0m method, url, fields\u001b[39m=\u001b[39mfields, headers\u001b[39m=\u001b[39mheaders, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39murlopen_kw\n\u001b[0;32m 80\u001b[0m )\n",
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\request.py:170\u001b[0m, in \u001b[0;36mRequestMethods.request_encode_body\u001b[1;34m(self, method, url, fields, headers, encode_multipart, multipart_boundary, **urlopen_kw)\u001b[0m\n\u001b[0;32m 167\u001b[0m extra_kw[\u001b[39m\"\u001b[39m\u001b[39mheaders\u001b[39m\u001b[39m\"\u001b[39m]\u001b[39m.\u001b[39mupdate(headers)\n\u001b[0;32m 168\u001b[0m extra_kw\u001b[39m.\u001b[39mupdate(urlopen_kw)\n\u001b[1;32m--> 170\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39murlopen(method, url, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mextra_kw)\n",
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\poolmanager.py:376\u001b[0m, in \u001b[0;36mPoolManager.urlopen\u001b[1;34m(self, method, url, redirect, **kw)\u001b[0m\n\u001b[0;32m 374\u001b[0m response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39murlopen(method, url, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkw)\n\u001b[0;32m 375\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m--> 376\u001b[0m response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39murlopen(method, u\u001b[39m.\u001b[39mrequest_uri, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkw)\n\u001b[0;32m 378\u001b[0m redirect_location \u001b[39m=\u001b[39m redirect \u001b[39mand\u001b[39;00m response\u001b[39m.\u001b[39mget_redirect_location()\n\u001b[0;32m 379\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m redirect_location:\n",
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\connectionpool.py:703\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[1;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[0;32m 700\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_prepare_proxy(conn)\n\u001b[0;32m 702\u001b[0m \u001b[39m# Make the request on the httplib connection object.\u001b[39;00m\n\u001b[1;32m--> 703\u001b[0m httplib_response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_request(\n\u001b[0;32m 704\u001b[0m conn,\n\u001b[0;32m 705\u001b[0m method,\n\u001b[0;32m 706\u001b[0m url,\n\u001b[0;32m 707\u001b[0m timeout\u001b[39m=\u001b[39;49mtimeout_obj,\n\u001b[0;32m 708\u001b[0m body\u001b[39m=\u001b[39;49mbody,\n\u001b[0;32m 709\u001b[0m headers\u001b[39m=\u001b[39;49mheaders,\n\u001b[0;32m 710\u001b[0m chunked\u001b[39m=\u001b[39;49mchunked,\n\u001b[0;32m 711\u001b[0m )\n\u001b[0;32m 713\u001b[0m \u001b[39m# If we're going to release the connection in ``finally:``, then\u001b[39;00m\n\u001b[0;32m 714\u001b[0m \u001b[39m# the response doesn't need to know about the connection. Otherwise\u001b[39;00m\n\u001b[0;32m 715\u001b[0m \u001b[39m# it will also try to release it and we'll have a double-release\u001b[39;00m\n\u001b[0;32m 716\u001b[0m \u001b[39m# mess.\u001b[39;00m\n\u001b[0;32m 717\u001b[0m response_conn \u001b[39m=\u001b[39m conn \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m release_conn \u001b[39melse\u001b[39;00m \u001b[39mNone\u001b[39;00m\n",
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\connectionpool.py:449\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[1;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[0;32m 444\u001b[0m httplib_response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39mgetresponse()\n\u001b[0;32m 445\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 446\u001b[0m \u001b[39m# Remove the TypeError from the exception chain in\u001b[39;00m\n\u001b[0;32m 447\u001b[0m \u001b[39m# Python 3 (including for exceptions like SystemExit).\u001b[39;00m\n\u001b[0;32m 448\u001b[0m \u001b[39m# Otherwise it looks like a bug in the code.\u001b[39;00m\n\u001b[1;32m--> 449\u001b[0m six\u001b[39m.\u001b[39;49mraise_from(e, \u001b[39mNone\u001b[39;49;00m)\n\u001b[0;32m 450\u001b[0m \u001b[39mexcept\u001b[39;00m (SocketTimeout, BaseSSLError, SocketError) \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 451\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_raise_timeout(err\u001b[39m=\u001b[39me, url\u001b[39m=\u001b[39murl, timeout_value\u001b[39m=\u001b[39mread_timeout)\n",
"File \u001b[1;32m<string>:3\u001b[0m, in \u001b[0;36mraise_from\u001b[1;34m(value, from_value)\u001b[0m\n",
"File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\connectionpool.py:444\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[1;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[0;32m 441\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mTypeError\u001b[39;00m:\n\u001b[0;32m 442\u001b[0m \u001b[39m# Python 3\u001b[39;00m\n\u001b[0;32m 443\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 444\u001b[0m httplib_response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39;49mgetresponse()\n\u001b[0;32m 445\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 446\u001b[0m \u001b[39m# Remove the TypeError from the exception chain in\u001b[39;00m\n\u001b[0;32m 447\u001b[0m \u001b[39m# Python 3 (including for exceptions like SystemExit).\u001b[39;00m\n\u001b[0;32m 448\u001b[0m \u001b[39m# Otherwise it looks like a bug in the code.\u001b[39;00m\n\u001b[0;32m 449\u001b[0m six\u001b[39m.\u001b[39mraise_from(e, \u001b[39mNone\u001b[39;00m)\n",
"File \u001b[1;32mc:\\Python310\\lib\\http\\client.py:1374\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1372\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 1373\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m-> 1374\u001b[0m response\u001b[39m.\u001b[39;49mbegin()\n\u001b[0;32m 1375\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mConnectionError\u001b[39;00m:\n\u001b[0;32m 1376\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mclose()\n",
"File \u001b[1;32mc:\\Python310\\lib\\http\\client.py:318\u001b[0m, in \u001b[0;36mHTTPResponse.begin\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 316\u001b[0m \u001b[39m# read until we get a non-100 response\u001b[39;00m\n\u001b[0;32m 317\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m--> 318\u001b[0m version, status, reason \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_read_status()\n\u001b[0;32m 319\u001b[0m \u001b[39mif\u001b[39;00m status \u001b[39m!=\u001b[39m CONTINUE:\n\u001b[0;32m 320\u001b[0m \u001b[39mbreak\u001b[39;00m\n",
"File \u001b[1;32mc:\\Python310\\lib\\http\\client.py:279\u001b[0m, in \u001b[0;36mHTTPResponse._read_status\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 278\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_read_status\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[1;32m--> 279\u001b[0m line \u001b[39m=\u001b[39m \u001b[39mstr\u001b[39m(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfp\u001b[39m.\u001b[39;49mreadline(_MAXLINE \u001b[39m+\u001b[39;49m \u001b[39m1\u001b[39;49m), \u001b[39m\"\u001b[39m\u001b[39miso-8859-1\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 280\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(line) \u001b[39m>\u001b[39m _MAXLINE:\n\u001b[0;32m 281\u001b[0m \u001b[39mraise\u001b[39;00m LineTooLong(\u001b[39m\"\u001b[39m\u001b[39mstatus line\u001b[39m\u001b[39m\"\u001b[39m)\n",
"File \u001b[1;32mc:\\Python310\\lib\\socket.py:705\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[1;34m(self, b)\u001b[0m\n\u001b[0;32m 703\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[0;32m 704\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 705\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sock\u001b[39m.\u001b[39;49mrecv_into(b)\n\u001b[0;32m 706\u001b[0m \u001b[39mexcept\u001b[39;00m timeout:\n\u001b[0;32m 707\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_timeout_occurred \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n",
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"import os\n",
"from tqdm import tqdm\n",
"from pathlib import Path\n",
"\n",
"from selenium import webdriver\n",
@ -38,7 +96,7 @@
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"\n",
"search_query = \"GEA Farm Technologies\"\n",
"search_query = \"A*\"\n",
"\n",
"options = webdriver.ChromeOptions()\n",
"\n",
@ -55,6 +113,7 @@
" \"default_directory\": download_path,\n",
" },\n",
"}\n",
"options.add_argument(\"--headless=new\")\n",
"options.add_experimental_option(\"prefs\", preferences)\n",
"\n",
"driver = webdriver.Chrome(options=options)\n",
@ -75,16 +134,28 @@
"wait.until(\n",
" lambda driver: driver.current_url != \"https://www.unternehmensregister.de/ureg/\"\n",
")\n",
"## TODO Iterate over tabs\n",
"\n",
"num_pages = int(\n",
" driver.find_element(By.XPATH, '//*[@class=\"page_count\"]').text.split(\" \")[0]\n",
")\n",
"for page_index in range(num_pages):\n",
"\n",
"processed_companies = []\n",
"\n",
"for page_index in tqdm(range(num_pages)):\n",
" # Find all \"Registerinformationen\"\n",
" companies_tab = driver.find_elements(\n",
" By.LINK_TEXT, \"Registerinformationen des Registergerichts\"\n",
" )\n",
" for company_link in companies_tab:\n",
" company_names = [\n",
" elem.text\n",
" for elem in driver.find_elements(\n",
" By.XPATH, '//div[@class=\"company_result\"]/span/b'\n",
" )\n",
" ]\n",
" for index, company_link in enumerate(companies_tab):\n",
" company_name = company_names[index]\n",
" if company_name in processed_companies:\n",
" continue\n",
" # Go to intermediary page\n",
" company_link.click()\n",
" # Trigger next redirect\n",
@ -101,14 +172,37 @@
" elems = driver.find_elements(By.TAG_NAME, \"input\")\n",
" elems[-2].click()\n",
"\n",
" wait.until(\n",
" EC.visibility_of_element_located((By.ID, \"paymentFormOverview:btnNext\"))\n",
" )\n",
" driver.find_element(By.ID, \"paymentFormOverview:btnNext\").click()\n",
"\n",
" wait.until(\n",
" EC.visibility_of_element_located((By.LINK_TEXT, \"Zum Dokumentenkorb\"))\n",
" )\n",
" driver.find_element(By.LINK_TEXT, \"Zum Dokumentenkorb\").click()\n",
"\n",
" num_files = get_num_files(\"./data/Unternehmensregister/\")\n",
" driver.find_element(By.CLASS_NAME, \"download-wrapper\").click()\n",
"\n",
" try:\n",
" wait.until(\n",
" lambda x: wait_for_download_condition(\n",
" \"./data/Unternehmensregister/\", num_files\n",
" )\n",
" )\n",
" rename_latest_file(\n",
" \"./data/Unternehmensregister/\",\n",
" f\"{company_name.replace(' ', '_').replace('/','_')}.xml\",\n",
" )\n",
" processed_companies.append(company_name)\n",
" except:\n",
" print(f\"Could not process {company_name}\")\n",
" for i in range(6):\n",
" driver.back()\n",
" driver.find_element(By.XPATH, '//*[@class=\"fas fa-angle-right\"]').click()\n",
"driver.close()"
"driver.close()\n",
"print(processed_companies)"
]
},
{
@ -121,243 +215,429 @@
},
{
"cell_type": "code",
"execution_count": 119,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['registerdocument-2023-06-09-14-05-01.xml',\n",
" 'registerdocument-2023-06-09-14-05-03.xml']"
"['A&A_Amini_Art_GmbH.xml',\n",
" 'A&A_Immo_GmbH.xml',\n",
" 'A&P_AUDITING_GmbH_Wirtschaftsprüfungsgesellschaft.xml',\n",
" 'A&QUA_gemeinnützige_Gesellschaft_für_Arbeit_u._Qualifizierung_mbH.xml',\n",
" 'a+b_Asphalt-_und_Betonmischwerke_GmbH_&_Co._KG.xml',\n",
" 'a+b_Verwaltungsgesellschaft_mbH.xml',\n",
" 'A+E_Beteiligungs-_und_Handels-GmbH.xml',\n",
" 'A+W_Systemhaus_GmbH.xml',\n",
" 'A-S-D_Kfz-Teile-Handel_GmbH.xml',\n",
" 'A-TEAM_Industrielles_Roboterschweißen_GmbH.xml',\n",
" 'A.C.C._Funk_Taxi_&_Minicar_e.K..xml',\n",
" 'a.c.k._aqua_concept_GmbH_Karlsruhe.xml',\n",
" 'A.C._Weiss_GmbH_&_Co._KG.xml',\n",
" 'A.D.S._OHG.xml',\n",
" 'A.D._Glas-_und_Gebäudereinigung_e.K..xml',\n",
" 'A.E._Z-Line_Taxi_-_und_Shuttle-Service_e.K..xml',\n",
" 'A.F.Z._Automatisierung,_Fördern,_Zuführen_GmbH.xml',\n",
" 'A.G._Zentral_Michael_Greising_e.K..xml',\n",
" 'A.H._Steuerberatungsgesellschaft_mbH.xml',\n",
" 'A.I.V._SERVICES_GmbH.xml',\n",
" 'A.I._Kommanditist-Gesellschaft_mbH.xml',\n",
" 'A.KIein_Immobilien_KG.xml',\n",
" 'A.L.G._Christian_Schmelzer.xml',\n",
" 'A.L.S._Architektonische_Licht-Systeme_GmbH.xml',\n",
" 'A.M.G._Motorenbau_Hans_Werner_Aufrecht.xml',\n",
" 'A.M.P._Athos_GmbH.xml',\n",
" 'A.N._Gartenbau_GmbH.xml',\n",
" 'A.Q.U.A_Services_KG.xml',\n",
" 'A.R.S._GmbH_Süd,_Alt_und_Reststoffverwertung.xml',\n",
" 'A.S.G._Industrielackierungen_GmbH.xml',\n",
" 'A.S.S._bikes_and_parts_GmbH.xml',\n",
" 'A.S._Baubedarfvermittlung_Gesellschaft_mit_beschränkter_Haftung.xml',\n",
" 'A.T.C._Automotive_GmbH.xml',\n",
" 'A._&_S._Aigner_und_Schulz_GmbH.xml',\n",
" 'A._+_H._Weier_GmbH.xml',\n",
" 'A._+_K._Hertkorn_OHG_Möbel_-_Innenausbau.xml',\n",
" 'A._Abele_GmbH.xml',\n",
" 'A._Baur_Mineralöl-Abfertigungsspedition_GmbH.xml',\n",
" 'A._Blum_GmbH.xml',\n",
" 'A._Both_GmbH.xml',\n",
" 'A._Both_GmbH_&_Co._KG_Werkzeugtechnik_CNC_Maschinenausrüstung.xml',\n",
" 'A._DINKIC_GMBH.xml',\n",
" 'A._Elsbecker_GmbH.xml',\n",
" 'A._Erglis_GmbH.xml',\n",
" 'A._Frauenrath_Landschaftsbau_GmbH_&_Co._KG..xml',\n",
" 'A._Gradmann_GmbH_&_Co._KG.xml',\n",
" 'A._Hanhart_GmbH_&_Co._KG.xml',\n",
" 'A._Hüglin_-_Putz_und_Stuck_-_Gesellschaft_mit_beschränkter_Haftung.xml',\n",
" 'A._Illmann_Zahntechnik_GmbH.xml',\n",
" 'A._Junghanns_Automatisierungs_GmbH.xml',\n",
" 'A._Jung_GmbH_&_Co._KG.xml',\n",
" 'A._Kolbinger_GmbH_Versicherungs-Makler.xml',\n",
" 'A._Kolckmann,_Weberei_und_Kunststoffbeschichtungen_GmbH.xml',\n",
" 'A._Kolckmann_GmbH_&_Co._KG.xml',\n",
" 'A._Kuhner_GmbH.xml',\n",
" 'A._Lipp_GmbH.xml',\n",
" 'A._Müller_Geschäftsführungs-_GmbH.xml',\n",
" 'A._Müller_GmbH_&_Co._KG.xml',\n",
" 'A._Nassal_GmbH.xml',\n",
" 'A._Oster_e.K..xml',\n",
" 'A._Pfeiffer_Zimmerei_GmbH.xml',\n",
" 'A._Pfingsten_KG.xml',\n",
" 'A._Pullmann_GmbH.xml',\n",
" 'A._Randecker_Wirtschafts-_und_Steuerberatungsgesellschaft_mbH.xml',\n",
" 'A._Reinhard_GmbH.xml',\n",
" 'A._Ritter_GmbH.xml',\n",
" 'A._Sabadinowitsch_Verwaltung_GmbH.xml',\n",
" 'A._Sluka-Verwaltungsgesellschaft_mit_beschränkter_Haftung.xml',\n",
" 'A._Sommer_Finanzdienstleistungsvermittlung_e.K..xml',\n",
" 'A._Sorg_GmbH_&_Co._KG.xml',\n",
" 'A._u._G_Sexton_GmbH.xml',\n",
" 'A._Umminger_LUM-Air,_Elektro-_und_Filtertechnik_GmbH.xml',\n",
" 'A._Wankmüller_GmbH_&_Co._KG.xml',\n",
" 'A._Ziemann_Gesellschaft_mit_beschränkter_Haftung.xml',\n",
" 'A._Zwisler_e.K..xml',\n",
" 'A_&_A_Consulting_GmbH.xml',\n",
" 'A_&_A_Gipserbetrieb_GmbH.xml',\n",
" 'a_&_b_Beteiligungs-GmbH.xml',\n",
" 'A_&_B_Gastronomie-Betriebe_GmbH.xml',\n",
" 'A_&_C_Aqua_&_Care_Limited.xml',\n",
" 'A_&_F_Lori_GmbH.xml',\n",
" 'A_&_L_Engineering_Service_GmbH.xml',\n",
" 'A_&_M_Stanzformzubehör_Olaf_Abendroth_GmbH.xml',\n",
" 'A_&_O_Grundstücksverwaltungs_GmbH_&_Co._KG.xml',\n",
" 'A_&_R_Textilproduktion_GmbH.xml',\n",
" 'A_&_S_Bäder_GmbH_&_Co..xml',\n",
" 'A_&_S_Vermögensverwaltungs_GmbH.xml',\n",
" 'A_&_T_Roth_GmbH.xml',\n",
" 'A_+_A_Aalsmeer_Blumen_-_Bräutigam_E._Kfr.,_Inh._Manuela_Bräutigam.xml',\n",
" 'a_+_b_Wohnbau_GmbH.xml',\n",
" 'A_+_H_Bauträger-_und_Verwaltungsgesellschaft_mit_beschränkter_Haftung.xml',\n",
" 'A_+_M_Verwaltungs-GmbH.xml',\n",
" 'A_+_P_Baumaschinen_GmbH_&_Co._KG.xml',\n",
" 'A_+_R_Baumaschinen_-_Mietpark_+_-Vertriebs-GmbH.xml',\n",
" 'A_+_S_Tierbedarf_GmbH.xml',\n",
" 'A_+_Te_Stabil-Bau_GmbH.xml',\n",
" 'A_+_W._Sahm_Bedachungs-GmbH.xml',\n",
" 'a_-_Vermögensverwaltungs-GmbH_&_Co._KG.xml',\n",
" 'A_-_Z_Kreditvermittlungs-Gesellschaft_mit_beschränkter_Haftung.xml',\n",
" 'A_2000_Industrie-Elektronik_GmbH.xml',\n",
" 'A_bis_Z_Verwaltungs_GmbH.xml',\n",
" 'A_B_A_S_A_GmbH_Organisations_-_Planungsbüro_für_den_Innenausbau.xml',\n",
" 'A_B_U_-_GmbH_Altlasten_Bauökologie_Umweltmanagement.xml',\n",
" 'A_F_Fussbodentechnik_GmbH.xml',\n",
" 'A_L_T_E_C_GmbH.xml',\n",
" 'A_l_u_f_o_r_m_Alucobondverarbeitungs-GmbH.xml',\n",
" 'A_L_Z_Auto_Licht_und_Zündung_Service_Gesellschaft_mit_beschränkter_Haftung.xml',\n",
" 'a_m_friseure_GmbH_Karlsruhe.xml',\n",
" 'a_m_friseure_GmbH_Koblenz.xml',\n",
" 'a_priori_GmbH.xml',\n",
" 'a_s_k_-_Kunststoffe_GmbH.xml',\n",
" 'A_S_TRUCKS_e.K..xml',\n",
" 'A_S_Y_S_Automatic_Systems_Beteiligungs-GmbH.xml',\n",
" 'A_u_c_h_Gesellschaft_mit_beschränkter_Haftung.xml',\n",
" 'export',\n",
" 'registerdocument-2023-06-11-12-41-30 (1).xml',\n",
" 'registerdocument-2023-06-11-12-41-30.xml',\n",
" 'registerdocument-2023-06-11-12-52-33.xml',\n",
" 'registerdocument-2023-06-11-12-52-41.xml']"
]
},
"execution_count": 119,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import os\n",
"\n",
"files = os.listdir(\"./data/Unternehmensregister\")\n",
"files"
]
},
{
"cell_type": "code",
"execution_count": 135,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import xmltodict\n",
"\n",
"\n",
"def transform_xml_to_json(source_dir: str, target_dir: str):\n",
" for file in glob.glob1(source_dir, \"*.xml\"):\n",
" source_path = os.path.join(source_dir, file)\n",
" target_path = os.path.join(target_dir, file.replace(\".xml\", \".json\"))\n",
"\n",
" with open(source_path, \"r\", encoding=\"utf-8\") as source_file:\n",
" data = xmltodict.parse(source_file.read().encode())\n",
" with open(target_path, \"w\", encoding=\"utf-8\") as json_file:\n",
" json_file.write(json.dumps(data))\n",
"\n",
"\n",
"transform_xml_to_json(\"./data/Unternehmensregister/\", \"./data/Unternehmensregister/\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"for file in glob.glob1(\"./data/Unternehmensregister/\", \"*.json\"):\n",
" path = os.path.join(\"./data/Unternehmensregister/\", file)\n",
" with open(path, \"r\", encoding=\"utf-8\") as file_object:\n",
" data = json.loads(file_object.read())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def parse_stakeholder(data: dict) -> list:\n",
" if \"Natuerliche_Person\" in data[\"Beteiligter\"]:\n",
" return {\n",
" \"name\": {\n",
" \"firstname\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\"Voller_Name\"][\n",
" \"Vorname\"\n",
" ],\n",
" \"lastname\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\"Voller_Name\"][\n",
" \"Nachname\"\n",
" ],\n",
" },\n",
" \"date_of_birth\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\"Geburt\"][\n",
" \"Geburtsdatum\"\n",
" ]\n",
" if \"Geburt\" in data[\"Beteiligter\"][\"Natuerliche_Person\"]\n",
" else None,\n",
" \"location\": {\n",
" \"city\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\"Anschrift\"][\"Ort\"]\n",
" },\n",
" \"role\": data[\"Rolle\"][\"Rollenbezeichnung\"][\"content\"],\n",
" }\n",
" if \"Organisation\" in data[\"Beteiligter\"]:\n",
" return {\n",
" \"role\": \"Organisation\",\n",
" \"description\": data[\"Beteiligter\"][\"Organisation\"][\"Bezeichnung\"][\n",
" \"Bezeichnung_Aktuell\"\n",
" ],\n",
" \"location\": {\n",
" \"city\": data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"][\"Ort\"],\n",
" \"street\": data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"][\"Strasse\"]\n",
" if \"Strasse\" in data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"]\n",
" else None,\n",
" \"house_number\": data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"][\n",
" \"Hausnummer\"\n",
" ]\n",
" if \"Hausnummer\" in data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"]\n",
" else None,\n",
" \"zip_code\": data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"][\n",
" \"Postleitzahl\"\n",
" ],\n",
" },\n",
" }\n",
"\n",
"\n",
"def map_unternehmensregister_json(data: dict) -> dict:\n",
" result = {\"base_info\": None, \"relationships\": []}\n",
"\n",
" base_info = {\n",
" \"company_name\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
" \"Basisdaten_Register\"\n",
" ][\"Rechtstraeger\"][\"Bezeichnung\"][\"Bezeichnung_Aktuell\"],\n",
" \"location\": {\n",
" \"city\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\"Basisdaten_Register\"][\n",
" \"Rechtstraeger\"\n",
" ][\"Anschrift\"][\"Ort\"],\n",
" \"zip_code\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
" \"Basisdaten_Register\"\n",
" ][\"Rechtstraeger\"][\"Anschrift\"][\"Postleitzahl\"],\n",
" \"street\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
" \"Basisdaten_Register\"\n",
" ][\"Rechtstraeger\"][\"Anschrift\"][\"Strasse\"]\n",
" if \"Strasse\"\n",
" in data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\"Basisdaten_Register\"][\n",
" \"Rechtstraeger\"\n",
" ][\"Anschrift\"]\n",
" else None,\n",
" \"house_number\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
" \"Basisdaten_Register\"\n",
" ][\"Rechtstraeger\"][\"Anschrift\"][\"Hausnummer\"]\n",
" if \"Hausnummer\"\n",
" in data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\"Basisdaten_Register\"][\n",
" \"Rechtstraeger\"\n",
" ][\"Anschrift\"]\n",
" else None,\n",
" },\n",
" \"last_update\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\"Auszug\"][\n",
" \"letzte_Eintragung\"\n",
" ],\n",
" }\n",
" result[\"base_info\"] = base_info\n",
" for i in range(\n",
" len(data[\"XJustiz_Daten\"][\"Grunddaten\"][\"Verfahrensdaten\"][\"Beteiligung\"])\n",
" ):\n",
" people = parse_stakeholder(\n",
" data[\"XJustiz_Daten\"][\"Grunddaten\"][\"Verfahrensdaten\"][\"Beteiligung\"][i]\n",
" )\n",
" result[\"relationships\"].append(people)\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{}\n",
"{}\n",
"{'name': {'firstname': 'Reinhard', 'lastname': 'Gebing'}, 'date_of_birth': '1964-04-26', 'location': {'city': 'Oelde'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Markus', 'lastname': 'Kreft'}, 'date_of_birth': '1966-04-03', 'location': {'city': 'Wetter'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Kai', 'lastname': 'Luntz'}, 'date_of_birth': '1970-12-04', 'location': {'city': 'Holzminden'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Thomas', 'lastname': 'Mader'}, 'date_of_birth': '1972-05-24', 'location': {'city': 'Rheda-Wiedenbrück'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Peter', 'lastname': 'Lauwers'}, 'date_of_birth': '1970-03-26', 'location': {'city': 'Düsseldorf'}, 'role': 'Geschäftsführer(in)'}\n",
"{'name': {'firstname': 'Erkul', 'lastname': 'Basaran'}, 'date_of_birth': '1977-05-06', 'location': {'city': 'Erkrath'}, 'role': 'Geschäftsführer(in)'}\n",
"{'name': {'firstname': 'Katja', 'lastname': 'Voß'}, 'date_of_birth': '1978-02-24', 'location': {'city': 'Rheda-Wiedenbrück'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Henrik', 'lastname': 'Böttner'}, 'date_of_birth': '1982-11-07', 'location': {'city': 'Bochum'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Ulrich', 'lastname': 'Raßenhövel'}, 'date_of_birth': '1969-04-16', 'location': {'city': 'Oelde'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Andreas', 'lastname': 'Naroska'}, 'date_of_birth': '1967-03-23', 'location': {'city': 'Herdecke'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Mark', 'lastname': 'Kramps'}, 'date_of_birth': '1967-09-04', 'location': {'city': 'Witten'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Ralf', 'lastname': 'Barkmeyer'}, 'date_of_birth': '1974-02-28', 'location': {'city': 'Dortmund'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Holger', 'lastname': 'Siegwarth'}, 'date_of_birth': '1967-05-13', 'location': {'city': 'Tönnisvorst'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Oliver', 'lastname': 'Liß'}, 'date_of_birth': '1981-04-13', 'location': {'city': 'Herne'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Liang', 'lastname': 'Cheng'}, 'date_of_birth': '1980-12-29', 'location': {'city': 'Göppingen'}, 'role': 'Geschäftsführer(in)'}\n",
"{'name': {'firstname': 'Astrid', 'lastname': 'Dörner-Rodeheger'}, 'date_of_birth': '1968-12-24', 'location': {'city': 'Beckum'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Jon', 'lastname': 'Lange'}, 'date_of_birth': '1978-04-25', 'location': {'city': 'Dortmund'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Matthias', 'lastname': 'Peters'}, 'date_of_birth': '1973-08-28', 'location': {'city': 'Dortmund'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Ralf', 'lastname': 'Frombach'}, 'date_of_birth': '1977-01-25', 'location': {'city': 'Werne'}, 'role': 'Prokurist(in)'}\n",
"{'name': {'firstname': 'Sven', 'lastname': 'Hommel'}, 'date_of_birth': '1979-04-22', 'location': {'city': 'Berlin'}, 'role': 'Prokurist(in)'}\n"
"A&A Amini Art GmbH\n",
"A&A Immo GmbH\n",
"A&P AUDITING GmbH Wirtschaftsprüfungsgesellschaft\n",
"A&QUA gemeinnützige Gesellschaft für Arbeit u. Qualifizierung mbH\n",
"a+b Asphalt- und Betonmischwerke GmbH & Co. KG\n",
"a+b Verwaltungsgesellschaft mbH\n",
"A+E Beteiligungs- und Handels-GmbH\n",
"A+W Systemhaus GmbH\n",
"A-S-D Kfz-Teile-Handel GmbH\n",
"A-TEAM Industrielles Roboterschweißen GmbH\n",
"A.C.C. Funk Taxi & Minicar e.K.\n",
"a.c.k. aqua concept GmbH Karlsruhe\n",
"A.C. Weiss GmbH & Co. KG\n",
"A.D.S. OHG\n",
"A.D. Glas- und Gebäudereinigung e.K.\n",
"A.E. Z-Line Taxi - und Shuttle-Service e.K.\n",
"A.F.Z. Automatisierung, Fördern, Zuführen GmbH\n",
"A.G. Zentral Michael Greising e.K.\n",
"A.H. Steuerberatungsgesellschaft mbH\n",
"A.I.V. SERVICES GmbH\n",
"A.I. Kommanditist-Gesellschaft mbH\n",
"A.KIein Immobilien KG\n",
"A.L.G. Christian Schmelzer\n",
"A.L.S. Architektonische Licht-Systeme GmbH\n",
"A.M.G. Motorenbau Hans Werner Aufrecht\n",
"A.M.P. Athos GmbH\n",
"A.N. Gartenbau GmbH\n",
"A.Q.U.A Services KG\n",
"A.R.S. GmbH Süd, Alt und Reststoffverwertung\n",
"A.S.G. Industrielackierungen GmbH\n",
"A.S.S. bikes and parts GmbH\n",
"A.S. Baubedarfvermittlung Gesellschaft mit beschränkter Haftung\n",
"A.T.C. Automotive GmbH\n",
"A. & S. Aigner und Schulz GmbH\n",
"A. + H. Weier GmbH\n",
"A. + K. Hertkorn OHG Möbel - Innenausbau\n",
"A. Abele GmbH\n",
"A. Baur Mineralöl-Abfertigungsspedition GmbH\n",
"A. Blum GmbH\n",
"A. Both GmbH\n",
"A. Both GmbH & Co. KG Werkzeugtechnik CNC Maschinenausrüstung\n",
"A. DINKIC GMBH\n",
"A. Elsbecker GmbH\n",
"A. Erglis GmbH\n",
"A. Frauenrath Landschaftsbau GmbH & Co. KG.\n",
"A. Gradmann GmbH & Co. KG\n",
"A. Hanhart GmbH & Co. KG\n",
"A. Hüglin - Putz und Stuck - Gesellschaft mit beschränkter Haftung\n",
"A. Illmann Zahntechnik GmbH\n",
"A. Junghanns Automatisierungs GmbH\n",
"A. Jung GmbH & Co.KG\n",
"A. Kolbinger GmbH Versicherungs-Makler\n",
"A. Kolckmann, Weberei und Kunststoffbeschichtungen GmbH\n",
"A. Kolckmann GmbH & Co. KG\n",
"A. Kuhner GmbH\n",
"A. Lipp GmbH\n",
"A. Müller Geschäftsführungs- GmbH\n",
"A. Müller GmbH & Co. KG\n",
"A. Nassal GmbH\n",
"A. Oster e.K.\n",
"A. Pfeiffer Zimmerei GmbH\n",
"A. Pfingsten KG\n",
"A. Pullmann GmbH\n",
"A. Randecker Wirtschafts- und Steuerberatungsgesellschaft mbH\n",
"A. Reinhard GmbH\n",
"A. Ritter GmbH\n",
"A. Sabadinowitsch Verwaltung GmbH\n",
"A. Sluka-Verwaltungsgesellschaft mit beschränkter Haftung\n",
"A. Sommer Finanzdienstleistungsvermittlung e.K.\n",
"A. Sorg GmbH & Co. KG\n",
"A. u. G Sexton GmbH\n",
"A. Umminger LUM-Air, Elektro- und Filtertechnik GmbH\n",
"A. Wankmüller GmbH & Co. KG\n",
"A. Ziemann Gesellschaft mit beschränkter Haftung\n",
"A. Zwisler e.K.\n",
"A & A Consulting GmbH\n",
"A & A Gipserbetrieb GmbH\n",
"a & b Beteiligungs-GmbH\n",
"A & B Gastronomie-Betriebe GmbH\n",
"A & C Aqua & Care Limited\n",
"A & F Lori GmbH\n",
"A & L Engineering Service GmbH\n",
"A & M Stanzformzubehör Olaf Abendroth GmbH\n",
"A & O Grundstücksverwaltungs GmbH & Co. KG\n",
"A & R Textilproduktion GmbH\n",
"A & S Bäder GmbH & Co.\n",
"A & S Vermögensverwaltungs GmbH\n",
"A & T Roth GmbH\n",
"A + A Aalsmeer Blumen - Bräutigam E. Kfr., Inh. Manuela Bräutigam\n",
"a + b Wohnbau GmbH\n",
"A + H Bauträger- und Verwaltungsgesellschaft mit beschränkter Haftung\n",
"A + M Verwaltungs-GmbH\n",
"A + P Baumaschinen GmbH & Co. KG\n",
"A + R Baumaschinen - Mietpark + -Vertriebs-GmbH\n",
"A + S Tierbedarf GmbH\n",
"A + Te Stabil-Bau GmbH\n",
"A + W. Sahm Bedachungs-GmbH\n",
"a - Vermögensverwaltungs-GmbH & Co. KG\n",
"A - Z Kreditvermittlungs-Gesellschaft mit beschränkter Haftung\n",
"A 2000 Industrie-Elektronik GmbH\n",
"A bis Z Verwaltungs GmbH\n",
"A B A S A GmbH Organisations - Planungsbüro für den Innenausbau\n",
"A B U - GmbH Altlasten Bauökologie Umweltmanagement\n",
"A F Fussbodentechnik GmbH\n",
"A L T E C GmbH\n",
"A l u f o r m Alucobondverarbeitungs-GmbH\n",
"A L Z Auto Licht und Zündung Service Gesellschaft mit beschränkter Haftung\n",
"a/m friseure GmbH Karlsruhe\n",
"a/m friseure GmbH Koblenz\n",
"a priori GmbH\n",
"a s k - Kunststoffe GmbH\n",
"A/S TRUCKS e.K.\n",
"A S Y S Automatic Systems Beteiligungs-GmbH\n",
"A u c h Gesellschaft mit beschränkter Haftung\n",
"a s k - Kunststoffe GmbH\n",
"A. Maier GmbH & Co. KG\n",
"\"A/D/L/E/R Steuerberatungsgesellschaft mbH\"\n",
"a | m | | medienservice e. k.\n"
]
}
],
"source": [
"import json\n",
"import xmltodict\n",
"\n",
"for file in files:\n",
" with open(\"./data/Unternehmensregister/\" + file, \"r\", encoding=\"utf-8\") as xml_file:\n",
" data = xmltodict.parse(xml_file.read())\n",
" with open(\"./data/temp.json\", \"w\", encoding=\"utf-8\") as json_file:\n",
" json_file.write(json.dumps(data))\n",
"for file in glob.glob1(\"./data/Unternehmensregister/\", \"*.json\"):\n",
" path = os.path.join(\"./data/Unternehmensregister/\", file)\n",
" with open(path, \"r\", encoding=\"utf-8\") as file_object:\n",
" data = json.loads(file_object.read())\n",
"\n",
" keys = dict.keys(data[\"XJustiz_Daten\"][\"Grunddaten\"])\n",
" base_info = {\n",
" \"company_name\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
" \"Basisdaten_Register\"\n",
" ][\"Rechtstraeger\"][\"Bezeichnung\"][\"Bezeichnung_Aktuell\"],\n",
" \"location\": {\n",
" \"city\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
" \"Basisdaten_Register\"\n",
" ][\"Rechtstraeger\"][\"Anschrift\"][\"Ort\"],\n",
" \"zip_code\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
" \"Basisdaten_Register\"\n",
" ][\"Rechtstraeger\"][\"Anschrift\"][\"Postleitzahl\"],\n",
" \"street\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
" \"Basisdaten_Register\"\n",
" ][\"Rechtstraeger\"][\"Anschrift\"][\"Strasse\"],\n",
" \"house_number\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
" \"Basisdaten_Register\"\n",
" ][\"Rechtstraeger\"][\"Anschrift\"][\"Hausnummer\"],\n",
" },\n",
" }\n",
" result = map_unternehmensregister_json(data)\n",
" print(result[\"base_info\"][\"company_name\"])\n",
"\n",
" def parse_stakeholder(data: dict) -> list:\n",
" if \"Natuerliche_Person\" in data[\"Beteiligter\"]:\n",
" return {\n",
" \"name\": {\n",
" \"firstname\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\n",
" \"Voller_Name\"\n",
" ][\"Vorname\"],\n",
" \"lastname\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\n",
" \"Voller_Name\"\n",
" ][\"Nachname\"],\n",
" },\n",
" \"date_of_birth\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\n",
" \"Geburt\"\n",
" ][\"Geburtsdatum\"],\n",
" \"location\": {\n",
" \"city\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\"Anschrift\"][\n",
" \"Ort\"\n",
" ]\n",
" },\n",
" \"role\": data[\"Rolle\"][\"Rollenbezeichnung\"][\"content\"],\n",
" }\n",
" return {}\n",
"\n",
" for i in range(\n",
" len(data[\"XJustiz_Daten\"][\"Grunddaten\"][\"Verfahrensdaten\"][\"Beteiligung\"])\n",
" ):\n",
" people = parse_stakeholder(\n",
" data[\"XJustiz_Daten\"][\"Grunddaten\"][\"Verfahrensdaten\"][\"Beteiligung\"][i]\n",
" )\n",
" print(people)\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"from pdf2image import convert_from_path\n",
"\n",
"pdfs = r\"./data/test.pdf\"\n",
"pages = convert_from_path(pdfs, 350)\n",
"\n",
"\n",
"for i, page in enumerate(pages):\n",
" image_name = f\"./data/Page_{i+1}.jpg\"\n",
" page.save(image_name, \"JPEG\")"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Handelsregister B des Abteilung B Nummer der Firma:\n",
"Amtsgerichts Hamm Wiedergabe des aktuellen HRB 5363\n",
"Registerinhalts\n",
"Abruf vom 07.06.2023 19:37\n",
"1. Anzahl der bisherigen Eintragungen:\n",
"51\n",
"2. a) Firma:\n",
"GEA Farm Technologies GmbH\n",
"b) Sitz, Niederlassung, inländische Geschäftsanschrift, empfangsberechtigte Person, Zweigniederlassungen:\n",
"Bönen\n",
"Geschäftsanschrift: Siemensstraße 25-27, 59199 Bönen\n",
"c) Gegenstand des Unternehmens:\n",
"Entwicklung, Herstellung und der Vertrieb von Landtechnik, insbesondere von Komponenten und Anlagen\n",
"(a) zur Gewinnung, Kühlung, Behandlung und Lagerung von Milch;\n",
"(b) für das Milchvieh-Herdenmanagement;\n",
"(c) zur Tierhygiene und Sicherung der Milchqualität und\n",
"(d) zur Aufstallung von Tieren;\n",
"sowie die Herstellung und der Vertrieb von Anlagen und Fahrzeugen zur Aufbereitung und zum Transport von Gülle.\n",
"3. Grund- oder Stammkapital:\n",
"5.115.000,00 EUR\n",
"4. a) Allgemeine Vertretungsregelung:\n",
"Ist nur ein Geschäftsführer bestellt, so vertritt er die Gesellschaft allein. Sind mehrere Geschäftsführer bestellt, so wird die\n",
"Gesellschaft durch zwei Geschäftsführer oder durch einen Geschäftsführer gemeinsam mit einem Prokuristen vertreten.\n",
"b) Vorstand, Leitungsorgan, geschäftsführende Direktoren, persönlich haftender Gesellschafter, Geschäftsführer,\n",
"Vertretungsberechtigte und besondere Vertretungsbefugnis:\n",
"Geschäftsführer: Basaran, Erkul, Erkrath, *06.05.1977\n",
"Geschäftsführer: Cheng, Liang, Göppingen, *29.12.1980\n",
"Geschäftsführer: Lauwers, Peter, Düsseldorf, *26.03.1970\n",
"5. Prokura:\n",
"Gesamtprokura gemeinsam mit einem Geschäftsführer oder einem anderen Prokuristen:\n",
"Barkmeyer, Ralf, Dortmund, *28.02.1974\n",
"Böttner, Henrik, Bochum, *07.11.1982\n",
"Dörner-Rodeheger, Astrid, Beckum, *24.12.1968\n",
"Frombach, Ralf, Werne, *25.01.1977\n",
"Gebing, Reinhard, Oelde, *26.04.1964\n",
"Hommel, Sven, Berlin, *22.04.1979\n",
"Kramps, Mark, Witten, *04.09.1967\n",
"Kreft, Markus, Wetter, *03.04.1966\n",
"\n"
]
}
],
"source": [
"import cv2\n",
"import pytesseract\n",
"\n",
"image_path = \"./data/Page_1.jpg\"\n",
"image = cv2.imread(image_path)\n",
"\n",
"text = str(pytesseract.image_to_string(image, config=\"--psm 6\", lang=\"deu\"))\n",
"print(text)\n",
"with open(\"./data/Page_1.txt\", \"w\", encoding=\"utf-8\") as output_file:\n",
" output_file.write(text)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Geschäftsführer: Basaran, Erkul', 'Geschäftsführer: Cheng, Liang', 'Geschäftsführer: Lauwers, Peter']\n"
]
},
{
"data": {
"text/plain": [
"['Erkul Basaran', 'Liang Cheng', 'Peter Lauwers']"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import re\n",
"\n",
"\n",
"def get_managing_directors(text: str) -> list:\n",
" managing_directors_regex = r\"Geschäftsführer: [a-zA-ZäöüÄÖÜ]*, [a-zA-ZäöüÄÖÜ]*\"\n",
" hits = re.findall(managing_directors_regex, text)\n",
" print(hits)\n",
" return [\n",
" \" \".join(hit.replace(\"Geschäftsführer: \", \"\").replace(\",\", \"\").split(\" \")[::-1])\n",
" for hit in hits\n",
" ]\n",
"\n",
"\n",
"get_managing_directors(text)"
" name = (\n",
" result[\"base_info\"][\"company_name\"]\n",
" .replace(\" \", \"_\")\n",
" .replace(\"/\", \"_\")\n",
" .replace('\"', \"\")\n",
" .replace(\"|\", \"_\")\n",
" )\n",
" with open(\n",
" f\"./data/Unternehmensregister/export/{name}.json\", \"w+\", encoding=\"utf-8\"\n",
" ) as export_file:\n",
" json.dump(result, export_file, ensure_ascii=False)"
]
}
],

View File

@ -4,4 +4,5 @@ opencv-python
pdf2image
bs4
selenium
xmltodict
xmltodict
tqdm