From 058c16b3ff42ecabf06444e496718265f20ae4fb Mon Sep 17 00:00:00 2001 From: TrisNol Date: Sun, 11 Jun 2023 13:11:44 +0200 Subject: [PATCH] Bulk process Unternehmensregister .xmls --- .../Unternehmensregister/notebook.ipynb | 704 ++++++++++++------ .../Unternehmensregister/requirements.txt | 3 +- 2 files changed, 494 insertions(+), 213 deletions(-) diff --git a/Jupyter/API-tests/Unternehmensregister/notebook.ipynb b/Jupyter/API-tests/Unternehmensregister/notebook.ipynb index 9aab687..de91308 100644 --- a/Jupyter/API-tests/Unternehmensregister/notebook.ipynb +++ b/Jupyter/API-tests/Unternehmensregister/notebook.ipynb @@ -18,7 +18,39 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import glob" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def wait_for_download_condition(\n", + " path: str, num_files: int, pattern: str = \"*.xml\"\n", + ") -> bool:\n", + " return len(glob.glob1(path, pattern)) > num_files\n", + "\n", + "\n", + "def get_num_files(path: str, pattern: str = \"*.xml\") -> int:\n", + " return len(glob.glob1(path, pattern))\n", + "\n", + "\n", + "def rename_latest_file(path: str, filename: str, pattern: str = \"*.xml\"):\n", + " list_of_files = [os.path.join(path, file) for file in glob.glob1(path, pattern)]\n", + " latest_download = max(list_of_files, key=os.path.getctime)\n", + " os.rename(latest_download, os.path.join(path, filename))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -27,10 +59,36 @@ "text": [ "c:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Unternehmensregister\\data\\Unternehmensregister\n" ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Unternehmensregister\\notebook.ipynb Cell 5\u001b[0m in \u001b[0;36m3\n\u001b[0;32m 27\u001b[0m options\u001b[39m.\u001b[39madd_experimental_option(\u001b[39m\"\u001b[39m\u001b[39mprefs\u001b[39m\u001b[39m\"\u001b[39m, preferences)\n\u001b[0;32m 29\u001b[0m driver \u001b[39m=\u001b[39m webdriver\u001b[39m.\u001b[39mChrome(options\u001b[39m=\u001b[39moptions)\n\u001b[1;32m---> 31\u001b[0m driver\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mhttps://www.unternehmensregister.de/ureg/\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[0;32m 32\u001b[0m \u001b[39m# Accept Cookies\u001b[39;00m\n\u001b[0;32m 33\u001b[0m driver\u001b[39m.\u001b[39mfind_elements(\n\u001b[0;32m 34\u001b[0m By\u001b[39m.\u001b[39mXPATH, \u001b[39m'\u001b[39m\u001b[39m//button[text()=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mNur technisch notwendige Cookies akzeptieren\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m]\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m 35\u001b[0m )[\u001b[39m0\u001b[39m]\u001b[39m.\u001b[39mclick()\n", + "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py:355\u001b[0m, in \u001b[0;36mWebDriver.get\u001b[1;34m(self, url)\u001b[0m\n\u001b[0;32m 353\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mget\u001b[39m(\u001b[39mself\u001b[39m, url: \u001b[39mstr\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 354\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Loads a web page in the current browser session.\"\"\"\u001b[39;00m\n\u001b[1;32m--> 355\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mexecute(Command\u001b[39m.\u001b[39;49mGET, {\u001b[39m\"\u001b[39;49m\u001b[39murl\u001b[39;49m\u001b[39m\"\u001b[39;49m: url})\n", + "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py:344\u001b[0m, in \u001b[0;36mWebDriver.execute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m 341\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39msessionId\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m params:\n\u001b[0;32m 342\u001b[0m params[\u001b[39m\"\u001b[39m\u001b[39msessionId\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msession_id\n\u001b[1;32m--> 344\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mcommand_executor\u001b[39m.\u001b[39;49mexecute(driver_command, params)\n\u001b[0;32m 345\u001b[0m \u001b[39mif\u001b[39;00m response:\n\u001b[0;32m 346\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror_handler\u001b[39m.\u001b[39mcheck_response(response)\n", + "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\remote_connection.py:290\u001b[0m, in \u001b[0;36mRemoteConnection.execute\u001b[1;34m(self, command, params)\u001b[0m\n\u001b[0;32m 288\u001b[0m data \u001b[39m=\u001b[39m utils\u001b[39m.\u001b[39mdump_json(params)\n\u001b[0;32m 289\u001b[0m url \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_url\u001b[39m}\u001b[39;00m\u001b[39m{\u001b[39;00mpath\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m--> 290\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_request(command_info[\u001b[39m0\u001b[39;49m], url, body\u001b[39m=\u001b[39;49mdata)\n", + "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\remote_connection.py:311\u001b[0m, in \u001b[0;36mRemoteConnection._request\u001b[1;34m(self, method, url, body)\u001b[0m\n\u001b[0;32m 308\u001b[0m body \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m 310\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mkeep_alive:\n\u001b[1;32m--> 311\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_conn\u001b[39m.\u001b[39;49mrequest(method, url, body\u001b[39m=\u001b[39;49mbody, headers\u001b[39m=\u001b[39;49mheaders)\n\u001b[0;32m 312\u001b[0m statuscode \u001b[39m=\u001b[39m response\u001b[39m.\u001b[39mstatus\n\u001b[0;32m 313\u001b[0m \u001b[39melse\u001b[39;00m:\n", + "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\request.py:78\u001b[0m, in \u001b[0;36mRequestMethods.request\u001b[1;34m(self, method, url, fields, headers, **urlopen_kw)\u001b[0m\n\u001b[0;32m 74\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mrequest_encode_url(\n\u001b[0;32m 75\u001b[0m method, url, fields\u001b[39m=\u001b[39mfields, headers\u001b[39m=\u001b[39mheaders, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39murlopen_kw\n\u001b[0;32m 76\u001b[0m )\n\u001b[0;32m 77\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m---> 78\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mrequest_encode_body(\n\u001b[0;32m 79\u001b[0m method, url, fields\u001b[39m=\u001b[39mfields, headers\u001b[39m=\u001b[39mheaders, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39murlopen_kw\n\u001b[0;32m 80\u001b[0m )\n", + "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\request.py:170\u001b[0m, in \u001b[0;36mRequestMethods.request_encode_body\u001b[1;34m(self, method, url, fields, headers, encode_multipart, multipart_boundary, **urlopen_kw)\u001b[0m\n\u001b[0;32m 167\u001b[0m extra_kw[\u001b[39m\"\u001b[39m\u001b[39mheaders\u001b[39m\u001b[39m\"\u001b[39m]\u001b[39m.\u001b[39mupdate(headers)\n\u001b[0;32m 168\u001b[0m extra_kw\u001b[39m.\u001b[39mupdate(urlopen_kw)\n\u001b[1;32m--> 170\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39murlopen(method, url, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mextra_kw)\n", + "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\poolmanager.py:376\u001b[0m, in \u001b[0;36mPoolManager.urlopen\u001b[1;34m(self, method, url, redirect, **kw)\u001b[0m\n\u001b[0;32m 374\u001b[0m response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39murlopen(method, url, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkw)\n\u001b[0;32m 375\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m--> 376\u001b[0m response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39murlopen(method, u\u001b[39m.\u001b[39mrequest_uri, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkw)\n\u001b[0;32m 378\u001b[0m redirect_location \u001b[39m=\u001b[39m redirect \u001b[39mand\u001b[39;00m response\u001b[39m.\u001b[39mget_redirect_location()\n\u001b[0;32m 379\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m redirect_location:\n", + "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\connectionpool.py:703\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[1;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[0;32m 700\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_prepare_proxy(conn)\n\u001b[0;32m 702\u001b[0m \u001b[39m# Make the request on the httplib connection object.\u001b[39;00m\n\u001b[1;32m--> 703\u001b[0m httplib_response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_request(\n\u001b[0;32m 704\u001b[0m conn,\n\u001b[0;32m 705\u001b[0m method,\n\u001b[0;32m 706\u001b[0m url,\n\u001b[0;32m 707\u001b[0m timeout\u001b[39m=\u001b[39;49mtimeout_obj,\n\u001b[0;32m 708\u001b[0m body\u001b[39m=\u001b[39;49mbody,\n\u001b[0;32m 709\u001b[0m headers\u001b[39m=\u001b[39;49mheaders,\n\u001b[0;32m 710\u001b[0m chunked\u001b[39m=\u001b[39;49mchunked,\n\u001b[0;32m 711\u001b[0m )\n\u001b[0;32m 713\u001b[0m \u001b[39m# If we're going to release the connection in ``finally:``, then\u001b[39;00m\n\u001b[0;32m 714\u001b[0m \u001b[39m# the response doesn't need to know about the connection. Otherwise\u001b[39;00m\n\u001b[0;32m 715\u001b[0m \u001b[39m# it will also try to release it and we'll have a double-release\u001b[39;00m\n\u001b[0;32m 716\u001b[0m \u001b[39m# mess.\u001b[39;00m\n\u001b[0;32m 717\u001b[0m response_conn \u001b[39m=\u001b[39m conn \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m release_conn \u001b[39melse\u001b[39;00m \u001b[39mNone\u001b[39;00m\n", + "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\connectionpool.py:449\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[1;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[0;32m 444\u001b[0m httplib_response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39mgetresponse()\n\u001b[0;32m 445\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 446\u001b[0m \u001b[39m# Remove the TypeError from the exception chain in\u001b[39;00m\n\u001b[0;32m 447\u001b[0m \u001b[39m# Python 3 (including for exceptions like SystemExit).\u001b[39;00m\n\u001b[0;32m 448\u001b[0m \u001b[39m# Otherwise it looks like a bug in the code.\u001b[39;00m\n\u001b[1;32m--> 449\u001b[0m six\u001b[39m.\u001b[39;49mraise_from(e, \u001b[39mNone\u001b[39;49;00m)\n\u001b[0;32m 450\u001b[0m \u001b[39mexcept\u001b[39;00m (SocketTimeout, BaseSSLError, SocketError) \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 451\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_raise_timeout(err\u001b[39m=\u001b[39me, url\u001b[39m=\u001b[39murl, timeout_value\u001b[39m=\u001b[39mread_timeout)\n", + "File \u001b[1;32m:3\u001b[0m, in \u001b[0;36mraise_from\u001b[1;34m(value, from_value)\u001b[0m\n", + "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\connectionpool.py:444\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[1;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[0;32m 441\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mTypeError\u001b[39;00m:\n\u001b[0;32m 442\u001b[0m \u001b[39m# Python 3\u001b[39;00m\n\u001b[0;32m 443\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 444\u001b[0m httplib_response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39;49mgetresponse()\n\u001b[0;32m 445\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 446\u001b[0m \u001b[39m# Remove the TypeError from the exception chain in\u001b[39;00m\n\u001b[0;32m 447\u001b[0m \u001b[39m# Python 3 (including for exceptions like SystemExit).\u001b[39;00m\n\u001b[0;32m 448\u001b[0m \u001b[39m# Otherwise it looks like a bug in the code.\u001b[39;00m\n\u001b[0;32m 449\u001b[0m six\u001b[39m.\u001b[39mraise_from(e, \u001b[39mNone\u001b[39;00m)\n", + "File \u001b[1;32mc:\\Python310\\lib\\http\\client.py:1374\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1372\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 1373\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m-> 1374\u001b[0m response\u001b[39m.\u001b[39;49mbegin()\n\u001b[0;32m 1375\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mConnectionError\u001b[39;00m:\n\u001b[0;32m 1376\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mclose()\n", + "File \u001b[1;32mc:\\Python310\\lib\\http\\client.py:318\u001b[0m, in \u001b[0;36mHTTPResponse.begin\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 316\u001b[0m \u001b[39m# read until we get a non-100 response\u001b[39;00m\n\u001b[0;32m 317\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m--> 318\u001b[0m version, status, reason \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_read_status()\n\u001b[0;32m 319\u001b[0m \u001b[39mif\u001b[39;00m status \u001b[39m!=\u001b[39m CONTINUE:\n\u001b[0;32m 320\u001b[0m \u001b[39mbreak\u001b[39;00m\n", + "File \u001b[1;32mc:\\Python310\\lib\\http\\client.py:279\u001b[0m, in \u001b[0;36mHTTPResponse._read_status\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 278\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_read_status\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[1;32m--> 279\u001b[0m line \u001b[39m=\u001b[39m \u001b[39mstr\u001b[39m(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfp\u001b[39m.\u001b[39;49mreadline(_MAXLINE \u001b[39m+\u001b[39;49m \u001b[39m1\u001b[39;49m), \u001b[39m\"\u001b[39m\u001b[39miso-8859-1\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 280\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(line) \u001b[39m>\u001b[39m _MAXLINE:\n\u001b[0;32m 281\u001b[0m \u001b[39mraise\u001b[39;00m LineTooLong(\u001b[39m\"\u001b[39m\u001b[39mstatus line\u001b[39m\u001b[39m\"\u001b[39m)\n", + "File \u001b[1;32mc:\\Python310\\lib\\socket.py:705\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[1;34m(self, b)\u001b[0m\n\u001b[0;32m 703\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[0;32m 704\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 705\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sock\u001b[39m.\u001b[39;49mrecv_into(b)\n\u001b[0;32m 706\u001b[0m \u001b[39mexcept\u001b[39;00m timeout:\n\u001b[0;32m 707\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_timeout_occurred \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n", + "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + ] } ], "source": [ - "import os\n", + "from tqdm import tqdm\n", "from pathlib import Path\n", "\n", "from selenium import webdriver\n", @@ -38,7 +96,7 @@ "from selenium.webdriver.support.ui import WebDriverWait\n", "from selenium.webdriver.support import expected_conditions as EC\n", "\n", - "search_query = \"GEA Farm Technologies\"\n", + "search_query = \"A*\"\n", "\n", "options = webdriver.ChromeOptions()\n", "\n", @@ -55,6 +113,7 @@ " \"default_directory\": download_path,\n", " },\n", "}\n", + "options.add_argument(\"--headless=new\")\n", "options.add_experimental_option(\"prefs\", preferences)\n", "\n", "driver = webdriver.Chrome(options=options)\n", @@ -75,16 +134,28 @@ "wait.until(\n", " lambda driver: driver.current_url != \"https://www.unternehmensregister.de/ureg/\"\n", ")\n", - "## TODO Iterate over tabs\n", + "\n", "num_pages = int(\n", " driver.find_element(By.XPATH, '//*[@class=\"page_count\"]').text.split(\" \")[0]\n", ")\n", - "for page_index in range(num_pages):\n", + "\n", + "processed_companies = []\n", + "\n", + "for page_index in tqdm(range(num_pages)):\n", " # Find all \"Registerinformationen\"\n", " companies_tab = driver.find_elements(\n", " By.LINK_TEXT, \"Registerinformationen des Registergerichts\"\n", " )\n", - " for company_link in companies_tab:\n", + " company_names = [\n", + " elem.text\n", + " for elem in driver.find_elements(\n", + " By.XPATH, '//div[@class=\"company_result\"]/span/b'\n", + " )\n", + " ]\n", + " for index, company_link in enumerate(companies_tab):\n", + " company_name = company_names[index]\n", + " if company_name in processed_companies:\n", + " continue\n", " # Go to intermediary page\n", " company_link.click()\n", " # Trigger next redirect\n", @@ -101,14 +172,37 @@ " elems = driver.find_elements(By.TAG_NAME, \"input\")\n", " elems[-2].click()\n", "\n", + " wait.until(\n", + " EC.visibility_of_element_located((By.ID, \"paymentFormOverview:btnNext\"))\n", + " )\n", " driver.find_element(By.ID, \"paymentFormOverview:btnNext\").click()\n", + "\n", + " wait.until(\n", + " EC.visibility_of_element_located((By.LINK_TEXT, \"Zum Dokumentenkorb\"))\n", + " )\n", " driver.find_element(By.LINK_TEXT, \"Zum Dokumentenkorb\").click()\n", + "\n", + " num_files = get_num_files(\"./data/Unternehmensregister/\")\n", " driver.find_element(By.CLASS_NAME, \"download-wrapper\").click()\n", "\n", + " try:\n", + " wait.until(\n", + " lambda x: wait_for_download_condition(\n", + " \"./data/Unternehmensregister/\", num_files\n", + " )\n", + " )\n", + " rename_latest_file(\n", + " \"./data/Unternehmensregister/\",\n", + " f\"{company_name.replace(' ', '_').replace('/','_')}.xml\",\n", + " )\n", + " processed_companies.append(company_name)\n", + " except:\n", + " print(f\"Could not process {company_name}\")\n", " for i in range(6):\n", " driver.back()\n", " driver.find_element(By.XPATH, '//*[@class=\"fas fa-angle-right\"]').click()\n", - "driver.close()" + "driver.close()\n", + "print(processed_companies)" ] }, { @@ -121,243 +215,429 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['registerdocument-2023-06-09-14-05-01.xml',\n", - " 'registerdocument-2023-06-09-14-05-03.xml']" + "['A&A_Amini_Art_GmbH.xml',\n", + " 'A&A_Immo_GmbH.xml',\n", + " 'A&P_AUDITING_GmbH_Wirtschaftsprüfungsgesellschaft.xml',\n", + " 'A&QUA_gemeinnützige_Gesellschaft_für_Arbeit_u._Qualifizierung_mbH.xml',\n", + " 'a+b_Asphalt-_und_Betonmischwerke_GmbH_&_Co._KG.xml',\n", + " 'a+b_Verwaltungsgesellschaft_mbH.xml',\n", + " 'A+E_Beteiligungs-_und_Handels-GmbH.xml',\n", + " 'A+W_Systemhaus_GmbH.xml',\n", + " 'A-S-D_Kfz-Teile-Handel_GmbH.xml',\n", + " 'A-TEAM_Industrielles_Roboterschweißen_GmbH.xml',\n", + " 'A.C.C._Funk_Taxi_&_Minicar_e.K..xml',\n", + " 'a.c.k._aqua_concept_GmbH_Karlsruhe.xml',\n", + " 'A.C._Weiss_GmbH_&_Co._KG.xml',\n", + " 'A.D.S._OHG.xml',\n", + " 'A.D._Glas-_und_Gebäudereinigung_e.K..xml',\n", + " 'A.E._Z-Line_Taxi_-_und_Shuttle-Service_e.K..xml',\n", + " 'A.F.Z._Automatisierung,_Fördern,_Zuführen_GmbH.xml',\n", + " 'A.G._Zentral_Michael_Greising_e.K..xml',\n", + " 'A.H._Steuerberatungsgesellschaft_mbH.xml',\n", + " 'A.I.V._SERVICES_GmbH.xml',\n", + " 'A.I._Kommanditist-Gesellschaft_mbH.xml',\n", + " 'A.KIein_Immobilien_KG.xml',\n", + " 'A.L.G._Christian_Schmelzer.xml',\n", + " 'A.L.S._Architektonische_Licht-Systeme_GmbH.xml',\n", + " 'A.M.G._Motorenbau_Hans_Werner_Aufrecht.xml',\n", + " 'A.M.P._Athos_GmbH.xml',\n", + " 'A.N._Gartenbau_GmbH.xml',\n", + " 'A.Q.U.A_Services_KG.xml',\n", + " 'A.R.S._GmbH_Süd,_Alt_und_Reststoffverwertung.xml',\n", + " 'A.S.G._Industrielackierungen_GmbH.xml',\n", + " 'A.S.S._bikes_and_parts_GmbH.xml',\n", + " 'A.S._Baubedarfvermittlung_Gesellschaft_mit_beschränkter_Haftung.xml',\n", + " 'A.T.C._Automotive_GmbH.xml',\n", + " 'A._&_S._Aigner_und_Schulz_GmbH.xml',\n", + " 'A._+_H._Weier_GmbH.xml',\n", + " 'A._+_K._Hertkorn_OHG_Möbel_-_Innenausbau.xml',\n", + " 'A._Abele_GmbH.xml',\n", + " 'A._Baur_Mineralöl-Abfertigungsspedition_GmbH.xml',\n", + " 'A._Blum_GmbH.xml',\n", + " 'A._Both_GmbH.xml',\n", + " 'A._Both_GmbH_&_Co._KG_Werkzeugtechnik_CNC_Maschinenausrüstung.xml',\n", + " 'A._DINKIC_GMBH.xml',\n", + " 'A._Elsbecker_GmbH.xml',\n", + " 'A._Erglis_GmbH.xml',\n", + " 'A._Frauenrath_Landschaftsbau_GmbH_&_Co._KG..xml',\n", + " 'A._Gradmann_GmbH_&_Co._KG.xml',\n", + " 'A._Hanhart_GmbH_&_Co._KG.xml',\n", + " 'A._Hüglin_-_Putz_und_Stuck_-_Gesellschaft_mit_beschränkter_Haftung.xml',\n", + " 'A._Illmann_Zahntechnik_GmbH.xml',\n", + " 'A._Junghanns_Automatisierungs_GmbH.xml',\n", + " 'A._Jung_GmbH_&_Co._KG.xml',\n", + " 'A._Kolbinger_GmbH_Versicherungs-Makler.xml',\n", + " 'A._Kolckmann,_Weberei_und_Kunststoffbeschichtungen_GmbH.xml',\n", + " 'A._Kolckmann_GmbH_&_Co._KG.xml',\n", + " 'A._Kuhner_GmbH.xml',\n", + " 'A._Lipp_GmbH.xml',\n", + " 'A._Müller_Geschäftsführungs-_GmbH.xml',\n", + " 'A._Müller_GmbH_&_Co._KG.xml',\n", + " 'A._Nassal_GmbH.xml',\n", + " 'A._Oster_e.K..xml',\n", + " 'A._Pfeiffer_Zimmerei_GmbH.xml',\n", + " 'A._Pfingsten_KG.xml',\n", + " 'A._Pullmann_GmbH.xml',\n", + " 'A._Randecker_Wirtschafts-_und_Steuerberatungsgesellschaft_mbH.xml',\n", + " 'A._Reinhard_GmbH.xml',\n", + " 'A._Ritter_GmbH.xml',\n", + " 'A._Sabadinowitsch_Verwaltung_GmbH.xml',\n", + " 'A._Sluka-Verwaltungsgesellschaft_mit_beschränkter_Haftung.xml',\n", + " 'A._Sommer_Finanzdienstleistungsvermittlung_e.K..xml',\n", + " 'A._Sorg_GmbH_&_Co._KG.xml',\n", + " 'A._u._G_Sexton_GmbH.xml',\n", + " 'A._Umminger_LUM-Air,_Elektro-_und_Filtertechnik_GmbH.xml',\n", + " 'A._Wankmüller_GmbH_&_Co._KG.xml',\n", + " 'A._Ziemann_Gesellschaft_mit_beschränkter_Haftung.xml',\n", + " 'A._Zwisler_e.K..xml',\n", + " 'A_&_A_Consulting_GmbH.xml',\n", + " 'A_&_A_Gipserbetrieb_GmbH.xml',\n", + " 'a_&_b_Beteiligungs-GmbH.xml',\n", + " 'A_&_B_Gastronomie-Betriebe_GmbH.xml',\n", + " 'A_&_C_Aqua_&_Care_Limited.xml',\n", + " 'A_&_F_Lori_GmbH.xml',\n", + " 'A_&_L_Engineering_Service_GmbH.xml',\n", + " 'A_&_M_Stanzformzubehör_Olaf_Abendroth_GmbH.xml',\n", + " 'A_&_O_Grundstücksverwaltungs_GmbH_&_Co._KG.xml',\n", + " 'A_&_R_Textilproduktion_GmbH.xml',\n", + " 'A_&_S_Bäder_GmbH_&_Co..xml',\n", + " 'A_&_S_Vermögensverwaltungs_GmbH.xml',\n", + " 'A_&_T_Roth_GmbH.xml',\n", + " 'A_+_A_Aalsmeer_Blumen_-_Bräutigam_E._Kfr.,_Inh._Manuela_Bräutigam.xml',\n", + " 'a_+_b_Wohnbau_GmbH.xml',\n", + " 'A_+_H_Bauträger-_und_Verwaltungsgesellschaft_mit_beschränkter_Haftung.xml',\n", + " 'A_+_M_Verwaltungs-GmbH.xml',\n", + " 'A_+_P_Baumaschinen_GmbH_&_Co._KG.xml',\n", + " 'A_+_R_Baumaschinen_-_Mietpark_+_-Vertriebs-GmbH.xml',\n", + " 'A_+_S_Tierbedarf_GmbH.xml',\n", + " 'A_+_Te_Stabil-Bau_GmbH.xml',\n", + " 'A_+_W._Sahm_Bedachungs-GmbH.xml',\n", + " 'a_-_Vermögensverwaltungs-GmbH_&_Co._KG.xml',\n", + " 'A_-_Z_Kreditvermittlungs-Gesellschaft_mit_beschränkter_Haftung.xml',\n", + " 'A_2000_Industrie-Elektronik_GmbH.xml',\n", + " 'A_bis_Z_Verwaltungs_GmbH.xml',\n", + " 'A_B_A_S_A_GmbH_Organisations_-_Planungsbüro_für_den_Innenausbau.xml',\n", + " 'A_B_U_-_GmbH_Altlasten_Bauökologie_Umweltmanagement.xml',\n", + " 'A_F_Fussbodentechnik_GmbH.xml',\n", + " 'A_L_T_E_C_GmbH.xml',\n", + " 'A_l_u_f_o_r_m_Alucobondverarbeitungs-GmbH.xml',\n", + " 'A_L_Z_Auto_Licht_und_Zündung_Service_Gesellschaft_mit_beschränkter_Haftung.xml',\n", + " 'a_m_friseure_GmbH_Karlsruhe.xml',\n", + " 'a_m_friseure_GmbH_Koblenz.xml',\n", + " 'a_priori_GmbH.xml',\n", + " 'a_s_k_-_Kunststoffe_GmbH.xml',\n", + " 'A_S_TRUCKS_e.K..xml',\n", + " 'A_S_Y_S_Automatic_Systems_Beteiligungs-GmbH.xml',\n", + " 'A_u_c_h_Gesellschaft_mit_beschränkter_Haftung.xml',\n", + " 'export',\n", + " 'registerdocument-2023-06-11-12-41-30 (1).xml',\n", + " 'registerdocument-2023-06-11-12-41-30.xml',\n", + " 'registerdocument-2023-06-11-12-52-33.xml',\n", + " 'registerdocument-2023-06-11-12-52-41.xml']" ] }, - "execution_count": 119, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "import os\n", + "\n", "files = os.listdir(\"./data/Unternehmensregister\")\n", "files" ] }, { "cell_type": "code", - "execution_count": 135, + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import xmltodict\n", + "\n", + "\n", + "def transform_xml_to_json(source_dir: str, target_dir: str):\n", + " for file in glob.glob1(source_dir, \"*.xml\"):\n", + " source_path = os.path.join(source_dir, file)\n", + " target_path = os.path.join(target_dir, file.replace(\".xml\", \".json\"))\n", + "\n", + " with open(source_path, \"r\", encoding=\"utf-8\") as source_file:\n", + " data = xmltodict.parse(source_file.read().encode())\n", + " with open(target_path, \"w\", encoding=\"utf-8\") as json_file:\n", + " json_file.write(json.dumps(data))\n", + "\n", + "\n", + "transform_xml_to_json(\"./data/Unternehmensregister/\", \"./data/Unternehmensregister/\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "for file in glob.glob1(\"./data/Unternehmensregister/\", \"*.json\"):\n", + " path = os.path.join(\"./data/Unternehmensregister/\", file)\n", + " with open(path, \"r\", encoding=\"utf-8\") as file_object:\n", + " data = json.loads(file_object.read())" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def parse_stakeholder(data: dict) -> list:\n", + " if \"Natuerliche_Person\" in data[\"Beteiligter\"]:\n", + " return {\n", + " \"name\": {\n", + " \"firstname\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\"Voller_Name\"][\n", + " \"Vorname\"\n", + " ],\n", + " \"lastname\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\"Voller_Name\"][\n", + " \"Nachname\"\n", + " ],\n", + " },\n", + " \"date_of_birth\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\"Geburt\"][\n", + " \"Geburtsdatum\"\n", + " ]\n", + " if \"Geburt\" in data[\"Beteiligter\"][\"Natuerliche_Person\"]\n", + " else None,\n", + " \"location\": {\n", + " \"city\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\"Anschrift\"][\"Ort\"]\n", + " },\n", + " \"role\": data[\"Rolle\"][\"Rollenbezeichnung\"][\"content\"],\n", + " }\n", + " if \"Organisation\" in data[\"Beteiligter\"]:\n", + " return {\n", + " \"role\": \"Organisation\",\n", + " \"description\": data[\"Beteiligter\"][\"Organisation\"][\"Bezeichnung\"][\n", + " \"Bezeichnung_Aktuell\"\n", + " ],\n", + " \"location\": {\n", + " \"city\": data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"][\"Ort\"],\n", + " \"street\": data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"][\"Strasse\"]\n", + " if \"Strasse\" in data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"]\n", + " else None,\n", + " \"house_number\": data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"][\n", + " \"Hausnummer\"\n", + " ]\n", + " if \"Hausnummer\" in data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"]\n", + " else None,\n", + " \"zip_code\": data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"][\n", + " \"Postleitzahl\"\n", + " ],\n", + " },\n", + " }\n", + "\n", + "\n", + "def map_unternehmensregister_json(data: dict) -> dict:\n", + " result = {\"base_info\": None, \"relationships\": []}\n", + "\n", + " base_info = {\n", + " \"company_name\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n", + " \"Basisdaten_Register\"\n", + " ][\"Rechtstraeger\"][\"Bezeichnung\"][\"Bezeichnung_Aktuell\"],\n", + " \"location\": {\n", + " \"city\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\"Basisdaten_Register\"][\n", + " \"Rechtstraeger\"\n", + " ][\"Anschrift\"][\"Ort\"],\n", + " \"zip_code\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n", + " \"Basisdaten_Register\"\n", + " ][\"Rechtstraeger\"][\"Anschrift\"][\"Postleitzahl\"],\n", + " \"street\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n", + " \"Basisdaten_Register\"\n", + " ][\"Rechtstraeger\"][\"Anschrift\"][\"Strasse\"]\n", + " if \"Strasse\"\n", + " in data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\"Basisdaten_Register\"][\n", + " \"Rechtstraeger\"\n", + " ][\"Anschrift\"]\n", + " else None,\n", + " \"house_number\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n", + " \"Basisdaten_Register\"\n", + " ][\"Rechtstraeger\"][\"Anschrift\"][\"Hausnummer\"]\n", + " if \"Hausnummer\"\n", + " in data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\"Basisdaten_Register\"][\n", + " \"Rechtstraeger\"\n", + " ][\"Anschrift\"]\n", + " else None,\n", + " },\n", + " \"last_update\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\"Auszug\"][\n", + " \"letzte_Eintragung\"\n", + " ],\n", + " }\n", + " result[\"base_info\"] = base_info\n", + " for i in range(\n", + " len(data[\"XJustiz_Daten\"][\"Grunddaten\"][\"Verfahrensdaten\"][\"Beteiligung\"])\n", + " ):\n", + " people = parse_stakeholder(\n", + " data[\"XJustiz_Daten\"][\"Grunddaten\"][\"Verfahrensdaten\"][\"Beteiligung\"][i]\n", + " )\n", + " result[\"relationships\"].append(people)\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{}\n", - "{}\n", - "{'name': {'firstname': 'Reinhard', 'lastname': 'Gebing'}, 'date_of_birth': '1964-04-26', 'location': {'city': 'Oelde'}, 'role': 'Prokurist(in)'}\n", - "{'name': {'firstname': 'Markus', 'lastname': 'Kreft'}, 'date_of_birth': '1966-04-03', 'location': {'city': 'Wetter'}, 'role': 'Prokurist(in)'}\n", - "{'name': {'firstname': 'Kai', 'lastname': 'Luntz'}, 'date_of_birth': '1970-12-04', 'location': {'city': 'Holzminden'}, 'role': 'Prokurist(in)'}\n", - "{'name': {'firstname': 'Thomas', 'lastname': 'Mader'}, 'date_of_birth': '1972-05-24', 'location': {'city': 'Rheda-Wiedenbrück'}, 'role': 'Prokurist(in)'}\n", - "{'name': {'firstname': 'Peter', 'lastname': 'Lauwers'}, 'date_of_birth': '1970-03-26', 'location': {'city': 'Düsseldorf'}, 'role': 'Geschäftsführer(in)'}\n", - "{'name': {'firstname': 'Erkul', 'lastname': 'Basaran'}, 'date_of_birth': '1977-05-06', 'location': {'city': 'Erkrath'}, 'role': 'Geschäftsführer(in)'}\n", - "{'name': {'firstname': 'Katja', 'lastname': 'Voß'}, 'date_of_birth': '1978-02-24', 'location': {'city': 'Rheda-Wiedenbrück'}, 'role': 'Prokurist(in)'}\n", - "{'name': {'firstname': 'Henrik', 'lastname': 'Böttner'}, 'date_of_birth': '1982-11-07', 'location': {'city': 'Bochum'}, 'role': 'Prokurist(in)'}\n", - "{'name': {'firstname': 'Ulrich', 'lastname': 'Raßenhövel'}, 'date_of_birth': '1969-04-16', 'location': {'city': 'Oelde'}, 'role': 'Prokurist(in)'}\n", - "{'name': {'firstname': 'Andreas', 'lastname': 'Naroska'}, 'date_of_birth': '1967-03-23', 'location': {'city': 'Herdecke'}, 'role': 'Prokurist(in)'}\n", - "{'name': {'firstname': 'Mark', 'lastname': 'Kramps'}, 'date_of_birth': '1967-09-04', 'location': {'city': 'Witten'}, 'role': 'Prokurist(in)'}\n", - "{'name': {'firstname': 'Ralf', 'lastname': 'Barkmeyer'}, 'date_of_birth': '1974-02-28', 'location': {'city': 'Dortmund'}, 'role': 'Prokurist(in)'}\n", - "{'name': {'firstname': 'Holger', 'lastname': 'Siegwarth'}, 'date_of_birth': '1967-05-13', 'location': {'city': 'Tönnisvorst'}, 'role': 'Prokurist(in)'}\n", - "{'name': {'firstname': 'Oliver', 'lastname': 'Liß'}, 'date_of_birth': '1981-04-13', 'location': {'city': 'Herne'}, 'role': 'Prokurist(in)'}\n", - "{'name': {'firstname': 'Liang', 'lastname': 'Cheng'}, 'date_of_birth': '1980-12-29', 'location': {'city': 'Göppingen'}, 'role': 'Geschäftsführer(in)'}\n", - "{'name': {'firstname': 'Astrid', 'lastname': 'Dörner-Rodeheger'}, 'date_of_birth': '1968-12-24', 'location': {'city': 'Beckum'}, 'role': 'Prokurist(in)'}\n", - "{'name': {'firstname': 'Jon', 'lastname': 'Lange'}, 'date_of_birth': '1978-04-25', 'location': {'city': 'Dortmund'}, 'role': 'Prokurist(in)'}\n", - "{'name': {'firstname': 'Matthias', 'lastname': 'Peters'}, 'date_of_birth': '1973-08-28', 'location': {'city': 'Dortmund'}, 'role': 'Prokurist(in)'}\n", - "{'name': {'firstname': 'Ralf', 'lastname': 'Frombach'}, 'date_of_birth': '1977-01-25', 'location': {'city': 'Werne'}, 'role': 'Prokurist(in)'}\n", - "{'name': {'firstname': 'Sven', 'lastname': 'Hommel'}, 'date_of_birth': '1979-04-22', 'location': {'city': 'Berlin'}, 'role': 'Prokurist(in)'}\n" + "A&A Amini Art GmbH\n", + "A&A Immo GmbH\n", + "A&P AUDITING GmbH Wirtschaftsprüfungsgesellschaft\n", + "A&QUA gemeinnützige Gesellschaft für Arbeit u. Qualifizierung mbH\n", + "a+b Asphalt- und Betonmischwerke GmbH & Co. KG\n", + "a+b Verwaltungsgesellschaft mbH\n", + "A+E Beteiligungs- und Handels-GmbH\n", + "A+W Systemhaus GmbH\n", + "A-S-D Kfz-Teile-Handel GmbH\n", + "A-TEAM Industrielles Roboterschweißen GmbH\n", + "A.C.C. Funk Taxi & Minicar e.K.\n", + "a.c.k. aqua concept GmbH Karlsruhe\n", + "A.C. Weiss GmbH & Co. KG\n", + "A.D.S. OHG\n", + "A.D. Glas- und Gebäudereinigung e.K.\n", + "A.E. Z-Line Taxi - und Shuttle-Service e.K.\n", + "A.F.Z. Automatisierung, Fördern, Zuführen GmbH\n", + "A.G. Zentral Michael Greising e.K.\n", + "A.H. Steuerberatungsgesellschaft mbH\n", + "A.I.V. SERVICES GmbH\n", + "A.I. Kommanditist-Gesellschaft mbH\n", + "A.KIein Immobilien KG\n", + "A.L.G. Christian Schmelzer\n", + "A.L.S. Architektonische Licht-Systeme GmbH\n", + "A.M.G. Motorenbau Hans Werner Aufrecht\n", + "A.M.P. Athos GmbH\n", + "A.N. Gartenbau GmbH\n", + "A.Q.U.A Services KG\n", + "A.R.S. GmbH Süd, Alt und Reststoffverwertung\n", + "A.S.G. Industrielackierungen GmbH\n", + "A.S.S. bikes and parts GmbH\n", + "A.S. Baubedarfvermittlung Gesellschaft mit beschränkter Haftung\n", + "A.T.C. Automotive GmbH\n", + "A. & S. Aigner und Schulz GmbH\n", + "A. + H. Weier GmbH\n", + "A. + K. Hertkorn OHG Möbel - Innenausbau\n", + "A. Abele GmbH\n", + "A. Baur Mineralöl-Abfertigungsspedition GmbH\n", + "A. Blum GmbH\n", + "A. Both GmbH\n", + "A. Both GmbH & Co. KG Werkzeugtechnik CNC Maschinenausrüstung\n", + "A. DINKIC GMBH\n", + "A. Elsbecker GmbH\n", + "A. Erglis GmbH\n", + "A. Frauenrath Landschaftsbau GmbH & Co. KG.\n", + "A. Gradmann GmbH & Co. KG\n", + "A. Hanhart GmbH & Co. KG\n", + "A. Hüglin - Putz und Stuck - Gesellschaft mit beschränkter Haftung\n", + "A. Illmann Zahntechnik GmbH\n", + "A. Junghanns Automatisierungs GmbH\n", + "A. Jung GmbH & Co.KG\n", + "A. Kolbinger GmbH Versicherungs-Makler\n", + "A. Kolckmann, Weberei und Kunststoffbeschichtungen GmbH\n", + "A. Kolckmann GmbH & Co. KG\n", + "A. Kuhner GmbH\n", + "A. Lipp GmbH\n", + "A. Müller Geschäftsführungs- GmbH\n", + "A. Müller GmbH & Co. KG\n", + "A. Nassal GmbH\n", + "A. Oster e.K.\n", + "A. Pfeiffer Zimmerei GmbH\n", + "A. Pfingsten KG\n", + "A. Pullmann GmbH\n", + "A. Randecker Wirtschafts- und Steuerberatungsgesellschaft mbH\n", + "A. Reinhard GmbH\n", + "A. Ritter GmbH\n", + "A. Sabadinowitsch Verwaltung GmbH\n", + "A. Sluka-Verwaltungsgesellschaft mit beschränkter Haftung\n", + "A. Sommer Finanzdienstleistungsvermittlung e.K.\n", + "A. Sorg GmbH & Co. KG\n", + "A. u. G Sexton GmbH\n", + "A. Umminger LUM-Air, Elektro- und Filtertechnik GmbH\n", + "A. Wankmüller GmbH & Co. KG\n", + "A. Ziemann Gesellschaft mit beschränkter Haftung\n", + "A. Zwisler e.K.\n", + "A & A Consulting GmbH\n", + "A & A Gipserbetrieb GmbH\n", + "a & b Beteiligungs-GmbH\n", + "A & B Gastronomie-Betriebe GmbH\n", + "A & C Aqua & Care Limited\n", + "A & F Lori GmbH\n", + "A & L Engineering Service GmbH\n", + "A & M Stanzformzubehör Olaf Abendroth GmbH\n", + "A & O Grundstücksverwaltungs GmbH & Co. KG\n", + "A & R Textilproduktion GmbH\n", + "A & S Bäder GmbH & Co.\n", + "A & S Vermögensverwaltungs GmbH\n", + "A & T Roth GmbH\n", + "A + A Aalsmeer Blumen - Bräutigam E. Kfr., Inh. Manuela Bräutigam\n", + "a + b Wohnbau GmbH\n", + "A + H Bauträger- und Verwaltungsgesellschaft mit beschränkter Haftung\n", + "A + M Verwaltungs-GmbH\n", + "A + P Baumaschinen GmbH & Co. KG\n", + "A + R Baumaschinen - Mietpark + -Vertriebs-GmbH\n", + "A + S Tierbedarf GmbH\n", + "A + Te Stabil-Bau GmbH\n", + "A + W. Sahm Bedachungs-GmbH\n", + "a - Vermögensverwaltungs-GmbH & Co. KG\n", + "A - Z Kreditvermittlungs-Gesellschaft mit beschränkter Haftung\n", + "A 2000 Industrie-Elektronik GmbH\n", + "A bis Z Verwaltungs GmbH\n", + "A B A S A GmbH Organisations - Planungsbüro für den Innenausbau\n", + "A B U - GmbH Altlasten Bauökologie Umweltmanagement\n", + "A F Fussbodentechnik GmbH\n", + "A L T E C GmbH\n", + "A l u f o r m Alucobondverarbeitungs-GmbH\n", + "A L Z Auto Licht und Zündung Service Gesellschaft mit beschränkter Haftung\n", + "a/m friseure GmbH Karlsruhe\n", + "a/m friseure GmbH Koblenz\n", + "a priori GmbH\n", + "a s k - Kunststoffe GmbH\n", + "A/S TRUCKS e.K.\n", + "A S Y S Automatic Systems Beteiligungs-GmbH\n", + "A u c h Gesellschaft mit beschränkter Haftung\n", + "a s k - Kunststoffe GmbH\n", + "A. Maier GmbH & Co. KG\n", + "\"A/D/L/E/R Steuerberatungsgesellschaft mbH\"\n", + "a | m | | medienservice e. k.\n" ] } ], "source": [ "import json\n", - "import xmltodict\n", "\n", - "for file in files:\n", - " with open(\"./data/Unternehmensregister/\" + file, \"r\", encoding=\"utf-8\") as xml_file:\n", - " data = xmltodict.parse(xml_file.read())\n", - " with open(\"./data/temp.json\", \"w\", encoding=\"utf-8\") as json_file:\n", - " json_file.write(json.dumps(data))\n", + "for file in glob.glob1(\"./data/Unternehmensregister/\", \"*.json\"):\n", + " path = os.path.join(\"./data/Unternehmensregister/\", file)\n", + " with open(path, \"r\", encoding=\"utf-8\") as file_object:\n", + " data = json.loads(file_object.read())\n", "\n", - " keys = dict.keys(data[\"XJustiz_Daten\"][\"Grunddaten\"])\n", - " base_info = {\n", - " \"company_name\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n", - " \"Basisdaten_Register\"\n", - " ][\"Rechtstraeger\"][\"Bezeichnung\"][\"Bezeichnung_Aktuell\"],\n", - " \"location\": {\n", - " \"city\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n", - " \"Basisdaten_Register\"\n", - " ][\"Rechtstraeger\"][\"Anschrift\"][\"Ort\"],\n", - " \"zip_code\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n", - " \"Basisdaten_Register\"\n", - " ][\"Rechtstraeger\"][\"Anschrift\"][\"Postleitzahl\"],\n", - " \"street\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n", - " \"Basisdaten_Register\"\n", - " ][\"Rechtstraeger\"][\"Anschrift\"][\"Strasse\"],\n", - " \"house_number\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n", - " \"Basisdaten_Register\"\n", - " ][\"Rechtstraeger\"][\"Anschrift\"][\"Hausnummer\"],\n", - " },\n", - " }\n", + " result = map_unternehmensregister_json(data)\n", + " print(result[\"base_info\"][\"company_name\"])\n", "\n", - " def parse_stakeholder(data: dict) -> list:\n", - " if \"Natuerliche_Person\" in data[\"Beteiligter\"]:\n", - " return {\n", - " \"name\": {\n", - " \"firstname\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\n", - " \"Voller_Name\"\n", - " ][\"Vorname\"],\n", - " \"lastname\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\n", - " \"Voller_Name\"\n", - " ][\"Nachname\"],\n", - " },\n", - " \"date_of_birth\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\n", - " \"Geburt\"\n", - " ][\"Geburtsdatum\"],\n", - " \"location\": {\n", - " \"city\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\"Anschrift\"][\n", - " \"Ort\"\n", - " ]\n", - " },\n", - " \"role\": data[\"Rolle\"][\"Rollenbezeichnung\"][\"content\"],\n", - " }\n", - " return {}\n", - "\n", - " for i in range(\n", - " len(data[\"XJustiz_Daten\"][\"Grunddaten\"][\"Verfahrensdaten\"][\"Beteiligung\"])\n", - " ):\n", - " people = parse_stakeholder(\n", - " data[\"XJustiz_Daten\"][\"Grunddaten\"][\"Verfahrensdaten\"][\"Beteiligung\"][i]\n", - " )\n", - " print(people)\n", - " break" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "from pdf2image import convert_from_path\n", - "\n", - "pdfs = r\"./data/test.pdf\"\n", - "pages = convert_from_path(pdfs, 350)\n", - "\n", - "\n", - "for i, page in enumerate(pages):\n", - " image_name = f\"./data/Page_{i+1}.jpg\"\n", - " page.save(image_name, \"JPEG\")" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Handelsregister B des Abteilung B Nummer der Firma:\n", - "Amtsgerichts Hamm Wiedergabe des aktuellen HRB 5363\n", - "Registerinhalts\n", - "Abruf vom 07.06.2023 19:37\n", - "1. Anzahl der bisherigen Eintragungen:\n", - "51\n", - "2. a) Firma:\n", - "GEA Farm Technologies GmbH\n", - "b) Sitz, Niederlassung, inländische Geschäftsanschrift, empfangsberechtigte Person, Zweigniederlassungen:\n", - "Bönen\n", - "Geschäftsanschrift: Siemensstraße 25-27, 59199 Bönen\n", - "c) Gegenstand des Unternehmens:\n", - "Entwicklung, Herstellung und der Vertrieb von Landtechnik, insbesondere von Komponenten und Anlagen\n", - "(a) zur Gewinnung, Kühlung, Behandlung und Lagerung von Milch;\n", - "(b) für das Milchvieh-Herdenmanagement;\n", - "(c) zur Tierhygiene und Sicherung der Milchqualität und\n", - "(d) zur Aufstallung von Tieren;\n", - "sowie die Herstellung und der Vertrieb von Anlagen und Fahrzeugen zur Aufbereitung und zum Transport von Gülle.\n", - "3. Grund- oder Stammkapital:\n", - "5.115.000,00 EUR\n", - "4. a) Allgemeine Vertretungsregelung:\n", - "Ist nur ein Geschäftsführer bestellt, so vertritt er die Gesellschaft allein. Sind mehrere Geschäftsführer bestellt, so wird die\n", - "Gesellschaft durch zwei Geschäftsführer oder durch einen Geschäftsführer gemeinsam mit einem Prokuristen vertreten.\n", - "b) Vorstand, Leitungsorgan, geschäftsführende Direktoren, persönlich haftender Gesellschafter, Geschäftsführer,\n", - "Vertretungsberechtigte und besondere Vertretungsbefugnis:\n", - "Geschäftsführer: Basaran, Erkul, Erkrath, *06.05.1977\n", - "Geschäftsführer: Cheng, Liang, Göppingen, *29.12.1980\n", - "Geschäftsführer: Lauwers, Peter, Düsseldorf, *26.03.1970\n", - "5. Prokura:\n", - "Gesamtprokura gemeinsam mit einem Geschäftsführer oder einem anderen Prokuristen:\n", - "Barkmeyer, Ralf, Dortmund, *28.02.1974\n", - "Böttner, Henrik, Bochum, *07.11.1982\n", - "Dörner-Rodeheger, Astrid, Beckum, *24.12.1968\n", - "Frombach, Ralf, Werne, *25.01.1977\n", - "Gebing, Reinhard, Oelde, *26.04.1964\n", - "Hommel, Sven, Berlin, *22.04.1979\n", - "Kramps, Mark, Witten, *04.09.1967\n", - "Kreft, Markus, Wetter, *03.04.1966\n", - "\n" - ] - } - ], - "source": [ - "import cv2\n", - "import pytesseract\n", - "\n", - "image_path = \"./data/Page_1.jpg\"\n", - "image = cv2.imread(image_path)\n", - "\n", - "text = str(pytesseract.image_to_string(image, config=\"--psm 6\", lang=\"deu\"))\n", - "print(text)\n", - "with open(\"./data/Page_1.txt\", \"w\", encoding=\"utf-8\") as output_file:\n", - " output_file.write(text)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['Geschäftsführer: Basaran, Erkul', 'Geschäftsführer: Cheng, Liang', 'Geschäftsführer: Lauwers, Peter']\n" - ] - }, - { - "data": { - "text/plain": [ - "['Erkul Basaran', 'Liang Cheng', 'Peter Lauwers']" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import re\n", - "\n", - "\n", - "def get_managing_directors(text: str) -> list:\n", - " managing_directors_regex = r\"Geschäftsführer: [a-zA-ZäöüÄÖÜ]*, [a-zA-ZäöüÄÖÜ]*\"\n", - " hits = re.findall(managing_directors_regex, text)\n", - " print(hits)\n", - " return [\n", - " \" \".join(hit.replace(\"Geschäftsführer: \", \"\").replace(\",\", \"\").split(\" \")[::-1])\n", - " for hit in hits\n", - " ]\n", - "\n", - "\n", - "get_managing_directors(text)" + " name = (\n", + " result[\"base_info\"][\"company_name\"]\n", + " .replace(\" \", \"_\")\n", + " .replace(\"/\", \"_\")\n", + " .replace('\"', \"\")\n", + " .replace(\"|\", \"_\")\n", + " )\n", + " with open(\n", + " f\"./data/Unternehmensregister/export/{name}.json\", \"w+\", encoding=\"utf-8\"\n", + " ) as export_file:\n", + " json.dump(result, export_file, ensure_ascii=False)" ] } ], diff --git a/Jupyter/API-tests/Unternehmensregister/requirements.txt b/Jupyter/API-tests/Unternehmensregister/requirements.txt index 80ede99..16b85b7 100644 --- a/Jupyter/API-tests/Unternehmensregister/requirements.txt +++ b/Jupyter/API-tests/Unternehmensregister/requirements.txt @@ -4,4 +4,5 @@ opencv-python pdf2image bs4 selenium -xmltodict \ No newline at end of file +xmltodict +tqdm \ No newline at end of file