{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Unternehmensregister" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Fetch Auszug" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import glob" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def wait_for_download_condition(\n", " path: str, num_files: int, pattern: str = \"*.xml\"\n", ") -> bool:\n", " return len(glob.glob1(path, pattern)) > num_files\n", "\n", "\n", "def get_num_files(path: str, pattern: str = \"*.xml\") -> int:\n", " return len(glob.glob1(path, pattern))\n", "\n", "\n", "def rename_latest_file(path: str, filename: str, pattern: str = \"*.xml\"):\n", " list_of_files = [os.path.join(path, file) for file in glob.glob1(path, pattern)]\n", " latest_download = max(list_of_files, key=os.path.getctime)\n", " os.rename(latest_download, os.path.join(path, filename))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "c:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Unternehmensregister\\data\\Unternehmensregister\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 0%| | 0/4192 [00:00111\u001b[0m \u001b[39mfor\u001b[39;00m i \u001b[39min\u001b[39;00m \u001b[39mrange\u001b[39m(\u001b[39m6\u001b[39m):\n\u001b[0;32m 112\u001b[0m driver\u001b[39m.\u001b[39mback()\n\u001b[1;32m--> 113\u001b[0m driver\u001b[39m.\u001b[39;49mfind_element(By\u001b[39m.\u001b[39;49mXPATH, \u001b[39m'\u001b[39;49m\u001b[39m//*[@class=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mfas fa-angle-right\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m]\u001b[39;49m\u001b[39m'\u001b[39;49m)\u001b[39m.\u001b[39;49mclick()\n\u001b[0;32m 114\u001b[0m driver\u001b[39m.\u001b[39mclose()\n\u001b[0;32m 115\u001b[0m \u001b[39mprint\u001b[39m(processed_companies)\n", "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webelement.py:94\u001b[0m, in \u001b[0;36mWebElement.click\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 92\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mclick\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 93\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Clicks the element.\"\"\"\u001b[39;00m\n\u001b[1;32m---> 94\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_execute(Command\u001b[39m.\u001b[39;49mCLICK_ELEMENT)\n", "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webelement.py:395\u001b[0m, in \u001b[0;36mWebElement._execute\u001b[1;34m(self, command, params)\u001b[0m\n\u001b[0;32m 393\u001b[0m params \u001b[39m=\u001b[39m {}\n\u001b[0;32m 394\u001b[0m params[\u001b[39m\"\u001b[39m\u001b[39mid\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_id\n\u001b[1;32m--> 395\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_parent\u001b[39m.\u001b[39;49mexecute(command, params)\n", "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py:344\u001b[0m, in \u001b[0;36mWebDriver.execute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m 341\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39msessionId\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m params:\n\u001b[0;32m 342\u001b[0m params[\u001b[39m\"\u001b[39m\u001b[39msessionId\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msession_id\n\u001b[1;32m--> 344\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mcommand_executor\u001b[39m.\u001b[39;49mexecute(driver_command, params)\n\u001b[0;32m 345\u001b[0m \u001b[39mif\u001b[39;00m response:\n\u001b[0;32m 346\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror_handler\u001b[39m.\u001b[39mcheck_response(response)\n", "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\remote_connection.py:290\u001b[0m, in \u001b[0;36mRemoteConnection.execute\u001b[1;34m(self, command, params)\u001b[0m\n\u001b[0;32m 288\u001b[0m data \u001b[39m=\u001b[39m utils\u001b[39m.\u001b[39mdump_json(params)\n\u001b[0;32m 289\u001b[0m url \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_url\u001b[39m}\u001b[39;00m\u001b[39m{\u001b[39;00mpath\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m--> 290\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_request(command_info[\u001b[39m0\u001b[39;49m], url, body\u001b[39m=\u001b[39;49mdata)\n", "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\remote_connection.py:311\u001b[0m, in \u001b[0;36mRemoteConnection._request\u001b[1;34m(self, method, url, body)\u001b[0m\n\u001b[0;32m 308\u001b[0m body \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m 310\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mkeep_alive:\n\u001b[1;32m--> 311\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_conn\u001b[39m.\u001b[39;49mrequest(method, url, body\u001b[39m=\u001b[39;49mbody, headers\u001b[39m=\u001b[39;49mheaders)\n\u001b[0;32m 312\u001b[0m statuscode \u001b[39m=\u001b[39m response\u001b[39m.\u001b[39mstatus\n\u001b[0;32m 313\u001b[0m \u001b[39melse\u001b[39;00m:\n", "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\request.py:78\u001b[0m, in \u001b[0;36mRequestMethods.request\u001b[1;34m(self, method, url, fields, headers, **urlopen_kw)\u001b[0m\n\u001b[0;32m 74\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mrequest_encode_url(\n\u001b[0;32m 75\u001b[0m method, url, fields\u001b[39m=\u001b[39mfields, headers\u001b[39m=\u001b[39mheaders, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39murlopen_kw\n\u001b[0;32m 76\u001b[0m )\n\u001b[0;32m 77\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m---> 78\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mrequest_encode_body(\n\u001b[0;32m 79\u001b[0m method, url, fields\u001b[39m=\u001b[39mfields, headers\u001b[39m=\u001b[39mheaders, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39murlopen_kw\n\u001b[0;32m 80\u001b[0m )\n", "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\request.py:170\u001b[0m, in \u001b[0;36mRequestMethods.request_encode_body\u001b[1;34m(self, method, url, fields, headers, encode_multipart, multipart_boundary, **urlopen_kw)\u001b[0m\n\u001b[0;32m 167\u001b[0m extra_kw[\u001b[39m\"\u001b[39m\u001b[39mheaders\u001b[39m\u001b[39m\"\u001b[39m]\u001b[39m.\u001b[39mupdate(headers)\n\u001b[0;32m 168\u001b[0m extra_kw\u001b[39m.\u001b[39mupdate(urlopen_kw)\n\u001b[1;32m--> 170\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39murlopen(method, url, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mextra_kw)\n", "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\poolmanager.py:376\u001b[0m, in \u001b[0;36mPoolManager.urlopen\u001b[1;34m(self, method, url, redirect, **kw)\u001b[0m\n\u001b[0;32m 374\u001b[0m response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39murlopen(method, url, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkw)\n\u001b[0;32m 375\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m--> 376\u001b[0m response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39murlopen(method, u\u001b[39m.\u001b[39mrequest_uri, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkw)\n\u001b[0;32m 378\u001b[0m redirect_location \u001b[39m=\u001b[39m redirect \u001b[39mand\u001b[39;00m response\u001b[39m.\u001b[39mget_redirect_location()\n\u001b[0;32m 379\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m redirect_location:\n", "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\connectionpool.py:703\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[1;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[0;32m 700\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_prepare_proxy(conn)\n\u001b[0;32m 702\u001b[0m \u001b[39m# Make the request on the httplib connection object.\u001b[39;00m\n\u001b[1;32m--> 703\u001b[0m httplib_response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_request(\n\u001b[0;32m 704\u001b[0m conn,\n\u001b[0;32m 705\u001b[0m method,\n\u001b[0;32m 706\u001b[0m url,\n\u001b[0;32m 707\u001b[0m timeout\u001b[39m=\u001b[39;49mtimeout_obj,\n\u001b[0;32m 708\u001b[0m body\u001b[39m=\u001b[39;49mbody,\n\u001b[0;32m 709\u001b[0m headers\u001b[39m=\u001b[39;49mheaders,\n\u001b[0;32m 710\u001b[0m chunked\u001b[39m=\u001b[39;49mchunked,\n\u001b[0;32m 711\u001b[0m )\n\u001b[0;32m 713\u001b[0m \u001b[39m# If we're going to release the connection in ``finally:``, then\u001b[39;00m\n\u001b[0;32m 714\u001b[0m \u001b[39m# the response doesn't need to know about the connection. Otherwise\u001b[39;00m\n\u001b[0;32m 715\u001b[0m \u001b[39m# it will also try to release it and we'll have a double-release\u001b[39;00m\n\u001b[0;32m 716\u001b[0m \u001b[39m# mess.\u001b[39;00m\n\u001b[0;32m 717\u001b[0m response_conn \u001b[39m=\u001b[39m conn \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m release_conn \u001b[39melse\u001b[39;00m \u001b[39mNone\u001b[39;00m\n", "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\connectionpool.py:449\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[1;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[0;32m 444\u001b[0m httplib_response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39mgetresponse()\n\u001b[0;32m 445\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 446\u001b[0m \u001b[39m# Remove the TypeError from the exception chain in\u001b[39;00m\n\u001b[0;32m 447\u001b[0m \u001b[39m# Python 3 (including for exceptions like SystemExit).\u001b[39;00m\n\u001b[0;32m 448\u001b[0m \u001b[39m# Otherwise it looks like a bug in the code.\u001b[39;00m\n\u001b[1;32m--> 449\u001b[0m six\u001b[39m.\u001b[39;49mraise_from(e, \u001b[39mNone\u001b[39;49;00m)\n\u001b[0;32m 450\u001b[0m \u001b[39mexcept\u001b[39;00m (SocketTimeout, BaseSSLError, SocketError) \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 451\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_raise_timeout(err\u001b[39m=\u001b[39me, url\u001b[39m=\u001b[39murl, timeout_value\u001b[39m=\u001b[39mread_timeout)\n", "File \u001b[1;32m:3\u001b[0m, in \u001b[0;36mraise_from\u001b[1;34m(value, from_value)\u001b[0m\n", "File \u001b[1;32mc:\\Python310\\lib\\site-packages\\urllib3\\connectionpool.py:444\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[1;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[0;32m 441\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mTypeError\u001b[39;00m:\n\u001b[0;32m 442\u001b[0m \u001b[39m# Python 3\u001b[39;00m\n\u001b[0;32m 443\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 444\u001b[0m httplib_response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39;49mgetresponse()\n\u001b[0;32m 445\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 446\u001b[0m \u001b[39m# Remove the TypeError from the exception chain in\u001b[39;00m\n\u001b[0;32m 447\u001b[0m \u001b[39m# Python 3 (including for exceptions like SystemExit).\u001b[39;00m\n\u001b[0;32m 448\u001b[0m \u001b[39m# Otherwise it looks like a bug in the code.\u001b[39;00m\n\u001b[0;32m 449\u001b[0m six\u001b[39m.\u001b[39mraise_from(e, \u001b[39mNone\u001b[39;00m)\n", "File \u001b[1;32mc:\\Python310\\lib\\http\\client.py:1374\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1372\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 1373\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m-> 1374\u001b[0m response\u001b[39m.\u001b[39;49mbegin()\n\u001b[0;32m 1375\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mConnectionError\u001b[39;00m:\n\u001b[0;32m 1376\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mclose()\n", "File \u001b[1;32mc:\\Python310\\lib\\http\\client.py:318\u001b[0m, in \u001b[0;36mHTTPResponse.begin\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 316\u001b[0m \u001b[39m# read until we get a non-100 response\u001b[39;00m\n\u001b[0;32m 317\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m--> 318\u001b[0m version, status, reason \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_read_status()\n\u001b[0;32m 319\u001b[0m \u001b[39mif\u001b[39;00m status \u001b[39m!=\u001b[39m CONTINUE:\n\u001b[0;32m 320\u001b[0m \u001b[39mbreak\u001b[39;00m\n", "File \u001b[1;32mc:\\Python310\\lib\\http\\client.py:279\u001b[0m, in \u001b[0;36mHTTPResponse._read_status\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 278\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_read_status\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[1;32m--> 279\u001b[0m line \u001b[39m=\u001b[39m \u001b[39mstr\u001b[39m(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfp\u001b[39m.\u001b[39;49mreadline(_MAXLINE \u001b[39m+\u001b[39;49m \u001b[39m1\u001b[39;49m), \u001b[39m\"\u001b[39m\u001b[39miso-8859-1\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 280\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(line) \u001b[39m>\u001b[39m _MAXLINE:\n\u001b[0;32m 281\u001b[0m \u001b[39mraise\u001b[39;00m LineTooLong(\u001b[39m\"\u001b[39m\u001b[39mstatus line\u001b[39m\u001b[39m\"\u001b[39m)\n", "File \u001b[1;32mc:\\Python310\\lib\\socket.py:705\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[1;34m(self, b)\u001b[0m\n\u001b[0;32m 703\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[0;32m 704\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 705\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sock\u001b[39m.\u001b[39;49mrecv_into(b)\n\u001b[0;32m 706\u001b[0m \u001b[39mexcept\u001b[39;00m timeout:\n\u001b[0;32m 707\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_timeout_occurred \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n", "\u001b[1;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "from tqdm import tqdm\n", "from pathlib import Path\n", "\n", "from selenium import webdriver\n", "from selenium.webdriver.common.by import By\n", "from selenium.webdriver.support.ui import WebDriverWait\n", "from selenium.webdriver.support import expected_conditions as EC\n", "\n", "search_query = \"A*\"\n", "\n", "options = webdriver.ChromeOptions()\n", "\n", "download_path = str(Path(Path.cwd() / \"data\" / \"Unternehmensregister\"))\n", "print(download_path)\n", "\n", "preferences = {\n", " \"profile.default_content_settings.popups\": 0,\n", " \"safebrowsing.enabled\": True,\n", " \"download\": {\n", " \"directory_upgrade\": True,\n", " \"prompt_for_download\": False,\n", " \"extensions_to_open\": \"\",\n", " \"default_directory\": download_path,\n", " },\n", "}\n", "options.add_argument(\"--headless=new\")\n", "options.add_experimental_option(\"prefs\", preferences)\n", "\n", "driver = webdriver.Chrome(options=options)\n", "\n", "driver.get(\"https://www.unternehmensregister.de/ureg/\")\n", "# Accept Cookies\n", "driver.find_elements(\n", " By.XPATH, '//button[text()=\"Nur technisch notwendige Cookies akzeptieren\"]'\n", ")[0].click()\n", "# Enter search query\n", "driver.find_elements(By.ID, \"globalSearchForm:extendedResearchCompanyName\")[\n", " 0\n", "].send_keys(search_query)\n", "# Trigger search\n", "driver.find_elements(By.ID, \"globalSearchForm:btnExecuteSearchOld\")[0].click()\n", "# Wait for results\n", "wait = WebDriverWait(driver, 5)\n", "wait.until(\n", " lambda driver: driver.current_url != \"https://www.unternehmensregister.de/ureg/\"\n", ")\n", "\n", "num_pages = int(\n", " driver.find_element(By.XPATH, '//*[@class=\"page_count\"]').text.split(\" \")[0]\n", ")\n", "\n", "processed_companies = []\n", "\n", "for page_index in tqdm(range(num_pages)):\n", " # Find all \"Registerinformationen\"\n", " companies_tab = driver.find_elements(\n", " By.LINK_TEXT, \"Registerinformationen des Registergerichts\"\n", " )\n", " company_names = [\n", " elem.text\n", " for elem in driver.find_elements(\n", " By.XPATH, '//div[@class=\"company_result\"]/span/b'\n", " )\n", " ]\n", " for index, company_link in enumerate(companies_tab):\n", " company_name = company_names[index]\n", " if company_name in processed_companies:\n", " continue\n", " # Go to intermediary page\n", " company_link.click()\n", " # Trigger next redirect\n", " driver.find_element(By.LINK_TEXT, \"Registerinformationen anzeigen\").click()\n", " # Trigger SI download\n", " driver.find_element(By.LINK_TEXT, \"SI\").click()\n", " # Show shopping cart - TODO evaluate restructuring behaviour by filling cart first and then bulk downloading\n", " wait.until(\n", " EC.visibility_of_element_located((By.LINK_TEXT, \"Dokumentenkorb ansehen\"))\n", " )\n", " driver.find_element(By.LINK_TEXT, \"Dokumentenkorb ansehen\").click()\n", " # Get document\n", " xpath = \"//input[@type='submit']\"\n", " elems = driver.find_elements(By.TAG_NAME, \"input\")\n", " elems[-2].click()\n", "\n", " wait.until(\n", " EC.visibility_of_element_located((By.ID, \"paymentFormOverview:btnNext\"))\n", " )\n", " driver.find_element(By.ID, \"paymentFormOverview:btnNext\").click()\n", "\n", " wait.until(\n", " EC.visibility_of_element_located((By.LINK_TEXT, \"Zum Dokumentenkorb\"))\n", " )\n", " driver.find_element(By.LINK_TEXT, \"Zum Dokumentenkorb\").click()\n", "\n", " num_files = get_num_files(\"./data/Unternehmensregister/\")\n", " driver.find_element(By.CLASS_NAME, \"download-wrapper\").click()\n", "\n", " try:\n", " wait.until(\n", " lambda x: wait_for_download_condition(\n", " \"./data/Unternehmensregister/\", num_files\n", " )\n", " )\n", " rename_latest_file(\n", " \"./data/Unternehmensregister/\",\n", " f\"{company_name.replace(' ', '_').replace('/','_')}.xml\",\n", " )\n", " processed_companies.append(company_name)\n", " except:\n", " print(f\"Could not process {company_name}\")\n", " for i in range(6):\n", " driver.back()\n", " driver.find_element(By.XPATH, '//*[@class=\"fas fa-angle-right\"]').click()\n", "driver.close()\n", "print(processed_companies)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Analyze Auszug" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['A&A_Amini_Art_GmbH.json',\n", " 'A&A_Amini_Art_GmbH.xml',\n", " 'A&A_Immo_GmbH.json',\n", " 'A&A_Immo_GmbH.xml',\n", " 'A&P_AUDITING_GmbH_Wirtschaftsprüfungsgesellschaft.json',\n", " 'A&P_AUDITING_GmbH_Wirtschaftsprüfungsgesellschaft.xml',\n", " 'A&QUA_gemeinnützige_Gesellschaft_für_Arbeit_u._Qualifizierung_mbH.json',\n", " 'A&QUA_gemeinnützige_Gesellschaft_für_Arbeit_u._Qualifizierung_mbH.xml',\n", " 'a+b_Asphalt-_und_Betonmischwerke_GmbH_&_Co._KG.json',\n", " 'a+b_Asphalt-_und_Betonmischwerke_GmbH_&_Co._KG.xml',\n", " 'a+b_Verwaltungsgesellschaft_mbH.json',\n", " 'a+b_Verwaltungsgesellschaft_mbH.xml',\n", " 'A+E_Beteiligungs-_und_Handels-GmbH.json',\n", " 'A+E_Beteiligungs-_und_Handels-GmbH.xml',\n", " 'A+W_Systemhaus_GmbH.json',\n", " 'A+W_Systemhaus_GmbH.xml',\n", " 'A-S-D_Kfz-Teile-Handel_GmbH.json',\n", " 'A-S-D_Kfz-Teile-Handel_GmbH.xml',\n", " 'A-TEAM_Industrielles_Roboterschweißen_GmbH.json',\n", " 'A-TEAM_Industrielles_Roboterschweißen_GmbH.xml',\n", " 'A.C.C._Funk_Taxi_&_Minicar_e.K..json',\n", " 'A.C.C._Funk_Taxi_&_Minicar_e.K..xml',\n", " 'a.c.k._aqua_concept_GmbH_Karlsruhe.json',\n", " 'a.c.k._aqua_concept_GmbH_Karlsruhe.xml',\n", " 'A.C._Weiss_GmbH_&_Co._KG.json',\n", " 'A.C._Weiss_GmbH_&_Co._KG.xml',\n", " 'A.D.S._OHG.json',\n", " 'A.D.S._OHG.xml',\n", " 'A.D._Glas-_und_Gebäudereinigung_e.K..json',\n", " 'A.D._Glas-_und_Gebäudereinigung_e.K..xml',\n", " 'A.E._Z-Line_Taxi_-_und_Shuttle-Service_e.K..json',\n", " 'A.E._Z-Line_Taxi_-_und_Shuttle-Service_e.K..xml',\n", " 'A.F.Z._Automatisierung,_Fördern,_Zuführen_GmbH.json',\n", " 'A.F.Z._Automatisierung,_Fördern,_Zuführen_GmbH.xml',\n", " 'A.G._Zentral_Michael_Greising_e.K..json',\n", " 'A.G._Zentral_Michael_Greising_e.K..xml',\n", " 'A.H._Steuerberatungsgesellschaft_mbH.json',\n", " 'A.H._Steuerberatungsgesellschaft_mbH.xml',\n", " 'A.I.V._SERVICES_GmbH.json',\n", " 'A.I.V._SERVICES_GmbH.xml',\n", " 'A.I._Kommanditist-Gesellschaft_mbH.json',\n", " 'A.I._Kommanditist-Gesellschaft_mbH.xml',\n", " 'A.KIein_Immobilien_KG.json',\n", " 'A.KIein_Immobilien_KG.xml',\n", " 'A.L.G._Christian_Schmelzer.json',\n", " 'A.L.G._Christian_Schmelzer.xml',\n", " 'A.L.S._Architektonische_Licht-Systeme_GmbH.json',\n", " 'A.L.S._Architektonische_Licht-Systeme_GmbH.xml',\n", " 'A.M.G._Motorenbau_Hans_Werner_Aufrecht.json',\n", " 'A.M.G._Motorenbau_Hans_Werner_Aufrecht.xml',\n", " 'A.M.P._Athos_GmbH.json',\n", " 'A.M.P._Athos_GmbH.xml',\n", " 'A.N._Gartenbau_GmbH.json',\n", " 'A.N._Gartenbau_GmbH.xml',\n", " 'A.Q.U.A_Services_KG.json',\n", " 'A.Q.U.A_Services_KG.xml',\n", " 'A.R.S._GmbH_Süd,_Alt_und_Reststoffverwertung.json',\n", " 'A.R.S._GmbH_Süd,_Alt_und_Reststoffverwertung.xml',\n", " 'A.S.G._Industrielackierungen_GmbH.json',\n", " 'A.S.G._Industrielackierungen_GmbH.xml',\n", " 'A.S.S._bikes_and_parts_GmbH.json',\n", " 'A.S.S._bikes_and_parts_GmbH.xml',\n", " 'A.S._Baubedarfvermittlung_Gesellschaft_mit_beschränkter_Haftung.json',\n", " 'A.S._Baubedarfvermittlung_Gesellschaft_mit_beschränkter_Haftung.xml',\n", " 'A.T.C._Automotive_GmbH.json',\n", " 'A.T.C._Automotive_GmbH.xml',\n", " 'A._&_S._Aigner_und_Schulz_GmbH.json',\n", " 'A._&_S._Aigner_und_Schulz_GmbH.xml',\n", " 'A._+_H._Weier_GmbH.json',\n", " 'A._+_H._Weier_GmbH.xml',\n", " 'A._+_K._Hertkorn_OHG_Möbel_-_Innenausbau.json',\n", " 'A._+_K._Hertkorn_OHG_Möbel_-_Innenausbau.xml',\n", " 'A._Abele_GmbH.json',\n", " 'A._Abele_GmbH.xml',\n", " 'A._Baur_Mineralöl-Abfertigungsspedition_GmbH.json',\n", " 'A._Baur_Mineralöl-Abfertigungsspedition_GmbH.xml',\n", " 'A._Blum_GmbH.json',\n", " 'A._Blum_GmbH.xml',\n", " 'A._Both_GmbH.json',\n", " 'A._Both_GmbH.xml',\n", " 'A._Both_GmbH_&_Co._KG_Werkzeugtechnik_CNC_Maschinenausrüstung.json',\n", " 'A._Both_GmbH_&_Co._KG_Werkzeugtechnik_CNC_Maschinenausrüstung.xml',\n", " 'A._DINKIC_GMBH.json',\n", " 'A._DINKIC_GMBH.xml',\n", " 'A._Elsbecker_GmbH.json',\n", " 'A._Elsbecker_GmbH.xml',\n", " 'A._Erglis_GmbH.json',\n", " 'A._Erglis_GmbH.xml',\n", " 'A._Frauenrath_Landschaftsbau_GmbH_&_Co._KG..json',\n", " 'A._Frauenrath_Landschaftsbau_GmbH_&_Co._KG..xml',\n", " 'A._Gradmann_GmbH_&_Co._KG.json',\n", " 'A._Gradmann_GmbH_&_Co._KG.xml',\n", " 'A._Hanhart_GmbH_&_Co._KG.json',\n", " 'A._Hanhart_GmbH_&_Co._KG.xml',\n", " 'A._Hüglin_-_Putz_und_Stuck_-_Gesellschaft_mit_beschränkter_Haftung.json',\n", " 'A._Hüglin_-_Putz_und_Stuck_-_Gesellschaft_mit_beschränkter_Haftung.xml',\n", " 'A._Illmann_Zahntechnik_GmbH.json',\n", " 'A._Illmann_Zahntechnik_GmbH.xml',\n", " 'A._Junghanns_Automatisierungs_GmbH.json',\n", " 'A._Junghanns_Automatisierungs_GmbH.xml',\n", " 'A._Jung_GmbH_&_Co._KG.json',\n", " 'A._Jung_GmbH_&_Co._KG.xml',\n", " 'A._Kolbinger_GmbH_Versicherungs-Makler.json',\n", " 'A._Kolbinger_GmbH_Versicherungs-Makler.xml',\n", " 'A._Kolckmann,_Weberei_und_Kunststoffbeschichtungen_GmbH.json',\n", " 'A._Kolckmann,_Weberei_und_Kunststoffbeschichtungen_GmbH.xml',\n", " 'A._Kolckmann_GmbH_&_Co._KG.json',\n", " 'A._Kolckmann_GmbH_&_Co._KG.xml',\n", " 'A._Kuhner_GmbH.json',\n", " 'A._Kuhner_GmbH.xml',\n", " 'A._Lipp_GmbH.json',\n", " 'A._Lipp_GmbH.xml',\n", " 'A._Müller_Geschäftsführungs-_GmbH.json',\n", " 'A._Müller_Geschäftsführungs-_GmbH.xml',\n", " 'A._Müller_GmbH_&_Co._KG.json',\n", " 'A._Müller_GmbH_&_Co._KG.xml',\n", " 'A._Nassal_GmbH.json',\n", " 'A._Nassal_GmbH.xml',\n", " 'A._Oster_e.K..json',\n", " 'A._Oster_e.K..xml',\n", " 'A._Pfeiffer_Zimmerei_GmbH.json',\n", " 'A._Pfeiffer_Zimmerei_GmbH.xml',\n", " 'A._Pfingsten_KG.json',\n", " 'A._Pfingsten_KG.xml',\n", " 'A._Pullmann_GmbH.json',\n", " 'A._Pullmann_GmbH.xml',\n", " 'A._Randecker_Wirtschafts-_und_Steuerberatungsgesellschaft_mbH.json',\n", " 'A._Randecker_Wirtschafts-_und_Steuerberatungsgesellschaft_mbH.xml',\n", " 'A._Reinhard_GmbH.json',\n", " 'A._Reinhard_GmbH.xml',\n", " 'A._Ritter_GmbH.json',\n", " 'A._Ritter_GmbH.xml',\n", " 'A._Sabadinowitsch_Verwaltung_GmbH.json',\n", " 'A._Sabadinowitsch_Verwaltung_GmbH.xml',\n", " 'A._Sluka-Verwaltungsgesellschaft_mit_beschränkter_Haftung.json',\n", " 'A._Sluka-Verwaltungsgesellschaft_mit_beschränkter_Haftung.xml',\n", " 'A._Sommer_Finanzdienstleistungsvermittlung_e.K..json',\n", " 'A._Sommer_Finanzdienstleistungsvermittlung_e.K..xml',\n", " 'A._Sorg_GmbH_&_Co._KG.json',\n", " 'A._Sorg_GmbH_&_Co._KG.xml',\n", " 'A._u._G_Sexton_GmbH.json',\n", " 'A._u._G_Sexton_GmbH.xml',\n", " 'A._Umminger_LUM-Air,_Elektro-_und_Filtertechnik_GmbH.json',\n", " 'A._Umminger_LUM-Air,_Elektro-_und_Filtertechnik_GmbH.xml',\n", " 'A._Wankmüller_GmbH_&_Co._KG.json',\n", " 'A._Wankmüller_GmbH_&_Co._KG.xml',\n", " 'A._Ziemann_Gesellschaft_mit_beschränkter_Haftung.json',\n", " 'A._Ziemann_Gesellschaft_mit_beschränkter_Haftung.xml',\n", " 'A._Zwisler_e.K..json',\n", " 'A._Zwisler_e.K..xml',\n", " 'A_&_A_Consulting_GmbH.json',\n", " 'A_&_A_Consulting_GmbH.xml',\n", " 'A_&_A_Gipserbetrieb_GmbH.json',\n", " 'A_&_A_Gipserbetrieb_GmbH.xml',\n", " 'a_&_b_Beteiligungs-GmbH.json',\n", " 'a_&_b_Beteiligungs-GmbH.xml',\n", " 'A_&_B_Gastronomie-Betriebe_GmbH.json',\n", " 'A_&_B_Gastronomie-Betriebe_GmbH.xml',\n", " 'A_&_C_Aqua_&_Care_Limited.json',\n", " 'A_&_C_Aqua_&_Care_Limited.xml',\n", " 'A_&_F_Lori_GmbH.json',\n", " 'A_&_F_Lori_GmbH.xml',\n", " 'A_&_L_Engineering_Service_GmbH.json',\n", " 'A_&_L_Engineering_Service_GmbH.xml',\n", " 'A_&_M_Stanzformzubehör_Olaf_Abendroth_GmbH.json',\n", " 'A_&_M_Stanzformzubehör_Olaf_Abendroth_GmbH.xml',\n", " 'A_&_O_Grundstücksverwaltungs_GmbH_&_Co._KG.json',\n", " 'A_&_O_Grundstücksverwaltungs_GmbH_&_Co._KG.xml',\n", " 'A_&_R_Textilproduktion_GmbH.json',\n", " 'A_&_R_Textilproduktion_GmbH.xml',\n", " 'A_&_S_Bäder_GmbH_&_Co..json',\n", " 'A_&_S_Bäder_GmbH_&_Co..xml',\n", " 'A_&_S_Vermögensverwaltungs_GmbH.json',\n", " 'A_&_S_Vermögensverwaltungs_GmbH.xml',\n", " 'A_&_T_Roth_GmbH.json',\n", " 'A_&_T_Roth_GmbH.xml',\n", " 'A_+_A_Aalsmeer_Blumen_-_Bräutigam_E._Kfr.,_Inh._Manuela_Bräutigam.json',\n", " 'A_+_A_Aalsmeer_Blumen_-_Bräutigam_E._Kfr.,_Inh._Manuela_Bräutigam.xml',\n", " 'a_+_b_Wohnbau_GmbH.json',\n", " 'a_+_b_Wohnbau_GmbH.xml',\n", " 'A_+_H_Bauträger-_und_Verwaltungsgesellschaft_mit_beschränkter_Haftung.json',\n", " 'A_+_H_Bauträger-_und_Verwaltungsgesellschaft_mit_beschränkter_Haftung.xml',\n", " 'A_+_M_Verwaltungs-GmbH.json',\n", " 'A_+_M_Verwaltungs-GmbH.xml',\n", " 'A_+_P_Baumaschinen_GmbH_&_Co._KG.json',\n", " 'A_+_P_Baumaschinen_GmbH_&_Co._KG.xml',\n", " 'A_+_R_Baumaschinen_-_Mietpark_+_-Vertriebs-GmbH.json',\n", " 'A_+_R_Baumaschinen_-_Mietpark_+_-Vertriebs-GmbH.xml',\n", " 'A_+_S_Tierbedarf_GmbH.json',\n", " 'A_+_S_Tierbedarf_GmbH.xml',\n", " 'A_+_Te_Stabil-Bau_GmbH.json',\n", " 'A_+_Te_Stabil-Bau_GmbH.xml',\n", " 'A_+_W._Sahm_Bedachungs-GmbH.json',\n", " 'A_+_W._Sahm_Bedachungs-GmbH.xml',\n", " 'a_-_Vermögensverwaltungs-GmbH_&_Co._KG.json',\n", " 'a_-_Vermögensverwaltungs-GmbH_&_Co._KG.xml',\n", " 'A_-_Z_Kreditvermittlungs-Gesellschaft_mit_beschränkter_Haftung.json',\n", " 'A_-_Z_Kreditvermittlungs-Gesellschaft_mit_beschränkter_Haftung.xml',\n", " 'A_2000_Industrie-Elektronik_GmbH.json',\n", " 'A_2000_Industrie-Elektronik_GmbH.xml',\n", " 'A_bis_Z_Verwaltungs_GmbH.json',\n", " 'A_bis_Z_Verwaltungs_GmbH.xml',\n", " 'A_B_A_S_A_GmbH_Organisations_-_Planungsbüro_für_den_Innenausbau.json',\n", " 'A_B_A_S_A_GmbH_Organisations_-_Planungsbüro_für_den_Innenausbau.xml',\n", " 'A_B_U_-_GmbH_Altlasten_Bauökologie_Umweltmanagement.json',\n", " 'A_B_U_-_GmbH_Altlasten_Bauökologie_Umweltmanagement.xml',\n", " 'A_F_Fussbodentechnik_GmbH.json',\n", " 'A_F_Fussbodentechnik_GmbH.xml',\n", " 'A_L_T_E_C_GmbH.json',\n", " 'A_L_T_E_C_GmbH.xml',\n", " 'A_l_u_f_o_r_m_Alucobondverarbeitungs-GmbH.json',\n", " 'A_l_u_f_o_r_m_Alucobondverarbeitungs-GmbH.xml',\n", " 'A_L_Z_Auto_Licht_und_Zündung_Service_Gesellschaft_mit_beschränkter_Haftung.json',\n", " 'A_L_Z_Auto_Licht_und_Zündung_Service_Gesellschaft_mit_beschränkter_Haftung.xml',\n", " 'a_m_friseure_GmbH_Karlsruhe.json',\n", " 'a_m_friseure_GmbH_Karlsruhe.xml',\n", " 'a_m_friseure_GmbH_Koblenz.json',\n", " 'a_m_friseure_GmbH_Koblenz.xml',\n", " 'a_priori_GmbH.json',\n", " 'a_priori_GmbH.xml',\n", " 'a_s_k_-_Kunststoffe_GmbH.json',\n", " 'a_s_k_-_Kunststoffe_GmbH.xml',\n", " 'A_S_TRUCKS_e.K..json',\n", " 'A_S_TRUCKS_e.K..xml',\n", " 'A_S_Y_S_Automatic_Systems_Beteiligungs-GmbH.json',\n", " 'A_S_Y_S_Automatic_Systems_Beteiligungs-GmbH.xml',\n", " 'A_u_c_h_Gesellschaft_mit_beschränkter_Haftung.json',\n", " 'A_u_c_h_Gesellschaft_mit_beschränkter_Haftung.xml',\n", " 'export',\n", " 'registerdocument-2023-06-11-12-41-30 (1).json',\n", " 'registerdocument-2023-06-11-12-41-30 (1).xml',\n", " 'registerdocument-2023-06-11-12-41-30.json',\n", " 'registerdocument-2023-06-11-12-41-30.xml',\n", " 'registerdocument-2023-06-11-12-52-33.json',\n", " 'registerdocument-2023-06-11-12-52-33.xml',\n", " 'registerdocument-2023-06-11-12-52-41.json',\n", " 'registerdocument-2023-06-11-12-52-41.xml']" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "\n", "files = os.listdir(\"./data/Unternehmensregister\")\n", "files" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "import json\n", "import xmltodict\n", "\n", "\n", "def transform_xml_to_json(source_dir: str, target_dir: str):\n", " for file in glob.glob1(source_dir, \"*.xml\"):\n", " source_path = os.path.join(source_dir, file)\n", " target_path = os.path.join(target_dir, file.replace(\".xml\", \".json\"))\n", "\n", " with open(source_path, \"r\", encoding=\"utf-8\") as source_file:\n", " data = xmltodict.parse(source_file.read().encode())\n", " with open(target_path, \"w\", encoding=\"utf-8\") as json_file:\n", " json_file.write(json.dumps(data))\n", "\n", "\n", "transform_xml_to_json(\"./data/Unternehmensregister/\", \"./data/Unternehmensregister/\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "for file in glob.glob1(\"./data/Unternehmensregister/\", \"*.json\"):\n", " path = os.path.join(\"./data/Unternehmensregister/\", file)\n", " with open(path, \"r\", encoding=\"utf-8\") as file_object:\n", " data = json.loads(file_object.read())" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "from models.Company import Company\n", "\n", "\n", "def parse_stakeholder(data: dict) -> list:\n", " if \"Natuerliche_Person\" in data[\"Beteiligter\"]:\n", " return {\n", " \"name\": {\n", " \"firstname\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\"Voller_Name\"][\n", " \"Vorname\"\n", " ],\n", " \"lastname\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\"Voller_Name\"][\n", " \"Nachname\"\n", " ],\n", " },\n", " \"date_of_birth\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\"Geburt\"][\n", " \"Geburtsdatum\"\n", " ]\n", " if \"Geburt\" in data[\"Beteiligter\"][\"Natuerliche_Person\"]\n", " else None,\n", " \"location\": {\n", " \"city\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\"Anschrift\"][\"Ort\"]\n", " },\n", " \"role\": data[\"Rolle\"][\"Rollenbezeichnung\"][\"content\"],\n", " }\n", " if \"Organisation\" in data[\"Beteiligter\"]:\n", " return {\n", " \"role\": \"Organisation\",\n", " \"description\": data[\"Beteiligter\"][\"Organisation\"][\"Bezeichnung\"][\n", " \"Bezeichnung_Aktuell\"\n", " ],\n", " \"location\": {\n", " \"city\": data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"][\"Ort\"],\n", " \"street\": data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"][\"Strasse\"]\n", " if \"Strasse\" in data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"]\n", " else None,\n", " \"house_number\": data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"][\n", " \"Hausnummer\"\n", " ]\n", " if \"Hausnummer\" in data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"]\n", " else None,\n", " \"zip_code\": data[\"Beteiligter\"][\"Organisation\"][\"Anschrift\"][\n", " \"Postleitzahl\"\n", " ],\n", " },\n", " }\n", "\n", "\n", "def map_unternehmensregister_json(data: dict) -> dict:\n", " result = {\"relationships\": []}\n", "\n", " result[\"id\"] = {\n", " \"hr_number\": data[\"XJustiz_Daten\"][\"Grunddaten\"][\"Verfahrensdaten\"][\n", " \"Instanzdaten\"\n", " ][\"Aktenzeichen\"],\n", " \"district_court\": data[\"XJustiz_Daten\"][\"Grunddaten\"][\"Verfahrensdaten\"][\n", " \"Beteiligung\"\n", " ][1][\"Beteiligter\"][\"Organisation\"][\"Bezeichnung\"][\"Bezeichnung_Aktuell\"],\n", " }\n", " result[\"name\"] = data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\"Basisdaten_Register\"][\n", " \"Rechtstraeger\"\n", " ][\"Bezeichnung\"][\"Bezeichnung_Aktuell\"]\n", "\n", " result[\"location\"] = {\n", " \"city\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\"Basisdaten_Register\"][\n", " \"Rechtstraeger\"\n", " ][\"Anschrift\"][\"Ort\"],\n", " \"zip_code\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\"Basisdaten_Register\"][\n", " \"Rechtstraeger\"\n", " ][\"Anschrift\"][\"Postleitzahl\"],\n", " \"street\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\"Basisdaten_Register\"][\n", " \"Rechtstraeger\"\n", " ][\"Anschrift\"][\"Strasse\"]\n", " if \"Strasse\"\n", " in data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\"Basisdaten_Register\"][\n", " \"Rechtstraeger\"\n", " ][\"Anschrift\"]\n", " else None,\n", " \"house_number\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n", " \"Basisdaten_Register\"\n", " ][\"Rechtstraeger\"][\"Anschrift\"][\"Hausnummer\"]\n", " if \"Hausnummer\"\n", " in data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\"Basisdaten_Register\"][\n", " \"Rechtstraeger\"\n", " ][\"Anschrift\"]\n", " else None,\n", " }\n", " result[\"last_update\"] = data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\"Auszug\"][\n", " \"letzte_Eintragung\"\n", " ]\n", "\n", " for i in range(\n", " 2, len(data[\"XJustiz_Daten\"][\"Grunddaten\"][\"Verfahrensdaten\"][\"Beteiligung\"])\n", " ):\n", " people = parse_stakeholder(\n", " data[\"XJustiz_Daten\"][\"Grunddaten\"][\"Verfahrensdaten\"][\"Beteiligung\"][i]\n", " )\n", " result[\"relationships\"].append(people)\n", " return Company(**result)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A&A Amini Art GmbH\n", "A&A Immo GmbH\n", "A&P AUDITING GmbH Wirtschaftsprüfungsgesellschaft\n", "A&QUA gemeinnützige Gesellschaft für Arbeit u. Qualifizierung mbH\n", "a+b Asphalt- und Betonmischwerke GmbH & Co. KG\n", "a+b Verwaltungsgesellschaft mbH\n", "A+E Beteiligungs- und Handels-GmbH\n", "A+W Systemhaus GmbH\n", "A-S-D Kfz-Teile-Handel GmbH\n", "A-TEAM Industrielles Roboterschweißen GmbH\n", "A.C.C. Funk Taxi & Minicar e.K.\n", "a.c.k. aqua concept GmbH Karlsruhe\n", "A.C. Weiss GmbH & Co. KG\n", "A.D.S. OHG\n", "A.D. Glas- und Gebäudereinigung e.K.\n", "A.E. Z-Line Taxi - und Shuttle-Service e.K.\n", "A.F.Z. Automatisierung, Fördern, Zuführen GmbH\n", "A.G. Zentral Michael Greising e.K.\n", "A.H. Steuerberatungsgesellschaft mbH\n", "A.I.V. SERVICES GmbH\n", "A.I. Kommanditist-Gesellschaft mbH\n", "A.KIein Immobilien KG\n", "A.L.G. Christian Schmelzer\n", "A.L.S. Architektonische Licht-Systeme GmbH\n", "A.M.G. Motorenbau Hans Werner Aufrecht\n", "A.M.P. Athos GmbH\n", "A.N. Gartenbau GmbH\n", "A.Q.U.A Services KG\n", "A.R.S. GmbH Süd, Alt und Reststoffverwertung\n", "A.S.G. Industrielackierungen GmbH\n", "A.S.S. bikes and parts GmbH\n", "A.S. Baubedarfvermittlung Gesellschaft mit beschränkter Haftung\n", "A.T.C. Automotive GmbH\n", "A. & S. Aigner und Schulz GmbH\n", "A. + H. Weier GmbH\n", "A. + K. Hertkorn OHG Möbel - Innenausbau\n", "A. Abele GmbH\n", "A. Baur Mineralöl-Abfertigungsspedition GmbH\n", "A. Blum GmbH\n", "A. Both GmbH\n", "A. Both GmbH & Co. KG Werkzeugtechnik CNC Maschinenausrüstung\n", "A. DINKIC GMBH\n", "A. Elsbecker GmbH\n", "A. Erglis GmbH\n", "A. Frauenrath Landschaftsbau GmbH & Co. KG.\n", "A. Gradmann GmbH & Co. KG\n", "A. Hanhart GmbH & Co. KG\n", "A. Hüglin - Putz und Stuck - Gesellschaft mit beschränkter Haftung\n", "A. Illmann Zahntechnik GmbH\n", "A. Junghanns Automatisierungs GmbH\n", "A. Jung GmbH & Co.KG\n", "A. Kolbinger GmbH Versicherungs-Makler\n", "A. Kolckmann, Weberei und Kunststoffbeschichtungen GmbH\n", "A. Kolckmann GmbH & Co. KG\n", "A. Kuhner GmbH\n", "A. Lipp GmbH\n", "A. Müller Geschäftsführungs- GmbH\n", "A. Müller GmbH & Co. KG\n", "A. Nassal GmbH\n", "A. Oster e.K.\n", "A. Pfeiffer Zimmerei GmbH\n", "A. Pfingsten KG\n", "A. Pullmann GmbH\n", "A. Randecker Wirtschafts- und Steuerberatungsgesellschaft mbH\n", "A. Reinhard GmbH\n", "A. Ritter GmbH\n", "A. Sabadinowitsch Verwaltung GmbH\n", "A. Sluka-Verwaltungsgesellschaft mit beschränkter Haftung\n", "A. Sommer Finanzdienstleistungsvermittlung e.K.\n", "A. Sorg GmbH & Co. KG\n", "A. u. G Sexton GmbH\n", "A. Umminger LUM-Air, Elektro- und Filtertechnik GmbH\n", "A. Wankmüller GmbH & Co. KG\n", "A. Ziemann Gesellschaft mit beschränkter Haftung\n", "A. Zwisler e.K.\n", "A & A Consulting GmbH\n", "A & A Gipserbetrieb GmbH\n", "a & b Beteiligungs-GmbH\n", "A & B Gastronomie-Betriebe GmbH\n", "A & C Aqua & Care Limited\n", "A & F Lori GmbH\n", "A & L Engineering Service GmbH\n", "A & M Stanzformzubehör Olaf Abendroth GmbH\n", "A & O Grundstücksverwaltungs GmbH & Co. KG\n", "A & R Textilproduktion GmbH\n", "A & S Bäder GmbH & Co.\n", "A & S Vermögensverwaltungs GmbH\n", "A & T Roth GmbH\n", "A + A Aalsmeer Blumen - Bräutigam E. Kfr., Inh. Manuela Bräutigam\n", "a + b Wohnbau GmbH\n", "A + H Bauträger- und Verwaltungsgesellschaft mit beschränkter Haftung\n", "A + M Verwaltungs-GmbH\n", "A + P Baumaschinen GmbH & Co. KG\n", "A + R Baumaschinen - Mietpark + -Vertriebs-GmbH\n", "A + S Tierbedarf GmbH\n", "A + Te Stabil-Bau GmbH\n", "A + W. Sahm Bedachungs-GmbH\n", "a - Vermögensverwaltungs-GmbH & Co. KG\n", "A - Z Kreditvermittlungs-Gesellschaft mit beschränkter Haftung\n", "A 2000 Industrie-Elektronik GmbH\n", "A bis Z Verwaltungs GmbH\n", "A B A S A GmbH Organisations - Planungsbüro für den Innenausbau\n", "A B U - GmbH Altlasten Bauökologie Umweltmanagement\n", "A F Fussbodentechnik GmbH\n", "A L T E C GmbH\n", "A l u f o r m Alucobondverarbeitungs-GmbH\n", "A L Z Auto Licht und Zündung Service Gesellschaft mit beschränkter Haftung\n", "a/m friseure GmbH Karlsruhe\n", "a/m friseure GmbH Koblenz\n", "a priori GmbH\n", "a s k - Kunststoffe GmbH\n", "A/S TRUCKS e.K.\n", "A S Y S Automatic Systems Beteiligungs-GmbH\n", "A u c h Gesellschaft mit beschränkter Haftung\n", "a s k - Kunststoffe GmbH\n", "A. Maier GmbH & Co. KG\n", "\"A/D/L/E/R Steuerberatungsgesellschaft mbH\"\n", "a | m | | medienservice e. k.\n" ] } ], "source": [ "import json\n", "import dataclasses\n", "\n", "for file in glob.glob1(\"./data/Unternehmensregister/\", \"*.json\"):\n", " path = os.path.join(\"./data/Unternehmensregister/\", file)\n", " with open(path, \"r\", encoding=\"utf-8\") as file_object:\n", " data = json.loads(file_object.read())\n", "\n", " company: Company = map_unternehmensregister_json(data)\n", " print(company.name)\n", "\n", " name = (\n", " company.name.replace(\" \", \"_\")\n", " .replace(\"/\", \"_\")\n", " .replace('\"', \"\")\n", " .replace(\"|\", \"_\")\n", " )\n", " with open(\n", " f\"./data/Unternehmensregister/export/{name}.json\", \"w+\", encoding=\"utf-8\"\n", " ) as export_file:\n", " json.dump(dataclasses.asdict(company), export_file, ensure_ascii=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.7" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }