{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Unternehmensregister" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Fetch Auszug" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import glob" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def wait_for_download_condition(\n", " path: str, num_files: int, pattern: str = \"*.xml\"\n", ") -> bool:\n", " return len(glob.glob1(path, pattern)) > num_files\n", "\n", "\n", "def get_num_files(path: str, pattern: str = \"*.xml\") -> int:\n", " return len(glob.glob1(path, pattern))\n", "\n", "\n", "def rename_latest_file(path: str, filename: str, pattern: str = \"*.xml\"):\n", " list_of_files = [os.path.join(path, file) for file in glob.glob1(path, pattern)]\n", " latest_download = max(list_of_files, key=os.path.getctime)\n", " os.rename(latest_download, os.path.join(path, filename))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import os\n", "from pathlib import Path\n", "\n", "from selenium import webdriver\n", "\n", "\n", "def configure_webdriver(\n", " headless: bool = True, download_dir: list[str] = [\"data\", \"Unternehmensregister\"]\n", ") -> webdriver:\n", " options = webdriver.ChromeOptions()\n", "\n", " download_path = os.path.join(str(Path.cwd()), *download_dir)\n", " print(download_path)\n", "\n", " preferences = {\n", " \"profile.default_content_settings.popups\": 0,\n", " \"safebrowsing.enabled\": True,\n", " \"download\": {\n", " \"directory_upgrade\": True,\n", " \"prompt_for_download\": False,\n", " \"extensions_to_open\": \"\",\n", " \"default_directory\": download_path,\n", " },\n", " }\n", " if headless:\n", " options.add_argument(\"--headless=new\")\n", " options.add_experimental_option(\"prefs\", preferences)\n", "\n", " return webdriver.Chrome(options=options)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from tqdm import tqdm\n", "from selenium.webdriver.common.by import By\n", "from selenium.webdriver.support.ui import WebDriverWait\n", "from selenium.webdriver.support import expected_conditions as EC\n", "\n", "\n", "def scrape(driver, query: str, dir: str):\n", " driver.get(\"https://www.unternehmensregister.de/ureg/\")\n", " # Accept Cookies\n", " driver.find_elements(\n", " By.XPATH, '//button[text()=\"Nur technisch notwendige Cookies akzeptieren\"]'\n", " )[0].click()\n", " # Enter search query\n", " driver.find_elements(By.ID, \"globalSearchForm:extendedResearchCompanyName\")[\n", " 0\n", " ].send_keys(query)\n", " # Trigger search\n", " driver.find_elements(By.ID, \"globalSearchForm:btnExecuteSearchOld\")[0].click()\n", " # Wait for results\n", " wait = WebDriverWait(driver, 5)\n", " wait.until(\n", " lambda driver: driver.current_url != \"https://www.unternehmensregister.de/ureg/\"\n", " )\n", "\n", " num_pages = int(\n", " driver.find_element(By.XPATH, '//*[@class=\"page_count\"]').text.split(\" \")[0]\n", " )\n", "\n", " processed_companies = []\n", "\n", " for page_index in tqdm(range(num_pages)):\n", " # Find all \"Registerinformationen\"\n", " companies_tab = driver.find_elements(\n", " By.LINK_TEXT, \"Registerinformationen des Registergerichts\"\n", " )\n", " company_names = [\n", " elem.text\n", " for elem in driver.find_elements(\n", " By.XPATH, '//div[@class=\"company_result\"]/span/b'\n", " )\n", " ]\n", " for index, company_link in enumerate(companies_tab):\n", " company_name = company_names[index]\n", " if company_name in processed_companies:\n", " continue\n", " # Go to intermediary page\n", " company_link.click()\n", " # Trigger next redirect\n", " driver.find_element(By.LINK_TEXT, \"Registerinformationen anzeigen\").click()\n", " # Trigger SI download\n", " driver.find_element(By.LINK_TEXT, \"SI\").click()\n", " # Show shopping cart - TODO evaluate restructuring behaviour by filling cart first and then bulk downloading\n", " wait.until(\n", " EC.visibility_of_element_located(\n", " (By.LINK_TEXT, \"Dokumentenkorb ansehen\")\n", " )\n", " )\n", " driver.find_element(By.LINK_TEXT, \"Dokumentenkorb ansehen\").click()\n", " # Get document\n", " xpath = \"//input[@type='submit']\"\n", " elems = driver.find_elements(By.TAG_NAME, \"input\")\n", " elems[-2].click()\n", "\n", " wait.until(\n", " EC.visibility_of_element_located((By.ID, \"paymentFormOverview:btnNext\"))\n", " )\n", " driver.find_element(By.ID, \"paymentFormOverview:btnNext\").click()\n", "\n", " wait.until(\n", " EC.visibility_of_element_located((By.LINK_TEXT, \"Zum Dokumentenkorb\"))\n", " )\n", " driver.find_element(By.LINK_TEXT, \"Zum Dokumentenkorb\").click()\n", "\n", " num_files = get_num_files(dir)\n", " driver.find_element(By.CLASS_NAME, \"download-wrapper\").click()\n", "\n", " try:\n", " wait.until(lambda x: wait_for_download_condition(dir, num_files))\n", " rename_latest_file(\n", " dir,\n", " f\"{company_name.replace(' ', '_').replace('/','_')}.xml\",\n", " )\n", " processed_companies.append(company_name)\n", " except:\n", " print(f\"Could not process {company_name}\")\n", " for i in range(6):\n", " driver.back()\n", " driver.find_element(By.XPATH, '//*[@class=\"fas fa-angle-right\"]').click()\n", " driver.close()\n", " print(processed_companies)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Unnamed: 0 | \n", "Name | \n", "IPO Status | \n", "Exchange | \n", "Listing ID | \n", "Founding Year | \n", "ISIC | \n", "Headquarters | \n", "Street | \n", "City | \n", "Postal Code | \n", "Phone | \n", "Website | \n", "End of fiscal year | \n", "2019 | \n", "2020 | \n", "2021 | \n", "CAGR* | \n", "Employees* | \n", "|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "Volkswagen | \n", "PUBLIC | \n", "XETR | \n", "VOW3 | \n", "1984.0 | \n", "2910 - Motor Vehicles | \n", "Germany | \n", "Berliner Ring 2 | \n", "Wolfsburg | \n", "38440 | \n", "+49-536190 | \n", "www.volkswagenag.com | \n", "NaN | \n", "12/31 | \n", "282839.18 | \n", "254299.32 | \n", "296086.81 | \n", "0.020497 | \n", "672800.0 | \n", "
1 | \n", "2 | \n", "Mercedes-Benz Group | \n", "PUBLIC | \n", "XMUN | \n", "DAI | \n", "1987.0 | \n", "2910 - Motor Vehicles | \n", "Germany | \n", "Mercedes Street 120 | \n", "Stuttgart | \n", "70372 | \n", "+49-7111795256 | \n", "www.group.mercedes-benz.com | \n", "NaN | \n", "12/31 | \n", "193400.10 | \n", "138942.51 | \n", "158449.05 | \n", "-0.071117 | \n", "172425.0 | \n", "
2 | \n", "3 | \n", "Deutsche Telekom | \n", "PUBLIC | \n", "XFRA | \n", "DTE | \n", "1990.0 | \n", "61 - Telecommunications | \n", "Germany | \n", "Friedrich-Ebert-Allee 140 | \n", "Bonn | \n", "53113 | \n", "+49-2281814949 | \n", "www.telekom.com | \n", "NaN | \n", "12/31 | \n", "90125.38 | \n", "115235.87 | \n", "128753.98 | \n", "0.110871 | \n", "216528.0 | \n", "
3 | \n", "4 | \n", "Bmw | \n", "PUBLIC | \n", "XDUS | \n", "BMW3 | \n", "1984.0 | \n", "2910 - Motor Vehicles | \n", "Germany | \n", "Petuelring 130 | \n", "Munich | \n", "80788 | \n", "+49-8938224272 | \n", "www.bmwgroup.com | \n", "NaN | \n", "12/31 | \n", "117613.06 | \n", "113581.49 | \n", "133364.51 | \n", "0.048403 | \n", "118909.0 | \n", "
4 | \n", "5 | \n", "Deutsche Post | \n", "PUBLIC | \n", "XMUN | \n", "DPW | \n", "1989.0 | \n", "5320 - Courier Services | \n", "Germany | \n", "Charles-De-Gaulle-Straße 20 | \n", "Bonn | \n", "53113 | \n", "+49-22818263636 | \n", "www.deutschepost.de | \n", "NaN | \n", "12/31 | \n", "70914.68 | \n", "76119.57 | \n", "96739.44 | \n", "0.091205 | \n", "592263.0 | \n", "