From d69368318fd12443fc45bb722416483c4852ea9c Mon Sep 17 00:00:00 2001 From: TrisNol Date: Fri, 9 Jun 2023 13:01:46 +0200 Subject: [PATCH] Download Unternehmensregister export via Selenium --- .../API-tests/Unternehmensregister/.gitignore | 1 + .../Unternehmensregister/notebook.ipynb | 337 ++++++++++++++++++ .../Unternehmensregister/requirements.txt | 6 + 3 files changed, 344 insertions(+) create mode 100644 Jupyter/API-tests/Unternehmensregister/.gitignore create mode 100644 Jupyter/API-tests/Unternehmensregister/notebook.ipynb create mode 100644 Jupyter/API-tests/Unternehmensregister/requirements.txt diff --git a/Jupyter/API-tests/Unternehmensregister/.gitignore b/Jupyter/API-tests/Unternehmensregister/.gitignore new file mode 100644 index 0000000..60baa9c --- /dev/null +++ b/Jupyter/API-tests/Unternehmensregister/.gitignore @@ -0,0 +1 @@ +data/* diff --git a/Jupyter/API-tests/Unternehmensregister/notebook.ipynb b/Jupyter/API-tests/Unternehmensregister/notebook.ipynb new file mode 100644 index 0000000..e8b75a7 --- /dev/null +++ b/Jupyter/API-tests/Unternehmensregister/notebook.ipynb @@ -0,0 +1,337 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Unternehmensregister" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetch Auszug" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "c:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Unternehmensregister\\data\\Unternehmensregister\n" + ] + } + ], + "source": [ + "import os\n", + "from pathlib import Path\n", + "\n", + "from selenium import webdriver\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.support.ui import WebDriverWait\n", + "from selenium.webdriver.support import expected_conditions as EC\n", + "\n", + "search_query = \"A*\"\n", + "\n", + "options = webdriver.ChromeOptions()\n", + "\n", + "download_path = str(Path(Path.cwd() / \"data\" / \"Unternehmensregister\"))\n", + "print(download_path)\n", + "\n", + "preferences = {\n", + " \"profile.default_content_settings.popups\": 0,\n", + " \"safebrowsing.enabled\": True,\n", + " \"download\": {\n", + " \"directory_upgrade\": True,\n", + " \"prompt_for_download\": False,\n", + " \"extensions_to_open\": \"\",\n", + " \"default_directory\": download_path,\n", + " },\n", + "}\n", + "options.add_experimental_option(\"prefs\", preferences)\n", + "\n", + "driver = webdriver.Chrome(options=options)\n", + "\n", + "driver.get(\"https://www.unternehmensregister.de/ureg/\")\n", + "# Accept Cookies\n", + "driver.find_elements(\n", + " By.XPATH, '//button[text()=\"Nur technisch notwendige Cookies akzeptieren\"]'\n", + ")[0].click()\n", + "# Enter search query\n", + "driver.find_elements(By.ID, \"globalSearchForm:extendedResearchCompanyName\")[\n", + " 0\n", + "].send_keys(search_query)\n", + "# Trigger search\n", + "driver.find_elements(By.ID, \"globalSearchForm:btnExecuteSearchOld\")[0].click()\n", + "# Wait for results\n", + "wait = WebDriverWait(driver, 5)\n", + "wait.until(\n", + " lambda driver: driver.current_url != \"https://www.unternehmensregister.de/ureg/\"\n", + ")\n", + "## TODO Iterate over tabs\n", + "# Find all \"Registerinformationen\"\n", + "companies_tab = driver.find_elements(\n", + " By.LINK_TEXT, \"Registerinformationen des Registergerichts\"\n", + ")\n", + "for company_link in companies_tab:\n", + " # Go to intermediary page\n", + " company_link.click()\n", + " # Trigger next redirect\n", + " driver.find_element(By.LINK_TEXT, \"Registerinformationen anzeigen\").click()\n", + " # Trigger SI download\n", + " driver.find_element(By.LINK_TEXT, \"SI\").click()\n", + " # Show shopping cart - TODO evaluate restructuring behaviour by filling cart first and then bulk downloading\n", + " wait.until(\n", + " EC.visibility_of_element_located((By.LINK_TEXT, \"Dokumentenkorb ansehen\"))\n", + " )\n", + " driver.find_element(By.LINK_TEXT, \"Dokumentenkorb ansehen\").click()\n", + " # Get document\n", + " xpath = \"//input[@type='submit']\"\n", + " elems = driver.find_elements(By.TAG_NAME, \"input\")\n", + " elems[-2].click()\n", + "\n", + " driver.find_element(By.ID, \"paymentFormOverview:btnNext\").click()\n", + " driver.find_element(By.LINK_TEXT, \"Zum Dokumentenkorb\").click()\n", + " driver.find_element(By.CLASS_NAME, \"download-wrapper\").click()\n", + "\n", + " for i in range(6):\n", + " driver.back()\n", + "\n", + "driver.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import requests\n", + "\n", + "session = requests.Session()\n", + "session.cookies[\"cc\"] = \"1686301974-69f11760d466bcea-10\"\n", + "session.headers.update(\n", + " {\n", + " \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9\",\n", + " \"Accept-Encoding\": \"gzip, deflate, br\",\n", + " \"Accept-Language\": \"de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7,et;q=0.6,pl;q=0.5\",\n", + " \"Cache-Control\": \"max-age=0\",\n", + " \"Connection\": \"keep-alive\",\n", + " \"DNT\": \"1\",\n", + " \"Host\": \"www.unternehmensregister.de\",\n", + " \"Pragma\": \"no-cache\",\n", + " \"Referer\": \"https://www.unternehmensregister.de/ureg/\",\n", + " \"sec-ch-ua-mobile\": \"?0\",\n", + " \"Sec-Fetch-Dest\": \"document\",\n", + " \"Sec-Fetch-Mode\": \"navigate\",\n", + " \"Sec-Fetch-Site\": \"same-origin\",\n", + " \"Sec-Fetch-User\": \"?1\",\n", + " \"Upgrade-Insecure-Requests\": \"1\",\n", + " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36\",\n", + " \"Content-Type\": \"application/x-www-form-urlencoded\",\n", + " }\n", + ")\n", + "\n", + "\n", + "# Get session cookie\n", + "response = session.get(\"https://www.unternehmensregister.de\")\n", + "jsessionid = re.findall(r\";jsessionid=[A-Z0-9]*.web[0-9]{2}.[0-9]\", response.text)[\n", + " 0\n", + "].replace(\";jsessionid=\", \"\")\n", + "print(jsessionid)\n", + "with open(\"temp.html\", \"w\", encoding=\"utf-8\") as file:\n", + " file.write(response.text)\n", + "print(session.cookies)\n", + "# Go to search page\n", + "response = session.get(\"https://www.unternehmensregister.de/ureg/\")\n", + "# Start search\n", + "response = session.post(\n", + " f\"https://www.unternehmensregister.de/ureg/index.html;jsessionid={jsessionid}\",\n", + " data={\n", + " \"globalSearchForm\": \"globalSearchForm\",\n", + " \"globalSearchForm:extendedResearchCompanyName\": \"A*\",\n", + " },\n", + ")\n", + "print(response.status_code)\n", + "# Get results\n", + "response = session.get(\n", + " f\"https://www.unternehmensregister.de/ureg/result.html;jsessionid={jsessionid}\"\n", + ")\n", + "with open(\"temp.html\", \"w\", encoding=\"utf-8\") as file:\n", + " file.write(response.text)\n", + "\n", + "print(session.cookies)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class Unternehmensregister:\n", + " def __init__(self):\n", + " self.session = requests.Session()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analyze Auszug" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "from pdf2image import convert_from_path\n", + "\n", + "pdfs = r\"./data/test.pdf\"\n", + "pages = convert_from_path(pdfs, 350)\n", + "\n", + "\n", + "for i, page in enumerate(pages):\n", + " image_name = f\"./data/Page_{i+1}.jpg\"\n", + " page.save(image_name, \"JPEG\")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Handelsregister B des Abteilung B Nummer der Firma:\n", + "Amtsgerichts Hamm Wiedergabe des aktuellen HRB 5363\n", + "Registerinhalts\n", + "Abruf vom 07.06.2023 19:37\n", + "1. Anzahl der bisherigen Eintragungen:\n", + "51\n", + "2. a) Firma:\n", + "GEA Farm Technologies GmbH\n", + "b) Sitz, Niederlassung, inländische Geschäftsanschrift, empfangsberechtigte Person, Zweigniederlassungen:\n", + "Bönen\n", + "Geschäftsanschrift: Siemensstraße 25-27, 59199 Bönen\n", + "c) Gegenstand des Unternehmens:\n", + "Entwicklung, Herstellung und der Vertrieb von Landtechnik, insbesondere von Komponenten und Anlagen\n", + "(a) zur Gewinnung, Kühlung, Behandlung und Lagerung von Milch;\n", + "(b) für das Milchvieh-Herdenmanagement;\n", + "(c) zur Tierhygiene und Sicherung der Milchqualität und\n", + "(d) zur Aufstallung von Tieren;\n", + "sowie die Herstellung und der Vertrieb von Anlagen und Fahrzeugen zur Aufbereitung und zum Transport von Gülle.\n", + "3. Grund- oder Stammkapital:\n", + "5.115.000,00 EUR\n", + "4. a) Allgemeine Vertretungsregelung:\n", + "Ist nur ein Geschäftsführer bestellt, so vertritt er die Gesellschaft allein. Sind mehrere Geschäftsführer bestellt, so wird die\n", + "Gesellschaft durch zwei Geschäftsführer oder durch einen Geschäftsführer gemeinsam mit einem Prokuristen vertreten.\n", + "b) Vorstand, Leitungsorgan, geschäftsführende Direktoren, persönlich haftender Gesellschafter, Geschäftsführer,\n", + "Vertretungsberechtigte und besondere Vertretungsbefugnis:\n", + "Geschäftsführer: Basaran, Erkul, Erkrath, *06.05.1977\n", + "Geschäftsführer: Cheng, Liang, Göppingen, *29.12.1980\n", + "Geschäftsführer: Lauwers, Peter, Düsseldorf, *26.03.1970\n", + "5. Prokura:\n", + "Gesamtprokura gemeinsam mit einem Geschäftsführer oder einem anderen Prokuristen:\n", + "Barkmeyer, Ralf, Dortmund, *28.02.1974\n", + "Böttner, Henrik, Bochum, *07.11.1982\n", + "Dörner-Rodeheger, Astrid, Beckum, *24.12.1968\n", + "Frombach, Ralf, Werne, *25.01.1977\n", + "Gebing, Reinhard, Oelde, *26.04.1964\n", + "Hommel, Sven, Berlin, *22.04.1979\n", + "Kramps, Mark, Witten, *04.09.1967\n", + "Kreft, Markus, Wetter, *03.04.1966\n", + "\n" + ] + } + ], + "source": [ + "import cv2\n", + "import pytesseract\n", + "\n", + "image_path = \"./data/Page_1.jpg\"\n", + "image = cv2.imread(image_path)\n", + "\n", + "text = str(pytesseract.image_to_string(image, config=\"--psm 6\", lang=\"deu\"))\n", + "print(text)\n", + "with open(\"./data/Page_1.txt\", \"w\", encoding=\"utf-8\") as output_file:\n", + " output_file.write(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Geschäftsführer: Basaran, Erkul', 'Geschäftsführer: Cheng, Liang', 'Geschäftsführer: Lauwers, Peter']\n" + ] + }, + { + "data": { + "text/plain": [ + "['Erkul Basaran', 'Liang Cheng', 'Peter Lauwers']" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import re\n", + "\n", + "\n", + "def get_managing_directors(text: str) -> list:\n", + " managing_directors_regex = r\"Geschäftsführer: [a-zA-ZäöüÄÖÜ]*, [a-zA-ZäöüÄÖÜ]*\"\n", + " hits = re.findall(managing_directors_regex, text)\n", + " print(hits)\n", + " return [\n", + " \" \".join(hit.replace(\"Geschäftsführer: \", \"\").replace(\",\", \"\").split(\" \")[::-1])\n", + " for hit in hits\n", + " ]\n", + "\n", + "\n", + "get_managing_directors(text)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.7" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Jupyter/API-tests/Unternehmensregister/requirements.txt b/Jupyter/API-tests/Unternehmensregister/requirements.txt new file mode 100644 index 0000000..aacd7f6 --- /dev/null +++ b/Jupyter/API-tests/Unternehmensregister/requirements.txt @@ -0,0 +1,6 @@ +ocrmypdf +pytesseract +opencv-python +pdf2image +bs4 +selenium \ No newline at end of file