mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-04-22 11:32:53 +02:00
Download Unternehmensregister export via Selenium
This commit is contained in:
parent
a9101aef2f
commit
d69368318f
1
Jupyter/API-tests/Unternehmensregister/.gitignore
vendored
Normal file
1
Jupyter/API-tests/Unternehmensregister/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
data/*
|
337
Jupyter/API-tests/Unternehmensregister/notebook.ipynb
Normal file
337
Jupyter/API-tests/Unternehmensregister/notebook.ipynb
Normal file
@ -0,0 +1,337 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Unternehmensregister"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Fetch Auszug"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 74,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"c:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Unternehmensregister\\data\\Unternehmensregister\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"from selenium import webdriver\n",
|
||||
"from selenium.webdriver.common.by import By\n",
|
||||
"from selenium.webdriver.support.ui import WebDriverWait\n",
|
||||
"from selenium.webdriver.support import expected_conditions as EC\n",
|
||||
"\n",
|
||||
"search_query = \"A*\"\n",
|
||||
"\n",
|
||||
"options = webdriver.ChromeOptions()\n",
|
||||
"\n",
|
||||
"download_path = str(Path(Path.cwd() / \"data\" / \"Unternehmensregister\"))\n",
|
||||
"print(download_path)\n",
|
||||
"\n",
|
||||
"preferences = {\n",
|
||||
" \"profile.default_content_settings.popups\": 0,\n",
|
||||
" \"safebrowsing.enabled\": True,\n",
|
||||
" \"download\": {\n",
|
||||
" \"directory_upgrade\": True,\n",
|
||||
" \"prompt_for_download\": False,\n",
|
||||
" \"extensions_to_open\": \"\",\n",
|
||||
" \"default_directory\": download_path,\n",
|
||||
" },\n",
|
||||
"}\n",
|
||||
"options.add_experimental_option(\"prefs\", preferences)\n",
|
||||
"\n",
|
||||
"driver = webdriver.Chrome(options=options)\n",
|
||||
"\n",
|
||||
"driver.get(\"https://www.unternehmensregister.de/ureg/\")\n",
|
||||
"# Accept Cookies\n",
|
||||
"driver.find_elements(\n",
|
||||
" By.XPATH, '//button[text()=\"Nur technisch notwendige Cookies akzeptieren\"]'\n",
|
||||
")[0].click()\n",
|
||||
"# Enter search query\n",
|
||||
"driver.find_elements(By.ID, \"globalSearchForm:extendedResearchCompanyName\")[\n",
|
||||
" 0\n",
|
||||
"].send_keys(search_query)\n",
|
||||
"# Trigger search\n",
|
||||
"driver.find_elements(By.ID, \"globalSearchForm:btnExecuteSearchOld\")[0].click()\n",
|
||||
"# Wait for results\n",
|
||||
"wait = WebDriverWait(driver, 5)\n",
|
||||
"wait.until(\n",
|
||||
" lambda driver: driver.current_url != \"https://www.unternehmensregister.de/ureg/\"\n",
|
||||
")\n",
|
||||
"## TODO Iterate over tabs\n",
|
||||
"# Find all \"Registerinformationen\"\n",
|
||||
"companies_tab = driver.find_elements(\n",
|
||||
" By.LINK_TEXT, \"Registerinformationen des Registergerichts\"\n",
|
||||
")\n",
|
||||
"for company_link in companies_tab:\n",
|
||||
" # Go to intermediary page\n",
|
||||
" company_link.click()\n",
|
||||
" # Trigger next redirect\n",
|
||||
" driver.find_element(By.LINK_TEXT, \"Registerinformationen anzeigen\").click()\n",
|
||||
" # Trigger SI download\n",
|
||||
" driver.find_element(By.LINK_TEXT, \"SI\").click()\n",
|
||||
" # Show shopping cart - TODO evaluate restructuring behaviour by filling cart first and then bulk downloading\n",
|
||||
" wait.until(\n",
|
||||
" EC.visibility_of_element_located((By.LINK_TEXT, \"Dokumentenkorb ansehen\"))\n",
|
||||
" )\n",
|
||||
" driver.find_element(By.LINK_TEXT, \"Dokumentenkorb ansehen\").click()\n",
|
||||
" # Get document\n",
|
||||
" xpath = \"//input[@type='submit']\"\n",
|
||||
" elems = driver.find_elements(By.TAG_NAME, \"input\")\n",
|
||||
" elems[-2].click()\n",
|
||||
"\n",
|
||||
" driver.find_element(By.ID, \"paymentFormOverview:btnNext\").click()\n",
|
||||
" driver.find_element(By.LINK_TEXT, \"Zum Dokumentenkorb\").click()\n",
|
||||
" driver.find_element(By.CLASS_NAME, \"download-wrapper\").click()\n",
|
||||
"\n",
|
||||
" for i in range(6):\n",
|
||||
" driver.back()\n",
|
||||
"\n",
|
||||
"driver.close()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"session = requests.Session()\n",
|
||||
"session.cookies[\"cc\"] = \"1686301974-69f11760d466bcea-10\"\n",
|
||||
"session.headers.update(\n",
|
||||
" {\n",
|
||||
" \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9\",\n",
|
||||
" \"Accept-Encoding\": \"gzip, deflate, br\",\n",
|
||||
" \"Accept-Language\": \"de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7,et;q=0.6,pl;q=0.5\",\n",
|
||||
" \"Cache-Control\": \"max-age=0\",\n",
|
||||
" \"Connection\": \"keep-alive\",\n",
|
||||
" \"DNT\": \"1\",\n",
|
||||
" \"Host\": \"www.unternehmensregister.de\",\n",
|
||||
" \"Pragma\": \"no-cache\",\n",
|
||||
" \"Referer\": \"https://www.unternehmensregister.de/ureg/\",\n",
|
||||
" \"sec-ch-ua-mobile\": \"?0\",\n",
|
||||
" \"Sec-Fetch-Dest\": \"document\",\n",
|
||||
" \"Sec-Fetch-Mode\": \"navigate\",\n",
|
||||
" \"Sec-Fetch-Site\": \"same-origin\",\n",
|
||||
" \"Sec-Fetch-User\": \"?1\",\n",
|
||||
" \"Upgrade-Insecure-Requests\": \"1\",\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36\",\n",
|
||||
" \"Content-Type\": \"application/x-www-form-urlencoded\",\n",
|
||||
" }\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Get session cookie\n",
|
||||
"response = session.get(\"https://www.unternehmensregister.de\")\n",
|
||||
"jsessionid = re.findall(r\";jsessionid=[A-Z0-9]*.web[0-9]{2}.[0-9]\", response.text)[\n",
|
||||
" 0\n",
|
||||
"].replace(\";jsessionid=\", \"\")\n",
|
||||
"print(jsessionid)\n",
|
||||
"with open(\"temp.html\", \"w\", encoding=\"utf-8\") as file:\n",
|
||||
" file.write(response.text)\n",
|
||||
"print(session.cookies)\n",
|
||||
"# Go to search page\n",
|
||||
"response = session.get(\"https://www.unternehmensregister.de/ureg/\")\n",
|
||||
"# Start search\n",
|
||||
"response = session.post(\n",
|
||||
" f\"https://www.unternehmensregister.de/ureg/index.html;jsessionid={jsessionid}\",\n",
|
||||
" data={\n",
|
||||
" \"globalSearchForm\": \"globalSearchForm\",\n",
|
||||
" \"globalSearchForm:extendedResearchCompanyName\": \"A*\",\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"print(response.status_code)\n",
|
||||
"# Get results\n",
|
||||
"response = session.get(\n",
|
||||
" f\"https://www.unternehmensregister.de/ureg/result.html;jsessionid={jsessionid}\"\n",
|
||||
")\n",
|
||||
"with open(\"temp.html\", \"w\", encoding=\"utf-8\") as file:\n",
|
||||
" file.write(response.text)\n",
|
||||
"\n",
|
||||
"print(session.cookies)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class Unternehmensregister:\n",
|
||||
" def __init__(self):\n",
|
||||
" self.session = requests.Session()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Analyze Auszug"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pdf2image import convert_from_path\n",
|
||||
"\n",
|
||||
"pdfs = r\"./data/test.pdf\"\n",
|
||||
"pages = convert_from_path(pdfs, 350)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"for i, page in enumerate(pages):\n",
|
||||
" image_name = f\"./data/Page_{i+1}.jpg\"\n",
|
||||
" page.save(image_name, \"JPEG\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Handelsregister B des Abteilung B Nummer der Firma:\n",
|
||||
"Amtsgerichts Hamm Wiedergabe des aktuellen HRB 5363\n",
|
||||
"Registerinhalts\n",
|
||||
"Abruf vom 07.06.2023 19:37\n",
|
||||
"1. Anzahl der bisherigen Eintragungen:\n",
|
||||
"51\n",
|
||||
"2. a) Firma:\n",
|
||||
"GEA Farm Technologies GmbH\n",
|
||||
"b) Sitz, Niederlassung, inländische Geschäftsanschrift, empfangsberechtigte Person, Zweigniederlassungen:\n",
|
||||
"Bönen\n",
|
||||
"Geschäftsanschrift: Siemensstraße 25-27, 59199 Bönen\n",
|
||||
"c) Gegenstand des Unternehmens:\n",
|
||||
"Entwicklung, Herstellung und der Vertrieb von Landtechnik, insbesondere von Komponenten und Anlagen\n",
|
||||
"(a) zur Gewinnung, Kühlung, Behandlung und Lagerung von Milch;\n",
|
||||
"(b) für das Milchvieh-Herdenmanagement;\n",
|
||||
"(c) zur Tierhygiene und Sicherung der Milchqualität und\n",
|
||||
"(d) zur Aufstallung von Tieren;\n",
|
||||
"sowie die Herstellung und der Vertrieb von Anlagen und Fahrzeugen zur Aufbereitung und zum Transport von Gülle.\n",
|
||||
"3. Grund- oder Stammkapital:\n",
|
||||
"5.115.000,00 EUR\n",
|
||||
"4. a) Allgemeine Vertretungsregelung:\n",
|
||||
"Ist nur ein Geschäftsführer bestellt, so vertritt er die Gesellschaft allein. Sind mehrere Geschäftsführer bestellt, so wird die\n",
|
||||
"Gesellschaft durch zwei Geschäftsführer oder durch einen Geschäftsführer gemeinsam mit einem Prokuristen vertreten.\n",
|
||||
"b) Vorstand, Leitungsorgan, geschäftsführende Direktoren, persönlich haftender Gesellschafter, Geschäftsführer,\n",
|
||||
"Vertretungsberechtigte und besondere Vertretungsbefugnis:\n",
|
||||
"Geschäftsführer: Basaran, Erkul, Erkrath, *06.05.1977\n",
|
||||
"Geschäftsführer: Cheng, Liang, Göppingen, *29.12.1980\n",
|
||||
"Geschäftsführer: Lauwers, Peter, Düsseldorf, *26.03.1970\n",
|
||||
"5. Prokura:\n",
|
||||
"Gesamtprokura gemeinsam mit einem Geschäftsführer oder einem anderen Prokuristen:\n",
|
||||
"Barkmeyer, Ralf, Dortmund, *28.02.1974\n",
|
||||
"Böttner, Henrik, Bochum, *07.11.1982\n",
|
||||
"Dörner-Rodeheger, Astrid, Beckum, *24.12.1968\n",
|
||||
"Frombach, Ralf, Werne, *25.01.1977\n",
|
||||
"Gebing, Reinhard, Oelde, *26.04.1964\n",
|
||||
"Hommel, Sven, Berlin, *22.04.1979\n",
|
||||
"Kramps, Mark, Witten, *04.09.1967\n",
|
||||
"Kreft, Markus, Wetter, *03.04.1966\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import cv2\n",
|
||||
"import pytesseract\n",
|
||||
"\n",
|
||||
"image_path = \"./data/Page_1.jpg\"\n",
|
||||
"image = cv2.imread(image_path)\n",
|
||||
"\n",
|
||||
"text = str(pytesseract.image_to_string(image, config=\"--psm 6\", lang=\"deu\"))\n",
|
||||
"print(text)\n",
|
||||
"with open(\"./data/Page_1.txt\", \"w\", encoding=\"utf-8\") as output_file:\n",
|
||||
" output_file.write(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['Geschäftsführer: Basaran, Erkul', 'Geschäftsführer: Cheng, Liang', 'Geschäftsführer: Lauwers, Peter']\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['Erkul Basaran', 'Liang Cheng', 'Peter Lauwers']"
|
||||
]
|
||||
},
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_managing_directors(text: str) -> list:\n",
|
||||
" managing_directors_regex = r\"Geschäftsführer: [a-zA-ZäöüÄÖÜ]*, [a-zA-ZäöüÄÖÜ]*\"\n",
|
||||
" hits = re.findall(managing_directors_regex, text)\n",
|
||||
" print(hits)\n",
|
||||
" return [\n",
|
||||
" \" \".join(hit.replace(\"Geschäftsführer: \", \"\").replace(\",\", \"\").split(\" \")[::-1])\n",
|
||||
" for hit in hits\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"get_managing_directors(text)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.7"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
6
Jupyter/API-tests/Unternehmensregister/requirements.txt
Normal file
6
Jupyter/API-tests/Unternehmensregister/requirements.txt
Normal file
@ -0,0 +1,6 @@
|
||||
ocrmypdf
|
||||
pytesseract
|
||||
opencv-python
|
||||
pdf2image
|
||||
bs4
|
||||
selenium
|
Loading…
x
Reference in New Issue
Block a user