338 lines
12 KiB
Plaintext

{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Unternehmensregister"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Fetch Auszug"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"c:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Unternehmensregister\\data\\Unternehmensregister\n"
]
}
],
"source": [
"import os\n",
"from pathlib import Path\n",
"\n",
"from selenium import webdriver\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"\n",
"search_query = \"A*\"\n",
"\n",
"options = webdriver.ChromeOptions()\n",
"\n",
"download_path = str(Path(Path.cwd() / \"data\" / \"Unternehmensregister\"))\n",
"print(download_path)\n",
"\n",
"preferences = {\n",
" \"profile.default_content_settings.popups\": 0,\n",
" \"safebrowsing.enabled\": True,\n",
" \"download\": {\n",
" \"directory_upgrade\": True,\n",
" \"prompt_for_download\": False,\n",
" \"extensions_to_open\": \"\",\n",
" \"default_directory\": download_path,\n",
" },\n",
"}\n",
"options.add_experimental_option(\"prefs\", preferences)\n",
"\n",
"driver = webdriver.Chrome(options=options)\n",
"\n",
"driver.get(\"https://www.unternehmensregister.de/ureg/\")\n",
"# Accept Cookies\n",
"driver.find_elements(\n",
" By.XPATH, '//button[text()=\"Nur technisch notwendige Cookies akzeptieren\"]'\n",
")[0].click()\n",
"# Enter search query\n",
"driver.find_elements(By.ID, \"globalSearchForm:extendedResearchCompanyName\")[\n",
" 0\n",
"].send_keys(search_query)\n",
"# Trigger search\n",
"driver.find_elements(By.ID, \"globalSearchForm:btnExecuteSearchOld\")[0].click()\n",
"# Wait for results\n",
"wait = WebDriverWait(driver, 5)\n",
"wait.until(\n",
" lambda driver: driver.current_url != \"https://www.unternehmensregister.de/ureg/\"\n",
")\n",
"## TODO Iterate over tabs\n",
"# Find all \"Registerinformationen\"\n",
"companies_tab = driver.find_elements(\n",
" By.LINK_TEXT, \"Registerinformationen des Registergerichts\"\n",
")\n",
"for company_link in companies_tab:\n",
" # Go to intermediary page\n",
" company_link.click()\n",
" # Trigger next redirect\n",
" driver.find_element(By.LINK_TEXT, \"Registerinformationen anzeigen\").click()\n",
" # Trigger SI download\n",
" driver.find_element(By.LINK_TEXT, \"SI\").click()\n",
" # Show shopping cart - TODO evaluate restructuring behaviour by filling cart first and then bulk downloading\n",
" wait.until(\n",
" EC.visibility_of_element_located((By.LINK_TEXT, \"Dokumentenkorb ansehen\"))\n",
" )\n",
" driver.find_element(By.LINK_TEXT, \"Dokumentenkorb ansehen\").click()\n",
" # Get document\n",
" xpath = \"//input[@type='submit']\"\n",
" elems = driver.find_elements(By.TAG_NAME, \"input\")\n",
" elems[-2].click()\n",
"\n",
" driver.find_element(By.ID, \"paymentFormOverview:btnNext\").click()\n",
" driver.find_element(By.LINK_TEXT, \"Zum Dokumentenkorb\").click()\n",
" driver.find_element(By.CLASS_NAME, \"download-wrapper\").click()\n",
"\n",
" for i in range(6):\n",
" driver.back()\n",
"\n",
"driver.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import requests\n",
"\n",
"session = requests.Session()\n",
"session.cookies[\"cc\"] = \"1686301974-69f11760d466bcea-10\"\n",
"session.headers.update(\n",
" {\n",
" \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9\",\n",
" \"Accept-Encoding\": \"gzip, deflate, br\",\n",
" \"Accept-Language\": \"de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7,et;q=0.6,pl;q=0.5\",\n",
" \"Cache-Control\": \"max-age=0\",\n",
" \"Connection\": \"keep-alive\",\n",
" \"DNT\": \"1\",\n",
" \"Host\": \"www.unternehmensregister.de\",\n",
" \"Pragma\": \"no-cache\",\n",
" \"Referer\": \"https://www.unternehmensregister.de/ureg/\",\n",
" \"sec-ch-ua-mobile\": \"?0\",\n",
" \"Sec-Fetch-Dest\": \"document\",\n",
" \"Sec-Fetch-Mode\": \"navigate\",\n",
" \"Sec-Fetch-Site\": \"same-origin\",\n",
" \"Sec-Fetch-User\": \"?1\",\n",
" \"Upgrade-Insecure-Requests\": \"1\",\n",
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36\",\n",
" \"Content-Type\": \"application/x-www-form-urlencoded\",\n",
" }\n",
")\n",
"\n",
"\n",
"# Get session cookie\n",
"response = session.get(\"https://www.unternehmensregister.de\")\n",
"jsessionid = re.findall(r\";jsessionid=[A-Z0-9]*.web[0-9]{2}.[0-9]\", response.text)[\n",
" 0\n",
"].replace(\";jsessionid=\", \"\")\n",
"print(jsessionid)\n",
"with open(\"temp.html\", \"w\", encoding=\"utf-8\") as file:\n",
" file.write(response.text)\n",
"print(session.cookies)\n",
"# Go to search page\n",
"response = session.get(\"https://www.unternehmensregister.de/ureg/\")\n",
"# Start search\n",
"response = session.post(\n",
" f\"https://www.unternehmensregister.de/ureg/index.html;jsessionid={jsessionid}\",\n",
" data={\n",
" \"globalSearchForm\": \"globalSearchForm\",\n",
" \"globalSearchForm:extendedResearchCompanyName\": \"A*\",\n",
" },\n",
")\n",
"print(response.status_code)\n",
"# Get results\n",
"response = session.get(\n",
" f\"https://www.unternehmensregister.de/ureg/result.html;jsessionid={jsessionid}\"\n",
")\n",
"with open(\"temp.html\", \"w\", encoding=\"utf-8\") as file:\n",
" file.write(response.text)\n",
"\n",
"print(session.cookies)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class Unternehmensregister:\n",
" def __init__(self):\n",
" self.session = requests.Session()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Analyze Auszug"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"from pdf2image import convert_from_path\n",
"\n",
"pdfs = r\"./data/test.pdf\"\n",
"pages = convert_from_path(pdfs, 350)\n",
"\n",
"\n",
"for i, page in enumerate(pages):\n",
" image_name = f\"./data/Page_{i+1}.jpg\"\n",
" page.save(image_name, \"JPEG\")"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Handelsregister B des Abteilung B Nummer der Firma:\n",
"Amtsgerichts Hamm Wiedergabe des aktuellen HRB 5363\n",
"Registerinhalts\n",
"Abruf vom 07.06.2023 19:37\n",
"1. Anzahl der bisherigen Eintragungen:\n",
"51\n",
"2. a) Firma:\n",
"GEA Farm Technologies GmbH\n",
"b) Sitz, Niederlassung, inländische Geschäftsanschrift, empfangsberechtigte Person, Zweigniederlassungen:\n",
"Bönen\n",
"Geschäftsanschrift: Siemensstraße 25-27, 59199 Bönen\n",
"c) Gegenstand des Unternehmens:\n",
"Entwicklung, Herstellung und der Vertrieb von Landtechnik, insbesondere von Komponenten und Anlagen\n",
"(a) zur Gewinnung, Kühlung, Behandlung und Lagerung von Milch;\n",
"(b) für das Milchvieh-Herdenmanagement;\n",
"(c) zur Tierhygiene und Sicherung der Milchqualität und\n",
"(d) zur Aufstallung von Tieren;\n",
"sowie die Herstellung und der Vertrieb von Anlagen und Fahrzeugen zur Aufbereitung und zum Transport von Gülle.\n",
"3. Grund- oder Stammkapital:\n",
"5.115.000,00 EUR\n",
"4. a) Allgemeine Vertretungsregelung:\n",
"Ist nur ein Geschäftsführer bestellt, so vertritt er die Gesellschaft allein. Sind mehrere Geschäftsführer bestellt, so wird die\n",
"Gesellschaft durch zwei Geschäftsführer oder durch einen Geschäftsführer gemeinsam mit einem Prokuristen vertreten.\n",
"b) Vorstand, Leitungsorgan, geschäftsführende Direktoren, persönlich haftender Gesellschafter, Geschäftsführer,\n",
"Vertretungsberechtigte und besondere Vertretungsbefugnis:\n",
"Geschäftsführer: Basaran, Erkul, Erkrath, *06.05.1977\n",
"Geschäftsführer: Cheng, Liang, Göppingen, *29.12.1980\n",
"Geschäftsführer: Lauwers, Peter, Düsseldorf, *26.03.1970\n",
"5. Prokura:\n",
"Gesamtprokura gemeinsam mit einem Geschäftsführer oder einem anderen Prokuristen:\n",
"Barkmeyer, Ralf, Dortmund, *28.02.1974\n",
"Böttner, Henrik, Bochum, *07.11.1982\n",
"Dörner-Rodeheger, Astrid, Beckum, *24.12.1968\n",
"Frombach, Ralf, Werne, *25.01.1977\n",
"Gebing, Reinhard, Oelde, *26.04.1964\n",
"Hommel, Sven, Berlin, *22.04.1979\n",
"Kramps, Mark, Witten, *04.09.1967\n",
"Kreft, Markus, Wetter, *03.04.1966\n",
"\n"
]
}
],
"source": [
"import cv2\n",
"import pytesseract\n",
"\n",
"image_path = \"./data/Page_1.jpg\"\n",
"image = cv2.imread(image_path)\n",
"\n",
"text = str(pytesseract.image_to_string(image, config=\"--psm 6\", lang=\"deu\"))\n",
"print(text)\n",
"with open(\"./data/Page_1.txt\", \"w\", encoding=\"utf-8\") as output_file:\n",
" output_file.write(text)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Geschäftsführer: Basaran, Erkul', 'Geschäftsführer: Cheng, Liang', 'Geschäftsführer: Lauwers, Peter']\n"
]
},
{
"data": {
"text/plain": [
"['Erkul Basaran', 'Liang Cheng', 'Peter Lauwers']"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import re\n",
"\n",
"\n",
"def get_managing_directors(text: str) -> list:\n",
" managing_directors_regex = r\"Geschäftsführer: [a-zA-ZäöüÄÖÜ]*, [a-zA-ZäöüÄÖÜ]*\"\n",
" hits = re.findall(managing_directors_regex, text)\n",
" print(hits)\n",
" return [\n",
" \" \".join(hit.replace(\"Geschäftsführer: \", \"\").replace(\",\", \"\").split(\" \")[::-1])\n",
" for hit in hits\n",
" ]\n",
"\n",
"\n",
"get_managing_directors(text)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.7"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}