mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-04-22 04:32:53 +02:00
387 lines
16 KiB
Plaintext
387 lines
16 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Unternehmensregister"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Fetch Auszug"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 118,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"c:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\Jupyter\\API-tests\\Unternehmensregister\\data\\Unternehmensregister\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import os\n",
|
|
"from pathlib import Path\n",
|
|
"\n",
|
|
"from selenium import webdriver\n",
|
|
"from selenium.webdriver.common.by import By\n",
|
|
"from selenium.webdriver.support.ui import WebDriverWait\n",
|
|
"from selenium.webdriver.support import expected_conditions as EC\n",
|
|
"\n",
|
|
"search_query = \"GEA Farm Technologies\"\n",
|
|
"\n",
|
|
"options = webdriver.ChromeOptions()\n",
|
|
"\n",
|
|
"download_path = str(Path(Path.cwd() / \"data\" / \"Unternehmensregister\"))\n",
|
|
"print(download_path)\n",
|
|
"\n",
|
|
"preferences = {\n",
|
|
" \"profile.default_content_settings.popups\": 0,\n",
|
|
" \"safebrowsing.enabled\": True,\n",
|
|
" \"download\": {\n",
|
|
" \"directory_upgrade\": True,\n",
|
|
" \"prompt_for_download\": False,\n",
|
|
" \"extensions_to_open\": \"\",\n",
|
|
" \"default_directory\": download_path,\n",
|
|
" },\n",
|
|
"}\n",
|
|
"options.add_experimental_option(\"prefs\", preferences)\n",
|
|
"\n",
|
|
"driver = webdriver.Chrome(options=options)\n",
|
|
"\n",
|
|
"driver.get(\"https://www.unternehmensregister.de/ureg/\")\n",
|
|
"# Accept Cookies\n",
|
|
"driver.find_elements(\n",
|
|
" By.XPATH, '//button[text()=\"Nur technisch notwendige Cookies akzeptieren\"]'\n",
|
|
")[0].click()\n",
|
|
"# Enter search query\n",
|
|
"driver.find_elements(By.ID, \"globalSearchForm:extendedResearchCompanyName\")[\n",
|
|
" 0\n",
|
|
"].send_keys(search_query)\n",
|
|
"# Trigger search\n",
|
|
"driver.find_elements(By.ID, \"globalSearchForm:btnExecuteSearchOld\")[0].click()\n",
|
|
"# Wait for results\n",
|
|
"wait = WebDriverWait(driver, 5)\n",
|
|
"wait.until(\n",
|
|
" lambda driver: driver.current_url != \"https://www.unternehmensregister.de/ureg/\"\n",
|
|
")\n",
|
|
"## TODO Iterate over tabs\n",
|
|
"num_pages = int(\n",
|
|
" driver.find_element(By.XPATH, '//*[@class=\"page_count\"]').text.split(\" \")[0]\n",
|
|
")\n",
|
|
"for page_index in range(num_pages):\n",
|
|
" # Find all \"Registerinformationen\"\n",
|
|
" companies_tab = driver.find_elements(\n",
|
|
" By.LINK_TEXT, \"Registerinformationen des Registergerichts\"\n",
|
|
" )\n",
|
|
" for company_link in companies_tab:\n",
|
|
" # Go to intermediary page\n",
|
|
" company_link.click()\n",
|
|
" # Trigger next redirect\n",
|
|
" driver.find_element(By.LINK_TEXT, \"Registerinformationen anzeigen\").click()\n",
|
|
" # Trigger SI download\n",
|
|
" driver.find_element(By.LINK_TEXT, \"SI\").click()\n",
|
|
" # Show shopping cart - TODO evaluate restructuring behaviour by filling cart first and then bulk downloading\n",
|
|
" wait.until(\n",
|
|
" EC.visibility_of_element_located((By.LINK_TEXT, \"Dokumentenkorb ansehen\"))\n",
|
|
" )\n",
|
|
" driver.find_element(By.LINK_TEXT, \"Dokumentenkorb ansehen\").click()\n",
|
|
" # Get document\n",
|
|
" xpath = \"//input[@type='submit']\"\n",
|
|
" elems = driver.find_elements(By.TAG_NAME, \"input\")\n",
|
|
" elems[-2].click()\n",
|
|
"\n",
|
|
" driver.find_element(By.ID, \"paymentFormOverview:btnNext\").click()\n",
|
|
" driver.find_element(By.LINK_TEXT, \"Zum Dokumentenkorb\").click()\n",
|
|
" driver.find_element(By.CLASS_NAME, \"download-wrapper\").click()\n",
|
|
"\n",
|
|
" for i in range(6):\n",
|
|
" driver.back()\n",
|
|
" driver.find_element(By.XPATH, '//*[@class=\"fas fa-angle-right\"]').click()\n",
|
|
"driver.close()"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Analyze Auszug"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 119,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"['registerdocument-2023-06-09-14-05-01.xml',\n",
|
|
" 'registerdocument-2023-06-09-14-05-03.xml']"
|
|
]
|
|
},
|
|
"execution_count": 119,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"files = os.listdir(\"./data/Unternehmensregister\")\n",
|
|
"files"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 135,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"{}\n",
|
|
"{}\n",
|
|
"{'name': {'firstname': 'Reinhard', 'lastname': 'Gebing'}, 'date_of_birth': '1964-04-26', 'location': {'city': 'Oelde'}, 'role': 'Prokurist(in)'}\n",
|
|
"{'name': {'firstname': 'Markus', 'lastname': 'Kreft'}, 'date_of_birth': '1966-04-03', 'location': {'city': 'Wetter'}, 'role': 'Prokurist(in)'}\n",
|
|
"{'name': {'firstname': 'Kai', 'lastname': 'Luntz'}, 'date_of_birth': '1970-12-04', 'location': {'city': 'Holzminden'}, 'role': 'Prokurist(in)'}\n",
|
|
"{'name': {'firstname': 'Thomas', 'lastname': 'Mader'}, 'date_of_birth': '1972-05-24', 'location': {'city': 'Rheda-Wiedenbrück'}, 'role': 'Prokurist(in)'}\n",
|
|
"{'name': {'firstname': 'Peter', 'lastname': 'Lauwers'}, 'date_of_birth': '1970-03-26', 'location': {'city': 'Düsseldorf'}, 'role': 'Geschäftsführer(in)'}\n",
|
|
"{'name': {'firstname': 'Erkul', 'lastname': 'Basaran'}, 'date_of_birth': '1977-05-06', 'location': {'city': 'Erkrath'}, 'role': 'Geschäftsführer(in)'}\n",
|
|
"{'name': {'firstname': 'Katja', 'lastname': 'Voß'}, 'date_of_birth': '1978-02-24', 'location': {'city': 'Rheda-Wiedenbrück'}, 'role': 'Prokurist(in)'}\n",
|
|
"{'name': {'firstname': 'Henrik', 'lastname': 'Böttner'}, 'date_of_birth': '1982-11-07', 'location': {'city': 'Bochum'}, 'role': 'Prokurist(in)'}\n",
|
|
"{'name': {'firstname': 'Ulrich', 'lastname': 'Raßenhövel'}, 'date_of_birth': '1969-04-16', 'location': {'city': 'Oelde'}, 'role': 'Prokurist(in)'}\n",
|
|
"{'name': {'firstname': 'Andreas', 'lastname': 'Naroska'}, 'date_of_birth': '1967-03-23', 'location': {'city': 'Herdecke'}, 'role': 'Prokurist(in)'}\n",
|
|
"{'name': {'firstname': 'Mark', 'lastname': 'Kramps'}, 'date_of_birth': '1967-09-04', 'location': {'city': 'Witten'}, 'role': 'Prokurist(in)'}\n",
|
|
"{'name': {'firstname': 'Ralf', 'lastname': 'Barkmeyer'}, 'date_of_birth': '1974-02-28', 'location': {'city': 'Dortmund'}, 'role': 'Prokurist(in)'}\n",
|
|
"{'name': {'firstname': 'Holger', 'lastname': 'Siegwarth'}, 'date_of_birth': '1967-05-13', 'location': {'city': 'Tönnisvorst'}, 'role': 'Prokurist(in)'}\n",
|
|
"{'name': {'firstname': 'Oliver', 'lastname': 'Liß'}, 'date_of_birth': '1981-04-13', 'location': {'city': 'Herne'}, 'role': 'Prokurist(in)'}\n",
|
|
"{'name': {'firstname': 'Liang', 'lastname': 'Cheng'}, 'date_of_birth': '1980-12-29', 'location': {'city': 'Göppingen'}, 'role': 'Geschäftsführer(in)'}\n",
|
|
"{'name': {'firstname': 'Astrid', 'lastname': 'Dörner-Rodeheger'}, 'date_of_birth': '1968-12-24', 'location': {'city': 'Beckum'}, 'role': 'Prokurist(in)'}\n",
|
|
"{'name': {'firstname': 'Jon', 'lastname': 'Lange'}, 'date_of_birth': '1978-04-25', 'location': {'city': 'Dortmund'}, 'role': 'Prokurist(in)'}\n",
|
|
"{'name': {'firstname': 'Matthias', 'lastname': 'Peters'}, 'date_of_birth': '1973-08-28', 'location': {'city': 'Dortmund'}, 'role': 'Prokurist(in)'}\n",
|
|
"{'name': {'firstname': 'Ralf', 'lastname': 'Frombach'}, 'date_of_birth': '1977-01-25', 'location': {'city': 'Werne'}, 'role': 'Prokurist(in)'}\n",
|
|
"{'name': {'firstname': 'Sven', 'lastname': 'Hommel'}, 'date_of_birth': '1979-04-22', 'location': {'city': 'Berlin'}, 'role': 'Prokurist(in)'}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import json\n",
|
|
"import xmltodict\n",
|
|
"\n",
|
|
"for file in files:\n",
|
|
" with open(\"./data/Unternehmensregister/\" + file, \"r\", encoding=\"utf-8\") as xml_file:\n",
|
|
" data = xmltodict.parse(xml_file.read())\n",
|
|
" with open(\"./data/temp.json\", \"w\", encoding=\"utf-8\") as json_file:\n",
|
|
" json_file.write(json.dumps(data))\n",
|
|
"\n",
|
|
" keys = dict.keys(data[\"XJustiz_Daten\"][\"Grunddaten\"])\n",
|
|
" base_info = {\n",
|
|
" \"company_name\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
|
|
" \"Basisdaten_Register\"\n",
|
|
" ][\"Rechtstraeger\"][\"Bezeichnung\"][\"Bezeichnung_Aktuell\"],\n",
|
|
" \"location\": {\n",
|
|
" \"city\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
|
|
" \"Basisdaten_Register\"\n",
|
|
" ][\"Rechtstraeger\"][\"Anschrift\"][\"Ort\"],\n",
|
|
" \"zip_code\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
|
|
" \"Basisdaten_Register\"\n",
|
|
" ][\"Rechtstraeger\"][\"Anschrift\"][\"Postleitzahl\"],\n",
|
|
" \"street\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
|
|
" \"Basisdaten_Register\"\n",
|
|
" ][\"Rechtstraeger\"][\"Anschrift\"][\"Strasse\"],\n",
|
|
" \"house_number\": data[\"XJustiz_Daten\"][\"Fachdaten_Register\"][\n",
|
|
" \"Basisdaten_Register\"\n",
|
|
" ][\"Rechtstraeger\"][\"Anschrift\"][\"Hausnummer\"],\n",
|
|
" },\n",
|
|
" }\n",
|
|
"\n",
|
|
" def parse_stakeholder(data: dict) -> list:\n",
|
|
" if \"Natuerliche_Person\" in data[\"Beteiligter\"]:\n",
|
|
" return {\n",
|
|
" \"name\": {\n",
|
|
" \"firstname\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\n",
|
|
" \"Voller_Name\"\n",
|
|
" ][\"Vorname\"],\n",
|
|
" \"lastname\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\n",
|
|
" \"Voller_Name\"\n",
|
|
" ][\"Nachname\"],\n",
|
|
" },\n",
|
|
" \"date_of_birth\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\n",
|
|
" \"Geburt\"\n",
|
|
" ][\"Geburtsdatum\"],\n",
|
|
" \"location\": {\n",
|
|
" \"city\": data[\"Beteiligter\"][\"Natuerliche_Person\"][\"Anschrift\"][\n",
|
|
" \"Ort\"\n",
|
|
" ]\n",
|
|
" },\n",
|
|
" \"role\": data[\"Rolle\"][\"Rollenbezeichnung\"][\"content\"],\n",
|
|
" }\n",
|
|
" return {}\n",
|
|
"\n",
|
|
" for i in range(\n",
|
|
" len(data[\"XJustiz_Daten\"][\"Grunddaten\"][\"Verfahrensdaten\"][\"Beteiligung\"])\n",
|
|
" ):\n",
|
|
" people = parse_stakeholder(\n",
|
|
" data[\"XJustiz_Daten\"][\"Grunddaten\"][\"Verfahrensdaten\"][\"Beteiligung\"][i]\n",
|
|
" )\n",
|
|
" print(people)\n",
|
|
" break"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 27,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from pdf2image import convert_from_path\n",
|
|
"\n",
|
|
"pdfs = r\"./data/test.pdf\"\n",
|
|
"pages = convert_from_path(pdfs, 350)\n",
|
|
"\n",
|
|
"\n",
|
|
"for i, page in enumerate(pages):\n",
|
|
" image_name = f\"./data/Page_{i+1}.jpg\"\n",
|
|
" page.save(image_name, \"JPEG\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 28,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Handelsregister B des Abteilung B Nummer der Firma:\n",
|
|
"Amtsgerichts Hamm Wiedergabe des aktuellen HRB 5363\n",
|
|
"Registerinhalts\n",
|
|
"Abruf vom 07.06.2023 19:37\n",
|
|
"1. Anzahl der bisherigen Eintragungen:\n",
|
|
"51\n",
|
|
"2. a) Firma:\n",
|
|
"GEA Farm Technologies GmbH\n",
|
|
"b) Sitz, Niederlassung, inländische Geschäftsanschrift, empfangsberechtigte Person, Zweigniederlassungen:\n",
|
|
"Bönen\n",
|
|
"Geschäftsanschrift: Siemensstraße 25-27, 59199 Bönen\n",
|
|
"c) Gegenstand des Unternehmens:\n",
|
|
"Entwicklung, Herstellung und der Vertrieb von Landtechnik, insbesondere von Komponenten und Anlagen\n",
|
|
"(a) zur Gewinnung, Kühlung, Behandlung und Lagerung von Milch;\n",
|
|
"(b) für das Milchvieh-Herdenmanagement;\n",
|
|
"(c) zur Tierhygiene und Sicherung der Milchqualität und\n",
|
|
"(d) zur Aufstallung von Tieren;\n",
|
|
"sowie die Herstellung und der Vertrieb von Anlagen und Fahrzeugen zur Aufbereitung und zum Transport von Gülle.\n",
|
|
"3. Grund- oder Stammkapital:\n",
|
|
"5.115.000,00 EUR\n",
|
|
"4. a) Allgemeine Vertretungsregelung:\n",
|
|
"Ist nur ein Geschäftsführer bestellt, so vertritt er die Gesellschaft allein. Sind mehrere Geschäftsführer bestellt, so wird die\n",
|
|
"Gesellschaft durch zwei Geschäftsführer oder durch einen Geschäftsführer gemeinsam mit einem Prokuristen vertreten.\n",
|
|
"b) Vorstand, Leitungsorgan, geschäftsführende Direktoren, persönlich haftender Gesellschafter, Geschäftsführer,\n",
|
|
"Vertretungsberechtigte und besondere Vertretungsbefugnis:\n",
|
|
"Geschäftsführer: Basaran, Erkul, Erkrath, *06.05.1977\n",
|
|
"Geschäftsführer: Cheng, Liang, Göppingen, *29.12.1980\n",
|
|
"Geschäftsführer: Lauwers, Peter, Düsseldorf, *26.03.1970\n",
|
|
"5. Prokura:\n",
|
|
"Gesamtprokura gemeinsam mit einem Geschäftsführer oder einem anderen Prokuristen:\n",
|
|
"Barkmeyer, Ralf, Dortmund, *28.02.1974\n",
|
|
"Böttner, Henrik, Bochum, *07.11.1982\n",
|
|
"Dörner-Rodeheger, Astrid, Beckum, *24.12.1968\n",
|
|
"Frombach, Ralf, Werne, *25.01.1977\n",
|
|
"Gebing, Reinhard, Oelde, *26.04.1964\n",
|
|
"Hommel, Sven, Berlin, *22.04.1979\n",
|
|
"Kramps, Mark, Witten, *04.09.1967\n",
|
|
"Kreft, Markus, Wetter, *03.04.1966\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import cv2\n",
|
|
"import pytesseract\n",
|
|
"\n",
|
|
"image_path = \"./data/Page_1.jpg\"\n",
|
|
"image = cv2.imread(image_path)\n",
|
|
"\n",
|
|
"text = str(pytesseract.image_to_string(image, config=\"--psm 6\", lang=\"deu\"))\n",
|
|
"print(text)\n",
|
|
"with open(\"./data/Page_1.txt\", \"w\", encoding=\"utf-8\") as output_file:\n",
|
|
" output_file.write(text)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 29,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"['Geschäftsführer: Basaran, Erkul', 'Geschäftsführer: Cheng, Liang', 'Geschäftsführer: Lauwers, Peter']\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"['Erkul Basaran', 'Liang Cheng', 'Peter Lauwers']"
|
|
]
|
|
},
|
|
"execution_count": 29,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"import re\n",
|
|
"\n",
|
|
"\n",
|
|
"def get_managing_directors(text: str) -> list:\n",
|
|
" managing_directors_regex = r\"Geschäftsführer: [a-zA-ZäöüÄÖÜ]*, [a-zA-ZäöüÄÖÜ]*\"\n",
|
|
" hits = re.findall(managing_directors_regex, text)\n",
|
|
" print(hits)\n",
|
|
" return [\n",
|
|
" \" \".join(hit.replace(\"Geschäftsführer: \", \"\").replace(\",\", \"\").split(\" \")[::-1])\n",
|
|
" for hit in hits\n",
|
|
" ]\n",
|
|
"\n",
|
|
"\n",
|
|
"get_managing_directors(text)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.7"
|
|
},
|
|
"orig_nbformat": 4
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|