mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-06-21 15:13:55 +02:00
checkpoint(data-ingestion): Move Unternehmensregister code to .py
This commit is contained in:
@ -1,192 +0,0 @@
|
||||
"""Unternehmensregister Scraping."""
|
||||
import glob
|
||||
import logging
|
||||
import multiprocessing
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions as ec
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from tqdm import tqdm
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
|
||||
def scrape(query: str, download_dir: list[str]):
|
||||
"""Fetch results from Unternehmensregister for given query.
|
||||
|
||||
Args:
|
||||
query (str): Search Query (RegEx supported)
|
||||
download_dir (list[str]): Directory to place output files in
|
||||
"""
|
||||
download_path = os.path.join(str(Path.cwd()), *download_dir)
|
||||
options = webdriver.ChromeOptions()
|
||||
preferences = {
|
||||
"profile.default_content_settings.popups": 0,
|
||||
"safebrowsing.enabled": True,
|
||||
"download": {
|
||||
"directory_upgrade": True,
|
||||
"prompt_for_download": False,
|
||||
"extensions_to_open": "",
|
||||
"default_directory": download_path,
|
||||
},
|
||||
}
|
||||
options.add_argument("--headless=new")
|
||||
options.add_experimental_option("prefs", preferences)
|
||||
|
||||
driver = webdriver.Chrome(options=options)
|
||||
|
||||
driver.get("https://www.unternehmensregister.de/ureg/")
|
||||
# Accept Cookies
|
||||
driver.find_elements(
|
||||
By.XPATH, '//button[text()="Nur technisch notwendige Cookies akzeptieren"]'
|
||||
)[0].click()
|
||||
# Enter search query
|
||||
driver.find_elements(By.ID, "globalSearchForm:extendedResearchCompanyName")[
|
||||
0
|
||||
].send_keys(query)
|
||||
# Trigger search
|
||||
driver.find_elements(By.ID, "globalSearchForm:btnExecuteSearchOld")[0].click()
|
||||
# Wait for results
|
||||
wait = WebDriverWait(driver, 15)
|
||||
wait.until(
|
||||
lambda driver: driver.current_url != "https://www.unternehmensregister.de/ureg/"
|
||||
)
|
||||
|
||||
num_pages = int(
|
||||
driver.find_element(By.XPATH, '//*[@class="page_count"]').text.split(" ")[0]
|
||||
)
|
||||
|
||||
processed_companies = []
|
||||
|
||||
for _ in tqdm(range(num_pages)):
|
||||
# Find all "Registerinformationen"
|
||||
companies_tab = driver.find_elements(
|
||||
By.LINK_TEXT, "Registerinformationen des Registergerichts"
|
||||
)
|
||||
company_names = [
|
||||
elem.text
|
||||
for elem in driver.find_elements(
|
||||
By.XPATH, '//div[@class="company_result"]/span/b'
|
||||
)
|
||||
]
|
||||
for index, company_link in enumerate(companies_tab):
|
||||
company_name = company_names[index]
|
||||
if company_name in processed_companies:
|
||||
continue
|
||||
# Go to intermediary page
|
||||
company_link.click()
|
||||
# Trigger next redirect
|
||||
driver.find_element(By.LINK_TEXT, "Registerinformationen anzeigen").click()
|
||||
# Trigger SI download
|
||||
driver.find_element(By.LINK_TEXT, "SI").click()
|
||||
# Show shopping cart
|
||||
wait.until(
|
||||
ec.visibility_of_element_located(
|
||||
(By.LINK_TEXT, "Dokumentenkorb ansehen")
|
||||
)
|
||||
)
|
||||
driver.find_element(By.LINK_TEXT, "Dokumentenkorb ansehen").click()
|
||||
# Get document
|
||||
elems = driver.find_elements(By.TAG_NAME, "input")
|
||||
elems[-2].click()
|
||||
|
||||
wait.until(
|
||||
ec.visibility_of_element_located((By.ID, "paymentFormOverview:btnNext"))
|
||||
)
|
||||
driver.find_element(By.ID, "paymentFormOverview:btnNext").click()
|
||||
|
||||
wait.until(
|
||||
ec.visibility_of_element_located((By.LINK_TEXT, "Zum Dokumentenkorb"))
|
||||
)
|
||||
driver.find_element(By.LINK_TEXT, "Zum Dokumentenkorb").click()
|
||||
|
||||
num_files = get_num_files(download_path)
|
||||
driver.find_element(By.CLASS_NAME, "download-wrapper").click()
|
||||
|
||||
try:
|
||||
wait.until(wait_for_download_condition(download_path, num_files))
|
||||
file_name = "".join(e for e in company_name if e.isalnum()) + ".xml"
|
||||
rename_latest_file(
|
||||
download_path,
|
||||
file_name,
|
||||
)
|
||||
processed_companies.append(company_name)
|
||||
except Exception:
|
||||
logger.warning("Exception caught in Scraping")
|
||||
finally:
|
||||
for _ in range(6):
|
||||
driver.back()
|
||||
driver.find_element(By.XPATH, '//*[@class="fas fa-angle-right"]').click()
|
||||
driver.close()
|
||||
|
||||
|
||||
def wait_for_download_condition(
|
||||
path: str, num_files: int, pattern: str = "*.xml"
|
||||
) -> bool:
|
||||
"""Selenium wait condition monitoring number of files in a dir.
|
||||
|
||||
Args:
|
||||
path (str): Directory path
|
||||
num_files (int): Current number of file
|
||||
pattern (str, optional): File pattern. Defaults to "*.xml".
|
||||
|
||||
Returns:
|
||||
bool: Current num file exceeded
|
||||
"""
|
||||
return len(glob.glob1(path, pattern)) > num_files
|
||||
|
||||
|
||||
def get_num_files(path: str, pattern: str = "*.xml") -> int:
|
||||
"""Get number of files in directory.
|
||||
|
||||
Args:
|
||||
path (str): Directory to scan
|
||||
pattern (str, optional): File pattern. Defaults to "*.xml".
|
||||
|
||||
Returns:
|
||||
int: Number of files matching pattern
|
||||
"""
|
||||
return len(glob.glob1(path, pattern))
|
||||
|
||||
|
||||
def rename_latest_file(path: str, filename: str, pattern: str = "*.xml"):
|
||||
"""Rename file in dir with latest change date.
|
||||
|
||||
Args:
|
||||
path (str): Dir to check
|
||||
filename (str): Name of file
|
||||
pattern (str, optional): File pattern. Defaults to "*.xml".
|
||||
"""
|
||||
list_of_files = [os.path.join(path, file) for file in glob.glob1(path, pattern)]
|
||||
latest_download = max(list_of_files, key=os.path.getctime)
|
||||
os.rename(latest_download, os.path.join(path, filename))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""Main procedure"""
|
||||
import pandas as pd
|
||||
|
||||
df_relevant_companies = pd.read_excel(
|
||||
"./data/study_id42887_top-100-unternehmen-deutschland.xlsx",
|
||||
sheet_name="Toplist",
|
||||
skiprows=1,
|
||||
)
|
||||
df_relevant_companies = df_relevant_companies[df_relevant_companies["Name"].notna()]
|
||||
|
||||
batch_size = 5
|
||||
pool = multiprocessing.Pool(processes=batch_size)
|
||||
params = [
|
||||
(query, ["data", "Unternehmensregister", "scraping", query.strip()])
|
||||
for query in df_relevant_companies.Name
|
||||
]
|
||||
# Map the process_handler function to the parameter list using the Pool
|
||||
pool.starmap(scrape, params)
|
||||
|
||||
# Close the Pool to prevent any more tasks from being submitted
|
||||
pool.close()
|
||||
|
||||
# Wait for all the processes to complete
|
||||
pool.join()
|
@ -3970,10 +3970,6 @@
|
||||
"import re\n",
|
||||
"from aki_prj23_transparenzregister.models.company import Company\n",
|
||||
"\n",
|
||||
"content = {\n",
|
||||
" \"type\": \"Person | Company\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def parse_stakeholder(data: dict) -> list:\n",
|
||||
" if \"Natuerliche_Person\" in data[\"Beteiligter\"]:\n",
|
||||
@ -4427,6 +4423,36 @@
|
||||
"service = CompanyMongoService(connector)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['Die Gesellschaft hat am 31.03.2022 mit der BayWa Aktiengesellschaft mit dem Sitz in München (Amtsgericht München HRB 4921) ']\n",
|
||||
"['Zwischen der E.ON Kraftwerke GmbH mit dem Sitz in Hannover (Amtsgericht Hannover HRB 58691) ']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"\n",
|
||||
"texts = [\n",
|
||||
" \"\"\"\n",
|
||||
"Die Gesellschaft hat am 31.03.2022 mit der BayWa Aktiengesellschaft mit dem Sitz in M\\u00fcnchen (Amtsgericht M\\u00fcnchen HRB 4921) als herrschender Gesellschaft einen Gewinnabf\\u00fchrungsvertrag geschlossen. \n",
|
||||
"Die Gesellschafterversammlung hat mit Beschluss vom 31.03.2022 zugestimmt.\"\n",
|
||||
"\"\"\",\n",
|
||||
" \"\"\"Zwischen der E.ON Kraftwerke GmbH mit dem Sitz in Hannover (Amtsgericht Hannover HRB 58691) als herrschender Gesellschaft und der Gesellschaft als beherrschter Gesellschaft ist am 26.10.2004 und 08.11.2004 ein Beherrschungs- und Gewinnabf\\u00fchrungsvertrag abgeschlossen worden. \n",
|
||||
"Die Gesellschafterversammlung der herrschenden Gesellschaft hat dem Vertrag am 08.11.2004 und die Gesellschafterversammlung der beherrschten Gesellschaft hat dem Vertrag am 08.11.2004 zugestimmt.\"\"\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"for text in texts:\n",
|
||||
" print(re.findall(r\"(.*)als herrschender Gesellschaft\", text))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
|
Reference in New Issue
Block a user