From 9fa46aac292f0c2cf0d3e2bbf54a6135eee955f3 Mon Sep 17 00:00:00 2001 From: Tristan Nolde Date: Mon, 20 Nov 2023 20:09:11 +0100 Subject: [PATCH] hotfix: Resolve issue in scrape process (#415) --- .../unternehmensregister/extract.py | 85 +++++++++++-------- 1 file changed, 49 insertions(+), 36 deletions(-) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py index 73f3d44..fde0e48 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py @@ -12,7 +12,7 @@ from selenium.webdriver.support.ui import WebDriverWait from tqdm import tqdm -def scrape( +def scrape( # noqa: PLR0915 query: str, download_dir: str, full_match: bool = False, @@ -80,42 +80,55 @@ def scrape( ) ] for index, company_link in enumerate(companies_tab): - company_name = company_names[index] - if company_name in processed_companies or ( - full_match is True and company_name != query - ): - continue - # Go to intermediary page - company_link.click() - # Trigger next redirect - driver.find_element(By.LINK_TEXT, "Registerinformationen anzeigen").click() - # Trigger SI download - driver.find_element(By.LINK_TEXT, "SI").click() - # Show shopping cart - wait.until( - ec.visibility_of_element_located( - (By.LINK_TEXT, "Dokumentenkorb ansehen") - ) - ) - driver.find_element(By.LINK_TEXT, "Dokumentenkorb ansehen").click() - # Get document - elems = driver.find_elements(By.TAG_NAME, "input") - elems[-2].click() - - wait.until( - ec.visibility_of_element_located((By.ID, "paymentFormOverview:btnNext")) - ) - driver.find_element(By.ID, "paymentFormOverview:btnNext").click() - - wait.until( - ec.visibility_of_element_located((By.LINK_TEXT, "Zum Dokumentenkorb")) - ) - driver.find_element(By.LINK_TEXT, "Zum Dokumentenkorb").click() - - num_files = get_num_files(download_path) - driver.find_element(By.CLASS_NAME, "download-wrapper").click() + pages_navigated = 0 try: + company_name = company_names[index] + if company_name in processed_companies or ( + full_match is True and company_name != query + ): + continue + # Go to intermediary page + company_link.click() + pages_navigated += 1 + # Trigger next redirect + driver.find_element( + By.LINK_TEXT, "Registerinformationen anzeigen" + ).click() + pages_navigated += 1 + # Trigger SI download + driver.find_element(By.LINK_TEXT, "SI").click() + pages_navigated += 1 + # Show shopping cart + wait.until( + ec.visibility_of_element_located( + (By.LINK_TEXT, "Dokumentenkorb ansehen") + ) + ) + driver.find_element(By.LINK_TEXT, "Dokumentenkorb ansehen").click() + pages_navigated += 1 + # Get document + elems = driver.find_elements(By.TAG_NAME, "input") + elems[-2].click() + pages_navigated += 1 + wait.until( + ec.visibility_of_element_located( + (By.ID, "paymentFormOverview:btnNext") + ) + ) + driver.find_element(By.ID, "paymentFormOverview:btnNext").click() + pages_navigated += 1 + + wait.until( + ec.visibility_of_element_located( + (By.LINK_TEXT, "Zum Dokumentenkorb") + ) + ) + driver.find_element(By.LINK_TEXT, "Zum Dokumentenkorb").click() + + num_files = get_num_files(download_path) + driver.find_element(By.CLASS_NAME, "download-wrapper").click() + # wait.until( lambda x: wait_for_download_condition(download_path, num_files) # type: ignore ) @@ -128,7 +141,7 @@ def scrape( except Exception: logger.warning("Exception caught in Scraping") finally: - for _ in range(6): + for _ in range(pages_navigated): # should be 6 driver.back() if company_name == query and full_match is True: break # noqa: B012