hotfix: Resolve issue in scrape process (#415)

This commit is contained in:
Tristan Nolde 2023-11-20 20:09:11 +01:00 committed by GitHub
parent cf1c8ea508
commit 9fa46aac29
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -12,7 +12,7 @@ from selenium.webdriver.support.ui import WebDriverWait
from tqdm import tqdm from tqdm import tqdm
def scrape( def scrape( # noqa: PLR0915
query: str, query: str,
download_dir: str, download_dir: str,
full_match: bool = False, full_match: bool = False,
@ -80,6 +80,9 @@ def scrape(
) )
] ]
for index, company_link in enumerate(companies_tab): for index, company_link in enumerate(companies_tab):
pages_navigated = 0
try:
company_name = company_names[index] company_name = company_names[index]
if company_name in processed_companies or ( if company_name in processed_companies or (
full_match is True and company_name != query full_match is True and company_name != query
@ -87,10 +90,15 @@ def scrape(
continue continue
# Go to intermediary page # Go to intermediary page
company_link.click() company_link.click()
pages_navigated += 1
# Trigger next redirect # Trigger next redirect
driver.find_element(By.LINK_TEXT, "Registerinformationen anzeigen").click() driver.find_element(
By.LINK_TEXT, "Registerinformationen anzeigen"
).click()
pages_navigated += 1
# Trigger SI download # Trigger SI download
driver.find_element(By.LINK_TEXT, "SI").click() driver.find_element(By.LINK_TEXT, "SI").click()
pages_navigated += 1
# Show shopping cart # Show shopping cart
wait.until( wait.until(
ec.visibility_of_element_located( ec.visibility_of_element_located(
@ -98,24 +106,29 @@ def scrape(
) )
) )
driver.find_element(By.LINK_TEXT, "Dokumentenkorb ansehen").click() driver.find_element(By.LINK_TEXT, "Dokumentenkorb ansehen").click()
pages_navigated += 1
# Get document # Get document
elems = driver.find_elements(By.TAG_NAME, "input") elems = driver.find_elements(By.TAG_NAME, "input")
elems[-2].click() elems[-2].click()
pages_navigated += 1
wait.until( wait.until(
ec.visibility_of_element_located((By.ID, "paymentFormOverview:btnNext")) ec.visibility_of_element_located(
(By.ID, "paymentFormOverview:btnNext")
)
) )
driver.find_element(By.ID, "paymentFormOverview:btnNext").click() driver.find_element(By.ID, "paymentFormOverview:btnNext").click()
pages_navigated += 1
wait.until( wait.until(
ec.visibility_of_element_located((By.LINK_TEXT, "Zum Dokumentenkorb")) ec.visibility_of_element_located(
(By.LINK_TEXT, "Zum Dokumentenkorb")
)
) )
driver.find_element(By.LINK_TEXT, "Zum Dokumentenkorb").click() driver.find_element(By.LINK_TEXT, "Zum Dokumentenkorb").click()
num_files = get_num_files(download_path) num_files = get_num_files(download_path)
driver.find_element(By.CLASS_NAME, "download-wrapper").click() driver.find_element(By.CLASS_NAME, "download-wrapper").click()
#
try:
wait.until( wait.until(
lambda x: wait_for_download_condition(download_path, num_files) # type: ignore lambda x: wait_for_download_condition(download_path, num_files) # type: ignore
) )
@ -128,7 +141,7 @@ def scrape(
except Exception: except Exception:
logger.warning("Exception caught in Scraping") logger.warning("Exception caught in Scraping")
finally: finally:
for _ in range(6): for _ in range(pages_navigated): # should be 6
driver.back() driver.back()
if company_name == query and full_match is True: if company_name == query and full_match is True:
break # noqa: B012 break # noqa: B012