hotfix: Resolve issue in scrape process (#415)

This commit is contained in:
Tristan Nolde 2023-11-20 20:09:11 +01:00 committed by GitHub
parent cf1c8ea508
commit 9fa46aac29
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -12,7 +12,7 @@ from selenium.webdriver.support.ui import WebDriverWait
from tqdm import tqdm
def scrape(
def scrape( # noqa: PLR0915
query: str,
download_dir: str,
full_match: bool = False,
@ -80,42 +80,55 @@ def scrape(
)
]
for index, company_link in enumerate(companies_tab):
company_name = company_names[index]
if company_name in processed_companies or (
full_match is True and company_name != query
):
continue
# Go to intermediary page
company_link.click()
# Trigger next redirect
driver.find_element(By.LINK_TEXT, "Registerinformationen anzeigen").click()
# Trigger SI download
driver.find_element(By.LINK_TEXT, "SI").click()
# Show shopping cart
wait.until(
ec.visibility_of_element_located(
(By.LINK_TEXT, "Dokumentenkorb ansehen")
)
)
driver.find_element(By.LINK_TEXT, "Dokumentenkorb ansehen").click()
# Get document
elems = driver.find_elements(By.TAG_NAME, "input")
elems[-2].click()
wait.until(
ec.visibility_of_element_located((By.ID, "paymentFormOverview:btnNext"))
)
driver.find_element(By.ID, "paymentFormOverview:btnNext").click()
wait.until(
ec.visibility_of_element_located((By.LINK_TEXT, "Zum Dokumentenkorb"))
)
driver.find_element(By.LINK_TEXT, "Zum Dokumentenkorb").click()
num_files = get_num_files(download_path)
driver.find_element(By.CLASS_NAME, "download-wrapper").click()
pages_navigated = 0
try:
company_name = company_names[index]
if company_name in processed_companies or (
full_match is True and company_name != query
):
continue
# Go to intermediary page
company_link.click()
pages_navigated += 1
# Trigger next redirect
driver.find_element(
By.LINK_TEXT, "Registerinformationen anzeigen"
).click()
pages_navigated += 1
# Trigger SI download
driver.find_element(By.LINK_TEXT, "SI").click()
pages_navigated += 1
# Show shopping cart
wait.until(
ec.visibility_of_element_located(
(By.LINK_TEXT, "Dokumentenkorb ansehen")
)
)
driver.find_element(By.LINK_TEXT, "Dokumentenkorb ansehen").click()
pages_navigated += 1
# Get document
elems = driver.find_elements(By.TAG_NAME, "input")
elems[-2].click()
pages_navigated += 1
wait.until(
ec.visibility_of_element_located(
(By.ID, "paymentFormOverview:btnNext")
)
)
driver.find_element(By.ID, "paymentFormOverview:btnNext").click()
pages_navigated += 1
wait.until(
ec.visibility_of_element_located(
(By.LINK_TEXT, "Zum Dokumentenkorb")
)
)
driver.find_element(By.LINK_TEXT, "Zum Dokumentenkorb").click()
num_files = get_num_files(download_path)
driver.find_element(By.CLASS_NAME, "download-wrapper").click()
#
wait.until(
lambda x: wait_for_download_condition(download_path, num_files) # type: ignore
)
@ -128,7 +141,7 @@ def scrape(
except Exception:
logger.warning("Exception caught in Scraping")
finally:
for _ in range(6):
for _ in range(pages_navigated): # should be 6
driver.back()
if company_name == query and full_match is True:
break # noqa: B012