mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-04-25 00:32:33 +02:00
hotfix: Resolve issue in scrape process (#415)
This commit is contained in:
parent
cf1c8ea508
commit
9fa46aac29
@ -12,7 +12,7 @@ from selenium.webdriver.support.ui import WebDriverWait
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def scrape(
|
||||
def scrape( # noqa: PLR0915
|
||||
query: str,
|
||||
download_dir: str,
|
||||
full_match: bool = False,
|
||||
@ -80,6 +80,9 @@ def scrape(
|
||||
)
|
||||
]
|
||||
for index, company_link in enumerate(companies_tab):
|
||||
pages_navigated = 0
|
||||
|
||||
try:
|
||||
company_name = company_names[index]
|
||||
if company_name in processed_companies or (
|
||||
full_match is True and company_name != query
|
||||
@ -87,10 +90,15 @@ def scrape(
|
||||
continue
|
||||
# Go to intermediary page
|
||||
company_link.click()
|
||||
pages_navigated += 1
|
||||
# Trigger next redirect
|
||||
driver.find_element(By.LINK_TEXT, "Registerinformationen anzeigen").click()
|
||||
driver.find_element(
|
||||
By.LINK_TEXT, "Registerinformationen anzeigen"
|
||||
).click()
|
||||
pages_navigated += 1
|
||||
# Trigger SI download
|
||||
driver.find_element(By.LINK_TEXT, "SI").click()
|
||||
pages_navigated += 1
|
||||
# Show shopping cart
|
||||
wait.until(
|
||||
ec.visibility_of_element_located(
|
||||
@ -98,24 +106,29 @@ def scrape(
|
||||
)
|
||||
)
|
||||
driver.find_element(By.LINK_TEXT, "Dokumentenkorb ansehen").click()
|
||||
pages_navigated += 1
|
||||
# Get document
|
||||
elems = driver.find_elements(By.TAG_NAME, "input")
|
||||
elems[-2].click()
|
||||
|
||||
pages_navigated += 1
|
||||
wait.until(
|
||||
ec.visibility_of_element_located((By.ID, "paymentFormOverview:btnNext"))
|
||||
ec.visibility_of_element_located(
|
||||
(By.ID, "paymentFormOverview:btnNext")
|
||||
)
|
||||
)
|
||||
driver.find_element(By.ID, "paymentFormOverview:btnNext").click()
|
||||
pages_navigated += 1
|
||||
|
||||
wait.until(
|
||||
ec.visibility_of_element_located((By.LINK_TEXT, "Zum Dokumentenkorb"))
|
||||
ec.visibility_of_element_located(
|
||||
(By.LINK_TEXT, "Zum Dokumentenkorb")
|
||||
)
|
||||
)
|
||||
driver.find_element(By.LINK_TEXT, "Zum Dokumentenkorb").click()
|
||||
|
||||
num_files = get_num_files(download_path)
|
||||
driver.find_element(By.CLASS_NAME, "download-wrapper").click()
|
||||
|
||||
try:
|
||||
#
|
||||
wait.until(
|
||||
lambda x: wait_for_download_condition(download_path, num_files) # type: ignore
|
||||
)
|
||||
@ -128,7 +141,7 @@ def scrape(
|
||||
except Exception:
|
||||
logger.warning("Exception caught in Scraping")
|
||||
finally:
|
||||
for _ in range(6):
|
||||
for _ in range(pages_navigated): # should be 6
|
||||
driver.back()
|
||||
if company_name == query and full_match is True:
|
||||
break # noqa: B012
|
||||
|
Loading…
x
Reference in New Issue
Block a user