mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-04-25 12:42:33 +02:00
hotfix: Resolve issue in scrape process (#415)
This commit is contained in:
parent
cf1c8ea508
commit
9fa46aac29
@ -12,7 +12,7 @@ from selenium.webdriver.support.ui import WebDriverWait
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
def scrape(
|
def scrape( # noqa: PLR0915
|
||||||
query: str,
|
query: str,
|
||||||
download_dir: str,
|
download_dir: str,
|
||||||
full_match: bool = False,
|
full_match: bool = False,
|
||||||
@ -80,6 +80,9 @@ def scrape(
|
|||||||
)
|
)
|
||||||
]
|
]
|
||||||
for index, company_link in enumerate(companies_tab):
|
for index, company_link in enumerate(companies_tab):
|
||||||
|
pages_navigated = 0
|
||||||
|
|
||||||
|
try:
|
||||||
company_name = company_names[index]
|
company_name = company_names[index]
|
||||||
if company_name in processed_companies or (
|
if company_name in processed_companies or (
|
||||||
full_match is True and company_name != query
|
full_match is True and company_name != query
|
||||||
@ -87,10 +90,15 @@ def scrape(
|
|||||||
continue
|
continue
|
||||||
# Go to intermediary page
|
# Go to intermediary page
|
||||||
company_link.click()
|
company_link.click()
|
||||||
|
pages_navigated += 1
|
||||||
# Trigger next redirect
|
# Trigger next redirect
|
||||||
driver.find_element(By.LINK_TEXT, "Registerinformationen anzeigen").click()
|
driver.find_element(
|
||||||
|
By.LINK_TEXT, "Registerinformationen anzeigen"
|
||||||
|
).click()
|
||||||
|
pages_navigated += 1
|
||||||
# Trigger SI download
|
# Trigger SI download
|
||||||
driver.find_element(By.LINK_TEXT, "SI").click()
|
driver.find_element(By.LINK_TEXT, "SI").click()
|
||||||
|
pages_navigated += 1
|
||||||
# Show shopping cart
|
# Show shopping cart
|
||||||
wait.until(
|
wait.until(
|
||||||
ec.visibility_of_element_located(
|
ec.visibility_of_element_located(
|
||||||
@ -98,24 +106,29 @@ def scrape(
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
driver.find_element(By.LINK_TEXT, "Dokumentenkorb ansehen").click()
|
driver.find_element(By.LINK_TEXT, "Dokumentenkorb ansehen").click()
|
||||||
|
pages_navigated += 1
|
||||||
# Get document
|
# Get document
|
||||||
elems = driver.find_elements(By.TAG_NAME, "input")
|
elems = driver.find_elements(By.TAG_NAME, "input")
|
||||||
elems[-2].click()
|
elems[-2].click()
|
||||||
|
pages_navigated += 1
|
||||||
wait.until(
|
wait.until(
|
||||||
ec.visibility_of_element_located((By.ID, "paymentFormOverview:btnNext"))
|
ec.visibility_of_element_located(
|
||||||
|
(By.ID, "paymentFormOverview:btnNext")
|
||||||
|
)
|
||||||
)
|
)
|
||||||
driver.find_element(By.ID, "paymentFormOverview:btnNext").click()
|
driver.find_element(By.ID, "paymentFormOverview:btnNext").click()
|
||||||
|
pages_navigated += 1
|
||||||
|
|
||||||
wait.until(
|
wait.until(
|
||||||
ec.visibility_of_element_located((By.LINK_TEXT, "Zum Dokumentenkorb"))
|
ec.visibility_of_element_located(
|
||||||
|
(By.LINK_TEXT, "Zum Dokumentenkorb")
|
||||||
|
)
|
||||||
)
|
)
|
||||||
driver.find_element(By.LINK_TEXT, "Zum Dokumentenkorb").click()
|
driver.find_element(By.LINK_TEXT, "Zum Dokumentenkorb").click()
|
||||||
|
|
||||||
num_files = get_num_files(download_path)
|
num_files = get_num_files(download_path)
|
||||||
driver.find_element(By.CLASS_NAME, "download-wrapper").click()
|
driver.find_element(By.CLASS_NAME, "download-wrapper").click()
|
||||||
|
#
|
||||||
try:
|
|
||||||
wait.until(
|
wait.until(
|
||||||
lambda x: wait_for_download_condition(download_path, num_files) # type: ignore
|
lambda x: wait_for_download_condition(download_path, num_files) # type: ignore
|
||||||
)
|
)
|
||||||
@ -128,7 +141,7 @@ def scrape(
|
|||||||
except Exception:
|
except Exception:
|
||||||
logger.warning("Exception caught in Scraping")
|
logger.warning("Exception caught in Scraping")
|
||||||
finally:
|
finally:
|
||||||
for _ in range(6):
|
for _ in range(pages_navigated): # should be 6
|
||||||
driver.back()
|
driver.back()
|
||||||
if company_name == query and full_match is True:
|
if company_name == query and full_match is True:
|
||||||
break # noqa: B012
|
break # noqa: B012
|
||||||
|
Loading…
x
Reference in New Issue
Block a user