Browse Source

edits to DarkDock crawler

main
westernmeadow 6 months ago
parent
commit
381e8be4e0
1 changed files with 6 additions and 29 deletions
  1. +6
    -29
      MarketPlaces/DarkDock/crawler_selenium.py

+ 6
- 29
MarketPlaces/DarkDock/crawler_selenium.py View File

@ -231,9 +231,7 @@ def crawlMarket(driver):
try:
has_next_page = True
count = 1 # Number of pages traversed
maxPages = ''
count = 2
while has_next_page:
@ -247,21 +245,6 @@ def crawlMarket(driver):
html = driver.page_source
savePage(driver, html, linksToCrawl[i] + f"page{count}")
# Get the number of maxPages if maxPages isn't fetched yet
if maxPages == '':
try:
# Wait 30 seconds or until element loads
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.XPATH, '//div[@class="pages"]//a[last()]'))
)
# fetches the element that gives the total number of pages in a category
maxPages = int(driver.find_element(By.XPATH, '//div[@class="pages"]//a[last()]').text)
print(f"Total number of Pages: {maxPages}")
except Exception as e:
print(f"Element not found: {str(e)}")
# Parse the product/description pages
list = descriptionPages(html)
for item in list:
@ -275,11 +258,9 @@ def crawlMarket(driver):
# Go back to the previous category page
driver.back()
# # Add a break for testing if we are checking only the first description/product page
# break
# # Add a break for testing if we are checking only the first description/product page
# break
#
# # Add a break for testing based on how many numbers of pages to test
# if count == 3:
# break
@ -287,7 +268,8 @@ def crawlMarket(driver):
# Try to find the next page
try:
link = f"{baseCategoryLink}/{count}/"
print("\tCurrent Page :", f"{link}")
driver.find_element(By.XPATH, f'//a[@href="{urlparse.urlparse(link).path}"]')
if link == "":
raise NoSuchElementException
count += 1
@ -295,11 +277,6 @@ def crawlMarket(driver):
except NoSuchElementException:
has_next_page = False
# If reached the number of maxPages stop crawling the current category
if count > maxPages:
print("Max Pages reached")
has_next_page = False
except Exception as e:
print(link, e)
i += 1


Loading…
Cancel
Save