|
|
@ -231,9 +231,7 @@ def crawlMarket(driver): |
|
|
|
|
|
|
|
try: |
|
|
|
has_next_page = True |
|
|
|
count = 1 # Number of pages traversed |
|
|
|
maxPages = '' |
|
|
|
|
|
|
|
count = 2 |
|
|
|
|
|
|
|
while has_next_page: |
|
|
|
|
|
|
@ -247,21 +245,6 @@ def crawlMarket(driver): |
|
|
|
html = driver.page_source |
|
|
|
savePage(driver, html, linksToCrawl[i] + f"page{count}") |
|
|
|
|
|
|
|
# Get the number of maxPages if maxPages isn't fetched yet |
|
|
|
if maxPages == '': |
|
|
|
try: |
|
|
|
# Wait 30 seconds or until element loads |
|
|
|
WebDriverWait(driver, 30).until( |
|
|
|
EC.presence_of_element_located((By.XPATH, '//div[@class="pages"]//a[last()]')) |
|
|
|
) |
|
|
|
# fetches the element that gives the total number of pages in a category |
|
|
|
maxPages = int(driver.find_element(By.XPATH, '//div[@class="pages"]//a[last()]').text) |
|
|
|
print(f"Total number of Pages: {maxPages}") |
|
|
|
except Exception as e: |
|
|
|
print(f"Element not found: {str(e)}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Parse the product/description pages |
|
|
|
list = descriptionPages(html) |
|
|
|
for item in list: |
|
|
@ -275,11 +258,9 @@ def crawlMarket(driver): |
|
|
|
# Go back to the previous category page |
|
|
|
driver.back() |
|
|
|
|
|
|
|
# # Add a break for testing if we are checking only the first description/product page |
|
|
|
# break |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# # Add a break for testing if we are checking only the first description/product page |
|
|
|
# break |
|
|
|
# |
|
|
|
# # Add a break for testing based on how many numbers of pages to test |
|
|
|
# if count == 3: |
|
|
|
# break |
|
|
@ -287,7 +268,8 @@ def crawlMarket(driver): |
|
|
|
# Try to find the next page |
|
|
|
try: |
|
|
|
link = f"{baseCategoryLink}/{count}/" |
|
|
|
print("\tCurrent Page :", f"{link}") |
|
|
|
driver.find_element(By.XPATH, f'//a[@href="{urlparse.urlparse(link).path}"]') |
|
|
|
|
|
|
|
if link == "": |
|
|
|
raise NoSuchElementException |
|
|
|
count += 1 |
|
|
@ -295,11 +277,6 @@ def crawlMarket(driver): |
|
|
|
except NoSuchElementException: |
|
|
|
has_next_page = False |
|
|
|
|
|
|
|
# If reached the number of maxPages stop crawling the current category |
|
|
|
if count > maxPages: |
|
|
|
print("Max Pages reached") |
|
|
|
has_next_page = False |
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
print(link, e) |
|
|
|
i += 1 |
|
|
|