diff --git a/Forums/AbyssForum/crawler_selenium.py b/Forums/AbyssForum/crawler_selenium.py index 573cd13..32678dc 100644 --- a/Forums/AbyssForum/crawler_selenium.py +++ b/Forums/AbyssForum/crawler_selenium.py @@ -191,86 +191,66 @@ def crawlForum(driver): print("Crawling the AbyssForum forum") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() i = 0 - count = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 while has_next_page: - list = topicPages(html) - for item in list: - itemURL = urlparse.urljoin(baseURL, str(item)) - try: - driver.get(itemURL) - except: - driver.refresh() - savePage(driver.page_source, item) - driver.back() - ''' - #variable to check if there is a next page for the topic + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + + topics = topicPages(html) + for topic in topics: has_next_topic_page = True counter = 1 + page = topic - # check if there is a next page for the topics while has_next_topic_page: - # try to access next page of th topic - itemURL = urlparse.urljoin(baseURL, str(item)) + itemURL = urlparse.urljoin(baseURL, str(page)) try: driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver.page_source, topic + f"page{counter}") - # if there is a next page then go and save.... - # next page in the topic? - try: - temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div') # /html/body/div/div[2]/div/div[2]/div/ - item = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div + # comment out + if counter == 2: + break + try: + temp = driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[2]/div[3]') + item = temp.find_element(by=By.CLASS_NAME, value='button button-icon-only').get_attribute('href') if item == "": raise NoSuchElementException - has_next_topic_page = False - else: - counter += 1 + counter += 1 + except NoSuchElementException: has_next_topic_page = False # end of loop for i in range(counter): driver.back() - ''' + # comment out break # comment out if count == 1: - count = 0 break try: link = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[2]/div[2]/div[2]/ul/li[9]/a').get_attribute('href') - if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -280,10 +260,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - - input("Crawling AbyssForum forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/AnonymousMarketplace/crawler_selenium.py b/MarketPlaces/AnonymousMarketplace/crawler_selenium.py index 70c5315..d09469f 100644 --- a/MarketPlaces/AnonymousMarketplace/crawler_selenium.py +++ b/MarketPlaces/AnonymousMarketplace/crawler_selenium.py @@ -202,24 +202,23 @@ def crawlForum(driver): print("Crawling the AnonymousMarketplace market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -231,24 +230,17 @@ def crawlForum(driver): driver.back() # comment out - # break + break # comment out - # if count == 20: - # count = 0 - # break + if count == 1: + break #left in in case site changes try: link = "" if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -258,9 +250,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling AnonymousMarketplace forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/Apocalypse/crawler_selenium.py b/MarketPlaces/Apocalypse/crawler_selenium.py index 28d6a0f..134f4d8 100644 --- a/MarketPlaces/Apocalypse/crawler_selenium.py +++ b/MarketPlaces/Apocalypse/crawler_selenium.py @@ -216,24 +216,23 @@ def crawlForum(driver): print("Crawling the Apocalypse market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -245,11 +244,10 @@ def crawlForum(driver): driver.back() # comment out - # break + break # comment out - if count == 20: - count = 0 + if count == 1: break try: @@ -257,12 +255,6 @@ def crawlForum(driver): '/html/body/div[1]/div/div[2]/nav/ul/li[5]/a').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -272,9 +264,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling Apocalypse forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/BlackPyramid/crawler_selenium.py b/MarketPlaces/BlackPyramid/crawler_selenium.py index af1623a..5ce0101 100644 --- a/MarketPlaces/BlackPyramid/crawler_selenium.py +++ b/MarketPlaces/BlackPyramid/crawler_selenium.py @@ -220,26 +220,25 @@ def crawlForum(driver): print("Crawling the BlackPyramid market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - clicker = driver.find_element(by=By.XPATH, value='/html/body/div[2]/form/nav/nav/ul/li[2]/div/a') - clicker.click() # open tab with url - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + clicker = driver.find_element(by=By.XPATH, value='/html/body/div[2]/form/nav/nav/ul/li[2]/div/a') + clicker.click() # open tab with url + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -255,7 +254,6 @@ def crawlForum(driver): # comment out if count == 1: - count = 0 break try: @@ -263,12 +261,6 @@ def crawlForum(driver): '/html/body/center/div[4]/div/div[3]/div[23]/div[2]/input[1]') if clicker == "": raise NoSuchElementException - try: - clicker.click() - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -278,9 +270,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling BlackPyramid forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/CityMarket/crawler_selenium.py b/MarketPlaces/CityMarket/crawler_selenium.py index 1384c18..ff30bf0 100644 --- a/MarketPlaces/CityMarket/crawler_selenium.py +++ b/MarketPlaces/CityMarket/crawler_selenium.py @@ -221,24 +221,23 @@ def crawlForum(driver): print("Crawling the CityMarket market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -254,7 +253,6 @@ def crawlForum(driver): # comment out if count == 1: - count = 0 break try: @@ -262,12 +260,6 @@ def crawlForum(driver): '/html/body/div[1]/div/div[2]/nav/ul/li[5]/a').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -277,9 +269,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling CityMarket forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/CypherMarketplace/crawler_selenium.py b/MarketPlaces/CypherMarketplace/crawler_selenium.py index aa587c4..120ed32 100644 --- a/MarketPlaces/CypherMarketplace/crawler_selenium.py +++ b/MarketPlaces/CypherMarketplace/crawler_selenium.py @@ -214,24 +214,23 @@ def crawlForum(driver): print("Crawling the CypherMarketplace market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -247,7 +246,6 @@ def crawlForum(driver): # comment out if count == 1: - count = 0 break try: @@ -256,12 +254,6 @@ def crawlForum(driver): link = temp.find_element(by=By.TAG_NAME, value='page-link').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -271,9 +263,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling CypherMarketplace forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/DarkFox/crawler_selenium.py b/MarketPlaces/DarkFox/crawler_selenium.py index 3967567..61927d6 100644 --- a/MarketPlaces/DarkFox/crawler_selenium.py +++ b/MarketPlaces/DarkFox/crawler_selenium.py @@ -239,46 +239,47 @@ def crawlForum(driver): print("Crawling the DarkFox market") linksToCrawl = getInterestedLinks() - # visited = set(linksToCrawl) - # initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): - if count >= 500: - break link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: - itemURL = str(item) + itemURL = urlparse.urljoin(baseURL, str(item)) try: driver.get(itemURL) except: driver.refresh() savePage(driver.page_source, item) driver.back() - count += 1 + + # comment out + break + + # comment out + if count == 0: + break try: link = driver.find_element(by=By.XPATH, value= '/html/body/main/div/div[2]/div/div[2]/div/div/div/nav/a[2]').get_attribute('href') - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) + if link == "": + raise NoSuchElementException + count += 1 + except NoSuchElementException: has_next_page = False @@ -286,9 +287,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling BestCardingWorld forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/DarkMatter/crawler_selenium.py b/MarketPlaces/DarkMatter/crawler_selenium.py index 21f6035..6d6986a 100644 --- a/MarketPlaces/DarkMatter/crawler_selenium.py +++ b/MarketPlaces/DarkMatter/crawler_selenium.py @@ -205,26 +205,24 @@ def crawlForum(driver): print("Crawling the DarkMatter market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: - list = productPages(html) + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: @@ -239,7 +237,6 @@ def crawlForum(driver): # comment out if count == 1: - count = 0 break try: @@ -248,12 +245,6 @@ def crawlForum(driver): link = a.get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -263,9 +254,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling DarkMatter forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/DarkTor/crawler_selenium.py b/MarketPlaces/DarkTor/crawler_selenium.py index 3c44dc7..d84de5c 100644 --- a/MarketPlaces/DarkTor/crawler_selenium.py +++ b/MarketPlaces/DarkTor/crawler_selenium.py @@ -201,24 +201,23 @@ def crawlForum(driver): print("Crawling the DarkTor market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -230,23 +229,16 @@ def crawlForum(driver): driver.back() # comment out - # break + break # comment out - if count == 30: - count = 0 + if count == 1: break try: link = "" if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -256,9 +248,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling DarkTor forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/DigitalThriftShop/crawler_selenium.py b/MarketPlaces/DigitalThriftShop/crawler_selenium.py index e602a4f..88b460f 100644 --- a/MarketPlaces/DigitalThriftShop/crawler_selenium.py +++ b/MarketPlaces/DigitalThriftShop/crawler_selenium.py @@ -204,24 +204,23 @@ def crawlForum(driver): print("Crawling the DigitalThriftShop market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -237,7 +236,6 @@ def crawlForum(driver): # comment out if count == 1: - count = 0 break try: @@ -245,12 +243,6 @@ def crawlForum(driver): '/html/body/div[1]/div[2]/div/div[2]/main/div[1]/nav/ul/li[5]/a').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -260,9 +252,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling DigitalThriftShop forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/LionMarketplace/crawler_selenium.py b/MarketPlaces/LionMarketplace/crawler_selenium.py index 3310aca..d969235 100644 --- a/MarketPlaces/LionMarketplace/crawler_selenium.py +++ b/MarketPlaces/LionMarketplace/crawler_selenium.py @@ -212,24 +212,23 @@ def crawlForum(driver): print("Crawling the LionMarketplace market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -245,7 +244,6 @@ def crawlForum(driver): # comment out if count == 1: - count = 0 break try: @@ -253,12 +251,6 @@ def crawlForum(driver): '/html/body/div[2]/div[2]/div/div[2]/nav/ul/li[5]/a').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -268,9 +260,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling LionMarketplace forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/M00nkeyMarket/crawler_selenium.py b/MarketPlaces/M00nkeyMarket/crawler_selenium.py index 2f651a5..6816cd3 100644 --- a/MarketPlaces/M00nkeyMarket/crawler_selenium.py +++ b/MarketPlaces/M00nkeyMarket/crawler_selenium.py @@ -217,24 +217,23 @@ def crawlForum(driver): print("Crawling the M00nkeyMarket market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -246,24 +245,16 @@ def crawlForum(driver): driver.back() # comment out - # break + break # comment out if count == 1: - count = 0 break try: link = driver.find_element(by=By.LINK_TEXT, value='Next ›').get_attribute('href') - if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -273,9 +264,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling M00nkeyMarket done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/MikesGrandStore/crawler_selenium.py b/MarketPlaces/MikesGrandStore/crawler_selenium.py index f24dce0..bb7d1f8 100644 --- a/MarketPlaces/MikesGrandStore/crawler_selenium.py +++ b/MarketPlaces/MikesGrandStore/crawler_selenium.py @@ -227,24 +227,23 @@ def crawlForum(driver): print("Crawling the MikesGrandStore market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -256,24 +255,17 @@ def crawlForum(driver): driver.back() # comment out - # break + break # comment out - # if count == 1: - # count = 0 - # break + if count == 1: + break try: link = driver.find_element(by=By.XPATH, value= '/html/body/div[1]/main/div/div[1]/div/div[3]/nav/ul/li[6]/a').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -283,9 +275,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling MikesGrandStore forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/ThiefWorld/crawler_selenium.py b/MarketPlaces/ThiefWorld/crawler_selenium.py index 52e8f89..1111c4d 100644 --- a/MarketPlaces/ThiefWorld/crawler_selenium.py +++ b/MarketPlaces/ThiefWorld/crawler_selenium.py @@ -211,24 +211,23 @@ def crawlForum(driver): print("Crawling the ThiefWorld market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -243,8 +242,7 @@ def crawlForum(driver): break # comment out - if count == 20: - count = 0 + if count == 1: break try: @@ -252,12 +250,6 @@ def crawlForum(driver): '/html/body/div/div[1]/div/div/div[2]/div[3]/div/ul/li[13]/a').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -267,9 +259,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling ThiefWorld forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/Tor2door/crawler_selenium.py b/MarketPlaces/Tor2door/crawler_selenium.py index a299c71..964c574 100644 --- a/MarketPlaces/Tor2door/crawler_selenium.py +++ b/MarketPlaces/Tor2door/crawler_selenium.py @@ -228,25 +228,23 @@ def crawlForum(driver): print("Crawling the Tor2door market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() i = 0 - count = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) - try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -256,12 +254,12 @@ def crawlForum(driver): driver.refresh() savePage(driver.page_source, item) driver.back() + # comment out break # comment out if count == 1: - count = 0 break try: @@ -269,15 +267,8 @@ def crawlForum(driver): '/html/body/main/div/div/div[2]/div[11]/div/nav') a = nav.find_element(by=By.LINK_TEXT, value="›") link = a.get_attribute('href') - if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -287,9 +278,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling Tor2door market done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/TorBay/crawler_selenium.py b/MarketPlaces/TorBay/crawler_selenium.py index e59f49e..0861e82 100644 --- a/MarketPlaces/TorBay/crawler_selenium.py +++ b/MarketPlaces/TorBay/crawler_selenium.py @@ -198,24 +198,23 @@ def crawlForum(driver): print("Crawling the TorBay Market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -226,25 +225,18 @@ def crawlForum(driver): savePage(driver.page_source, item) driver.back() - # #comment out - # break - # - # # # comment out - # if count == 1: - # count = 0 - # break + # comment out + break + + # comment out + if count == 1: + break try: link = driver.find_element(by=By.XPATH, value= '/html/body/section/div/div/div[2]/div/div[2]/ul/li[3]/a').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -254,9 +246,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling TorBay forum done sucessfully. Press ENTER to continue\n") diff --git a/MarketPlaces/TorMarket/crawler_selenium.py b/MarketPlaces/TorMarket/crawler_selenium.py index 35be864..0528a05 100644 --- a/MarketPlaces/TorMarket/crawler_selenium.py +++ b/MarketPlaces/TorMarket/crawler_selenium.py @@ -201,24 +201,23 @@ def crawlForum(driver): print("Crawling the TorMarket market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) @@ -234,7 +233,6 @@ def crawlForum(driver): # comment out if count == 1: - count = 0 break try: @@ -242,12 +240,6 @@ def crawlForum(driver): '/html/body/div[2]/div/div/div[1]/main/nav/ul/li[5]/a').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -257,9 +249,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling TorMarket forum done sucessfully. Press ENTER to continue\n")