Khoi 1 year ago
parent
commit
4133f340cd
17 changed files with 199 additions and 393 deletions
  1. +22
    -46
      Forums/AbyssForum/crawler_selenium.py
  2. +12
    -23
      MarketPlaces/AnonymousMarketplace/crawler_selenium.py
  3. +11
    -22
      MarketPlaces/Apocalypse/crawler_selenium.py
  4. +11
    -22
      MarketPlaces/BlackPyramid/crawler_selenium.py
  5. +9
    -20
      MarketPlaces/CityMarket/crawler_selenium.py
  6. +9
    -20
      MarketPlaces/CypherMarketplace/crawler_selenium.py
  7. +21
    -23
      MarketPlaces/DarkFox/crawler_selenium.py
  8. +9
    -21
      MarketPlaces/DarkMatter/crawler_selenium.py
  9. +11
    -22
      MarketPlaces/DarkTor/crawler_selenium.py
  10. +9
    -20
      MarketPlaces/DigitalThriftShop/crawler_selenium.py
  11. +9
    -20
      MarketPlaces/LionMarketplace/crawler_selenium.py
  12. +10
    -22
      MarketPlaces/M00nkeyMarket/crawler_selenium.py
  13. +12
    -23
      MarketPlaces/MikesGrandStore/crawler_selenium.py
  14. +10
    -21
      MarketPlaces/ThiefWorld/crawler_selenium.py
  15. +10
    -22
      MarketPlaces/Tor2door/crawler_selenium.py
  16. +15
    -26
      MarketPlaces/TorBay/crawler_selenium.py
  17. +9
    -20
      MarketPlaces/TorMarket/crawler_selenium.py

+ 22
- 46
Forums/AbyssForum/crawler_selenium.py View File

@ -191,86 +191,66 @@ def crawlForum(driver):
print("Crawling the AbyssForum forum")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0
count = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
list = topicPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
driver.back()
'''
#variable to check if there is a next page for the topic
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
topics = topicPages(html)
for topic in topics:
has_next_topic_page = True
counter = 1
page = topic
# check if there is a next page for the topics
while has_next_topic_page:
# try to access next page of th topic
itemURL = urlparse.urljoin(baseURL, str(item))
itemURL = urlparse.urljoin(baseURL, str(page))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
savePage(driver.page_source, topic + f"page{counter}")
# if there is a next page then go and save....
# next page in the topic?
try:
temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div') # /html/body/div/div[2]/div/div[2]/div/
item = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') #/html/body/div/div[2]/div/div[2]/div
# comment out
if counter == 2:
break
try:
temp = driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[2]/div[3]')
item = temp.find_element(by=By.CLASS_NAME, value='button button-icon-only').get_attribute('href')
if item == "":
raise NoSuchElementException
has_next_topic_page = False
else:
counter += 1
counter += 1
except NoSuchElementException:
has_next_topic_page = False
# end of loop
for i in range(counter):
driver.back()
'''
# comment out
break
# comment out
if count == 1:
count = 0
break
try:
link = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[2]/div[2]/div[2]/ul/li[9]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -280,10 +260,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling AbyssForum forum done sucessfully. Press ENTER to continue\n")


+ 12
- 23
MarketPlaces/AnonymousMarketplace/crawler_selenium.py View File

@ -202,24 +202,23 @@ def crawlForum(driver):
print("Crawling the AnonymousMarketplace market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -231,24 +230,17 @@ def crawlForum(driver):
driver.back()
# comment out
# break
break
# comment out
# if count == 20:
# count = 0
# break
if count == 1:
break
#left in in case site changes
try:
link = ""
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -258,9 +250,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling AnonymousMarketplace forum done sucessfully. Press ENTER to continue\n")


+ 11
- 22
MarketPlaces/Apocalypse/crawler_selenium.py View File

@ -216,24 +216,23 @@ def crawlForum(driver):
print("Crawling the Apocalypse market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -245,11 +244,10 @@ def crawlForum(driver):
driver.back()
# comment out
# break
break
# comment out
if count == 20:
count = 0
if count == 1:
break
try:
@ -257,12 +255,6 @@ def crawlForum(driver):
'/html/body/div[1]/div/div[2]/nav/ul/li[5]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -272,9 +264,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling Apocalypse forum done sucessfully. Press ENTER to continue\n")


+ 11
- 22
MarketPlaces/BlackPyramid/crawler_selenium.py View File

@ -220,26 +220,25 @@ def crawlForum(driver):
print("Crawling the BlackPyramid market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
clicker = driver.find_element(by=By.XPATH, value='/html/body/div[2]/form/nav/nav/ul/li[2]/div/a')
clicker.click() # open tab with url
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
clicker = driver.find_element(by=By.XPATH, value='/html/body/div[2]/form/nav/nav/ul/li[2]/div/a')
clicker.click() # open tab with url
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -255,7 +254,6 @@ def crawlForum(driver):
# comment out
if count == 1:
count = 0
break
try:
@ -263,12 +261,6 @@ def crawlForum(driver):
'/html/body/center/div[4]/div/div[3]/div[23]/div[2]/input[1]')
if clicker == "":
raise NoSuchElementException
try:
clicker.click()
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -278,9 +270,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling BlackPyramid forum done sucessfully. Press ENTER to continue\n")


+ 9
- 20
MarketPlaces/CityMarket/crawler_selenium.py View File

@ -221,24 +221,23 @@ def crawlForum(driver):
print("Crawling the CityMarket market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -254,7 +253,6 @@ def crawlForum(driver):
# comment out
if count == 1:
count = 0
break
try:
@ -262,12 +260,6 @@ def crawlForum(driver):
'/html/body/div[1]/div/div[2]/nav/ul/li[5]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -277,9 +269,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling CityMarket forum done sucessfully. Press ENTER to continue\n")


+ 9
- 20
MarketPlaces/CypherMarketplace/crawler_selenium.py View File

@ -214,24 +214,23 @@ def crawlForum(driver):
print("Crawling the CypherMarketplace market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -247,7 +246,6 @@ def crawlForum(driver):
# comment out
if count == 1:
count = 0
break
try:
@ -256,12 +254,6 @@ def crawlForum(driver):
link = temp.find_element(by=By.TAG_NAME, value='page-link').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -271,9 +263,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling CypherMarketplace forum done sucessfully. Press ENTER to continue\n")


+ 21
- 23
MarketPlaces/DarkFox/crawler_selenium.py View File

@ -239,46 +239,47 @@ def crawlForum(driver):
print("Crawling the DarkFox market")
linksToCrawl = getInterestedLinks()
# visited = set(linksToCrawl)
# initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
if count >= 500:
break
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = str(item)
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
driver.back()
count += 1
# comment out
break
# comment out
if count == 0:
break
try:
link = driver.find_element(by=By.XPATH, value=
'/html/body/main/div/div[2]/div/div[2]/div/div/div/nav/a[2]').get_attribute('href')
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
@ -286,9 +287,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling BestCardingWorld forum done sucessfully. Press ENTER to continue\n")


+ 9
- 21
MarketPlaces/DarkMatter/crawler_selenium.py View File

@ -205,26 +205,24 @@ def crawlForum(driver):
print("Crawling the DarkMatter market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
list = productPages(html)
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
@ -239,7 +237,6 @@ def crawlForum(driver):
# comment out
if count == 1:
count = 0
break
try:
@ -248,12 +245,6 @@ def crawlForum(driver):
link = a.get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -263,9 +254,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling DarkMatter forum done sucessfully. Press ENTER to continue\n")


+ 11
- 22
MarketPlaces/DarkTor/crawler_selenium.py View File

@ -201,24 +201,23 @@ def crawlForum(driver):
print("Crawling the DarkTor market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -230,23 +229,16 @@ def crawlForum(driver):
driver.back()
# comment out
# break
break
# comment out
if count == 30:
count = 0
if count == 1:
break
try:
link = ""
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -256,9 +248,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling DarkTor forum done sucessfully. Press ENTER to continue\n")


+ 9
- 20
MarketPlaces/DigitalThriftShop/crawler_selenium.py View File

@ -204,24 +204,23 @@ def crawlForum(driver):
print("Crawling the DigitalThriftShop market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -237,7 +236,6 @@ def crawlForum(driver):
# comment out
if count == 1:
count = 0
break
try:
@ -245,12 +243,6 @@ def crawlForum(driver):
'/html/body/div[1]/div[2]/div/div[2]/main/div[1]/nav/ul/li[5]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -260,9 +252,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling DigitalThriftShop forum done sucessfully. Press ENTER to continue\n")


+ 9
- 20
MarketPlaces/LionMarketplace/crawler_selenium.py View File

@ -212,24 +212,23 @@ def crawlForum(driver):
print("Crawling the LionMarketplace market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -245,7 +244,6 @@ def crawlForum(driver):
# comment out
if count == 1:
count = 0
break
try:
@ -253,12 +251,6 @@ def crawlForum(driver):
'/html/body/div[2]/div[2]/div/div[2]/nav/ul/li[5]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -268,9 +260,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling LionMarketplace forum done sucessfully. Press ENTER to continue\n")


+ 10
- 22
MarketPlaces/M00nkeyMarket/crawler_selenium.py View File

@ -215,24 +215,23 @@ def crawlForum(driver):
print("Crawling the M00nkeyMarket market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(BASE_URL, str(item))
@ -244,24 +243,16 @@ def crawlForum(driver):
driver.back()
# comment out
# break
break
# comment out
if count == 1:
count = 0
break
try:
link = driver.find_element(by=By.LINK_TEXT, value='Next ›').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -271,9 +262,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling M00nkeyMarket done sucessfully. Press ENTER to continue\n")


+ 12
- 23
MarketPlaces/MikesGrandStore/crawler_selenium.py View File

@ -227,24 +227,23 @@ def crawlForum(driver):
print("Crawling the MikesGrandStore market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -256,24 +255,17 @@ def crawlForum(driver):
driver.back()
# comment out
# break
break
# comment out
# if count == 1:
# count = 0
# break
if count == 1:
break
try:
link = driver.find_element(by=By.XPATH, value=
'/html/body/div[1]/main/div/div[1]/div/div[3]/nav/ul/li[6]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -283,9 +275,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling MikesGrandStore forum done sucessfully. Press ENTER to continue\n")


+ 10
- 21
MarketPlaces/ThiefWorld/crawler_selenium.py View File

@ -211,24 +211,23 @@ def crawlForum(driver):
print("Crawling the ThiefWorld market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -243,8 +242,7 @@ def crawlForum(driver):
break
# comment out
if count == 20:
count = 0
if count == 1:
break
try:
@ -252,12 +250,6 @@ def crawlForum(driver):
'/html/body/div/div[1]/div/div/div[2]/div[3]/div/ul/li[13]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -267,9 +259,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling ThiefWorld forum done sucessfully. Press ENTER to continue\n")


+ 10
- 22
MarketPlaces/Tor2door/crawler_selenium.py View File

@ -228,25 +228,23 @@ def crawlForum(driver):
print("Crawling the Tor2door market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0
count = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -256,12 +254,12 @@ def crawlForum(driver):
driver.refresh()
savePage(driver.page_source, item)
driver.back()
# comment out
break
# comment out
if count == 1:
count = 0
break
try:
@ -269,15 +267,8 @@ def crawlForum(driver):
'/html/body/main/div/div/div[2]/div[11]/div/nav')
a = nav.find_element(by=By.LINK_TEXT, value="")
link = a.get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -287,9 +278,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling Tor2door market done sucessfully. Press ENTER to continue\n")


+ 15
- 26
MarketPlaces/TorBay/crawler_selenium.py View File

@ -198,24 +198,23 @@ def crawlForum(driver):
print("Crawling the TorBay Market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -226,25 +225,18 @@ def crawlForum(driver):
savePage(driver.page_source, item)
driver.back()
# #comment out
# break
#
# # # comment out
# if count == 1:
# count = 0
# break
# comment out
break
# comment out
if count == 1:
break
try:
link = driver.find_element(by=By.XPATH, value=
'/html/body/section/div/div/div[2]/div/div[2]/ul/li[3]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -254,9 +246,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling TorBay forum done sucessfully. Press ENTER to continue\n")


+ 9
- 20
MarketPlaces/TorMarket/crawler_selenium.py View File

@ -201,24 +201,23 @@ def crawlForum(driver):
print("Crawling the TorMarket market")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
@ -234,7 +233,6 @@ def crawlForum(driver):
# comment out
if count == 1:
count = 0
break
try:
@ -242,12 +240,6 @@ def crawlForum(driver):
'/html/body/div[2]/div/div/div[1]/main/nav/ul/li[5]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
@ -257,9 +249,6 @@ def crawlForum(driver):
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling TorMarket forum done sucessfully. Press ENTER to continue\n")


Loading…
Cancel
Save