@ -34,7 +34,6 @@ BASE_URL = 'http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onio
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling ( ) :
opentor ( )
mktName = getMKTName ( )
driver = getAccess ( )
@ -44,25 +43,11 @@ def startCrawling():
crawlForum ( driver )
except Exception as e :
print ( driver . current_url , e )
closeto r ( driver )
closeDrive r ( driver )
new_parse ( mktName , BASE_URL , False )
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor ( ) :
from MarketPlaces.Initialization.markets_mining import config
global pid
print ( " Connecting Tor... " )
pro = subprocess . Popen ( config . get ( ' TOR ' , ' firefox_binary_path ' ) )
pid = pro . pid
time . sleep ( 7.5 )
input ( ' Tor Connected. Press ENTER to continue \n ' )
return
# Returns the name of the website
#return: name of site in string type
def getMKTName ( ) :
@ -79,7 +64,7 @@ def getFixedURL():
# Closes Tor Browser
#@param: current selenium driver
def closeto r ( driver ) :
def closeDrive r ( driver ) :
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -96,7 +81,6 @@ def createFFDriver():
ff_binary = FirefoxBinary ( config . get ( ' TOR ' , ' firefox_binary_path ' ) )
ff_prof = FirefoxProfile ( config . get ( ' TOR ' , ' firefox_profile_path ' ) )
ff_prof . set_preference ( " places.history.enabled " , False )
ff_prof . set_preference ( " privacy.clearOnShutdown.offlineApps " , True )
@ -107,7 +91,7 @@ def createFFDriver():
ff_prof . set_preference ( " network.cookie.lifetimePolicy " , 2 )
# ff_prof.set_preference("network.dns.disablePrefetch", True)#connection issue
# ff_prof.set_preference("network.http.sendRefererHeader", 0)#connection issue
ff_prof . set_preference ( " permissions.default.image " , 1 )
ff_prof . set_preference ( " permissions.default.image " , 3 )
ff_prof . set_preference ( " browser.download.folderList " , 2 )
ff_prof . set_preference ( " browser.download.manager.showWhenStarting " , False )
ff_prof . set_preference ( " browser.helperApps.neverAsk.saveToDisk " , " text/plain " )
@ -123,6 +107,8 @@ def createFFDriver():
driver = webdriver . Firefox ( firefox_binary = ff_binary , firefox_profile = ff_prof , service = service )
driver . maximize_window ( )
return driver
@ -146,15 +132,13 @@ def login(driver):
input ( " Press ENTER when CAPTCHA is completed and page is loaded \n " )
# wait for page to show up (This Xpath may need to change based on different seed url)
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage ( page , url ) :
cleanPage = cleanHTML ( page )
def savePage ( driver , page , url ) :
cleanPage = cleanHTML ( driver , page )
filePath = getFullPathName ( url )
# filePath = getFullPathName("Hello")
os . makedirs ( os . path . dirname ( filePath ) , exist_ok = True )
with open ( filePath , ' wb ' ) as file :
file . write ( cleanPage . encode ( ' utf-8 ' ) )
# open(filePath, 'wb').write(cleanPage.encode('utf-8'))
open ( filePath , ' wb ' ) . write ( cleanPage . encode ( ' utf-8 ' ) )
return
@ -191,16 +175,16 @@ def getInterestedLinks():
links = [ ]
# # services
links . append ( ' http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870 ')
# links.append(' http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870')
# # software & malware
# software & malware
links . append ( ' http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870 ' )
# # fraud
links . append ( ' http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870 ')
# links.append(' http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870')
# # guides
links . append ( ' http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Guides and Tutorials&small=0&big=5000000&id=75026212163304997524932260388151806190538071909089 ')
# links.append(' http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Guides and Tutorials&small=0&big=5000000&id=75026212163304997524932260388151806190538071909089')
return links
@ -227,27 +211,27 @@ def crawlForum(driver):
except :
driver . refresh ( )
html = driver . page_source
savePage ( html , link )
savePage ( driver , html , link )
list = productPages ( html )
for item in list :
itemURL = urlparse . urljoin ( BASE_URL , str ( item ) )
try :
time . sleep ( 1.5 ) # to keep from detecting click speed
# time.sleep(1.5) # to keep from detecting click speed
driver . get ( itemURL )
except :
driver . refresh ( )
savePage ( driver . page_source , item )
time . sleep ( 1.5 )
savePage ( driver , driver . page_source , item )
# time.sleep(1.5 )
driver . back ( )
# to keep from detecting click speed
# # comment out
# break
#
# # comment out
# if count == 1 :
# break
# comment out
break
# comment out
if count == 1 :
break
try :
# nav = driver.find_element(by=By.XPATH, value='/html/body/table[1]/tbody/tr/td/form/div/div[2]/table[2]')