Browse Source

added new prepare_parser.py and cleaned output for marketplaces

main
westernmeadow 1 year ago
parent
commit
1a1e887d35
33 changed files with 653 additions and 589 deletions
  1. +0
    -0
      Forums/BestCardingWorld/__init__.py
  2. +0
    -0
      Forums/CryptBB/__init__.py
  3. BIN
      Forums/CryptBB/captcha.png
  4. +1
    -1
      Forums/Initialization/forums_mining.py
  5. +11
    -7
      Forums/Initialization/prepare_parser.py
  6. +11
    -9
      MarketPlaces/AnonymousMarketplace/crawler_selenium.py
  7. +5
    -5
      MarketPlaces/AnonymousMarketplace/parser.py
  8. +15
    -13
      MarketPlaces/Apocalypse/crawler_selenium.py
  9. +16
    -9
      MarketPlaces/BlackPyramid/crawler_selenium.py
  10. +16
    -14
      MarketPlaces/CityMarket/crawler_selenium.py
  11. +6
    -4
      MarketPlaces/CypherMarketplace/crawler_selenium.py
  12. BIN
      MarketPlaces/DarkFox/captcha.png
  13. +10
    -6
      MarketPlaces/DarkFox/crawler_selenium.py
  14. +18
    -18
      MarketPlaces/DarkMatter/crawler_selenium.py
  15. +10
    -8
      MarketPlaces/DarkTor/crawler_selenium.py
  16. +16
    -15
      MarketPlaces/DigitalThriftShop/crawler_selenium.py
  17. +26
    -24
      MarketPlaces/HiddenMarket/crawler_selenium.py
  18. +8
    -1
      MarketPlaces/Initialization/marketsList.txt
  19. +25
    -8
      MarketPlaces/Initialization/markets_mining.py
  20. +262
    -230
      MarketPlaces/Initialization/prepare_parser.py
  21. +19
    -17
      MarketPlaces/LionMarketplace/crawler_selenium.py
  22. +17
    -15
      MarketPlaces/M00nkeyMarket/crawler_selenium.py
  23. +17
    -43
      MarketPlaces/MikesGrandStore/crawler_selenium.py
  24. +26
    -17
      MarketPlaces/Nexus/crawler_selenium.py
  25. +3
    -3
      MarketPlaces/Nexus/parser.py
  26. +33
    -40
      MarketPlaces/RobinhoodMarket/crawler_selenium.py
  27. +7
    -5
      MarketPlaces/ThiefWorld/crawler_selenium.py
  28. BIN
      MarketPlaces/Tor2door/captcha.png
  29. +6
    -4
      MarketPlaces/Tor2door/crawler_selenium.py
  30. +15
    -13
      MarketPlaces/TorBay/crawler_selenium.py
  31. +20
    -18
      MarketPlaces/TorMarket/crawler_selenium.py
  32. +33
    -41
      MarketPlaces/ViceCity/crawler_selenium.py
  33. +1
    -1
      setup.ini

+ 0
- 0
Forums/BestCardingWorld/__init__.py View File


+ 0
- 0
Forums/CryptBB/__init__.py View File


BIN
Forums/CryptBB/captcha.png View File

Before After
Width: 200  |  Height: 60  |  Size: 16 KiB

+ 1
- 1
Forums/Initialization/forums_mining.py View File

@ -135,7 +135,7 @@ if __name__ == '__main__':
elif forum == 'Libre': elif forum == 'Libre':
crawlerLibre() crawlerLibre()
print("Scraping process completed!")
print("\nScraping process completed!")


+ 11
- 7
Forums/Initialization/prepare_parser.py View File

@ -112,8 +112,6 @@ def parse_listing(forum, listingFile, soup, createLog, logFile):
try: try:
rw = []
if forum == "BestCardingWorld": if forum == "BestCardingWorld":
rw = bestcardingworld_listing_parser(soup) rw = bestcardingworld_listing_parser(soup)
elif forum == "Cardingleaks": elif forum == "Cardingleaks":
@ -128,16 +126,19 @@ def parse_listing(forum, listingFile, soup, createLog, logFile):
rw = procrax_listing_parser(soup) rw = procrax_listing_parser(soup)
elif forum == "Libre": elif forum == "Libre":
rw = libre_listing_parser(soup) rw = libre_listing_parser(soup)
else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception
return rw return rw
except: except:
incrementError() incrementError()
print("There was a problem to read the file " + listingFile + " in the listing section!")
print("There was a problem to parse the file " + listingFile + " in the listing section!")
traceback.print_exc() traceback.print_exc()
if createLog: if createLog:
logFile.write( logFile.write(
str(nError) + ". There was a problem to read the file " + listingFile + " in the Listing section.\n")
str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n")
return None return None
@ -145,8 +146,6 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile):
try: try:
rmm = []
if forum == "BestCardingWorld": if forum == "BestCardingWorld":
rmm = bestcardingworld_description_parser(soup) rmm = bestcardingworld_description_parser(soup)
elif forum == "Cardingleaks": elif forum == "Cardingleaks":
@ -161,6 +160,9 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile):
rmm = procrax_description_parser(soup) rmm = procrax_description_parser(soup)
elif forum == "Libre": elif forum == "Libre":
rmm = libre_description_parser(soup) rmm = libre_description_parser(soup)
else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception
return rmm return rmm
except: except:
@ -239,7 +241,9 @@ def new_parse(forum, url, createLog):
logFile = open(mainDir + f"/{CURRENT_DATE}/" + forum + "_" + CURRENT_DATE + ".log", "w") logFile = open(mainDir + f"/{CURRENT_DATE}/" + forum + "_" + CURRENT_DATE + ".log", "w")
except: except:
print("Could not open log file!") print("Could not open log file!")
raise SystemExit
createLog = False
logFile = None
# raise SystemExit
else: else:
logFile = None logFile = None


+ 11
- 9
MarketPlaces/AnonymousMarketplace/crawler_selenium.py View File

@ -32,19 +32,19 @@ baseURL = 'http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor # Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later #acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling(): def startCrawling():
opentor()
# opentor()
mktName = getMKTName() mktName = getMKTName()
driver = getAccess() driver = getAccess()
if driver != 'down': if driver != 'down':
try: try:
login(driver)
# login(driver)
crawlForum(driver) crawlForum(driver)
except Exception as e: except Exception as e:
print(driver.current_url, e) print(driver.current_url, e)
closetor(driver) closetor(driver)
# new_parse(mktName, baseURL, False)
new_parse(mktName, baseURL, True)
# Opens Tor Browser # Opens Tor Browser
@ -104,7 +104,7 @@ def createFFDriver():
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True) ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 2)
ff_prof.set_preference("permissions.default.image", 1)
ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@ -120,6 +120,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -185,11 +187,11 @@ def getNameFromURL(url):
def getInterestedLinks(): def getInterestedLinks():
links = [] links = []
# carding
links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/carding/')
# # carding
# links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/carding/')
# # hacked paypal # # hacked paypal
links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/hacked-paypal-accounts/')
# # hacking services
# links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/hacked-paypal-accounts/')
# hacking services
links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/hacking-services/') links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/hacking-services/')
return links return links
@ -250,7 +252,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
input("Crawling AnonymousMarketplace forum done sucessfully. Press ENTER to continue\n")
print("Crawling the AnonymousMarketplace market done.")
# Returns 'True' if the link is a description link # Returns 'True' if the link is a description link


+ 5
- 5
MarketPlaces/AnonymousMarketplace/parser.py View File

@ -88,14 +88,14 @@ def anonymousMarketplace_listing_parser(soup: Tag):
href = [] # 20 Product_Links href = [] # 20 Product_Links
product_list: ResultSet[Tag] = soup.find("ul", {"class": "products columns-4"}).find_all("li")
product_list: ResultSet[Tag] = soup.find("ul", {"class": "product_list_widget"}).find_all("li")
for item in product_list: for item in product_list:
item_href = item.find("a").get("href") item_href = item.find("a").get("href")
href.append(item_href) href.append(item_href)
item_name = item.find("h2", {"class": "woocommerce-loop-product__title"}).text
name.append(cleanString(>'item_name'.strip()))
item_name = item.find("span", {"class": "product-title"}).text
name.append(cleanString(item_name.strip()))
item_rating = item.find("div", {"class": "star-rating"}).find("strong", {"class": "rating"}).text item_rating = item.find("div", {"class": "star-rating"}).find("strong", {"class": "rating"}).text
rating_item.append(cleanString(item_rating.strip())) rating_item.append(cleanString(item_rating.strip()))
@ -167,10 +167,10 @@ def anonymous_links_parser(soup):
# Returning all links that should be visited by the Crawler # Returning all links that should be visited by the Crawler
href = [] href = []
listing = soup.find('ul', {"class": "products columns-4"}).findAll('li')
listing = soup.find('ul', {"class": "product_list_widget"}).findAll('li')
for a in listing: for a in listing:
bae = a.find('a', {"class": "woocommerce-LoopProduct-link woocommerce-loop-product__link"}, href=True)
bae = a.find('a', href=True)
link = bae['href'] link = bae['href']
href.append(link) href.append(link)


+ 15
- 13
MarketPlaces/Apocalypse/crawler_selenium.py View File

@ -34,17 +34,17 @@ baseURL = 'http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion
def startCrawling(): def startCrawling():
# opentor() # opentor()
mktName = getMKTName() mktName = getMKTName()
# driver = getAccess()
driver = getAccess()
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
new_parse(mktName, baseURL, False)
new_parse(mktName, baseURL, True)
# Opens Tor Browser # Opens Tor Browser
@ -120,6 +120,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -201,8 +203,8 @@ def getNameFromURL(url):
def getInterestedLinks(): def getInterestedLinks():
links = [] links = []
# Hacking Services
links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/subcategory/19')
# # Hacking Services
# links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/subcategory/19')
# software and malware # software and malware
links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/subcategory/30') links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/subcategory/30')
@ -244,7 +246,7 @@ def crawlForum(driver):
driver.back() driver.back()
# comment out # comment out
break
# break
# comment out # comment out
if count == 1: if count == 1:
@ -264,7 +266,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
input("Crawling Apocalypse forum done sucessfully. Press ENTER to continue\n")
print("Crawling the Apocalypse market done.")
# Returns 'True' if the link is a description link # Returns 'True' if the link is a description link


+ 16
- 9
MarketPlaces/BlackPyramid/crawler_selenium.py View File

@ -26,8 +26,6 @@ from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.BlackPyramid.parser import blackpyramid_links_parser from MarketPlaces.BlackPyramid.parser import blackpyramid_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML from MarketPlaces.Utilities.utilities import cleanHTML
config = configparser.ConfigParser()
config.read('../../setup.ini')
counter = 1 counter = 1
baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/login/' baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/login/'
@ -35,8 +33,8 @@ baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor # Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later #acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling(): def startCrawling():
opentor()
# mktName = getMKTName()
# opentor()
mktName = getMKTName()
driver = getAccess() driver = getAccess()
if driver != 'down': if driver != 'down':
@ -47,12 +45,14 @@ def startCrawling():
print(driver.current_url, e) print(driver.current_url, e)
closetor(driver) closetor(driver)
# new_parse(forumName, baseURL, False)
new_parse(mktName, baseURL, True)
# Opens Tor Browser # Opens Tor Browser
#prompts for ENTER input to continue #prompts for ENTER input to continue
def opentor(): def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid global pid
print("Connecting Tor...") print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
@ -91,6 +91,8 @@ def closetor(driver):
# Creates FireFox 'driver' and configure its 'Profile' # Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket # to use Tor proxy and socket
def createFFDriver(): def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
@ -119,6 +121,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -171,12 +175,14 @@ def savePage(driver, page, url):
# Gets the full path of the page to be saved along with its appropriate file name # Gets the full path of the page to be saved along with its appropriate file name
#@param: raw url as crawler crawls through every site #@param: raw url as crawler crawls through every site
def getFullPathName(url): def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import CURRENT_DATE
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url) fileName = getNameFromURL(url)
if isDescriptionLink(url): if isDescriptionLink(url):
fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else: else:
fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath return fullPath
@ -210,6 +216,7 @@ def getInterestedLinks():
# links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/')
# # Services # # Services
# links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/')
return links return links
@ -270,7 +277,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
input("Crawling BlackPyramid forum done sucessfully. Press ENTER to continue\n")
print("Crawling the BlackPyramid market done.")
# Returns 'True' if the link is a description link # Returns 'True' if the link is a description link


+ 16
- 14
MarketPlaces/CityMarket/crawler_selenium.py View File

@ -33,8 +33,8 @@ baseURL = 'http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor # Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later #acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling(): def startCrawling():
opentor()
# mktName = getMKTName()
# opentor()
mktName = getMKTName()
driver = getAccess() driver = getAccess()
if driver != 'down': if driver != 'down':
@ -45,7 +45,7 @@ def startCrawling():
print(driver.current_url, e) print(driver.current_url, e)
closetor(driver) closetor(driver)
# new_parse(forumName, baseURL, False)
new_parse(mktName, baseURL, True)
# Opens Tor Browser # Opens Tor Browser
@ -121,6 +121,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -198,18 +200,18 @@ def getNameFromURL(url):
def getInterestedLinks(): def getInterestedLinks():
links = [] links = []
# Hiring hacker
links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=3')
# # Hiring hacker
# links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=3')
# virus and malware # virus and malware
links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=15') links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=15')
# ddos
links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=16')
# software
links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=17')
# botnets
links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=18')
# hacking service
links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=31')
# # ddos
# links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=16')
# # software
# links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=17')
# # botnets
# links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=18')
# # hacking service
# links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=31')
return links return links
@ -269,7 +271,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
input("Crawling CityMarket forum done sucessfully. Press ENTER to continue\n")
print("Crawling the CityMarket market done.")
# Returns 'True' if the link is a description link # Returns 'True' if the link is a description link


+ 6
- 4
MarketPlaces/CypherMarketplace/crawler_selenium.py View File

@ -32,8 +32,8 @@ baseURL = 'http://6c5qa2ke2esh6ake6u6yoxjungz2czbbl7hqxl75v5k37frtzhxuk7ad.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor # Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later #acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling(): def startCrawling():
opentor()
# mktName = getMKTName()
# opentor()
mktName = getMKTName()
driver = getAccess() driver = getAccess()
if driver != 'down': if driver != 'down':
@ -44,7 +44,7 @@ def startCrawling():
print(driver.current_url, e) print(driver.current_url, e)
closetor(driver) closetor(driver)
# new_parse(forumName, baseURL, False)
new_parse(mktName, baseURL, True)
# Opens Tor Browser # Opens Tor Browser
@ -120,6 +120,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -263,7 +265,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
input("Crawling CypherMarketplace forum done sucessfully. Press ENTER to continue\n")
print("Crawling the CypherMarketplace market done.")
# Returns 'True' if the link is a description link # Returns 'True' if the link is a description link


BIN
MarketPlaces/DarkFox/captcha.png View File

Before After
Width: 150  |  Height: 150  |  Size: 55 KiB

+ 10
- 6
MarketPlaces/DarkFox/crawler_selenium.py View File

@ -30,7 +30,7 @@ baseURL = 'http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor # Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later #acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling(): def startCrawling():
opentor()
# opentor()
mktName = getMKTName() mktName = getMKTName()
driver = getAccess() driver = getAccess()
@ -42,7 +42,7 @@ def startCrawling():
print(driver.current_url, e) print(driver.current_url, e)
closetor(driver) closetor(driver)
new_parse(mktName, baseURL, False)
new_parse(mktName, baseURL, True)
# Opens Tor Browser # Opens Tor Browser
@ -124,6 +124,7 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -145,6 +146,7 @@ def getAccess():
# then allows for manual solving of captcha in the terminal # then allows for manual solving of captcha in the terminal
#@param: current selenium web driver #@param: current selenium web driver
def captcha(driver): def captcha(driver):
'''
# wait for captcha page show up # wait for captcha page show up
WebDriverWait(driver, 100).until(EC.visibility_of_element_located((By.XPATH, "/html/body/div/div/form/button[1]"))) WebDriverWait(driver, 100).until(EC.visibility_of_element_located((By.XPATH, "/html/body/div/div/form/button[1]")))
@ -168,6 +170,9 @@ def captcha(driver):
# click the verify(submit) button # click the verify(submit) button
driver.find_element(by=By.XPATH, value="/html/body/div/div/form/button[1]").click() driver.find_element(by=By.XPATH, value="/html/body/div/div/form/button[1]").click()
'''
input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url) # wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
@ -220,8 +225,7 @@ def getInterestedLinks():
# # Digital Products # # Digital Products
# links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781') # links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781')
# Software and Malware # Software and Malware
# links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc')
links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc?page=15')
links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc')
# # Services # # Services
# links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280') # links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280')
# # Miscellaneous # # Miscellaneous
@ -270,7 +274,7 @@ def crawlForum(driver):
break break
# comment out # comment out
if count == 0:
if count == 1:
break break
try: try:
@ -287,7 +291,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
input("Crawling BestCardingWorld forum done sucessfully. Press ENTER to continue\n")
print("Crawling the DarkFox market done.")
# Returns 'True' if the link is a description link # Returns 'True' if the link is a description link


+ 18
- 18
MarketPlaces/DarkMatter/crawler_selenium.py View File

@ -32,7 +32,7 @@ baseURL = 'http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor # Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later #acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling(): def startCrawling():
opentor()
# opentor()
mktName = getMKTName() mktName = getMKTName()
driver = getAccess() driver = getAccess()
@ -44,7 +44,7 @@ def startCrawling():
print(driver.current_url, e) print(driver.current_url, e)
closetor(driver) closetor(driver)
new_parse(mktName, baseURL, False)
new_parse(mktName, baseURL, True)
# Opens Tor Browser # Opens Tor Browser
@ -121,6 +121,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -185,15 +187,15 @@ def getNameFromURL(url):
def getInterestedLinks(): def getInterestedLinks():
links = [] links = []
# digital fraud software
links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=76')
# legit
links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=78')
# # digital fraud software
# links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=76')
# # legit
# links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=78')
# # hack guides # # hack guides
links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=94')
# links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=94')
# # services # # services
links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=117')
# # software/malware
# links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=117')
# software/malware
links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=121') links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=121')
return links return links
@ -236,16 +238,14 @@ def crawlForum(driver):
driver.back() driver.back()
# to keep from detecting click speed # to keep from detecting click speed
# # comment out
# break
#
# # comment out
# if count == 1:
# break
# comment out
break
# comment out
if count == 1:
break
try: try:
# nav = driver.find_element(by=By.XPATH, value='/html/body/table[1]/tbody/tr/td/form/div/div[2]/table[2]')
# a = nav.find_element(by=By.LINK_TEXT, value=">")
link = driver.find_element(by=By.LINK_TEXT, value=">").get_attribute('href') link = driver.find_element(by=By.LINK_TEXT, value=">").get_attribute('href')
if link == "": if link == "":
raise NoSuchElementException raise NoSuchElementException
@ -258,7 +258,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
input("Crawling DarkMatter forum done sucessfully. Press ENTER to continue\n")
print("Crawling the DarkMatter market done.")
# Returns 'True' if the link is a description link # Returns 'True' if the link is a description link


+ 10
- 8
MarketPlaces/DarkTor/crawler_selenium.py View File

@ -31,8 +31,8 @@ baseURL = 'http://zuauw53dukqdmll5p3fld26ns2gepcyfmbofobjczdni6ecmkoitnfid.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor # Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later #acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling(): def startCrawling():
opentor()
# mktName = getMKTName()
# opentor()
mktName = getMKTName()
driver = getAccess() driver = getAccess()
if driver != 'down': if driver != 'down':
@ -43,7 +43,7 @@ def startCrawling():
print(driver.current_url, e) print(driver.current_url, e)
closetor(driver) closetor(driver)
# new_parse(forumName, baseURL, False)
new_parse(mktName, baseURL, True)
# Opens Tor Browser # Opens Tor Browser
@ -119,6 +119,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -186,10 +188,10 @@ def getInterestedLinks():
# Hacking # Hacking
links.append('http://zuauw53dukqdmll5p3fld26ns2gepcyfmbofobjczdni6ecmkoitnfid.onion/product-category/hacking-services/') links.append('http://zuauw53dukqdmll5p3fld26ns2gepcyfmbofobjczdni6ecmkoitnfid.onion/product-category/hacking-services/')
# Carding
links.append('http://zuauw53dukqdmll5p3fld26ns2gepcyfmbofobjczdni6ecmkoitnfid.onion/product-category/carding/')
# hacked paypals
links.append('http://zuauw53dukqdmll5p3fld26ns2gepcyfmbofobjczdni6ecmkoitnfid.onion/product-category/hacked-paypal-accounts/')
# # Carding
# links.append('http://zuauw53dukqdmll5p3fld26ns2gepcyfmbofobjczdni6ecmkoitnfid.onion/product-category/carding/')
# # hacked paypals
# links.append('http://zuauw53dukqdmll5p3fld26ns2gepcyfmbofobjczdni6ecmkoitnfid.onion/product-category/hacked-paypal-accounts/')
return links return links
@ -248,7 +250,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
input("Crawling DarkTor forum done sucessfully. Press ENTER to continue\n")
print("Crawling the DarkTor market done.")
# Returns 'True' if the link is a description link # Returns 'True' if the link is a description link


+ 16
- 15
MarketPlaces/DigitalThriftShop/crawler_selenium.py View File

@ -34,17 +34,17 @@ baseURL = 'http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion
def startCrawling(): def startCrawling():
# opentor() # opentor()
mktName = getMKTName() mktName = getMKTName()
# driver = getAccess()
driver = getAccess()
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
new_parse(mktName, baseURL, False)
new_parse(mktName, baseURL, True)
# Opens Tor Browser # Opens Tor Browser
@ -91,7 +91,6 @@ def closetor(driver):
def createFFDriver(): def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
@ -120,6 +119,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -189,8 +190,8 @@ def getInterestedLinks():
links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/botnets/') links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/botnets/')
# # data leak # # data leak
# links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/dataleak/') # links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/dataleak/')
# databases
links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/databases/')
# # databases
# links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/databases/')
# # ransomware # # ransomware
# links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/ransomware/') # links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/ransomware/')
# # rats # # rats
@ -234,10 +235,10 @@ def crawlForum(driver):
driver.back() driver.back()
# comment out # comment out
# break
break
# comment out # comment out
if count == 10:
if count == 1:
break break
try: try:
@ -254,7 +255,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
input("Crawling DigitalThriftShop forum done sucessfully. Press ENTER to continue\n")
print("Crawling the DigitalThriftShop market done.")
# Returns 'True' if the link is a description link # Returns 'True' if the link is a description link


+ 26
- 24
MarketPlaces/HiddenMarket/crawler_selenium.py View File

@ -29,7 +29,7 @@ baseURL = 'http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion
# Opens Tor Browser, crawls the website # Opens Tor Browser, crawls the website
def startCrawling(): def startCrawling():
opentor()
# opentor()
marketName = getMKTName() marketName = getMKTName()
driver = getAccess() driver = getAccess()
@ -41,7 +41,7 @@ def startCrawling():
print(driver.current_url, e) print(driver.current_url, e)
closetor(driver) closetor(driver)
new_parse(marketName, baseURL, False)
new_parse(marketName, baseURL, True)
# Opens Tor Browser # Opens Tor Browser
@ -161,6 +161,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -211,27 +213,27 @@ def getInterestedLinks():
links = [] links = []
# # Civil Software # # Civil Software
links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/civil_softwares')
# links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/civil_softwares')
# # Tutorials - Carding # # Tutorials - Carding
links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/carding')
# # Digital - Hacks
# links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/carding')
# Digital - Hacks
links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/hacks') links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/hacks')
# Digital - Exploit Kit
links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/exploit_kit')
# # Digital - Exploit Kit
# links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/exploit_kit')
# # 0Day # # 0Day
links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/0day')
# Digital Forensics
links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/digital_forensics')
# Tutorials - Mining
links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/mining')
# Tutorials - Worms
links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/worms')
# Tutorials - Viruses
links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/viruses')
# Tutorials - Trojans
links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/trojans')
# Tutorials - Botnets
links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/botnets')
# links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/0day')
# # Digital Forensics
# links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/digital_forensics')
# # Tutorials - Mining
# links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/mining')
# # Tutorials - Worms
# links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/worms')
# # Tutorials - Viruses
# links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/viruses')
# # Tutorials - Trojans
# links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/trojans')
# # Tutorials - Botnets
# links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/botnets')
return links return links
@ -275,11 +277,11 @@ def crawlForum(driver):
driver.back() driver.back()
# comment out # comment out
# break
break
# comment out # comment out
# if count == 2:
# break
if count == 1:
break
try: try:
pageCount += 1 pageCount += 1
@ -296,7 +298,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
input("Crawling HiddenMarket market done sucessfully. Press ENTER to continue\n")
print("Crawling the HiddenMarket market done.")
# Returns 'True' if the link is Topic link # Returns 'True' if the link is Topic link


+ 8
- 1
MarketPlaces/Initialization/marketsList.txt View File

@ -1 +1,8 @@
HiddenMarket
Apocalypse
DarkMatter
DigitalThriftShop
HiddenMarket
Nexus
Robinhood
TorBay
ViceCity

+ 25
- 8
MarketPlaces/Initialization/markets_mining.py View File

@ -4,7 +4,6 @@ __author__ = 'DarkWeb'
Starting point of the Darkweb Markets Mining Starting point of the Darkweb Markets Mining
''' '''
import os
from datetime import * from datetime import *
from MarketPlaces.DarkFox.crawler_selenium import crawler as crawlerDarkFox from MarketPlaces.DarkFox.crawler_selenium import crawler as crawlerDarkFox
from MarketPlaces.Tor2door.crawler_selenium import crawler as crawlerTor2door from MarketPlaces.Tor2door.crawler_selenium import crawler as crawlerTor2door
@ -24,9 +23,11 @@ from MarketPlaces.ViceCity.crawler_selenium import crawler as crawlerViceCity
from MarketPlaces.HiddenMarket.crawler_selenium import crawler as crawlerHiddenMarket from MarketPlaces.HiddenMarket.crawler_selenium import crawler as crawlerHiddenMarket
from MarketPlaces.RobinhoodMarket.crawler_selenium import crawler as crawlerRobinhoodMarket from MarketPlaces.RobinhoodMarket.crawler_selenium import crawler as crawlerRobinhoodMarket
from MarketPlaces.Nexus.crawler_selenium import crawler as crawlerNexus from MarketPlaces.Nexus.crawler_selenium import crawler as crawlerNexus
from MarketPlaces.CypherMarketplace.crawler_selenium import crawler as crawlerCypher
import configparser import configparser
import time
import os
import subprocess
config = configparser.ConfigParser() config = configparser.ConfigParser()
config.read('../../setup.ini') config.read('../../setup.ini')
@ -71,18 +72,34 @@ def createDirectory(mkt):
os.mkdir(descReadDir) os.mkdir(descReadDir)
# Opens Tor Browser
def opentor():
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
# time.sleep(7.5)
input('Press ENTER when Tor is connected to continue')
return
if __name__ == '__main__': if __name__ == '__main__':
# opentor()
mktsList = getMarkets() mktsList = getMarkets()
for mkt in mktsList: for mkt in mktsList:
mkt = mkt.replace('\n','') mkt = mkt.replace('\n','')
print(f"Creating listing and description directories of {mkt} ...")
print("\nCreating listing and description directories ... for " + mkt)
createDirectory(mkt) createDirectory(mkt)
time.sleep(5)
input("Directories created successfully. Press ENTER to continue\n")
print("Directories created.")
if mkt == "DarkFox": if mkt == "DarkFox":
# for base in json["DarkFox"]["base"]:
# if crawlerDarkFox(base["url"], base["categories"]):
# break
crawlerDarkFox() crawlerDarkFox()
elif mkt == 'Tor2door': elif mkt == 'Tor2door':
crawlerTor2door() crawlerTor2door()
@ -118,7 +135,7 @@ if __name__ == '__main__':
crawlerRobinhoodMarket() crawlerRobinhoodMarket()
elif mkt == "Nexus": elif mkt == "Nexus":
crawlerNexus() crawlerNexus()
elif mkt == "CypherMarketplace":
crawlerCypher()
print("Scraping process completed successfully!")
print("\nScraping process completed!")

+ 262
- 230
MarketPlaces/Initialization/prepare_parser.py View File

@ -20,9 +20,12 @@ from MarketPlaces.TorMarket.parser import *
from MarketPlaces.HiddenMarket.parser import * from MarketPlaces.HiddenMarket.parser import *
from MarketPlaces.RobinhoodMarket.parser import * from MarketPlaces.RobinhoodMarket.parser import *
from MarketPlaces.Nexus.parser import * from MarketPlaces.Nexus.parser import *
from MarketPlaces.MikesGrandStore.parser import *
from MarketPlaces.Classifier.classify_product import predict from MarketPlaces.Classifier.classify_product import predict
nError = 0
def mergePages(rmm, rec): def mergePages(rmm, rec):
@ -82,13 +85,182 @@ def persist_data(url, row, cur):
create_items(cur, row, marketPlace, vendor) create_items(cur, row, marketPlace, vendor)
def incrementError():
global nError
nError += 1
def read_file(filePath, createLog, logFile):
try:
html = codecs.open(filePath.strip('\n'), encoding='utf8')
soup = BeautifulSoup(html, "html.parser")
html.close()
return soup
except:
try:
html = open(filePath.strip('\n'))
soup = BeautifulSoup(html, "html.parser")
html.close()
return soup
except:
incrementError()
print("There was a problem to read the file " + filePath)
if createLog:
logFile.write(
str(nError) + ". There was a problem to read the file " + filePath + "\n")
return None
def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
try:
if marketPlace == "DarkFox":
rw = darkfox_listing_parser(soup)
elif marketPlace == "Tor2door":
rw = tor2door_listing_parser(soup)
elif marketPlace == "Apocalypse":
rw = apocalypse_listing_parser(soup)
elif marketPlace == "ThiefWorld":
rw = thiefWorld_listing_parser(soup)
elif marketPlace == "AnonymousMarketplace":
rw = anonymousMarketplace_listing_parser(soup)
elif marketPlace == "ViceCity":
rw = vicecity_listing_parser(soup)
elif marketPlace == "TorBay":
rw = torbay_listing_parser(soup)
elif marketPlace == "M00nkeyMarket":
rw = m00nkey_listing_parser(soup)
elif marketPlace == "HiddenMarket":
rw = hiddenmarket_listing_parser(soup)
elif marketPlace == "DarkMatter":
rw = darkmatter_listing_parser(soup)
elif marketPlace == "DigitalThriftShop":
rw = digitalThriftShop_listing_parser(soup)
elif marketPlace == "LionMarketplace":
rw = lionmarketplace_listing_parser(soup)
elif marketPlace == "TorMarket":
rw = tormarket_listing_parser(soup)
elif marketPlace == "RobinhoodMarket":
rw = Robinhood_listing_parser(soup)
elif marketPlace == "Nexus":
rw = nexus_listing_parser(soup)
elif marketPlace == "MikesGrandStore":
rw = mikesGrandStore_listing_parser(soup)
else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception
return rw
except:
incrementError()
print("There was a problem to parse the file " + listingFile + " in the listing section!")
traceback.print_exc()
if createLog:
logFile.write(
str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n")
return None
def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
try:
if marketPlace == "DarkFox":
rmm = darkfox_description_parser(soup)
elif marketPlace == "Tor2door":
rmm = tor2door_description_parser(soup)
elif marketPlace == "Apocalypse":
rmm = apocalypse_description_parser(soup)
elif marketPlace == "ThiefWorld":
rmm = thiefWorld_description_parser(soup)
elif marketPlace == "AnonymousMarketplace":
rmm = anonymousMarketplace_description_parser(soup)
elif marketPlace == "ViceCity":
rmm = vicecity_description_parser(soup)
elif marketPlace == "TorBay":
rmm = torbay_description_parser(soup)
elif marketPlace == "M00nkeyMarket":
rmm = m00nkey_description_parser(soup)
elif marketPlace == "HiddenMarket":
rmm = hiddenmarket_description_parser(soup)
elif marketPlace == "DarkMatter":
rmm = darkmatter_description_parser(soup)
elif marketPlace == "DigitalThriftShop":
rmm = digitalThriftShop_description_parser(soup)
elif marketPlace == "LionMarketplace":
rmm = lionmarketplace_description_parser(soup)
elif marketPlace == "TorMarket":
rmm = tormarket_description_parser(soup)
elif marketPlace == "RobinhoodMarket":
rmm = Robinhood_description_parser(soup)
elif marketPlace == "Nexus":
rmm = nexus_description_parser(soup)
elif marketPlace == "MikesGrandStore":
rmm = mikesGrandStore_description_parser(soup)
else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception
return rmm
except:
incrementError()
print("There was a problem to parse the file " + descriptionFile + " in the Description section!")
traceback.print_exc()
if createLog:
logFile.write(
str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n")
return None
def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile):
try:
persist_data(url, tuple(rec), cur)
con.commit()
return True
except:
con.rollback()
trace = traceback.format_exc()
if trace.find("already exists") == -1:
incrementError()
print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!")
traceback.print_exc()
if createLog:
logFile.write(
str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n")
return False
else:
return True
def move_file(filePath, createLog, logFile):
# source = line2.replace(os.path.basename(line2), "") + filename
source = filePath
destination = filePath.replace(os.path.basename(filePath), "") + r'Read/'
try:
shutil.move(source, destination)
return True
except:
print("There was a problem to move the file " + filePath)
incrementError()
if createLog:
logFile.write(
str(nError) + ". There was a problem to move the file " + filePath + "\n")
return False
def new_parse(marketPlace, url, createLog): def new_parse(marketPlace, url, createLog):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
print("Parsing the " + marketPlace + " marketplace and conduct data classification to store the information in the database.")
# ini = time.time()
print("Parsing the " + marketPlace + " market and conduct data classification to store the information in the database.")
# Connecting to the database # Connecting to the database
con = connectDataBase() con = connectDataBase()
@ -97,271 +269,131 @@ def new_parse(marketPlace, url, createLog):
# Creating the tables (The database should be created manually) # Creating the tables (The database should be created manually)
create_database(cur, con) create_database(cur, con)
nError = 0
lines = [] # listing pages
lns = [] # description pages
detPage = {}
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + marketPlace + "/HTML_Pages")
#Creating the log file for each Market Place
# Creating the log file for each Forum
if createLog: if createLog:
if not os.path.exists("./" + marketPlace + "/Logs/" + marketPlace + "_" + CURRENT_DATE + ".log"):
logFile = open("./" + marketPlace + "/Logs/" + marketPlace + "_" + CURRENT_DATE + ".log", "w")
else:
print("Files of the date " + CURRENT_DATE + " from the Market Place " + marketPlace +
" were already read. Delete the referent information in the Data Base and also delete the log file"
" in the _Logs folder to read files from this Market Place of this date again.")
raise SystemExit
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + marketPlace + "/HTML_Pages")
try:
logFile = open(mainDir + f"/{CURRENT_DATE}/" + marketPlace + "_" + CURRENT_DATE + ".log", "w")
except:
print("Could not open log file!")
createLog = False
logFile = None
# raise SystemExit
else:
logFile = None
# Reading the Listing Html Pages # Reading the Listing Html Pages
for fileListing in glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')):
lines.append(fileListing)
listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html'))
for listingIndex, listingFile in enumerate(listings):
# Reading the Description Html Pages
for fileDescription in glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", '*.html')):
lns.append(fileDescription)
print("Reading listing folder of '" + marketPlace + "', file '" + os.path.basename(listingFile) + "', index= " + str(
listingIndex + 1) + " ... " + str(len(listings)))
# Parsing the Description Pages and put the tag's content into a dictionary (Hash table)
for index, line2 in enumerate(lns):
listingSoup = read_file(listingFile, createLog, logFile)
print("Reading description folder of '" + marketPlace + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns)))
# listing flags
doParseListing = listingSoup is not None
doDescription = False
try:
html = codecs.open(line2.strip('\n'), encoding='utf8')
soup = BeautifulSoup(html, "html.parser")
html.close()
except:
readDescriptionError = False
parseDescriptionError = False
persistDescriptionError = False
moveDescriptionError = False
findDescriptionError = False
try:
html = open(line2.strip('\n'))
soup = BeautifulSoup(html, "html.parser")
html.close()
except:
rw = []
nError += 1
print("There was a problem to read the file " + line2 + " in the Description section!")
if createLog:
logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section.\n")
continue
if doParseListing:
try:
rw = parse_listing(marketPlace, listingFile, listingSoup, createLog, logFile)
if marketPlace == "DarkFox":
rmm = darkfox_description_parser(soup)
elif marketPlace == "Tor2door":
rmm = tor2door_description_parser(soup)
elif marketPlace == "Apocalypse":
rmm = apocalypse_description_parser(soup)
elif marketPlace == "ThiefWorld":
rmm = thiefWorld_description_parser(soup)
elif marketPlace =="AnonymousMarketplace":
rmm = anonymousMarketplace_description_parser(soup)
elif marketPlace == "ViceCity":
rmm = vicecity_description_parser(soup)
elif marketPlace == "TorBay":
rmm = torbay_description_parser(soup)
elif marketPlace == "M00nkeyMarket":
rmm = m00nkey_description_parser(soup)
elif marketPlace == "HiddenMarket":
rmm = hiddenmarket_description_parser(soup)
elif marketPlace == "DarkMatter":
rmm = darkmatter_description_parser(soup)
elif marketPlace == "DigitalThriftShop":
rmm = digitalThriftShop_description_parser(soup)
elif marketPlace == "LionMarketplace":
rmm = lionmarketplace_description_parser(soup)
elif marketPlace == "TorMarket":
rmm = tormarket_description_parser(soup)
elif marketPlace == "RobinhoodMarket":
rmm = Robinhood_description_parser(soup)
elif marketPlace == "Nexus":
rmm = nexus_description_parser(soup)
# key = u"Pr:" + rmm[0].upper()[:desc_lim1] + u" Vendor:" + rmm[13].upper()[:desc_lim2]
key = u"Url:" + os.path.basename(line2).replace(".html", "")
# save file address with description record in memory
detPage[key] = {'rmm': rmm, 'filename': os.path.basename(line2)}
except Exception as e:
raise e
nError += 1
print("There was a problem to parse the file " + line2 + " in the Description section!")
if createLog:
logFile.write(str(nError) + ". There was a problem to parse the file " + line2 + " in the Description section.\n")
doDescription = rw is not None
# Parsing the Listing Pages and put the tag's content into a list
for index, line1 in enumerate(lines):
if doDescription:
print("Reading listing folder of '" + marketPlace + "', file '" + os.path.basename(line1) + "', index= " + str(index + 1) + " ... " + str(len(lines)))
nFound = 0
readError = False
try:
html = codecs.open(line1.strip('\n'), encoding='utf8')
soup = BeautifulSoup(html, "html.parser")
html.close()
except:
for rec in rw:
try:
html = open(line1.strip('\n'))
soup = BeautifulSoup(html, "html.parser")
html.close()
except Exception as e:
raise e
nError += 1
print("There was a problem to read the file " + line1 + " in the Listing section!")
if createLog:
logFile.write(str(nError) + ". There was a problem to read the file " + line1 + " in the Listing section.\n")
readError = True
if not readError:
parseError = False
try:
if marketPlace == "DarkFox":
rw = darkfox_listing_parser(soup)
elif marketPlace == "Tor2door":
rw = tor2door_listing_parser(soup)
elif marketPlace == "Apocalypse":
rw = apocalypse_listing_parser(soup)
elif marketPlace == "ThiefWorld":
rw = thiefWorld_listing_parser(soup)
elif marketPlace == "AnonymousMarketplace":
rw = anonymousMarketplace_listing_parser(soup)
elif marketPlace == "ViceCity":
rw = vicecity_listing_parser(soup)
elif marketPlace == "TorBay":
rw = torbay_listing_parser(soup)
elif marketPlace == "M00nkeyMarket":
rw = m00nkey_listing_parser(soup)
elif marketPlace == "HiddenMarket":
rw =hiddenmarket_listing_parser(soup)
elif marketPlace == "DarkMatter":
rw = darkmatter_listing_parser(soup)
elif marketPlace == "DigitalThriftShop":
rw = digitalThriftShop_listing_parser(soup)
elif marketPlace == "LionMarketplace":
rw = lionmarketplace_listing_parser(soup)
elif marketPlace == "TorMarket":
rw = tormarket_listing_parser(soup)
elif marketPlace == "RobinhoodMarket":
rw = Robinhood_listing_parser(soup)
elif marketPlace == "Nexus":
rw = nexus_listing_parser(soup)
else:
parseError = True
except Exception as e:
nError += 1
print("There was a problem to parse the file " + line1 + " in the listing section!")
if createLog:
logFile.write(
str(nError) + ". There was a problem to parse the file " + line1 + " in the Listing section.\n")
parseError = True
rec = rec.split(',')
if not parseError:
descriptionPattern = cleanLink(rec[20]) + ".html"
persistError = False
moveError = False
num_in_db = 0
num_persisted_moved = 0
# Reading the associated description Html Pages
descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern))
for rec in rw:
nFound += len(descriptions)
rec = rec.split(',')
for descriptionIndex, descriptionFile in enumerate(descriptions):
# if len(detPage) > 0: #It was created here just because Zeroday Market does not have Description Pages
# key = rec[23]
print("Reading description folder of '" + marketPlace + "', file '" + os.path.basename(
descriptionFile) + "', index= " + str(descriptionIndex + 1) + " ... " + str(len(descriptions)))
# key = u"Pr:" + rec[1].upper()[:list_lim1] + u" Vendor:" + rec[18].upper()[:list_lim2]
key = u"Url:" + cleanLink(rec[20])
descriptionSoup = read_file(descriptionFile, createLog, logFile)
# if the associated description page is parsed
if key in detPage:
# description flags
doParseDescription = descriptionSoup is not None
doPersistRecord = False
doMoveDescription = False
# rec = mergePages(detPage, rec)
rmm = []
if doParseDescription:
rmm = parse_description(marketPlace, descriptionFile, descriptionSoup, createLog, logFile)
doPersistRecord = rmm is not None
else:
readDescriptionError = True
parseDescriptionError = True
if doPersistRecord:
# Combining the information from Listing and Description Pages # Combining the information from Listing and Description Pages
rmm = detPage[key]['rmm']
rec = mergePages(rmm, rec) rec = mergePages(rmm, rec)
# Append to the list the classification of the product
# rec.append(str(predict(rec[1], rec[5], language='markets')))
# Append to the list the classification of the topic
rec.append(str(predict(rec[4], rec[5], language='sup_english'))) rec.append(str(predict(rec[4], rec[5], language='sup_english')))
# Persisting the information in the database # Persisting the information in the database
try:
persist_data(url, tuple(rec), cur)
con.commit()
except Exception as e:
trace = traceback.format_exc()
if trace.find("already exists") == -1:
nError += 1
print("There was a problem to persist the file " + detPage[key]['filename'] + " in the database!")
if createLog:
logFile.write(
str(nError) + ". There was a problem to persist the file " + detPage[key]['filename'] + " in the database.\n")
persistError = True
con.rollback()
if not persistError:
# move description files of completed folder
source = line2.replace(os.path.basename(line2), "") + detPage[key]['filename']
destination = line2.replace(os.path.basename(line2), "") + r'Read/'
try:
shutil.move(source, destination)
num_persisted_moved += 1
except:
print("There was a problem to move the file " + detPage[key]['filename'] + " in the Description section!")
nError += 1
if createLog:
logFile.write(
str(nError) + ". There was a problem to move the file " + detPage[key]['filename'] + " in the Description section!.\n")
moveError = True
# if the associated description page is not read or not parsed
persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile,
descriptionFile)
doMoveDescription = persistSuccess
else: else:
# query database
# if the product already exists:
# num_in_db += 1
pass
parseDescriptionError = True
# if number of products on listing page is equal to
# the number of merged, persisted, and moved products plus
# the number of products already in the database
if not persistError and not moveError and len(rw) == (num_persisted_moved + num_in_db):
if doMoveDescription:
# move listing file to completed folder
source = line1
destination = line1.replace(os.path.basename(line1), "") + r'Read/'
# move description files of completed folder
moveSuccess = move_file(descriptionFile, createLog, logFile)
try:
shutil.move(source, destination)
except:
if not moveSuccess:
moveDescriptionError = True
nError += 1
print("There was a problem to move the file " + line1 + " in the Listing section!")
if createLog:
logFile.write(str(nError) + ". There was a problem to move the file " + line1 + " in the Listing section!.\n")
else:
moveDescriptionError = True
# g.close ()
if not (nFound > 0):
if createLog:
logFile.close()
findDescriptionError = True
incrementError()
print(f"There was a problem to locate the file(s) for {listingFile} in the Description section!")
if createLog:
logFile.write(
str(nError) + f". There was a problem to locate the file(s) for {listingFile}"
f" in the Description section!\n")
# end = time.time()
if not (readDescriptionError or parseDescriptionError or persistDescriptionError
or moveDescriptionError or findDescriptionError):
# move listing files of completed folder
move_file(listingFile, createLog, logFile)
# finalTime = float(end-ini)
if createLog:
logFile.close()
# print (marketPlace + " Parsing Perfomed Succesfully in %.2f" %finalTime + "!")
input("Parsing the " + marketPlace + " marketplace and data classification done successfully. Press ENTER to continue\n")
print("Parsing the " + marketPlace + " market and data classification done.")

+ 19
- 17
MarketPlaces/LionMarketplace/crawler_selenium.py View File

@ -31,19 +31,19 @@ baseURL = 'http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor # Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later #acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling(): def startCrawling():
opentor()
# opentor()
mktName = getMKTName() mktName = getMKTName()
driver = getAccess() driver = getAccess()
if driver != 'down': if driver != 'down':
try: try:
login(driver)
# login(driver)
crawlForum(driver) crawlForum(driver)
except Exception as e: except Exception as e:
print(driver.current_url, e) print(driver.current_url, e)
closetor(driver) closetor(driver)
new_parse(mktName, baseURL, False)
new_parse(mktName, baseURL, True)
# Opens Tor Browser # Opens Tor Browser
@ -103,7 +103,7 @@ def createFFDriver():
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True) ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 2)
ff_prof.set_preference("permissions.default.image", 1)
ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@ -119,6 +119,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -187,12 +189,12 @@ def getInterestedLinks():
# Software/Malware # Software/Malware
links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/16') links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/16')
# Carding
links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/20')
# Hacking
links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/ba142ac0-c7e7-11ec-9bd1-fdd89c3d3f91')
# tutorial
links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/19')
# # Carding
# links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/20')
# # Hacking
# links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/ba142ac0-c7e7-11ec-9bd1-fdd89c3d3f91')
# # tutorial
# links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/19')
return links return links
@ -231,12 +233,12 @@ def crawlForum(driver):
savePage(driver, driver.page_source, item) savePage(driver, driver.page_source, item)
driver.back() driver.back()
# # comment out
# break
#
# # comment out
# if count == 1:
# break
# comment out
break
# comment out
if count == 1:
break
try: try:
link = driver.find_element(by=By.XPATH, value= link = driver.find_element(by=By.XPATH, value=
@ -252,7 +254,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
input("Crawling LionMarketplace forum done sucessfully. Press ENTER to continue\n")
print("Crawling the LionMarketplace market done.")
# Returns 'True' if the link is a description link # Returns 'True' if the link is a description link


+ 17
- 15
MarketPlaces/M00nkeyMarket/crawler_selenium.py View File

@ -34,17 +34,17 @@ MARKET_NAME = 'M00nkeyMarket'
#acts like the main method for the crawler, another function at the end of this code calls this function later #acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling(): def startCrawling():
# opentor() # opentor()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
driver = getAccess()
new_parse(MARKET_NAME, BASE_URL, False)
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
new_parse(MARKET_NAME, BASE_URL, True)
# Opens Tor Browser # Opens Tor Browser
@ -120,6 +120,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -203,7 +205,7 @@ def getInterestedLinks():
# software # software
links.append('http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/search/subcategories?subcategory=30') links.append('http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/search/subcategories?subcategory=30')
# # guides # # guides
links.append('http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/search/subcategories?subcategory=17')
# links.append('http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/search/subcategories?subcategory=17')
return links return links
@ -243,11 +245,11 @@ def crawlForum(driver):
driver.back() driver.back()
# comment out # comment out
# break
break
# comment out # comment out
# if count == 1:
# break
if count == 1:
break
try: try:
link = driver.find_element(by=By.LINK_TEXT, value='Next ›').get_attribute('href') link = driver.find_element(by=By.LINK_TEXT, value='Next ›').get_attribute('href')
@ -262,7 +264,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
input("Crawling M00nkeyMarket done sucessfully. Press ENTER to continue\n")
print("Crawling the M00nkeyMarket done.")
# Returns 'True' if the link is a description link # Returns 'True' if the link is a description link


+ 17
- 43
MarketPlaces/MikesGrandStore/crawler_selenium.py View File

@ -31,47 +31,19 @@ baseURL = 'http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor # Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later #acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling(): def startCrawling():
print("Welcome to the darkweb pipeline. Do you want to run:")
print("[A] Entire pipeline\t[B] Crawler only\t[C] Parser only")
choice = input()
while choice not in {'A', 'B', 'C'}:
print("Choose the options below only!")
print("[A] Entire pipeline\t[B] Crawler only\t[C] Parser only")
choice = input()
if choice == 'A':
opentor()
mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
new_parse(mktName, baseURL, False)
if choice == 'B':
opentor()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
if choice == 'C':
mktName = getMKTName()
new_parse(mktName, baseURL, False)
# opentor()
mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
new_parse(mktName, baseURL, True)
# Opens Tor Browser # Opens Tor Browser
@ -131,7 +103,7 @@ def createFFDriver():
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True) ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 2)
ff_prof.set_preference("permissions.default.image", 1)
ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@ -147,6 +119,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -275,7 +249,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
input("Crawling MikesGrandStore forum done sucessfully. Press ENTER to continue\n")
print("Crawling the MikesGrandStore market done.")
# Returns 'True' if the link is a description link # Returns 'True' if the link is a description link


+ 26
- 17
MarketPlaces/Nexus/crawler_selenium.py View File

@ -31,7 +31,7 @@ baseURL = 'http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor # Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later #acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling(): def startCrawling():
opentor()
# opentor()
mktName = getMKTName() mktName = getMKTName()
driver = getAccess() driver = getAccess()
@ -40,9 +40,9 @@ def startCrawling():
crawlForum(driver) crawlForum(driver)
except Exception as e: except Exception as e:
print(driver.current_url, e) print(driver.current_url, e)
closetor(driver)
closetor(driver)
new_parse(mktName, baseURL, False)
new_parse(mktName, baseURL, True)
# Opens Tor Browser # Opens Tor Browser
#prompts for ENTER input to continue #prompts for ENTER input to continue
@ -116,6 +116,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -131,8 +133,8 @@ def getAccess():
driver.close() driver.close()
return 'down' return 'down'
def savePage(page, url):
cleanPage = cleanHTML(page)
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url) filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True) os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8')) open(filePath, 'wb').write(cleanPage.encode('utf-8'))
@ -173,14 +175,14 @@ def getInterestedLinks():
# Bot nets # Bot nets
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/botnets/') links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/botnets/')
# Rats
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/rats/')
# Ransomware
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/ransomware/')
# Other Malware
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/outros-malware/')
# Hacking Tools & Scripting
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/hacking-spam/ferramentas-de-hacking-scripts/')
# # Rats
# links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/rats/')
# # Ransomware
# links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/ransomware/')
# # Other Malware
# links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/outros-malware/')
# # Hacking Tools & Scripting
# links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/hacking-spam/ferramentas-de-hacking-scripts/')
return links return links
@ -207,7 +209,7 @@ def crawlForum(driver):
except: except:
driver.refresh() driver.refresh()
html = driver.page_source html = driver.page_source
savePage(html, link)
savePage(driver, html, link)
list = productPages(html) list = productPages(html)
for item in list: for item in list:
@ -216,9 +218,16 @@ def crawlForum(driver):
driver.get(itemURL) driver.get(itemURL)
except: except:
driver.refresh() driver.refresh()
savePage(driver.page_source, item)
savePage(driver, driver.page_source, item)
driver.back() driver.back()
# comment out
break
# comment out
if count == 1:
break
try: try:
link = driver.find_element(by=By.XPATH, value= link = driver.find_element(by=By.XPATH, value=
'/html/body/div[1]/div[2]/div/div/main/nav/ul/li[3]/a').get_attribute('href') '/html/body/div[1]/div[2]/div/div/main/nav/ul/li[3]/a').get_attribute('href')
@ -233,7 +242,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
input("Crawling Nexus done sucessfully. Press ENTER to continue\n")
print("Crawling the Nexus market done.")
# Returns 'True' if the link is a description link # Returns 'True' if the link is a description link
@ -263,5 +272,5 @@ def productPages(html):
def crawler(): def crawler():
startCrawling() startCrawling()
print("Crawling and Parsing Nexus .... DONE!")
# print("Crawling and Parsing Nexus .... DONE!")

+ 3
- 3
MarketPlaces/Nexus/parser.py View File

@ -107,7 +107,7 @@ def nexus_listing_parser(soup):
# Finding the name of the product # Finding the name of the product
name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text
name_of_product_cleaned = cleanString(name_of_product.strip()) name_of_product_cleaned = cleanString(name_of_product.strip())
print(name_of_product_cleaned)
# print(name_of_product_cleaned)
name.append(name_of_product_cleaned) name.append(name_of_product_cleaned)
#finding the URL #finding the URL
try: try:
@ -135,8 +135,8 @@ def nexus_listing_parser(soup):
qLeft.append("-1") qLeft.append("-1")
shipFrom.append("-1") shipFrom.append("-1")
shipTo.append("-1") shipTo.append("-1")
print("Done! moving onto the next product!")
print(len(shipTo))
# print("Done! moving onto the next product!")
# print(len(shipTo))
nm += 1 nm += 1
except AttributeError as e: except AttributeError as e:
print("I'm somewhere I don't belong. I'm going to leave") print("I'm somewhere I don't belong. I'm going to leave")


+ 33
- 40
MarketPlaces/RobinhoodMarket/crawler_selenium.py View File

@ -1,7 +1,7 @@
__author__ = 'chris' __author__ = 'chris'
''' '''
WeTheNorth Market Crawler (Selenium)
RobinhoodMarket Market Crawler (Selenium)
''' '''
from selenium import webdriver from selenium import webdriver
@ -23,8 +23,6 @@ from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.RobinhoodMarket.parser import Robinhood_links_parser from MarketPlaces.RobinhoodMarket.parser import Robinhood_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML from MarketPlaces.Utilities.utilities import cleanHTML
config = configparser.ConfigParser()
config.read('../../setup.ini')
counter = 1 counter = 1
baseURL = 'http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/' baseURL = 'http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/'
@ -34,15 +32,14 @@ def startCrawling():
# Opening tor beforehand gives "Tor exited during startup error" # Opening tor beforehand gives "Tor exited during startup error"
# opentor() # opentor()
marketName = getMarketName()
marketName = getMKTName()
driver = getAccess() driver = getAccess()
# Captcha
input("Press ENTER when website has loaded")
if driver != 'down': if driver != 'down':
try: try:
# Captcha
input("Press ENTER when website has loaded")
# Robinhood doesn't need login # Robinhood doesn't need login
# login(driver) # login(driver)
crawlForum(driver) crawlForum(driver)
@ -50,11 +47,13 @@ def startCrawling():
print(driver.current_url, e) print(driver.current_url, e)
closetor(driver) closetor(driver)
new_parse(marketName, baseURL, False)
new_parse(marketName, baseURL, True)
# Opens Tor Browser # Opens Tor Browser
def opentor(): def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid global pid
print("Connecting Tor...") print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
@ -70,7 +69,7 @@ def login(driver):
# Returns the name of the website # Returns the name of the website
def getMarketName():
def getMKTName():
name = 'RobinhoodMarket' name = 'RobinhoodMarket'
return name return name
@ -96,6 +95,8 @@ def closetor(driver):
# Creates FireFox 'driver' and configure its 'Profile' # Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket # to use Tor proxy and socket
def createFFDriver(): def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
@ -124,13 +125,14 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
def getAccess(): def getAccess():
url = getFixedURL() url = getFixedURL()
driver = createFFDriver() driver = createFFDriver()
input('Tor Connected. Press ENTER to continue\n')
try: try:
driver.get(url) driver.get(url)
return driver return driver
@ -150,12 +152,14 @@ def savePage(driver, page, url):
# Gets the full path of the page to be saved along with its appropriate file name # Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url): def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import CURRENT_DATE
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url) fileName = getNameFromURL(url)
if isDescriptionLink(url): if isDescriptionLink(url):
fullPath = r'..\RobinhoodMarket\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else: else:
fullPath = r'..\RobinhoodMarket\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath return fullPath
@ -174,8 +178,8 @@ def getInterestedLinks():
# Hacking # Hacking
links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/hacking/') links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/hacking/')
# Other Software
links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/other-software/')
# # Other Software
# links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/other-software/')
return links return links
@ -184,25 +188,24 @@ def crawlForum(driver):
print("Crawling the Robinhood market") print("Crawling the Robinhood market")
linksToCrawl = getInterestedLinks() linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0 i = 0
count = 0
while i < len(linksToCrawl): while i < len(linksToCrawl):
link = linksToCrawl[i] link = linksToCrawl[i]
print('Crawling :', link) print('Crawling :', link)
try: try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
has_next_page = True has_next_page = True
count = 0
while has_next_page: while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
list = productPages(html) list = productPages(html)
for item in list: for item in list:
@ -213,27 +216,20 @@ def crawlForum(driver):
driver.refresh() driver.refresh()
savePage(driver, driver.page_source, item) savePage(driver, driver.page_source, item)
driver.back() driver.back()
# comment out # comment out
# break
break
# comment out # comment out
# if count == 1:
# count = 0
# break
if count == 1:
break
# go to next page of market # go to next page of market
try: try:
nav = driver.find_element(by=By.XPATH, value="//a[@class='next page-numbers']") nav = driver.find_element(by=By.XPATH, value="//a[@class='next page-numbers']")
link = nav.get_attribute('href') link = nav.get_attribute('href')
if link == "": if link == "":
raise NoSuchElementException raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
count += 1 count += 1
except NoSuchElementException: except NoSuchElementException:
@ -243,10 +239,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling Robinhood market done successfully. Press ENTER to continue\n")
print("Crawling the Robinhood market done.")
# Returns 'True' if the link is Topic link # Returns 'True' if the link is Topic link


+ 7
- 5
MarketPlaces/ThiefWorld/crawler_selenium.py View File

@ -1,7 +1,7 @@
__author__ = 'Helium' __author__ = 'Helium'
''' '''
ThiefWorld Forum Crawler (Selenium)
ThiefWorld Market Crawler (Selenium)
''' '''
from selenium import webdriver from selenium import webdriver
@ -32,7 +32,7 @@ baseURL = 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor # Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later #acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling(): def startCrawling():
opentor()
# opentor()
mktName = getMKTName() mktName = getMKTName()
driver = getAccess() driver = getAccess()
@ -44,7 +44,7 @@ def startCrawling():
print(driver.current_url, e) print(driver.current_url, e)
closetor(driver) closetor(driver)
# new_parse(mktName, baseURL, False)
new_parse(mktName, baseURL, True)
# Opens Tor Browser # Opens Tor Browser
@ -104,7 +104,7 @@ def createFFDriver():
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True) ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 2)
ff_prof.set_preference("permissions.default.image", 1)
ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@ -120,6 +120,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -260,7 +262,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
input("Crawling ThiefWorld forum done sucessfully. Press ENTER to continue\n")
print("Crawling the ThiefWorld market done.")
# Returns 'True' if the link is a description link # Returns 'True' if the link is a description link


BIN
MarketPlaces/Tor2door/captcha.png View File

Before After
Width: 120  |  Height: 38  |  Size: 3.3 KiB

+ 6
- 4
MarketPlaces/Tor2door/crawler_selenium.py View File

@ -29,8 +29,8 @@ baseURL = 'http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion
# Opens Tor Browser, crawls the website # Opens Tor Browser, crawls the website
def startCrawling(): def startCrawling():
opentor()
# marketName = getMKTName()
# opentor()
marketName = getMKTName()
driver = getAccess() driver = getAccess()
if driver != 'down': if driver != 'down':
@ -41,7 +41,7 @@ def startCrawling():
print(driver.current_url, e) print(driver.current_url, e)
closetor(driver) closetor(driver)
# new_parse(marketName, baseURL, False)
new_parse(marketName, baseURL, True)
# Opens Tor Browser # Opens Tor Browser
@ -161,6 +161,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -278,7 +280,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
input("Crawling Tor2door market done sucessfully. Press ENTER to continue\n")
print("Crawling the Tor2door market done.")
# Returns 'True' if the link is Topic link # Returns 'True' if the link is Topic link


+ 15
- 13
MarketPlaces/TorBay/crawler_selenium.py View File

@ -34,17 +34,17 @@ baseURL = 'http://torbay3253zck4ym5cbowwvrbfjjzruzthrx3np5y6owvifrnhy5ybid.onion
def startCrawling(): def startCrawling():
# opentor() # opentor()
mktName = getMKTName() mktName = getMKTName()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
#
new_parse(mktName, baseURL, False)
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
new_parse(mktName, baseURL, True)
# Opens Tor Browser # Opens Tor Browser
@ -120,6 +120,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' #the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
@ -230,7 +232,7 @@ def crawlForum(driver):
# comment out # comment out
if count == 1: if count == 1:
break
break
try: try:
link = driver.find_element(by=By.XPATH, value= link = driver.find_element(by=By.XPATH, value=
@ -246,7 +248,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
input("Crawling TorBay forum done sucessfully. Press ENTER to continue\n")
print("Crawling the TorBay market done.")
# Returns 'True' if the link is a description link # Returns 'True' if the link is a description link


+ 20
- 18
MarketPlaces/TorMarket/crawler_selenium.py View File

@ -33,17 +33,17 @@ baseURL = 'http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion
def startCrawling(): def startCrawling():
# opentor() # opentor()
mktName = getMKTName() mktName = getMKTName()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
driver = getAccess()
new_parse(mktName, baseURL, False)
if driver != 'down':
try:
# login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
new_parse(mktName, baseURL, True)
# Opens Tor Browser # Opens Tor Browser
@ -103,7 +103,7 @@ def createFFDriver():
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True) ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 2)
ff_prof.set_preference("permissions.default.image", 1)
ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@ -119,6 +119,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
@ -184,12 +186,12 @@ def getNameFromURL(url):
def getInterestedLinks(): def getInterestedLinks():
links = [] links = []
# Hacking Tutorials
links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/guides-tutorials/hacking/')
# # Malware
# # Hacking Tutorials
# links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/guides-tutorials/hacking/')
# Malware
links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/malware/') links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/malware/')
# # Hacking Services # # Hacking Services
links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/services/hacking-services/')
# links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/services/hacking-services/')
return links return links
@ -232,8 +234,8 @@ def crawlForum(driver):
break break
# comment out # comment out
# if count == 1:
# break
if count == 1:
break
try: try:
link = driver.find_element(by=By.XPATH, value= link = driver.find_element(by=By.XPATH, value=
@ -249,7 +251,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
input("Crawling TorMarket forum done sucessfully. Press ENTER to continue\n")
print("Crawling the TorMarket market done.")
# Returns 'True' if the link is a description link # Returns 'True' if the link is a description link


+ 33
- 41
MarketPlaces/ViceCity/crawler_selenium.py View File

@ -25,8 +25,6 @@ from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.ViceCity.parser import vicecity_links_parser from MarketPlaces.ViceCity.parser import vicecity_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML from MarketPlaces.Utilities.utilities import cleanHTML
config = configparser.ConfigParser()
config.read('../../setup.ini')
counter = 1 counter = 1
baseURL = 'http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/' baseURL = 'http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/'
@ -34,7 +32,7 @@ baseURL = 'http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor # Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later #acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling(): def startCrawling():
opentor()
# opentor()
mktName = getMKTName() mktName = getMKTName()
driver = getAccess() driver = getAccess()
@ -46,12 +44,14 @@ def startCrawling():
print(driver.current_url, e) print(driver.current_url, e)
closetor(driver) closetor(driver)
new_parse(mktName, baseURL, False)
new_parse(mktName, baseURL, True)
# Opens Tor Browser # Opens Tor Browser
#prompts for ENTER input to continue #prompts for ENTER input to continue
def opentor(): def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid global pid
print("Connecting Tor...") print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
@ -90,6 +90,8 @@ def closetor(driver):
# Creates FireFox 'driver' and configure its 'Profile' # Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket # to use Tor proxy and socket
def createFFDriver(): def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
@ -118,6 +120,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' #the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
@ -140,9 +144,9 @@ def login(driver):
# wait for first captcha page to show up (This Xpath may need to change based on different seed url) # wait for first captcha page to show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div/div/form/div/div[1]"))) (By.XPATH, "/html/body/div/div/form/div/div[1]")))
input("Press Enter once captcha done (dont press done)")
input("Press Enter once captcha done")
#clicks button after captcha is inputted #clicks button after captcha is inputted
driver.find_element(by=By.XPATH, value='/html/body/div/div/form/button').click()
# driver.find_element(by=By.XPATH, value='/html/body/div/div/form/button').click()
#wait for login page to show up #wait for login page to show up
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
@ -152,9 +156,9 @@ def login(driver):
userBox.send_keys('ct1234') userBox.send_keys('ct1234')
#waits for second catpcha to be inputted by user #waits for second catpcha to be inputted by user
input("Press Enter once captcha done (dont press continue)")
input("Press Enter once captcha done")
#clicks on continue #clicks on continue
driver.find_element(by=By.XPATH, value='/html/body/div/div/div/form/input[2]').click()
# driver.find_element(by=By.XPATH, value='/html/body/div/div/div/form/input[2]').click()
#waits for password to show #waits for password to show
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
@ -220,12 +224,12 @@ def getInterestedLinks():
# Digital - Fraud Software, Has Hacking and Guides # Digital - Fraud Software, Has Hacking and Guides
links.append('http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/?category=150') links.append('http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/?category=150')
# Digital - Guides and Tutorials
links.append('http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/?category=94')
# Carding Services
links.append('http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/?category=155')
# Digital - Other (half junk half random stuff like: bots, rats, viruses, and guides)
links.append('http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/?category=153')
# # Digital - Guides and Tutorials
# links.append('http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/?category=94')
# # Carding Services
# links.append('http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/?category=155')
# # Digital - Other (half junk half random stuff like: bots, rats, viruses, and guides)
# links.append('http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/?category=153')
return links return links
@ -237,26 +241,24 @@ def crawlForum(driver):
print("Crawling the ViceCity Market") print("Crawling the ViceCity Market")
linksToCrawl = getInterestedLinks() linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
count = 0
i = 0 i = 0
while i < len(linksToCrawl): while i < len(linksToCrawl):
link = linksToCrawl[i] link = linksToCrawl[i]
print('Crawling :', link) print('Crawling :', link)
try: try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
has_next_page = True has_next_page = True
count = 0
while has_next_page: while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
list = productPages(html) list = productPages(html)
j = 0
for item in list: for item in list:
itemURL = urlparse.urljoin(baseURL, str(item)) itemURL = urlparse.urljoin(baseURL, str(item))
try: try:
@ -268,25 +270,18 @@ def crawlForum(driver):
time.sleep(2.5) # so site doesnt crash time.sleep(2.5) # so site doesnt crash
driver.back() driver.back()
#comment out
# break
# comment out
break
# # comment out
# if count == 1:
# count = 0
# break
# comment out
if count == 1:
break
try: try:
temp = driver.find_element(by=By.CLASS_NAME, value='pagination') temp = driver.find_element(by=By.CLASS_NAME, value='pagination')
link = temp.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') link = temp.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
if link == "": if link == "":
raise NoSuchElementException raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
count += 1 count += 1
except NoSuchElementException: except NoSuchElementException:
@ -296,10 +291,7 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling ViceCity done sucessfully. Press ENTER to continue\n")
print("Crawling the ViceCity market done.")
# Returns 'True' if the link is a description link # Returns 'True' if the link is a description link


+ 1
- 1
setup.ini View File

@ -15,4 +15,4 @@ password = password
database = darkweb_markets_forums database = darkweb_markets_forums
[Encryption] [Encryption]
secret = "password"
secret = password

Loading…
Cancel
Save