unknown 1 year ago
parent
commit
34af539238
59 changed files with 2479 additions and 1170 deletions
  1. +16
    -3
      .idea/DW_Pipeline_Test.iml
  2. +1
    -1
      .idea/misc.xml
  3. +14
    -26
      Forums/AbyssForum/crawler_selenium.py
  4. +9
    -21
      Forums/Altenens/crawler_selenium.py
  5. +11
    -24
      Forums/BestCardingWorld/crawler_selenium.py
  6. +26
    -24
      Forums/BestCardingWorld/parser.py
  7. +15
    -27
      Forums/Cardingleaks/crawler_selenium.py
  8. +13
    -25
      Forums/CryptBB/crawler_selenium.py
  9. +1
    -1
      Forums/CryptBB/parser.py
  10. +0
    -1
      Forums/DB_Connection/db_connection.py
  11. +13
    -25
      Forums/HiddenAnswers/crawler_selenium.py
  12. +5
    -2
      Forums/HiddenAnswers/parser.py
  13. +0
    -1
      Forums/Initialization/forumsList.txt
  14. +5
    -0
      Forums/Initialization/prepare_parser.py
  15. +14
    -26
      Forums/Libre/crawler_selenium.py
  16. +12
    -25
      Forums/OnniForums/crawler_selenium.py
  17. +8
    -7
      Forums/OnniForums/parser.py
  18. +13
    -25
      Forums/Procrax/crawler_selenium.py
  19. +8
    -26
      MarketPlaces/AnonymousMarketplace/crawler_selenium.py
  20. +17
    -20
      MarketPlaces/AnonymousMarketplace/parser.py
  21. +14
    -20
      MarketPlaces/Apocalypse/crawler_selenium.py
  22. +23
    -8
      MarketPlaces/Apocalypse/parser.py
  23. +3
    -18
      MarketPlaces/BlackPyramid/crawler_selenium.py
  24. +3
    -18
      MarketPlaces/CityMarket/crawler_selenium.py
  25. +3
    -18
      MarketPlaces/CypherMarketplace/crawler_selenium.py
  26. +46
    -31
      MarketPlaces/DB_Connection/db_connection.py
  27. +262
    -0
      MarketPlaces/DarkBazar/crawler_selenium.py
  28. +289
    -0
      MarketPlaces/DarkBazar/parser.py
  29. +6
    -21
      MarketPlaces/DarkFox/crawler_selenium.py
  30. +6
    -23
      MarketPlaces/DarkMatter/crawler_selenium.py
  31. +106
    -134
      MarketPlaces/DarkMatter/parser.py
  32. +4
    -18
      MarketPlaces/DarkTor/crawler_selenium.py
  33. +3
    -17
      MarketPlaces/DigitalThriftShop/crawler_selenium.py
  34. +19
    -6
      MarketPlaces/DigitalThriftShop/parser.py
  35. +7
    -21
      MarketPlaces/HiddenMarket/crawler_selenium.py
  36. +21
    -7
      MarketPlaces/HiddenMarket/parser.py
  37. +4
    -0
      MarketPlaces/Initialization/marketsList.txt
  38. +6
    -0
      MarketPlaces/Initialization/markets_mining.py
  39. +16
    -2
      MarketPlaces/Initialization/prepare_parser.py
  40. +4
    -19
      MarketPlaces/LionMarketplace/crawler_selenium.py
  41. +120
    -153
      MarketPlaces/LionMarketplace/parser.py
  42. +4
    -19
      MarketPlaces/M00nkeyMarket/crawler_selenium.py
  43. +291
    -0
      MarketPlaces/MetaVerseMarket/crawler_selenium.py
  44. +285
    -0
      MarketPlaces/MetaVerseMarket/parser.py
  45. +4
    -18
      MarketPlaces/MikesGrandStore/crawler_selenium.py
  46. +23
    -27
      MarketPlaces/Nexus/crawler_selenium.py
  47. +66
    -46
      MarketPlaces/Nexus/parser.py
  48. +256
    -0
      MarketPlaces/PabloEscobarMarket/crawler_selenium.py
  49. +241
    -0
      MarketPlaces/PabloEscobarMarket/parser.py
  50. +11
    -25
      MarketPlaces/RobinhoodMarket/crawler_selenium.py
  51. +35
    -14
      MarketPlaces/RobinhoodMarket/parser.py
  52. +5
    -20
      MarketPlaces/ThiefWorld/crawler_selenium.py
  53. +1
    -1
      MarketPlaces/ThiefWorld/parser.py
  54. +2
    -16
      MarketPlaces/Tor2door/crawler_selenium.py
  55. +4
    -18
      MarketPlaces/TorBay/crawler_selenium.py
  56. +10
    -27
      MarketPlaces/TorMarket/crawler_selenium.py
  57. +49
    -52
      MarketPlaces/TorMarket/parser.py
  58. +19
    -21
      MarketPlaces/Utilities/utilities.py
  59. +7
    -22
      MarketPlaces/ViceCity/crawler_selenium.py

+ 16
- 3
.idea/DW_Pipeline_Test.iml View File

@ -2,7 +2,7 @@
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="jdk" jdkName="C:\Users\calsyslab\anaconda3" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyNamespacePackagesService">
@ -10,10 +10,23 @@
<list>
<option value="$MODULE_DIR$/Forums/BestCardingWorld" />
<option value="$MODULE_DIR$/Forums/CryptBB" />
<option value="$MODULE_DIR$/MarketPlaces/DarkFox" />
<option value="$MODULE_DIR$/MarketPlaces/Tor2door" />
<option value="$MODULE_DIR$/Forums/OnniForums" />
<option value="$MODULE_DIR$/MarketPlaces/ThiefWorld" />
<option value="$MODULE_DIR$/MarketPlaces/Apocalypse" />
<option value="$MODULE_DIR$/MarketPlaces/DarkMatter" />
<option value="$MODULE_DIR$/MarketPlaces/DigitalThriftShop" />
<option value="$MODULE_DIR$/MarketPlaces/HiddenMarket" />
<option value="$MODULE_DIR$/MarketPlaces/LionMarketplace" />
<option value="$MODULE_DIR$/MarketPlaces/Nexus" />
<option value="$MODULE_DIR$/MarketPlaces/RobinhoodMarket" />
<option value="$MODULE_DIR$/MarketPlaces/TorBay" />
<option value="$MODULE_DIR$/MarketPlaces/TorMarket" />
<option value="$MODULE_DIR$/MarketPlaces/ViceCity" />
<option value="$MODULE_DIR$/Forums/Altenens" />
<option value="$MODULE_DIR$/Forums/Cardingleaks" />
<option value="$MODULE_DIR$/Forums/HiddenAnswers" />
<option value="$MODULE_DIR$/Forums/Libre" />
<option value="$MODULE_DIR$/Forums/Procrax" />
</list>
</option>
</component>

+ 1
- 1
.idea/misc.xml View File

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="C:\Users\John Wick\anaconda3" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="C:\Users\calsyslab\anaconda3" project-jdk-type="Python SDK" />
</project>

+ 14
- 26
Forums/AbyssForum/crawler_selenium.py View File

@ -30,32 +30,18 @@ baseURL = 'http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion
# Opens Tor Browser, crawls the website
def startCrawling():
# opentor()
forumName = getForumName()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
driver = getAccess()
new_parse(forumName, baseURL, True)
# Opens Tor Browser
def opentor():
from Forums.Initialization.forums_mining import config
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
new_parse(forumName, baseURL, True)
# Login using premade account credentials and do login captcha manually
@ -78,7 +64,7 @@ def getFixedURL():
# Closes Tor Browser
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -121,6 +107,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
def getAccess():
@ -241,14 +229,14 @@ def crawlForum(driver):
driver.back()
# comment out
break
# break
# comment out
if count == 1:
break
try:
link = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[2]/div[2]/div[2]/ul/li[9]/a').get_attribute('href')
link = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div[2]/div[2]/div[2]/ul/li[9]/a').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1


+ 9
- 21
Forums/Altenens/crawler_selenium.py View File

@ -30,7 +30,6 @@ baseURL = 'https://altenens.is/'
# Opens Tor Browser, crawls the website
def startCrawling():
# opentor()
forumName = getForumName()
driver = getAccess()
@ -40,22 +39,9 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
# new_parse(forumName, baseURL, True)
# Opens Tor Browser
def opentor():
from Forums.Initialization.forums_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
new_parse(forumName, baseURL, True)
# Login using premade account credentials and do login captcha manually
@ -93,7 +79,7 @@ def getFixedURL():
# Closes Tor Browser
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -118,8 +104,8 @@ def createFFDriver():
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@ -136,6 +122,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
@ -253,7 +241,7 @@ def crawlForum(driver):
driver.back()
# comment out
break
# break
# comment out
if count == 1:
@ -272,7 +260,7 @@ def crawlForum(driver):
print(link, e)
i += 1
input("Crawling Altenens forum done successfully. Press ENTER to continue\n")
print("Crawling the Altenens forum done.")
# Returns 'True' if the link is Topic link, may need to change for every website


+ 11
- 24
Forums/BestCardingWorld/crawler_selenium.py View File

@ -27,7 +27,6 @@ baseURL = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
# opentor()
forumName = getForumName()
driver = getAccess()
@ -36,25 +35,11 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(forumName, baseURL, True)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from Forums.Initialization.forums_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
def getForumName():
@ -71,7 +56,7 @@ def getFixedURL():
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -98,7 +83,7 @@ def createFFDriver():
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)#might need to turn off
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 2)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@ -114,6 +99,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
@ -238,8 +225,8 @@ def crawlForum(driver):
try:
nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div[2]/div[4]/ul')
li = nav.find_element_by_class_name('next')
page = li.find_element_by_tag_name('a').get_attribute('href')
li = nav.find_element(by=By.CLASS_NAME, value='next')
page = li.find_element(by=By.TAG_NAME, value='a').get_attribute('href')
if page == "":
raise NoSuchElementException
counter += 1
@ -252,7 +239,7 @@ def crawlForum(driver):
driver.back()
# comment out
break
# break
# comment out
if count == 1:
@ -260,8 +247,8 @@ def crawlForum(driver):
try:
bar = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div[2]/div[3]/ul')
next = bar.find_element_by_class_name('next')
link = next.find_element_by_tag_name('a').get_attribute('href')
next = bar.find_element(by=By.CLASS_NAME, value='next')
link = next.find_element(by=By.TAG_NAME, value='a').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
@ -273,7 +260,7 @@ def crawlForum(driver):
print(link, e)
i += 1
input("Crawling BestCardingWorld forum done sucessfully. Press ENTER to continue\n")
print("Crawling the BestCardingWorld forum done.")
# Returns 'True' if the link is a description link


+ 26
- 24
Forums/BestCardingWorld/parser.py View File

@ -152,7 +152,7 @@ def bestcardingworld_description_parser(soup):
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, post, user, addDate, feedback, status, reputation, sign, interest)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
# Sending the results
@ -166,15 +166,17 @@ def bestcardingworld_description_parser(soup):
#return: 'row' that contains a variety of lists that each hold info on the listing page
def bestcardingworld_listing_parser(soup):
nm = 0 # this variable should receive the number of topics
topic = [] # 1 all topics
board = "-1" # 2 board name (the previous level of the topic in the Forum categorization tree.
nm = 0 # *this variable should receive the number of topics
forum = "BestCardingWorld" # 0 *forum name
board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
view = [] # 3 number of views of each topic
post = [] # 4 number of posts of each topic
user = [] # 5 all users of each topic
addDate = [] # 6 when the topic was created (difficult to find)
href = [] # 16 this variable should receive all cleaned urls (we will use this to do the marge between Listing and Description pages)
author = [] # 2 *all authors of each topic
topic = [] # 3 *all topics
views = [] # 4 number of views of each topic
posts = [] # 5 number of posts of each topic
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
# Listing and Description pages)
addDate = [] # 7 when the topic was created (difficult to find)
# Finding the board (should be just one)
@ -187,7 +189,12 @@ def bestcardingworld_listing_parser(soup):
itopics = soup.find('ul', {"class": "topiclist topics"}).findAll('div',{"class": "list-inner"})
replies = soup.find('ul', {"class": "topiclist topics"}).findAll('dd',{"class": "posts"})
views = soup.find('ul', {"class": "topiclist topics"}).findAll('dd',{"class": "views"})
view = soup.find('ul', {"class": "topiclist topics"}).findAll('dd',{"class": "views"})
# Counting how many topics we have found so far
nm = len(itopics)
index = 0
for itopic in itopics:
@ -213,10 +220,6 @@ def bestcardingworld_listing_parser(soup):
topics = itopic.find('a', {"class": "topictitle"}).text
topic.append(cleanString(topics))
# Counting how many topics we have found so far
nm = len(topic)
# Adding the url to the list of urls
link = itopic.find('a', {"class": "topictitle"}).get('href')
link = cleanLink(link)
@ -224,18 +227,18 @@ def bestcardingworld_listing_parser(soup):
# Finding the author of the topic
ps = itopic.find('div', {"class":"responsive-hide"}).find('a', {"class": "username-coloured"}).text
author = ps.strip()
user.append(cleanString(author))
user = ps.strip()
author.append(cleanString(user))
# Finding the number of replies
posts = replies[index].text.split()[0]
posts = posts.strip()
post.append(cleanString(posts))
post = replies[index].text.split()[0]
post = post.strip()
posts.append(cleanString(post))
# Finding the number of Views
tview = views[index].text.split()[0]
tview = view[index].text.split()[0]
tview = tview.strip()
view.append(cleanString(tview))
views.append(cleanString(tview))
# If no information about when the topic was added, just assign "-1" to the variable
#CryptBB doesn't show when topic was first posted on listing page
@ -245,10 +248,9 @@ def bestcardingworld_listing_parser(soup):
addDate.append(date_time_obj)
#addDate.append("-1")
index += 1
return organizeTopics("BestCardingWorld", nm, topic, board, view, post, user, addDate, href)
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate)
#called by the crawler to get description links on a listing page


+ 15
- 27
Forums/Cardingleaks/crawler_selenium.py View File

@ -32,32 +32,18 @@ baseURL = 'https://leaks.ws/'
# Opens Tor Browser, crawls the website
def startCrawling():
# opentor()
forumName = getForumName()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
driver = getAccess()
new_parse(forumName, baseURL, True)
# Opens Tor Browser
def opentor():
from Forums.Initialization.forums_mining import config
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
new_parse(forumName, baseURL, True)
# Login using premade account credentials and do login captcha manually
@ -101,7 +87,7 @@ def getFixedURL():
# Closes Tor Browser
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -144,6 +130,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
@ -159,7 +147,7 @@ def getAccess():
# Saves the crawled html page
def savePage(page, url):
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
@ -242,7 +230,7 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, topic + f"page{counter}") # very important
savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# comment out
if counter == 2:
@ -261,7 +249,7 @@ def crawlForum(driver):
driver.back()
# comment out
break
# break
# comment out
if count == 1:


+ 13
- 25
Forums/CryptBB/crawler_selenium.py View File

@ -28,32 +28,18 @@ baseURL = 'http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion
# Opens Tor Browser, crawls the website
def startCrawling():
# opentor()
forumName = getForumName()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
driver = getAccess()
new_parse(forumName, baseURL, True)
# Opens Tor Browser
def opentor():
from Forums.Initialization.forums_mining import config
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
new_parse(forumName, baseURL, True)
# Login using premade account credentials and do login captcha manually
@ -119,7 +105,7 @@ def getFixedURL():
# Closes Tor Browser
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -162,6 +148,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
@ -289,7 +277,7 @@ def crawlForum(driver):
driver.back()
# comment out
break
# break
# comment out
if count == 1:


+ 1
- 1
Forums/CryptBB/parser.py View File

@ -124,7 +124,7 @@ def cryptBB_description_parser(soup):
stime = dt.replace('Yesterday,','').strip()
date_time_obj = yesterday+ ', '+stime
date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
elif "hours ago" in dt:
elif "hour ago" in dt or "hours ago" in dt:
day = day.strftime('%m-%d-%Y')
date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p')


+ 0
- 1
Forums/DB_Connection/db_connection.py View File

@ -2,7 +2,6 @@ __author__ = 'DarkWeb'
import psycopg2
import traceback
import configparser
def connectDataBase():


+ 13
- 25
Forums/HiddenAnswers/crawler_selenium.py View File

@ -30,32 +30,18 @@ baseURL = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion
# Opens Tor Browser, crawls the website
def startCrawling():
# opentor()
forumName = getForumName()
# driver: webdriver.Firefox = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
driver: webdriver.Firefox = getAccess()
new_parse(forumName, baseURL, True)
# Opens Tor Browser
def opentor():
from Forums.Initialization.forums_mining import config
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
new_parse(forumName, baseURL, True)
# Login using premade account credentials and do login captcha manually
@ -78,7 +64,7 @@ def getFixedURL():
# Closes Tor Browser
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -121,6 +107,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
def getAccess():
@ -235,7 +223,7 @@ def crawlForum(driver: webdriver.Firefox):
driver.back()
# comment out
break
# break
# comment out
if count == 1:


+ 5
- 2
Forums/HiddenAnswers/parser.py View File

@ -127,15 +127,18 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup):
if date_posted.find("day") > 0:
datetime_obj = datetime.now() - timedelta(days=1)
else:
datetime_obj = datetime.strptime(f"{date_posted} {date.today().year}", "%b %d %Y")
try:
datetime_obj = datetime.strptime(f"{date_posted} {date.today().year}", "%b %d %Y")
except ValueError:
datetime_obj = datetime.strptime(f"{date_posted}", "%b %d, %Y")
addDate.append(datetime_obj)
#this link will be cleaned
listing_href = queries.find("div", {"class": "qa-q-item-title"}).find("a").get("href")
href.append(listing_href)
#need to change this method
nm = len(topic)
return organizeTopics(forum, nm, board, user, topic, view, post, href, addDate)
#need to change this method


+ 0
- 1
Forums/Initialization/forumsList.txt View File

@ -1,4 +1,3 @@
AbyssForum
Altenens
BestCardingWorld
Cardingleaks


+ 5
- 0
Forums/Initialization/prepare_parser.py View File

@ -12,6 +12,7 @@ from Forums.OnniForums.parser import *
from Forums.Altenens.parser import *
from Forums.Procrax.parser import *
from Forums.Libre.parser import *
from Forums.HiddenAnswers.parser import *
from Forums.Classifier.classify_product import predict
# from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
@ -126,6 +127,8 @@ def parse_listing(forum, listingFile, soup, createLog, logFile):
rw = procrax_listing_parser(soup)
elif forum == "Libre":
rw = libre_listing_parser(soup)
elif forum == "HiddenAnswers":
rw = HiddenAnswers_listing_parser(soup)
else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception
@ -160,6 +163,8 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile):
rmm = procrax_description_parser(soup)
elif forum == "Libre":
rmm = libre_description_parser(soup)
elif forum == "HiddenAnswers":
rmm = HiddenAnswers_description_parser(soup)
else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception


+ 14
- 26
Forums/Libre/crawler_selenium.py View File

@ -28,32 +28,18 @@ baseURL = 'http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion
# Opens Tor Browser, crawls the website
def startCrawling():
# opentor()
forumName = getForumName()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
driver = getAccess()
new_parse(forumName, baseURL, True)
# Opens Tor Browser
def opentor():
from Forums.Initialization.forums_mining import config
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
new_parse(forumName, baseURL, True)
# Login using premade account credentials and do login captcha manually
@ -101,7 +87,7 @@ def getFixedURL():
# Closes Tor Browser
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -144,6 +130,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
@ -255,7 +243,7 @@ def crawlForum(driver):
driver.back()
# comment out
break
# break
# comment out
if count == 1:
@ -275,7 +263,7 @@ def crawlForum(driver):
print(link, e)
i += 1
input("Crawling the Libre forum done.")
print("Crawling the Libre forum done.")
# Returns 'True' if the link is Topic link, may need to change for every website


+ 12
- 25
Forums/OnniForums/crawler_selenium.py View File

@ -31,32 +31,18 @@ baseURL = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion
# Opens Tor Browser, crawls the website
def startCrawling():
# opentor()
forumName = getForumName()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
driver = getAccess()
new_parse(forum=forumName, url=baseURL, createLog=True)
# Opens Tor Browser
def opentor():
from Forums.Initialization.forums_mining import config
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
new_parse(forum=forumName, url=baseURL, createLog=True)
# Login using premade account credentials and do login captcha manually
@ -96,7 +82,7 @@ def getFixedURL():
# Closes Tor Browser
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -139,6 +125,7 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
@ -267,7 +254,7 @@ def crawlForum(driver):
driver.back()
# comment out
break
# break
# comment out
if count == 1:


+ 8
- 7
Forums/OnniForums/parser.py View File

@ -139,12 +139,14 @@ def onniForums_listing_parser(soup: BeautifulSoup):
nm = len(thread_arrays)
for thread in thread_arrays: #getting the information from the posts and sorting them into the arrays defined above
try:
post_subject: str = thread.find("span",{"class": "subject_new"}).text #getting the topic
body = thread.find("span",{"class": "subject_new"})
try:
post_subject: str = body.text #getting the topic
except AttributeError:
post_subject: str = thread.find("span",{"class": "subject_old"}).text
body = thread.find("span",{"class": "subject_old"})
post_subject: str = body.text
post_subject_cleaned = cleanString(post_subject.strip())
topic.append(post_subject_cleaned)
@ -163,9 +165,8 @@ def onniForums_listing_parser(soup: BeautifulSoup):
author = thread.find("span",{"class" : "author smalltext"}).text
author_cleaned = cleanString(author.strip())
user.append(author_cleaned)
reply_anchor = thread.find_all("td", {"align": "center"})[2].find('a')
thread_link = reply_anchor.get('href')
thread_link = body.find('a').get('href')
href.append(thread_link)
return organizeTopics(


+ 13
- 25
Forums/Procrax/crawler_selenium.py View File

@ -32,16 +32,15 @@ FORUM_NAME = 'Procrax'
# Opens Tor Browser, crawls the website
def startCrawling():
# opentor()
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(
forum=FORUM_NAME,
@ -50,19 +49,6 @@ def startCrawling():
)
# Opens Tor Browser
def opentor():
from Forums.Initialization.forums_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Login using premade account credentials and do login captcha manually
def login(driver):
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
@ -97,7 +83,7 @@ def getFixedURL():
# Closes Tor Browser
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -140,6 +126,8 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
def getAccess():
@ -257,7 +245,7 @@ def crawlForum(driver):
driver.back()
# comment out
break
# break
# comment out
if count == 1:


+ 8
- 26
MarketPlaces/AnonymousMarketplace/crawler_selenium.py View File

@ -32,7 +32,6 @@ baseURL = 'http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
# opentor()
mktName = getMKTName()
driver = getAccess()
@ -42,25 +41,11 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
def getMKTName():
@ -77,7 +62,7 @@ def getFixedURL():
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -104,7 +89,7 @@ def createFFDriver():
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 1)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@ -146,6 +131,7 @@ def login(driver):
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.ID, "woocommerce_product_categories-2")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
@ -187,12 +173,8 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
# # carding
# links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/carding/')
# # hacked paypal
# links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/hacked-paypal-accounts/')
# hacking services
links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/hacking-services/')
# home
links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/')
return links
@ -232,7 +214,7 @@ def crawlForum(driver):
driver.back()
# comment out
break
# break
# comment out
if count == 1:
@ -240,7 +222,7 @@ def crawlForum(driver):
#left in in case site changes
try:
link = ""
link = driver.find_element(by=By.LINK_TEXT, value="").get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1


+ 17
- 20
MarketPlaces/AnonymousMarketplace/parser.py View File

@ -41,12 +41,12 @@ def anonymousMarketplace_description_parser(soup: Tag):
describe_output += div.text
describe = cleanString(describe_output.strip())
product_ratings: Tag = soup.find("div", {"class": "star-rating"})
product_ratings: Tag = soup.find("div", {"class": "woocommerce-product-rating"})
product_reviews = product_ratings.find("div", {"class": "woocommerce-product-rating"}).find("strong", {"class": "rating"}).text
product_reviews = product_ratings.find("span", {"class": "rating"}).text
reviews = cleanString(product_reviews.strip())
product_star_rating = product_ratings.find("span", {"class": "rating"}).text
product_star_rating = product_ratings.find("strong", {"class": "rating"}).text
rating_item = cleanString(product_star_rating.strip())
product_price = soup.find("span", {"class": "woocommerce-Price-amount amount"}).text
@ -86,15 +86,16 @@ def anonymousMarketplace_listing_parser(soup: Tag):
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
product_list: ResultSet[Tag] = soup.find("ul", {"class": "product_list_widget"}).find_all("li")
woo = soup.find('div', {"class": "woocommerce"})
product_list = woo.find('ul', {"class": "products columns-4"}, recursive=False).find_all('li')
for item in product_list:
item_href = item.find("a").get("href")
item_href = item.find("a", recursive=False).get("href")
href.append(item_href)
item_name = item.find("span", {"class": "product-title"}).text
item_name = item.find("h2").text
name.append(cleanString(item_name.strip()))
item_rating = item.find("div", {"class": "star-rating"}).find("strong", {"class": "rating"}).text
@ -103,14 +104,11 @@ def anonymousMarketplace_listing_parser(soup: Tag):
try:
item_price = item.find("span", {"class": "woocommerce-Price-amount amount"}).text
item_price = item_price.replace("$", "").strip()
USD.append(item_price)
USD.append(cleanNumbers(item_price))
except AttributeError:
USD.append("-1")
vendor.append("Anonymous")
vendor.append("AnonymousMarketplace")
rating_vendor.append("-1")
success.append("-1")
CVE.append("-1")
@ -153,10 +151,6 @@ def anonymousMarketplace_listing_parser(soup: Tag):
shipTo=shipTo,
href=href
)
#called by the crawler to get description links on a listing page
@ -167,10 +161,13 @@ def anonymous_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.find('ul', {"class": "product_list_widget"}).findAll('li')
woo = soup.find('div', {"class": "woocommerce"})
listing = woo.find('ul', {"class": "products columns-4"}, recursive=False).find_all('li')
for a in listing:
bae = a.find('a', href=True)
bae = a.find('a', href=True, recursive=False)
link = bae['href']
href.append(link)


+ 14
- 20
MarketPlaces/Apocalypse/crawler_selenium.py View File

@ -32,7 +32,6 @@ baseURL = 'http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
# opentor()
mktName = getMKTName()
driver = getAccess()
@ -47,20 +46,6 @@ def startCrawling():
new_parse(mktName, baseURL, True)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
def getMKTName():
@ -104,7 +89,7 @@ def createFFDriver():
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 1)##
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@ -162,6 +147,7 @@ def login(driver):
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[1]/div[2]/div[1]/div[1]/a[13]")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
@ -203,8 +189,12 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
# # Hacking Services
# links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/subcategory/19')
# # Digital Goods
# links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/74')
# # Fraud
# links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/75')
# # Services
# links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/76')
# software and malware
links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/subcategory/30')
@ -243,7 +233,11 @@ def crawlForum(driver):
except:
driver.refresh()
savePage(driver, driver.page_source, item)
driver.back()
# driver.back()
try:
driver.get(link)
except:
driver.refresh()
# comment out
# break
@ -282,7 +276,7 @@ def isDescriptionLink(url):
#@param: url of any url crawled
#return: true if is a Listing page, false if not
def isListingLink(url):
if 'subcategory' in url:
if 'category' in url:
return True
return False


+ 23
- 8
MarketPlaces/Apocalypse/parser.py View File

@ -30,7 +30,9 @@ def apocalypse_description_parser(soup: Tag):
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
content: Tag = soup.find("div", {'id': "article_page"})
product_name = content.find("p", {"class": "list-group-item text-center mb-0 box"}).text
@ -38,7 +40,11 @@ def apocalypse_description_parser(soup: Tag):
product_description = content.find("pre").text
describe = cleanString(product_description.strip())
# Finding Product Image
image = soup.find('div', {'class': 'col-md-7 text-center'}).find('img')
image = image.get('src').split('base64,')[-1]
product_reviews_list: Tag = content.find("table", {"class": "table product_reviews"}) \
.find_all("li")
@ -72,7 +78,7 @@ def apocalypse_description_parser(soup: Tag):
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo)
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
return row
@ -103,15 +109,21 @@ def apocalypse_listing_parser(soup: Tag):
vendor = [] # 18 Vendor
rating = [] # 19 Vendor_Rating
success = [] # 20 Vendor_Successful_Transactions
href = [] # 23 Product_Links (Urls)
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
listings: ResultSet[Tag] = soup.find("div", {"class": "col-lg-9 my-4"}).find_all("div", {"class": "col-lg-4 col-md-6 mb-1"})
for prod in listings:
product_name = prod.find('h5', {"class": "art_title"}).text
name.append(cleanString(product_name.strip()))
# Finding Product Image
product_image = prod.find('img', {'class': 'customHeight'})
product_image = product_image.get('src').split('base64,')[-1]
image.append(product_image)
CVE.append("-1")
MS.append("-1")
@ -124,6 +136,7 @@ def apocalypse_listing_parser(soup: Tag):
EURO.append("-1")
shipTo.append("-1")
success.append("-1")
image_vendor.append("-1")
product_price = prod.find("span", {"class": "priceP"}).text
USD.append(cleanString(product_price.strip()))
@ -161,7 +174,7 @@ def apocalypse_listing_parser(soup: Tag):
rating.append(cleanString(product_vendor_rating.strip()))
except Exception as e:
raise e
product_href = prod.find('a').get('href')
href.append(product_href)
@ -190,7 +203,9 @@ def apocalypse_listing_parser(soup: Tag):
qLeft=qLeft,
shipFrom=shipFrom,
shipTo=shipTo,
href=href
href=href,
image=image,
image_vendor=image_vendor
)
#called by the crawler to get description links on a listing page


+ 3
- 18
MarketPlaces/BlackPyramid/crawler_selenium.py View File

@ -33,7 +33,6 @@ baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
# opentor()
mktName = getMKTName()
driver = getAccess()
@ -43,25 +42,11 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
def getMKTName():
@ -78,7 +63,7 @@ def getFixedURL():
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -105,7 +90,7 @@ def createFFDriver():
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 2)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")


+ 3
- 18
MarketPlaces/CityMarket/crawler_selenium.py View File

@ -33,7 +33,6 @@ baseURL = 'http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
# opentor()
mktName = getMKTName()
driver = getAccess()
@ -43,25 +42,11 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
def getMKTName():
@ -78,7 +63,7 @@ def getFixedURL():
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -105,7 +90,7 @@ def createFFDriver():
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 1)##
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")


+ 3
- 18
MarketPlaces/CypherMarketplace/crawler_selenium.py View File

@ -32,7 +32,6 @@ baseURL = 'http://6c5qa2ke2esh6ake6u6yoxjungz2czbbl7hqxl75v5k37frtzhxuk7ad.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
# opentor()
mktName = getMKTName()
driver = getAccess()
@ -42,25 +41,11 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
def getMKTName():
@ -77,7 +62,7 @@ def getFixedURL():
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -104,7 +89,7 @@ def createFFDriver():
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 2)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")


+ 46
- 31
MarketPlaces/DB_Connection/db_connection.py View File

@ -3,6 +3,7 @@ __author__ = 'DarkWeb'
import psycopg2
import traceback
import configparser
from MarketPlaces.Utilities.utilities import *
def connectDataBase():
@ -146,7 +147,7 @@ def create_marketPlace(cur, row, url):
sql = "Insert into marketplaces (market_id, name_market, url_market, dateinserted_market) " \
"Values (%s, %s, %s, %s)"
recset = [marketId, row[0], url, row[21]]
recset = [marketId, row[0], url, row[23]]
cur.execute(sql, recset)
@ -165,13 +166,15 @@ def create_vendor(cur, row, marketId):
if newVendor:
sql = "Insert into vendors (vendor_id, market_id, name_vendor, rating_vendor, successfultransactions_vendor, dateinserted_vendor) Values (%s, %s, %s, %s, %s, %s)"
sql = "Insert into vendors (vendor_id, market_id, name_vendor, rating_vendor, successfultransactions_vendor, image_vendor, dateinserted_vendor) " \
"Values (%s, %s, %s, %s, %s, %s, %s)"
recset = [vendorId, marketId,
row[1],
row[2] if row[2] != '-1' else None,
row[3] if row[3] != '-1' else None,
row[21]]
row[21] if row[21] != '-1' else None,
row[23]]
cur.execute(sql, recset)
@ -183,24 +186,30 @@ def create_vendor(cur, row, marketId):
recset = cur.fetchall()
# decode_decrypt_image_in_base64(recset[0][5])
if (str(recset[0][3]) != str(row[2] if row[2] != '-1' else None) or # there was a change in the vendor information
str(recset[0][4]) != str(row[3] if row[3] != '-1' else None)):
str(recset[0][4]) != str(row[3] if row[3] != '-1' else None) or
str(recset[0][5]) != str(row[21] if row[21] != '-1' else None)):
sql = "Insert into vendors_history (vendor_id, market_id, name_vendor, rating_vendor, successfultransactions_vendor, dateinserted_vendor) Values (%s, %s, %s, %s, %s, %s)"
sql = "Insert into vendors_history (vendor_id, market_id, name_vendor, rating_vendor, successfultransactions_vendor, image_vendor, dateinserted_vendor) " \
"Values (%s, %s, %s, %s, %s, %s, %s)"
recset = [vendorId, marketId,
recset[0][2],
recset[0][3],
recset[0][4],
recset[0][5]]
recset[0][5],
recset[0][6]]
cur.execute(sql, recset)
sql = "Update vendors set rating_vendor = %(rating_vendor)s, successfultransactions_vendor = %(successfultransactions_vendor)s, " \
"dateinserted_vendor = %(dateinserted_vendor)s where vendor_id = %(vendorId)s"
"image_vendor = %(image_vendor)s, dateinserted_vendor = %(dateinserted_vendor)s where vendor_id = %(vendorId)s"
cur.execute(sql, {'rating_vendor': row[2] if row[2] != '-1' else None,
'successfultransactions_vendor': row[3] if row[3] != '-1' else None,
'dateinserted_vendor': row[21],
'image_vendor': row[21] if row[21] != '-1' else None,
'dateinserted_vendor': row[23],
'vendorId': vendorId})
return vendorId
@ -220,9 +229,9 @@ def create_items(cur, row, marketId, vendorId):
sql = "Insert into items (item_id, market_id, vendor_id, name_item, description_item, cve_item, ms_item, category_item, " \
"views_item, reviews_item, rating_item, dateadded_item, btc_item, usd_item, euro_item, quantitysold_item, " \
"quantityleft_item, shippedfrom_item, shippedto_item, href_item, lastseen_item, dateinserted_item, " \
"quantityleft_item, shippedfrom_item, shippedto_item, lastseen_item, image_item, href_item, dateinserted_item, " \
"classification_item) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " \
"%s, %s, %s, %s)"
"%s, %s, %s, %s, %s)"
recset = [itemId, marketId, vendorId,
row[4],
@ -241,10 +250,11 @@ def create_items(cur, row, marketId, vendorId):
row[17] if row[17] != '-1' else None,
row[18] if row[18] != '-1' else None,
row[19] if row[19] != '-1' else None,
row[23],
row[20] if row[20] != '-1' else None,
row[21],
row[21],
row[22]]
row[22] if row[22] != '-1' else None,
row[23],
row[24]]
cur.execute(sql, recset)
@ -256,19 +266,22 @@ def create_items(cur, row, marketId, vendorId):
recset = cur.fetchall()
#decode_decrypt_image_in_base64(recset[0][20])
if (str(recset[0][4]) != str(row[5] if row[5] != '-1' else None) or str(recset[0][5]) != str(row[6] if row[6] != '-1' else None) or
str(recset[0][6]) != str(row[7] if row[7] != '-1' else None) or str(recset[0][7]) != str(row[8] if row[8] != '-1' else None) or
str(recset[0][8]) != str(row[9] if row[9] != '-1' else None) or str(recset[0][9]) != str(row[10] if row[10] != '-1' else None) or
str(recset[0][10]) != str(row[11] if row[11] != '-1' else None) or str(recset[0][11]) != str(row[12] if row[12] != '-1' else None) or
str(recset[0][12]) != str(row[13] if row[13] != '-1' else None) or str(recset[0][13]) != str(row[14] if row[14] != '-1' else None) or
str(recset[0][14]) != str(row[15] if row[15] != '-1' else None) or str(recset[0][15]) != str(row[16] if row[16] != '-1' else None) or
str(recset[0][16]) != str(row[17] if row[17] != '-1' else None) or str(recset[0][17]) != str(row[18] if row[18] != '-1' else None)):
str(recset[0][16]) != str(row[17] if row[17] != '-1' else None) or str(recset[0][17]) != str(row[18] if row[18] != '-1' else None) or
str(recset[0][18]) != str(row[19] if row[19] != '-1' else None) or str(recset[0][20]) != str(row[20] if row[20] != '-1' else None)):
sql = "Insert into items_history (item_id, market_id, vendor_id, name_item, description_item, cve_item, ms_item, category_item, " \
"views_item, reviews_item, rating_item, dateadded_item, btc_item, usd_item, euro_item, quantitysold_item, " \
"quantityleft_item, shippedfrom_item, shippedto_item, href_item, lastseen_item, dateinserted_item, " \
"quantityleft_item, shippedfrom_item, shippedto_item, lastseen_item, image_item, href_item, dateinserted_item, " \
"classification_item) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " \
"%s, %s, %s, %s)"
"%s, %s, %s, %s, %s)"
recset = [itemId, marketId, vendorId,
recset[0][3],
@ -290,7 +303,8 @@ def create_items(cur, row, marketId, vendorId):
recset[0][19],
recset[0][20],
recset[0][21],
recset[0][22]]
recset[0][22],
recset[0][23]]
cur.execute(sql, recset)
@ -299,7 +313,7 @@ def create_items(cur, row, marketId, vendorId):
"rating_item = %(rating_item)s, dateadded_item = %(dateadded_item)s, btc_item = %(btc_item)s, " \
"usd_item = %(usd_item)s, euro_item = %(euro_item)s, quantitysold_item = %(quantitysold_item)s, " \
"quantityleft_item = %(quantityleft_item)s, shippedfrom_item = %(shippedfrom_item)s, shippedto_item = %(shippedto_item)s, " \
"lastseen_item = %(lastseen_item)s, dateinserted_item = %(dateinserted_item)s where item_id = %(itemId)s"
"lastseen_item = %(lastseen_item)s, image_item = %(image_item)s, dateinserted_item = %(dateinserted_item)s where item_id = %(itemId)s"
cur.execute(sql, {'description_item': row[5] if row[5] != '-1' else None,
'cve_item': row[6] if row[6] != '-1' else None,
@ -316,8 +330,9 @@ def create_items(cur, row, marketId, vendorId):
'quantityleft_item': row[17] if row[17] != '-1' else None,
'shippedfrom_item': row[18] if row[18] != '-1' else None,
'shippedto_item': row[19] if row[19] != '-1' else None,
'dateinserted_item': row[21],
'lastseen_item': row[21],
'dateinserted_item': row[23],
'lastseen_item': row[23],
'image_item': row[20],
'itemId': itemId})
@ -325,7 +340,7 @@ def create_items(cur, row, marketId, vendorId):
sql = "Update items set lastseen_item = %(lastseen_item)s where item_id = %(itemId)s"
cur.execute(sql, {'lastseen_item': row[21],
cur.execute(sql, {'lastseen_item': row[23],
'itemId': itemId})
return itemId
@ -344,8 +359,8 @@ def create_database(cur, con):
sql = "create table vendors(vendor_id integer not null, market_id integer not null, name_vendor character " \
"varying(255) not null, rating_vendor character varying(255), successfultransactions_vendor integer " \
"null, dateinserted_vendor timestamp(6) with time zone not null, constraint vendors_pk primary key (" \
"vendor_id), constraint vendors_market_id_fkey foreign key (market_id) references marketplaces (" \
"null, image_vendor character varying(1000000) null, dateinserted_vendor timestamp(6) with time zone not null, " \
"constraint vendors_pk primary key (vendor_id), constraint vendors_market_id_fkey foreign key (market_id) references marketplaces (" \
"market_id))"
cur.execute(sql)
@ -354,8 +369,8 @@ def create_database(cur, con):
sql = "create table vendors_history(vendor_id integer not null, market_id integer not null, name_vendor " \
"character varying(255) not null, rating_vendor character varying(255), successfultransactions_vendor " \
"integer null, dateinserted_vendor timestamp(6) with time zone not null, constraint vendors_history_pk " \
"primary key (vendor_id, dateinserted_vendor), constraint vendors_history_vendor_id_fkey foreign key (" \
"integer null, image_vendor character varying(1000000) null, dateinserted_vendor timestamp(6) with time zone not null, " \
"constraint vendors_history_pk primary key (vendor_id, dateinserted_vendor), constraint vendors_history_vendor_id_fkey foreign key (" \
"vendor_id) references vendors (vendor_id), constraint vendors_history_market_id_fkey foreign key (" \
"market_id) references marketplaces (market_id))"
cur.execute(sql)
@ -367,9 +382,9 @@ def create_database(cur, con):
"character varying(25) null, btc_item character varying(255) null, usd_item character varying(255) " \
"null, euro_item character varying(255) null, quantitysold_item integer null, quantityleft_item " \
"character varying(255) null, shippedfrom_item character varying(255) null, shippedto_item character " \
"varying(255) null, href_item character varying(255) not null, lastseen_item timestamp(6) with time zone " \
"not null, dateinserted_item timestamp(6) with time zone not null, classification_item double " \
"precision not null, constraint items_pk primary key (item_id), constraint " \
"varying(255) null, lastseen_item timestamp(6) with time zone not null, image_item character varying(1000000) null, " \
"href_item character varying(255) not null, dateinserted_item timestamp(6) with time zone not null, " \
"classification_item double precision not null, constraint items_pk primary key (item_id), constraint " \
"items_market_id_fkey foreign key (market_id) references marketplaces (market_id),constraint " \
"items_vendor_id_fkey foreign key (vendor_id) references vendors (vendor_id))"
cur.execute(sql)
@ -384,9 +399,9 @@ def create_database(cur, con):
"character varying(25) null, btc_item character varying(255) null, usd_item character varying(255) " \
"null, euro_item character varying(255) null, quantitysold_item integer null, quantityleft_item " \
"character varying(255) null, shippedfrom_item character varying(255) null, shippedto_item character " \
"varying(255) null, href_item character varying(255) not null, lastseen_item timestamp(6) with time zone " \
"not null, dateinserted_item timestamp(6) with time zone not null, classification_item double " \
"precision not null, constraint items_history_pk primary key (item_id, dateinserted_item), " \
"varying(255) null, lastseen_item timestamp(6) with time zone not null, image_item character varying(1000000) null, " \
"href_item character varying(255) not null, dateinserted_item timestamp(6) with time zone not null, " \
"classification_item double precision not null, constraint items_history_pk primary key (item_id, dateinserted_item), " \
"constraint items_history_market_id_fkey foreign key (market_id) references marketplaces (market_id), " \
"constraint items_history_vendor_id_fkey foreign key (vendor_id) references vendors (vendor_id), " \
"constraint items_history_item_id_fkey foreign key (item_id) references items (item_id))"


+ 262
- 0
MarketPlaces/DarkBazar/crawler_selenium.py View File

@ -0,0 +1,262 @@
__author__ = 'DarkWeb'
'''
DarkBazar Marketplace Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.DarkBazar.parser import darkbazar_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/'
def startCrawling():
mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Returns the name of the website
def getMKTName():
name = 'DarkBazar'
return name
# Return the base link of the website
def getFixedURL():
url = 'http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/'
return url
# Closes Tor Browser
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", False)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
def login(driver):
input("Press ENTER when CAPTCHA is complete and login page has loaded\n")
# entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//input[@name="username"]')
# Username here
usernameBox.send_keys('aliciamykeys')
passwordBox = driver.find_element(by=By.XPATH, value='//input[@name="password"]')
# Password here
passwordBox.send_keys('aliciawherearemykey$')
# session time
session_select = Select(driver.find_element(by=By.XPATH, value='/html/body/main/div/div/div/div/div/form/div[4]/div/div[2]/select'))
session_select.select_by_visible_text('Session 60min')
input("Press ENTER when CAPTCHA is completed and you exit the newsletter\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="submit"]')))
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
def getMKTName() -> str:
name = 'DarkBazar'
return name
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if name == '':
name = str(counter)
counter = counter + 1
return name
def getInterestedLinks():
links = []
# # Digital Goods
# links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=4')
# Services
links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=5')
return links
def crawlForum(driver):
print("Crawling the DarkBazar market")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver, driver.page_source, item)
driver.back()
# comment out
# break
# comment out
if count == 1:
break
try:
link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
print("Crawling the DarkBazar market done.")
# Returns 'True' if the link is Topic link, may need to change for every website
def isDescriptionLink(url):
if 'item' in url:
return True
return False
# Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url):
if 'category=' in url:
return True
return False
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return darkbazar_links_parser(soup)
def crawler():
startCrawling()

+ 289
- 0
MarketPlaces/DarkBazar/parser.py View File

@ -0,0 +1,289 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
# stores info it needs in different lists, these lists are returned after being organized
# @param: soup object looking at html page of description page
# return: 'row' that contains a variety of lists that each hold info on the description page
def darkbazar_description_parser(soup):
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
# Finding Product Name
divmb = soup.findAll('div', {'class': "mb-1"})
name = divmb[0].text
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = name.strip()
# Finding Vendor
vendor = divmb[1].find('a').text.strip()
# Finding Vendor Rating
temp = soup.find('div', {'class': ""}).text
temp = temp.split('(')
rating = temp[0].replace("Vendor's Review : ", "")
rating = rating.replace("%", "")
rating_vendor = rating.strip()
# Finding the Product Rating and Number of Product Reviews
reviews = temp[2].replace(" review)", "")
reviews = reviews.strip()
temp = temp[1].split(")")
rating = temp[1].replace("Product Review : ", "")
rating = rating.replace("%", "")
rating_item = rating.strip()
# Finding Prices
USD = soup.find('div', {'class': "h3 text-primary"}).text.strip()
# Finding the Product Category
pmb = soup.findAll('p', {'class': "mb-1"})
category = pmb[-1].text
category = category.replace("Category: ", "").strip()
# Finding the Product Quantity Available
left = divmb[-1].text
left = left.split(",", 1)[1]
left = left.replace("in stock", "")
left = left.strip()
# Finding Number Sold
sold = divmb[-1].text
sold = sold.split(",", 1)[0]
sold = sold.replace("sold", "")
sold = sold.strip()
# Finding Shipment Information (Origin)
pmb[0].text
shipFrom = shipFrom.replace("Ships from: ", "").strip()
# Finding Shipment Information (Destination)
pmb[1].text
shipTo = shipTo.replace("Ships to: ", "").strip()
# Finding the Product description
cardbody = soup.findAll('div', {'class': "card-body"})
describe = cardbody[1].text.strip()
# Finding Product Image
image = soup.find('div', {'class': 'product-primary'}).find('img')
image = image.get('src')
image = image.split('base64,')[-1]
# Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve:
CVE = " "
for idx in cve:
CVE += (idx)
CVE += " "
CVE = CVE.replace(',', ' ')
CVE = CVE.replace('\n', '')
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
if ms:
MS = " "
for im in ms:
MS += (im)
MS += " "
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
return row
# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
# stores info it needs in different lists, these lists are returned after being organized
# @param: soup object looking at html page of listing page
# return: 'row' that contains a variety of lists that each hold info on the listing page
def darkbazar_listing_parser(soup):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
mktName = "DarkBazar" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft = [] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
listing = soup.findAll('div', {"id": "itembox"})
# Populating the Number of Products
nm = len(listing)
for a in listing:
bae = a.findAll('a', href=True)
lb = a.findAll('div', {"id": "littlebox"})
# Adding the url to the list of urls
link = bae[0].get('href')
link = cleanLink(link)
href.append(link)
# Finding the Product
product = lb[1].find('a').text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.replace("...", "")
product = product.strip()
name.append(product)
# Finding Product Image
product_image = a.find('img')
product_image = product_image.get('src')
product_image = product_image.split('base64,')[-1]
image.append(product_image)
# Finding Prices
price = lb[-1].find('div', {"class": "mb-1"}).text
price = price.replace("$","")
price = price.strip()
USD.append(price)
# Finding the Vendor
vendor_name = lb[-1].find("a").text
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
image_vendor.append("-1")
# Finding the Category
cat = lb[-1].find("span").text
cat = cat.replace("class:", "")
cat = cat.strip()
category.append(cat)
span = lb[0].findAll("span")
# Finding Number of Views
num = span[0].text
num = num.replace("views:", "")
num = num.strip()
sold.append(num)
# Finding Number Sold
num = span[2].text
num = num.replace("Sold:", "")
num = num.strip()
sold.append(num)
# Finding Quantity Left
quant = span[1].text
quant = quant.replace("stock:", "")
quant = quant.strip()
qLeft.append(quant)
# add shipping information
ship = lb[2].findAll('small')[1].findAll('span')[1].text.split("->")
shipFrom.append(ship[0].replace("Ship from ", "").strip())
shipTo.append(ship[1].replace("to ", "").strip())
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue = "-1"
else:
cee = " "
for idx in cve:
cee += (idx)
cee += " "
cee = cee.replace(',', ' ')
cee = cee.replace('\n', '')
cveValue = cee
CVE.append(cveValue)
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue = "-1"
else:
me = " "
for im in ms:
me += (im)
me += " "
me = me.replace(',', ' ')
me = me.replace('\n', '')
MSValue = me
MS.append(MSValue)
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
# called by the crawler to get description links on a listing page
# @param: beautifulsoup object that is using the correct html page (listing page)
# return: list of description links from a listing page
def darkbazar_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.findAll('div', {"id": "itembox"})
# for a in listing:
# bae = a.find('a', {"class": "text-info"}, href=True)
# link = bae['href']
# href.append(link)
for a in listing:
bae = a.findAll('a', href=True)
# Adding the url to the list of urls
link = bae[0].get('href')
href.append(link)
return href

+ 6
- 21
MarketPlaces/DarkFox/crawler_selenium.py View File

@ -30,7 +30,6 @@ baseURL = 'http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
# opentor()
mktName = getMKTName()
driver = getAccess()
@ -40,25 +39,11 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
def getMKTName():
@ -81,7 +66,7 @@ def getFixedURL():
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -108,10 +93,10 @@ def createFFDriver():
# ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
# ff_prof.set_preference("permissions.default.image", 2)
# ff_prof.set_preference("browser.download.folderList", 2)
# ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
# ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')


+ 6
- 23
MarketPlaces/DarkMatter/crawler_selenium.py View File

@ -32,7 +32,6 @@ baseURL = 'http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
# opentor()
mktName = getMKTName()
driver = getAccess()
@ -42,25 +41,11 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
def getMKTName():
@ -77,7 +62,7 @@ def getFixedURL():
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -94,7 +79,6 @@ def createFFDriver():
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
@ -105,7 +89,7 @@ def createFFDriver():
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
#ff_prof.set_preference("network.dns.disablePrefetch", True)#connection issue
#ff_prof.set_preference("network.http.sendRefererHeader", 0)#connection issue
ff_prof.set_preference("permissions.default.image", 1)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@ -229,17 +213,16 @@ def crawlForum(driver):
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
time.sleep(1.5) # to keep from detecting click speed
time.sleep(3) # to keep from detecting click speed
driver.get(itemURL)
except:
driver.refresh()
savePage(driver, driver.page_source, item)
time.sleep(1.5)
time.sleep(3) # to keep from detecting click speed
driver.back()
# to keep from detecting click speed
# comment out
break
# break
# comment out
if count == 1:


+ 106
- 134
MarketPlaces/DarkMatter/parser.py View File

@ -34,36 +34,29 @@ def darkmatter_description_parser(soup):
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
# 0 *Vendor_Name
try:
temp = soup.find('table', {'class', 'vtable'})
temp = temp.findAll('tr')
temp2 = temp[3].find('a').text
name = cleanString(temp2.strip())
vendor = cleanString(temp2.strip())
except:
try:
temp = soup.find('table', {'class', 'vtable'})
temp = temp.findAll('tr')
temp2 = temp[4].find('a').text
name = cleanString(temp2.strip())
except:
print("vendor")
temp = soup.find('table', {'class', 'vtable'})
temp = temp.findAll('tr')
temp2 = temp[4].find('a').text
vendor = cleanString(temp2.strip())
# product name
try:
name = soup.find('div', {'class', 'title-h2'}).text
name = cleanString(name.strip())
except:
print("name")
name = soup.find('div', {'class', 'title-h2'}).text
name = cleanString(name.strip())
#product description
try:
temp = soup.find('pre', {'class', 'description'}).text
temp = temp.replace('\n', ' ')
describe = cleanString(temp.strip())
except:
print("description")
temp = soup.find('pre', {'class', 'description'}).text
temp = temp.replace('\n', ' ')
describe = cleanString(temp.strip())
#product category
try:
@ -75,48 +68,42 @@ def darkmatter_description_parser(soup):
temp2 = temp[4].find('a').text
category = cleanString(temp2.strip())
except:
try:
temp = soup.find('table', {'class', 'vtable'})
temp = temp.findAll('tr')
temp2 = temp[5].find('th').text
temp2 = cleanString(temp2.strip)
if (temp2 == "Category"):
temp2 = temp[5].find('a').text
category = cleanString(temp2.strip())
except:
print('category')
# usd
try:
temp = soup.find('table', {'class', 'vtable'})
temp = temp.findAll('tr')
temp2 = temp[1].find('td').text
temp2 = temp2.replace(' USD', '')
USD = cleanString(temp2)
except:
print('USD')
# 15 Product_QuantitySold
try:
temp = soup.find('table', {'class', 'vtable'})
temp = temp.findAll('tr')
temp2 = temp[5].find('th').text
temp2 = cleanString(temp2)
temp3 = temp[6].find('th').text
temp3 = cleanString(temp3)
if (temp2 == "Sold"):
temp2 = temp[5].find('td').text
sold = cleanString(temp2.strip())
elif (temp3 == "Sold"):
temp2 = temp[6].find('td').text
sold = cleanString(temp2.strip())
except:
print('sold')
temp2 = cleanString(temp2.strip)
if (temp2 == "Category"):
temp2 = temp[5].find('a').text
category = cleanString(temp2.strip())
# usd
temp = soup.find('table', {'class', 'vtable'})
temp = temp.findAll('tr')
temp2 = temp[1].find('td').text
temp2 = temp2.replace(' USD', '')
USD = cleanString(temp2)
# 15 Product_QuantitySold
temp = soup.find('table', {'class', 'vtable'})
temp = temp.findAll('tr')
temp2 = temp[5].find('th').text
temp2 = cleanString(temp2)
temp3 = temp[6].find('th').text
temp3 = cleanString(temp3)
if (temp2 == "Sold"):
temp2 = temp[5].find('td').text
sold = cleanString(temp2.strip())
elif (temp3 == "Sold"):
temp2 = temp[6].find('td').text
sold = cleanString(temp2.strip())
# Finding Product Image
image = soup.find('td', {"class": "vtop"}).find('img').get('src')
image = image.split('base64,')[-1]
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo)
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
return row
@ -129,34 +116,36 @@ def darkmatter_description_parser(soup):
def darkmatter_listing_parser(soup):
# Fields to be parsed
nm = 0 # Total_Products (Should be Integer)
mktName = "DarkMatter" # 0 Marketplace_Name
name = [] # 1 Product_Name
CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 3 Product_MS_Classification (Microsoft Security)
category = [] # 4 Product_Category
describe = [] # 5 Product_Description
escrow = [] # 6 Vendor_Warranty
views = [] # 7 Product_Number_Of_Views
reviews = [] # 8 Product_Number_Of_Reviews
addDate = [] # 9 Product_AddDate
rating_item = [] # 11 Product_Rating
lastSeen = [] # 10 Product_LastViewDate
BTC = [] # 11 Product_BTC_SellingPrice
USD = [] # 12 Product_USD_SellingPrice
EURO = [] # 13 Product_EURO_SellingPrice
sold = [] # 14 Product_QuantitySold
qLeft =[] # 15 Product_QuantityLeft
shipFrom = [] # 16 Product_ShippedFrom
shipTo = [] # 17 Product_ShippedTo
vendor = [] # 18 Vendor
rating = [] # 19 Vendor_Rating
success = [] # 20 Vendor_Successful_Transactions
href = [] # 23 Product_Links (Urls)
nm = 0 # *Total_Products (Should be Integer)
mktName = "DarkMatter" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 6 Product_MS_Classification (Microsoft Security)
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
names = soup.find('div', {"class": "content"}).findAll('td', {"class": "lefted", "colspan": "3"})
left = soup.find('div', {"class": "content"}).findAll('table', {"class": "vtable"})
right = soup.find('div', {"class": "content"}).findAll('td', {"class": "vtop centered"})
images = soup.find('div', {"class": "content"}).findAll('td', {"class": "vcentered"})
# vtop centered
count = 0
@ -165,18 +154,15 @@ def darkmatter_listing_parser(soup):
for a in names:
# product name
try:
temp = a.find('a').text
if ("pcs x " in temp):
index = temp.index("pcs x ")
result = temp[index + len("pcs x "):]
name.append(cleanString(result))
elif("pks x " in temp):
index = temp.index("pks x ")
result = temp[index + len("pks x "):]
name.append(cleanString(temp))
except Exception as e:
print("product name", e)
temp = a.find('a').text
if ("pcs x " in temp):
index = temp.index("pcs x ")
result = temp[index + len("pcs x "):]
name.append(cleanString(result))
elif("pks x " in temp):
index = temp.index("pks x ")
result = temp[index + len("pks x "):]
name.append(cleanString(result))
CVE.append("-1")
MS.append("-1")
@ -186,74 +172,60 @@ def darkmatter_listing_parser(soup):
length_2 = len(temp2) - 1
# category
try:
temp = temp2[1].find('td').text
category.append(cleanString(temp.strip()))
except:
print('category')
temp = temp2[1].find('td').text
category.append(cleanString(temp.strip()))
describe.append("-1")
escrow.append("-1")
#escrow.append("-1")
views.append("-1")
reviews.append("-1")
addDate.append("-1")
lastSeen.append("-1")
#lastSeen.append("-1")
BTC.append("-1")
image_vendor.append("-1")
# usd
try:
temp3 = right[count*2].find('span').text
temp = temp3.replace(' USD', '')
USD.append(cleanString(temp))
except:
print('USD')
temp3 = right[count*2].find('span').text
temp = temp3.replace(' USD', '')
USD.append(cleanString(temp))
EURO.append("-1")
# 14 Product_QuantitySold
try:
temp3 = temp2[length_2].find('th').text
temp3 = cleanString(temp3)
if (temp3 == "Sold:"):
temp = temp2[length_2].find('td').text
sold.append(cleanString(temp.strip()))
else:
sold.append("-1")
except Exception as e:
temp3 = temp2[length_2].find('th').text
temp3 = cleanString(temp3)
if (temp3 == "Sold:"):
temp = temp2[length_2].find('td').text
sold.append(cleanString(temp.strip()))
else:
sold.append("-1")
print('sold', e)
qLeft.append("-1")
shipFrom.append("-1")
# ship to
try:
temp3 = temp2[length_2].find('th').text
temp3 = cleanString(temp3)
if (temp3 == "Ship To:"):
temp = temp2[length_2].find('td').text
shipTo.append(cleanString(temp.strip()))
else:
shipTo.append("-1")
except Exception as e:
temp3 = temp2[length_2].find('th').text
temp3 = cleanString(temp3)
if (temp3 == "Ship To:"):
temp = temp2[length_2].find('td').text
shipTo.append(cleanString(temp.strip()))
else:
shipTo.append("-1")
print('shopto')
# vendor
try:
temp = temp2[0].find('a').text
vendor.append(cleanString(temp.strip()))
except:
print('vendor')
temp = temp2[0].find('a').text
vendor.append(cleanString(temp.strip()))
# add product rating (stars)
rating.append("-1")
success.append("-1")
try:
temp = a.find('a').get('href')
href.append(temp)
except:
print('href')
temp = a.find('a').get('href')
href.append(temp)
# Finding Product Image
image = images[count*2].find('img').get('src')
image = image.split('base64,')[-1]
count += 1
@ -261,7 +233,7 @@ def darkmatter_listing_parser(soup):
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
#called by the crawler to get description links on a listing page


+ 4
- 18
MarketPlaces/DarkTor/crawler_selenium.py View File

@ -31,7 +31,6 @@ baseURL = 'http://zuauw53dukqdmll5p3fld26ns2gepcyfmbofobjczdni6ecmkoitnfid.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
# opentor()
mktName = getMKTName()
driver = getAccess()
@ -41,25 +40,11 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
def getMKTName():
@ -76,7 +61,7 @@ def getFixedURL():
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -103,7 +88,7 @@ def createFFDriver():
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 2)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@ -145,6 +130,7 @@ def login(driver):
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[1]/div/div/div[2]/main/div/div/section[5]/div/div[1]/div")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)


+ 3
- 17
MarketPlaces/DigitalThriftShop/crawler_selenium.py View File

@ -32,7 +32,6 @@ baseURL = 'http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
# opentor()
mktName = getMKTName()
driver = getAccess()
@ -42,24 +41,11 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
def getMKTName():
@ -76,7 +62,7 @@ def getFixedURL():
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -235,7 +221,7 @@ def crawlForum(driver):
driver.back()
# comment out
break
# break
# comment out
if count == 1:


+ 19
- 6
MarketPlaces/DigitalThriftShop/parser.py View File

@ -34,7 +34,8 @@ def digitalThriftShop_description_parser(soup: Tag):
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
product_name = soup.find("h1", {"class": "product_title entry-title"}).text
@ -42,7 +43,11 @@ def digitalThriftShop_description_parser(soup: Tag):
product_description = soup.find("div", {"id": "tab-description"}).find("p").text
describe = cleanString(product_description.strip())
# Finding Product Image
image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img')
image = image.get('src').split('base64,')[-1]
product_category = soup.find("span", {"class": "posted_in"}).find("a").text
category = cleanString(product_category.strip())
@ -64,7 +69,7 @@ def digitalThriftShop_description_parser(soup: Tag):
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo)
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
return row
@ -98,7 +103,9 @@ def digitalThriftShop_listing_parser(soup: Tag):
qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
product_category = soup.find("h1", {"class": "woocommerce-products-header__title page-title"}).text
@ -108,12 +115,17 @@ def digitalThriftShop_listing_parser(soup: Tag):
for product in products_list:
nm += 1
vendor.append("-1")
vendor.append(mktName)
rating_vendor.append("-1")
success.append("-1")
product_name = product.find("h2", {"class": "woocommerce-loop-product__title"}).text
name.append(cleanString(product_name.strip()))
# Finding Product Image
product_image = product.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'})
product_image = product_image.get('src').split('base64,')[-1]
image.append(product_image)
CVE.append("-1")
MS.append("-1")
@ -121,6 +133,7 @@ def digitalThriftShop_listing_parser(soup: Tag):
describe.append("-1")
views.append("-1")
reviews.append("-1")
image_vendor.append("-1")
try:
product_rating = product.find("div", {"class": "star-rating"}).find("strong", {"class": "rating"}).text
@ -146,7 +159,7 @@ def digitalThriftShop_listing_parser(soup: Tag):
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
#called by the crawler to get description links on a listing page


+ 7
- 21
MarketPlaces/HiddenMarket/crawler_selenium.py View File

@ -29,7 +29,6 @@ baseURL = 'http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion
# Opens Tor Browser, crawls the website
def startCrawling():
# opentor()
marketName = getMKTName()
driver = getAccess()
@ -39,24 +38,11 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(marketName, baseURL, True)
# Opens Tor Browser
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Login using premade account credentials and do login captcha manually
def login(driver):
# wait for login page
@ -118,7 +104,7 @@ def getFixedURL():
# Closes Tor Browser
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -145,10 +131,10 @@ def createFFDriver():
# ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
# ff_prof.set_preference("permissions.default.image", 3)
# ff_prof.set_preference("browser.download.folderList", 2)
# ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
# ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
@ -277,7 +263,7 @@ def crawlForum(driver):
driver.back()
# comment out
break
# break
# comment out
if count == 1:


+ 21
- 7
MarketPlaces/HiddenMarket/parser.py View File

@ -30,6 +30,8 @@ def hiddenmarket_description_parser(soup):
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
bae = soup.find('div', {'class': "main"})
@ -84,6 +86,10 @@ def hiddenmarket_description_parser(soup):
describe = describe.replace("-", " ")
describe = describe.strip()
# Finding Product Image
image = soup.find('div', {"class": "thumbnails"}).find('img', {"class": "bigthumbnail"})
image = image.get('src').split('base64,')[-1]
# Finding the Product Category
category = mb[-4].text
category = category.replace("Category:", "")
@ -115,7 +121,7 @@ def hiddenmarket_description_parser(soup):
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo)
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
return row
@ -145,7 +151,9 @@ def hiddenmarket_listing_parser(soup):
qLeft = [] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
listing = soup.findAll('div', {"class": "item"})
@ -153,12 +161,13 @@ def hiddenmarket_listing_parser(soup):
nm = len(listing)
# Finding Category
# cat = soup.find("div", {'class': "heading"}).text
# cat = cat.replace(",", "")
# cat = cat.strip()
cat = soup.find("div", {'class': "heading"}).text
cat = cat.replace(",", "")
cat = cat.strip()
for card in listing:
# category.append(cat)
category.append(cat)
# Adding the url to the list of urls
@ -175,12 +184,17 @@ def hiddenmarket_listing_parser(soup):
product = product.strip()
name.append(product)
# Finding Product Image
image.append("-1")
# Finding Vendor
vendor_name = card.text
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
image_vendor.append("-1")
# Finding USD
usd = card.next_sibling.find('div', {"class": "buttons"}).find('div', {'class': "price"}).text
usd = usd.replace("USD", "")
@ -262,7 +276,7 @@ def hiddenmarket_listing_parser(soup):
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
def hiddenmarket_links_parser(soup):


+ 4
- 0
MarketPlaces/Initialization/marketsList.txt View File

@ -1,8 +1,12 @@
Apocalypse
DarkBazar
DarkMatter
DigitalThriftShop
HiddenMarket
LionMarketplace
Nexus
Robinhood
ThiefWorld
TorBay
TorMarket
ViceCity

+ 6
- 0
MarketPlaces/Initialization/markets_mining.py View File

@ -24,6 +24,8 @@ from MarketPlaces.HiddenMarket.crawler_selenium import crawler as crawlerHiddenM
from MarketPlaces.RobinhoodMarket.crawler_selenium import crawler as crawlerRobinhoodMarket
from MarketPlaces.Nexus.crawler_selenium import crawler as crawlerNexus
from MarketPlaces.CypherMarketplace.crawler_selenium import crawler as crawlerCypher
from MarketPlaces.DarkBazar.crawler_selenium import crawler as crawlerDarkBazar
from MarketPlaces.PabloEscobarMarket.crawler_selenium import crawler as crawlerPabloEscobar
import configparser
import os
@ -137,5 +139,9 @@ if __name__ == '__main__':
crawlerNexus()
elif mkt == "CypherMarketplace":
crawlerCypher()
elif mkt == "DarkBazar":
crawlerDarkBazar()
elif mkt == "PabloEscobarMarket":
crawlerPabloEscobar()
print("\nScraping process completed!")

+ 16
- 2
MarketPlaces/Initialization/prepare_parser.py View File

@ -1,4 +1,4 @@
__author__ = 'Helium'
__author__ = 'DarkWeb'
import glob
import os
@ -21,6 +21,8 @@ from MarketPlaces.HiddenMarket.parser import *
from MarketPlaces.RobinhoodMarket.parser import *
from MarketPlaces.Nexus.parser import *
from MarketPlaces.MikesGrandStore.parser import *
from MarketPlaces.DarkBazar.parser import *
from MarketPlaces.PabloEscobarMarket.parser import *
from MarketPlaces.Classifier.classify_product import predict
@ -72,6 +74,10 @@ def mergePages(rmm, rec):
rec[18] = rmm[17]
if rec[19] == "-1": # shippedto_item
rec[19] = rmm[18]
if rmm[19] != "-1": # image
rec[20] = rmm[19]
if rmm[20] != "-1": # image_vendor
rec[21] = rmm[20]
return rec
@ -148,6 +154,10 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
rw = nexus_listing_parser(soup)
elif marketPlace == "MikesGrandStore":
rw = mikesGrandStore_listing_parser(soup)
elif marketPlace == "DarkBazar":
rw = darkbazar_listing_parser(soup)
elif marketPlace == "PabloEscobarMarket":
rw = pabloescobarmarket_listing_parser(soup)
else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception
@ -199,6 +209,10 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
rmm = nexus_description_parser(soup)
elif marketPlace == "MikesGrandStore":
rmm = mikesGrandStore_description_parser(soup)
elif marketPlace == "DarkBazar":
rmm = darkbazar_description_parser(soup)
elif marketPlace == "PabloEscobarMarket":
rmm = pabloescobarmarket_description_parser(soup)
else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception
@ -318,7 +332,7 @@ def new_parse(marketPlace, url, createLog):
rec = rec.split(',')
descriptionPattern = cleanLink(rec[20]) + ".html"
descriptionPattern = cleanLink(rec[22]) + ".html"
# Reading the associated description Html Pages
descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern))


+ 4
- 19
MarketPlaces/LionMarketplace/crawler_selenium.py View File

@ -31,7 +31,6 @@ baseURL = 'http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
# opentor()
mktName = getMKTName()
driver = getAccess()
@ -41,25 +40,11 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
def getMKTName():
@ -76,7 +61,7 @@ def getFixedURL():
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -103,7 +88,7 @@ def createFFDriver():
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 1)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@ -234,7 +219,7 @@ def crawlForum(driver):
driver.back()
# comment out
break
# break
# comment out
if count == 1:


+ 120
- 153
MarketPlaces/LionMarketplace/parser.py View File

@ -12,37 +12,37 @@ from bs4 import BeautifulSoup
#@param: soup object looking at html page of description page
#return: 'row' that contains a variety of lists that each hold info on the description page
def lionmarketplace_description_parser(soup):
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
# vendor name
try:
temp = soup.find('div', {'class': 'btn-group'}).find('a').text
vendor = (cleanString(temp.strip()))
except:
print('vendor')
vendor = "-1"
temp = soup.find('div', {'class': 'btn-group'}).find('a').text
vendor = (cleanString(temp.strip()))
# table with info
table = soup.find('table', {'class', 'table border-0 text-left table-borderless'})
table = soup.find('table')
rows = table.findAll('tr')
# successful transaction
@ -51,37 +51,25 @@ def lionmarketplace_description_parser(soup):
# vendor rating 5
rating_vendor = '-1'
# product name
try:
temp = soup.find('div', {'class', 'row'}).find('h2').text
name = (cleanString(temp.strip()))
except:
name = '-1'
print('product name')
temp = soup.find('div', {'class', 'row'}).find('h2').text
name = (cleanString(temp.strip()))
# product description
try:
temp = soup.find('div', {'class': "mt-4"}).findAll('p')
temp = temp[1].text
if "\n" in temp:
temp = temp.replace("\n", " ")
temp = temp.replace("\r", " ")
describe = cleanString(temp.strip())
except:
describe="-1"
print('describe')
temp = soup.find('div', {'class': "mt-4"}).find(text=True, recursive=False)
describe = cleanString(temp.strip())
# Finding Product Image
image = soup.find('div', {'id': 'slide-1'}).find('img')
image = image.get('src')
image = image.split('base64,')[-1]
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much
# product category
try:
temp = rows[1].find('strong').text
category = cleanString(temp.strip())
except:
category = "-1"
print('category')
temp = rows[1].find('strong').text
category = cleanString(temp.strip())
# product number of views
views = "-1"
@ -92,54 +80,38 @@ def lionmarketplace_description_parser(soup):
BTC = "-1"
# USD selling price
try:
temp = rows[2].find('strong').text
if " $" in temp:
temp = temp.replace(" $", "")
elif "$" in temp:
temp = temp.replace("$", "")
USD = cleanString((temp.strip()))
except:
try:
temp = soup.find('li').find('strong').text
if " $" in temp:
temp = temp.replace(" $", "")
elif "$" in temp:
temp = temp.replace("$", "")
USD = cleanString((temp.strip()))
except:
print("USD")
temp = rows[2].find('strong').text
if " $" in temp:
temp = temp.replace(" $", "")
elif "$" in temp:
temp = temp.replace("$", "")
USD = cleanString((temp.strip()))
EURO = "-1" # 14 Product_EURO_SellingPrice
# product sold
try:
if (len(rows) <= 5):
temp = rows[4].find('td').text
string = cleanString(temp)
if (string == 'Left/Sold'):
temp = rows[4].findAll('td')
temp = temp[1].findAll('span')
# left
temp2 = temp[1].text
temp3 = temp[1].text
if(" items" in temp2):
temp2 = temp2.replace(" items", "")
if(" items" in temp3):
temp3 = temp3.replace(" items", "")
sold = (cleanString(temp2.strip()))
left = cleanString(temp3.strip())
else:
sold = '-1'
left = "-1"
if (len(rows) <= 5):
temp = rows[4].find('td').text
string = cleanString(temp)
if (string == 'Left/Sold'):
temp = rows[4].findAll('td')
temp = temp[1].findAll('span')
# left
temp2 = temp[1].text
temp3 = temp[1].text
if(" items" in temp2):
temp2 = temp2.replace(" items", "")
if(" items" in temp3):
temp3 = temp3.replace(" items", "")
sold = (cleanString(temp2.strip()))
left = cleanString(temp3.strip())
else:
sold = '-1'
left = "-1"
except:
print("success")
else:
sold = '-1'
left = "-1"
@ -148,7 +120,7 @@ def lionmarketplace_description_parser(soup):
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo)
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
return row
@ -159,45 +131,49 @@ def lionmarketplace_description_parser(soup):
#@param: soup object looking at html page of listing page
#return: 'row' that contains a variety of lists that each hold info on the listing page
def lionmarketplace_listing_parser(soup):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
mktName = "M00nkeyMarket" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft = [] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
listing = soup.findAll('div', {"class": "card-body"})
nm = 0 # *Total_Products (Should be Integer)
mktName = "LionMarketplace" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 6 Product_MS_Classification (Microsoft Security)
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
listings = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"})
# Populating the Number of Products
nm = len(listing)
nm = len(listings)
for a in listing:
for listing in listings:
a = listing.find('div', {"class": "card-body"})
row = a.findAll('p')
# vendor
try:
temp = row[3].text
vendor.append(cleanString(temp.strip()))
except:
vendor.append("-1")
print('vendor')
temp = row[3].text
temp = temp.replace("Vendor:", "")
vendor.append(cleanString(temp.strip()))
image_vendor.append("-1")
# vendor rating
rating_vendor.append("-1")
@ -206,25 +182,22 @@ def lionmarketplace_listing_parser(soup):
success.append("-1")
# product name
try:
temp = a.find('a').text
name.append(cleanString(temp.strip()))
except:
name.append("-1")
print('product name')
temp = a.find('a').text
name.append(cleanString(temp.strip()))
# Finding Product Image
product_image = listing.find('img', {'class': 'card-img-top rounded'})
product_image = product_image.get('src')
product_image = product_image.split('base64,')[-1]
image.append(product_image)
CVE.append('-1')
MS.append('-1')
# product category
try:
temp = row[2].text
if "Category: " in temp:
temp = temp.replace("Category: ", "")
category.append(cleanString(temp.strip()))
except:
print("Error in product category")
temp = row[2].text
temp = temp.replace("Category: ", "")
category.append(cleanString(temp.strip()))
describe.append('-1')
@ -238,14 +211,10 @@ def lionmarketplace_listing_parser(soup):
BTC.append('-1')
# USD
try:
temp = row[0].find('strong').text
if ' $' in temp:
temp = temp.replace(" $", "")
USD.append(cleanString(temp.strip())) # 14 Product_USD_SellingPrice
except:
print("USD")
USD.append("-1")
temp = row[0].find('strong').text
if ' $' in temp:
temp = temp.replace(" $", "")
USD.append(cleanString(temp.strip())) # 14 Product_USD_SellingPrice
EURO.append("-1") # 15 Product_EURO_SellingPrice
@ -257,15 +226,12 @@ def lionmarketplace_listing_parser(soup):
shipTo.append('-1') # 19 Product_ShippedTo
# href
try:
temp = a.find('a').get('href')
href.append(temp)
except:
print('product name')
temp = a.find('a').get('href')
href.append(temp)
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
#called by the crawler to get description links on a listing page
@ -276,9 +242,10 @@ def lionmarketplace_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.findAll('div', {"class": "container d-flex justify-content-center"})
listings = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"})
for a in listing:
for listing in listings:
a = listing.find('div', {"class": "card-body"})
bae = a.find('a', href=True)
link = bae['href']
href.append(link)


+ 4
- 19
MarketPlaces/M00nkeyMarket/crawler_selenium.py View File

@ -33,7 +33,6 @@ MARKET_NAME = 'M00nkeyMarket'
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
# opentor()
driver = getAccess()
if driver != 'down':
@ -42,25 +41,11 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(MARKET_NAME, BASE_URL, True)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
# def getMKTName():
@ -77,7 +62,7 @@ def opentor():
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -104,7 +89,7 @@ def createFFDriver():
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 1)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@ -159,7 +144,7 @@ def login(driver):
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div/div[1]/div/div/div[2]/div[3]/div")))
(By.XPATH, "/html/body/div/div[2]/div/div/div/div/div/div[1]/a/img")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(driver, page, url):


+ 291
- 0
MarketPlaces/MetaVerseMarket/crawler_selenium.py View File

@ -0,0 +1,291 @@
__author__ = 'Helium'
'''
MetaVerseMarket Marketplace Crawler (Selenium)
not complete
need to go through multiple pages...
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.MetaVerseMarket.parser import metaversemarket_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/login'
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Returns the name of the website
#return: name of site in string type
def getMKTName():
name = 'MetaVerseMarket'
return name
# Return the base link of the website
#return: url of base site in string type
def getFixedURL():
url = 'http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/login'
return url
# Closes Tor Browser
#@param: current selenium driver
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", False)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
#return: return the selenium driver or string 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
# then allows for manual solving of captcha in the terminal
#@param: current selenium web driver
def login(driver):
# entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
# Username here
usernameBox.send_keys('metotomoto')
passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
# Password here
passwordBox.send_keys('lionking_kumba1ya')
input("Press ENTER when CAPTCHA is completed and you exit the newsletter\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="searchq"]')))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
#@param: raw url as crawler crawls through every site
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
#@param: raw url as crawler crawls through every site
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
#in this example, there are a couple of categories some threads fall under such as
# Guides and Tutorials, Digital Products, and Software and Malware
#as you can see they are categories of products
def getInterestedLinks():
links = []
# hacking
links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/hacking')
# hosting
links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/hosting')
# hacking guides and tutorials
links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/hacking-guides-and-tutorials')
return links
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
#topic and description pages are crawled through here, where both types of pages are saved
#@param: selenium driver
def crawlForum(driver):
print("Crawling the MetaVerse market")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver, driver.page_source, item)
driver.back()
# comment out
break
# comment out
if count == 1:
break
try:
link = driver.find_element(by=By.XPATH, value='//a[@class="page-link-next"]').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
print("Crawling the MetaVerse market done.")
# Returns 'True' if the link is a description link
#@param: url of any url crawled
#return: true if is a description page, false if not
def isDescriptionLink(url):
if 'PR' in url:
return True
return False
# Returns True if the link is a listingPage link
#@param: url of any url crawled
#return: true if is a Listing page, false if not
def isListingLink(url):
if 'products' in url:
return True
return False
# calling the parser to define the links, the html is the url of a link from the list of interested link list
#@param: link from interested link list ie. getInterestingLinks()
#return: list of description links that should be crawled through
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return metaversemarket_links_parser(soup)
# Drop links that "signout"
# def isSignOut(url):
# #absURL = urlparse.urljoin(url.base_url, url.url)
# if 'signout' in url.lower() or 'logout' in url.lower():
# return True
#
# return False
def crawler():
startCrawling()
# print("Crawling and Parsing MetaVerseMarket .... DONE!")

+ 285
- 0
MarketPlaces/MetaVerseMarket/parser.py View File

@ -0,0 +1,285 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
# stores info it needs in different lists, these lists are returned after being organized
# @param: soup object looking at html page of description page
# return: 'row' that contains a variety of lists that each hold info on the description page
def darkfox_description_parser(soup):
# Fields to be parsed
name = "-1" # 0 Product_Name
describe = "-1" # 1 Product_Description
lastSeen = "-1" # 2 Product_LastViewDate
CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
review = "-1" # 6 Product_Number_Of_Reviews
category = "-1" # 7 Product_Category
shipFrom = "-1" # 8 Product_ShippedFrom
shipTo = "-1" # 9 Product_ShippedTo
left = "-1" # 10 Product_QuantityLeft
escrow = "-1" # 11 Vendor_Warranty
terms = "-1" # 12 Vendor_TermsAndConditions
vendor = "-1" # 13 Vendor_Name
sold = "-1" # 14 Product_QuantitySold
addDate = "-1" # 15 Product_AddedDate
BTC = "-1" # 18 Product_BTC_SellingPrice
USD = "-1" # 19 Product_USD_SellingPrice
rating = "-1" # 20 Vendor_Rating
success = "-1" # 21 Vendor_Successful_Transactions
EURO = "-1" # 22 Product_EURO_SellingPrice
# Finding Product Name
name = soup.find('h1').text
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = name.strip()
# Finding Vendor
vendor = soup.find('h3').find('a').text.strip()
# Finding Vendor Rating
rating = soup.find('span', {'class': "tag is-dark"}).text.strip()
# Finding Successful Transactions
success = soup.find('h3').text
success = success.replace("Vendor: ", "")
success = success.replace(vendor, "")
success = success.replace("(", "")
success = success.replace(")", "")
success = success.strip()
bae = soup.find('div', {'class': "box"}).find_all('ul')
# Finding Prices
USD = bae[1].find('strong').text.strip()
li = bae[2].find_all('li')
# Finding Escrow
escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip()
# Finding the Product Category
category = li[1].find('span', {'class': "tag is-dark"}).text.strip()
# Finding the Product Quantity Available
left = li[3].find('span', {'class': "tag is-dark"}).text.strip()
# Finding Number Sold
sold = li[4].find('span', {'class': "tag is-dark"}).text.strip()
li = bae[3].find_all('li')
# Finding Shipment Information (Origin)
if "Ships from:" in li[-2].text:
shipFrom = li[-2].text
shipFrom = shipFrom.replace("Ships from: ", "")
# shipFrom = shipFrom.replace(",", "")
shipFrom = shipFrom.strip()
# Finding Shipment Information (Destination)
shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text
shipTo = shipTo.replace("Ships to: ", "")
shipTo = shipTo.strip()
if "certain countries" in shipTo:
countries = ""
tags = li[-1].find_all('span', {'class': "tag"})
for tag in tags:
country = tag.text.strip()
countries += country + ", "
shipTo = countries.strip(", ")
# Finding the Product description
describe = soup.find('div', {'class': "pre-line"}).text
describe = describe.replace("\n", " ")
describe = describe.strip()
'''# Finding the Number of Product Reviews
tag = soup.findAll(text=re.compile('Reviews'))
for index in tag:
reviews = index
par = reviews.find('(')
if par >=0:
reviews = reviews.replace("Reviews (","")
reviews = reviews.replace(")","")
reviews = reviews.split(",")
review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
else :
review = "-1"'''
# Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve:
CVE = " "
for idx in cve:
CVE += (idx)
CVE += " "
CVE = CVE.replace(',', ' ')
CVE = CVE.replace('\n', '')
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
if ms:
MS = " "
for im in ms:
MS += (im)
MS += " "
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
# Populating the final variable (this should be a list with all fields scraped)
row = (name, describe, lastSeen, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,
sold, addDate, BTC, USD, rating, success, EURO)
# Sending the results
return row
# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
# stores info it needs in different lists, these lists are returned after being organized
# @param: soup object looking at html page of listing page
# return: 'row' that contains a variety of lists that each hold info on the listing page
def darkfox_listing_parser(soup):
# Fields to be parsed
nm = 0 # Total_Products (Should be Integer)
mktName = "DarkFox" # 0 Marketplace_Name
name = [] # 1 Product_Name
CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 3 Product_MS_Classification (Microsoft Security)
category = [] # 4 Product_Category
describe = [] # 5 Product_Description
escrow = [] # 6 Vendor_Warranty
views = [] # 7 Product_Number_Of_Views
reviews = [] # 8 Product_Number_Of_Reviews
addDate = [] # 9 Product_AddDate
lastSeen = [] # 10 Product_LastViewDate
BTC = [] # 11 Product_BTC_SellingPrice
USD = [] # 12 Product_USD_SellingPrice
EURO = [] # 13 Product_EURO_SellingPrice
sold = [] # 14 Product_QuantitySold
qLeft = [] # 15 Product_QuantityLeft
shipFrom = [] # 16 Product_ShippedFrom
shipTo = [] # 17 Product_ShippedTo
vendor = [] # 18 Vendor
rating = [] # 19 Vendor_Rating
success = [] # 20 Vendor_Successful_Transactions
href = [] # 23 Product_Links (Urls)
listing = soup.findAll('div', {"class": "card"})
# Populating the Number of Products
nm = len(listing)
for a in listing:
bae = a.findAll('a', href=True)
# Adding the url to the list of urls
link = bae[0].get('href')
link = cleanLink(link)
href.append(link)
# Finding the Product
product = bae[1].find('p').text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.replace("...", "")
product = product.strip()
name.append(product)
bae = a.find('div', {'class': "media-content"}).find('div').find_all('div')
if len(bae) >= 5:
# Finding Prices
price = bae[0].text
ud = price.replace(" USD", " ")
# u = ud.replace("$","")
u = ud.replace(",", "")
u = u.strip()
USD.append(u)
# bc = (prc[1]).strip(' BTC')
# BTC.append(bc)
# Finding the Vendor
vendor_name = bae[1].find('a').text
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
# Finding the Category
cat = bae[2].find('small').text
cat = cat.replace("Category: ", "")
cat = cat.replace(",", "")
cat = cat.strip()
category.append(cat)
# Finding Number Sold and Quantity Left
num = bae[3].text
num = num.replace("Sold: ", "")
num = num.strip()
sold.append(num)
quant = bae[4].find('small').text
quant = quant.replace("In stock: ", "")
quant = quant.strip()
qLeft.append(quant)
# Finding Successful Transactions
freq = bae[1].text
freq = freq.replace(vendor_name, "")
freq = re.sub(r'Vendor Level \d+', "", freq)
freq = freq.replace("(", "")
freq = freq.replace(")", "")
freq = freq.strip()
success.append(freq)
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue = "-1"
else:
cee = " "
for idx in cve:
cee += (idx)
cee += " "
cee = cee.replace(',', ' ')
cee = cee.replace('\n', '')
cveValue = cee
CVE.append(cveValue)
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue = "-1"
else:
me = " "
for im in ms:
me += (im)
me += " "
me = me.replace(',', ' ')
me = me.replace('\n', '')
MSValue = me
MS.append(MSValue)
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
# called by the crawler to get description links on a listing page
# @param: beautifulsoup object that is using the correct html page (listing page)
# return: list of description links from a listing page
def metaversemarket_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.findAll('div', {"class": "col-12 p-0"})
for a in listing:
bae = a.find('a', href=True)
link = bae['href']
href.append(link)
return href

+ 4
- 18
MarketPlaces/MikesGrandStore/crawler_selenium.py View File

@ -31,7 +31,6 @@ baseURL = 'http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
# opentor()
mktName = getMKTName()
driver = getAccess()
@ -41,25 +40,11 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
def getMKTName():
@ -76,7 +61,7 @@ def getFixedURL():
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -103,7 +88,7 @@ def createFFDriver():
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 1)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@ -145,6 +130,7 @@ def login(driver):
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[1]/header/div/div[3]/div/div/ul/li[6]/a")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)


+ 23
- 27
MarketPlaces/Nexus/crawler_selenium.py View File

@ -31,7 +31,6 @@ baseURL = 'http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
# opentor()
mktName = getMKTName()
driver = getAccess()
@ -40,22 +39,10 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
@ -73,7 +60,7 @@ def getFixedURL():
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -100,7 +87,7 @@ def createFFDriver():
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 2)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@ -133,6 +120,7 @@ def getAccess():
driver.close()
return 'down'
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
@ -173,16 +161,24 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
# Bot nets
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/botnets/')
# # Rats
# links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/rats/')
# # Ransomware
# links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/ransomware/')
# # Other Malware
# links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/outros-malware/')
# # Hacking Tools & Scripting
# links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/hacking-spam/ferramentas-de-hacking-scripts/')
# malware
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/')
# # hacking-spam
# links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/hacking-spam/')
# # hacking services
# links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/hacking/')
# # programming services
# links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/programacao/')
# # remote admin services
# links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/administracao-remota/')
# # hacking guides
# links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-hacking/')
# # malware guides
# links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-malware/')
# # fraud guides
# links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-fraudes/')
# # fraud software
# links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/fraudes/software-de-fraude/')
return links
@ -222,7 +218,7 @@ def crawlForum(driver):
driver.back()
# comment out
break
# break
# comment out
if count == 1:


+ 66
- 46
MarketPlaces/Nexus/parser.py View File

@ -15,25 +15,28 @@ import re
def nexus_description_parser(soup):
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
#finding the name of the product
@ -48,6 +51,10 @@ def nexus_description_parser(soup):
else:
describe = cleanString(description_div.text.strip())
# Finding Product Image
image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img')
image = image.get('src')
image = image.split('base64,')[-1]
#find the category of the product
name_of_category = soup.find("span", {"class": "posted_in"}).find("a").text
@ -64,7 +71,7 @@ def nexus_description_parser(soup):
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo)
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
@ -78,28 +85,31 @@ def nexus_description_parser(soup):
def nexus_listing_parser(soup):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
mktName = "Nexus" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 6 Product_MS_Classification (Microsoft Security)
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft = [] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
nm = 0 # *Total_Products (Should be Integer)
mktName = "Nexus" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 6 Product_MS_Classification (Microsoft Security)
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
products_list = soup.find_all('li')
nm = 0
for product in products_list:
@ -117,10 +127,18 @@ def nexus_listing_parser(soup):
print("I can't find the link")
raise e
# Finding Product Image
product_image = product.find('a', {'class': 'woocommerce-loop-image-link woocommerce-LoopProduct-link woocommerce-loop-product__link'}).find('img')
product_image = product_image.get('src')
product_image = product_image.split('base64,')[-1]
image.append(product_image)
BTC.append("-1")
#everything else appends a -1
rating_vendor.append("-1")
USD.append("-1")
vendor.append("-1")
vendor.append(mktName)
success.append("-1")
CVE.append("-1")
MS.append("-1")
@ -129,12 +147,12 @@ def nexus_listing_parser(soup):
views.append("-1")
reviews.append("-1")
addDate.append("-1")
BTC.append("-1")
EURO.append("-1")
sold.append("-1")
qLeft.append("-1")
shipFrom.append("-1")
shipTo.append("-1")
image_vendor.append("-1")
# print("Done! moving onto the next product!")
# print(len(shipTo))
nm += 1
@ -145,7 +163,7 @@ def nexus_listing_parser(soup):
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(
marketplace = "Nexus",
marketplace = mktName,
nm = nm,
vendor = vendor,
rating_vendor = rating_vendor,
@ -166,7 +184,9 @@ def nexus_listing_parser(soup):
qLeft = qLeft,
shipFrom = shipFrom,
shipTo = shipTo,
href = href
href = href,
image = image,
image_vendor = image_vendor
)


+ 256
- 0
MarketPlaces/PabloEscobarMarket/crawler_selenium.py View File

@ -0,0 +1,256 @@
__author__ = 'DarkWeb'
'''
PabloEscobarMarket Marketplace Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from PIL import Image
import urllib.parse as urlparse
import os, re, time
import subprocess
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.PabloEscobarMarket.parser import pabloescobarmarket_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://niejmptjzwhlfywruoab4pbuxg7kp2mtcr4c6mgpeykju5matewg36yd.onion/'
# Opens Tor Browser, crawls the website
def startCrawling():
mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Login using premade account credentials and do login captcha manually
def login(driver):
input("Press ENTER when CAPTCHA is complete and login page has loaded\n")
# entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
# Username here
usernameBox.send_keys('snorlaxrights')
passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="inputPassword3"]')
# Password here
passwordBox.send_keys('$noringAllday')
input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
# wait for 50 sec until id = tab_content is found, then cont
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="collapse3"]')))
# Returns the name of the website
def getMKTName() -> str:
name = 'PabloEscobarMarket'
return name
# Return the link of the website
def getFixedURL():
url = 'http://niejmptjzwhlfywruoab4pbuxg7kp2mtcr4c6mgpeykju5matewg36yd.onion/'
return url
# Closes Tor Browser
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close() #close tab
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", True)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
# Saves the crawled html page
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
# Creates the file name from passed URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if name == '':
name = str(counter)
counter = counter + 1
return name
# FIX
def getInterestedLinks():
links = []
# # hire hacker
# links.append('http://niejmptjzwhlfywruoab4pbuxg7kp2mtcr4c6mgpeykju5matewg36yd.onion/?sub_id=36')
# hacker
links.append('http://niejmptjzwhlfywruoab4pbuxg7kp2mtcr4c6mgpeykju5matewg36yd.onion/?sub_id=34')
return links
def crawlForum(driver):
print("Crawling the PabloEscobarMarket market")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver, driver.page_source, item)
driver.back()
# comment out
# break
# comment out
if count == 1:
break
try:
link = driver.find_element(by=By.XPATH, value='//a[@rel="next"]').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
print("Crawling the PabloEscobarMarket market done.")
# Returns 'True' if the link is Topic link, may need to change for every website
def isDescriptionLink(url):
if 'single_product' in url:
return True
return False
# Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url):
if 'sub_id' in url:
return True
return False
# calling the parser to define the links
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return pabloescobarmarket_links_parser(soup)
def crawler():
startCrawling()
# print("Crawling and Parsing PabloEscobarMarket .... DONE!")

+ 241
- 0
MarketPlaces/PabloEscobarMarket/parser.py View File

@ -0,0 +1,241 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
# stores info it needs in different lists, these lists are returned after being organized
# @param: soup object looking at html page of description page
# return: 'row' that contains a variety of lists that each hold info on the description page
def pabloescobarmarket_description_parser(soup):
# Fields to be parsed
name = "-1" # 0 Product_Name
describe = "-1" # 1 Product_Description
lastSeen = "-1" # 2 Product_LastViewDate
CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
review = "-1" # 6 Product_Number_Of_Reviews
category = "-1" # 7 Product_Category
shipFrom = "-1" # 8 Product_ShippedFrom
shipTo = "-1" # 9 Product_ShippedTo
left = "-1" # 10 Product_QuantityLeft
escrow = "-1" # 11 Vendor_Warranty
terms = "-1" # 12 Vendor_TermsAndConditions
vendor = "-1" # 13 Vendor_Name
sold = "-1" # 14 Product_QuantitySold
addDate = "-1" # 15 Product_AddedDate
BTC = "-1" # 18 Product_BTC_SellingPrice
USD = "-1" # 19 Product_USD_SellingPrice
rating = "-1" # 20 Vendor_Rating
success = "-1" # 21 Vendor_Successful_Transactions
EURO = "-1" # 22 Product_EURO_SellingPrice
# Finding Product Name
# NA
divmd7 = soup.find('div', {'class': "col-md-7"})
ptag = soup.findAll('p')
# Finding Vendor
vendor = divmd7.find('a').text.strip()
# Finding Vendor Rating
# NA
# Finding Successful Transactions
success = soup.find('span', {'class': "badge-primary"})
# Finding Prices
USD = soup.find('span', {'class': "total"}).text.strip()
BTC = soup.find('div', {'class': "text-center"}).text.strip()
# Finding Escrow
escrow = ptag[-1].text.strip()
# Finding the Product Category
category = ptag[-2].text.strip()
# Finding the Product Quantity Available
# NA
# Finding Number Sold
# NA
# Finding Shipment Information (Origin)
# NA
# Finding Shipment Information (Destination)
# NA
# Finding the Product description
describe = soup.find('div', {'class': "text-white"}).text
describe = describe.replace("\n", " ")
describe = describe.strip()
'''# Finding the Number of Product Reviews
tag = soup.findAll(text=re.compile('Reviews'))
for index in tag:
reviews = index
par = reviews.find('(')
if par >=0:
reviews = reviews.replace("Reviews (","")
reviews = reviews.replace(")","")
reviews = reviews.split(",")
review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
else :
review = "-1"'''
# Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve:
CVE = " "
for idx in cve:
CVE += (idx)
CVE += " "
CVE = CVE.replace(',', ' ')
CVE = CVE.replace('\n', '')
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
if ms:
MS = " "
for im in ms:
MS += (im)
MS += " "
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
# Populating the final variable (this should be a list with all fields scraped)
row = (name, describe, lastSeen, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,
sold, addDate, BTC, USD, rating, success, EURO)
# Sending the results
return row
# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
# stores info it needs in different lists, these lists are returned after being organized
# @param: soup object looking at html page of listing page
# return: 'row' that contains a variety of lists that each hold info on the listing page
def pabloescobarmarket_listing_parser(soup):
# Fields to be parsed
nm = 0 # Total_Products (Should be Integer)
mktName = "PabloEscobarMarket" # 0 Marketplace_Name
name = [] # 1 Product_Name
CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 3 Product_MS_Classification (Microsoft Security)
category = [] # 4 Product_Category
describe = [] # 5 Product_Description
escrow = [] # 6 Vendor_Warranty
views = [] # 7 Product_Number_Of_Views
reviews = [] # 8 Product_Number_Of_Reviews
addDate = [] # 9 Product_AddDate
lastSeen = [] # 10 Product_LastViewDate
BTC = [] # 11 Product_BTC_SellingPrice
USD = [] # 12 Product_USD_SellingPrice
EURO = [] # 13 Product_EURO_SellingPrice
sold = [] # 14 Product_QuantitySold
qLeft = [] # 15 Product_QuantityLeft
shipFrom = [] # 16 Product_ShippedFrom
shipTo = [] # 17 Product_ShippedTo
vendor = [] # 18 Vendor
rating = [] # 19 Vendor_Rating
success = [] # 20 Vendor_Successful_Transactions
href = [] # 23 Product_Links (Urls)
listing = soup.findAll('div', {"class": "p-4"})
# Populating the Number of Products
nm = len(listing)
for a in listing:
bae = a.findAll('a', href=True)
# Adding the url to the list of urls
link = bae[0].get('href')
link = cleanLink(link)
href.append(link)
# Finding the Product
product = a.find('h4').text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.replace("...", "")
product = product.strip()
name.append(product)
# Finding Prices
price = a.find('div', {"class": "price"}).text
tempUSD = price.split("~")[0]
tempUSD = tempUSD.replace("$", "")
tempUSD = tempUSD.strip()
USD.append(tempUSD)
tempBTC = price.split("~")[1]
tempBTC = tempBTC.replace("BTC", "")
tempBTC = tempBTC.strip()
BTC.append(tempBTC)
# Finding the Vendor
#NA
# Finding the Category
# NA
# Finding Number Sold and Quantity Left
# NA
# Finding Successful Transactions
# NA
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue = "-1"
else:
cee = " "
for idx in cve:
cee += (idx)
cee += " "
cee = cee.replace(',', ' ')
cee = cee.replace('\n', '')
cveValue = cee
CVE.append(cveValue)
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue = "-1"
else:
me = " "
for im in ms:
me += (im)
me += " "
me = me.replace(',', ' ')
me = me.replace('\n', '')
MSValue = me
MS.append(MSValue)
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
# called by the crawler to get description links on a listing page
# @param: beautifulsoup object that is using the correct html page (listing page)
# return: list of description links from a listing page FIX
def pabloescobarmarket_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.findAll('div', {"class": "p-4"})
for a in listing:
bae = a.find('a', href=True)
link = bae['href']
href.append(link)
return href

+ 11
- 25
MarketPlaces/RobinhoodMarket/crawler_selenium.py View File

@ -29,9 +29,6 @@ baseURL = 'http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion
# Opens Tor Browser, crawls the website
def startCrawling():
# Opening tor beforehand gives "Tor exited during startup error"
# opentor()
marketName = getMKTName()
driver = getAccess()
@ -45,24 +42,11 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(marketName, baseURL, True)
# Opens Tor Browser
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Login is not needed in Robinhood
def login(driver):
pass
@ -82,7 +66,7 @@ def getFixedURL():
# Closes Tor Browser
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -178,8 +162,8 @@ def getInterestedLinks():
# Hacking
links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/hacking/')
# # Other Software
# links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/other-software/')
# Other Software
links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/other-software/')
return links
@ -207,7 +191,7 @@ def crawlForum(driver):
savePage(driver, html, link)
list = productPages(html)
for item in list:
for c, item in enumerate(list):
itemURL = urlparse.urljoin(baseURL, str(item))
try:
@ -218,11 +202,12 @@ def crawlForum(driver):
driver.back()
# comment out
break
# if c == 4:
# break
# comment out
if count == 1:
break
# if count == 1:
# break
# go to next page of market
try:
@ -266,5 +251,6 @@ def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")
if __name__ == '__main__':
startCrawling()
startCrawling()

+ 35
- 14
MarketPlaces/RobinhoodMarket/parser.py View File

@ -39,6 +39,8 @@ def Robinhood_description_parser(soup):
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
# Finding Product Name
name = soup.find('h1').text
@ -48,16 +50,17 @@ def Robinhood_description_parser(soup):
# Finding description
desc = ''
primary = soup.find('div', {'id': 'primary'})
product = primary.findAll('div')[1]
commerce = product.findAll('div', recursive=False)[2]
descDiv = commerce.findAll('div')[0]
# descDiv = soup.find('div', {'class': 'woocommerce-Tabs-panel woocommerce-Tabs-panel--description panel entry-content wc-tab'})
descText = descDiv.findAll('p')
for para in descText:
desc = desc + para.text
describe = desc
tab = soup.find('div', {"id": "tab-description"})
for p in tab.findAll('p'):
desc += p.text
if desc == '':
desc = soup.find('div', {"class": "woocommerce-product-details__short-description"}).text
describe = cleanString(desc.strip())
# Finding Product Image
image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img')
image = image.get('src')
image = image.split('base64,')[-1]
# Finding Vendor
vendor = soup.find('a', {'class': 'wcfm_dashboard_item_title'}).text
@ -65,6 +68,11 @@ def Robinhood_description_parser(soup):
vendor = vendor.replace("Sold by:", "")
vendor = vendor.strip()
# Finding Vendor Image
vendor_image = soup.find('div', {'class': 'wcfmmp_sold_by_container_left'}).find('img')
vendor_image = vendor_image.get('src')
vendor_image = vendor_image.split('base64,')[-1]
# Finding Category
catSpan = soup.find('span', {'class': 'posted_in'})
category = catSpan.find('a').text
@ -93,7 +101,7 @@ def Robinhood_description_parser(soup):
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo)
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
return row
@ -124,7 +132,9 @@ def Robinhood_listing_parser(soup):
qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
listing = soup.find('ul', {"class": "products columns-4"})
items = listing.findAll('li')
@ -153,6 +163,12 @@ def Robinhood_listing_parser(soup):
product = product.strip()
name.append(product)
# Finding Product Image
product_image = card.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'})
product_image = product_image.get('src')
product_image = product_image.split('base64,')[-1]
image.append(product_image)
info = card.find('div', {'class': 'wcfmmp_sold_by_container'})
# Finding Vendor
@ -161,6 +177,12 @@ def Robinhood_listing_parser(soup):
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
# Finding Vendor Image
vendor_icon = info.find('img', {'class', 'wcfmmp_sold_by_logo'})
vendor_icon = vendor_icon.get('src')
vendor_icon = vendor_icon.split('base64,')[-1]
image_vendor.append(vendor_icon)
# Finding USD
span = card.find('span', {'class': 'price'})
if span is not None:
@ -198,13 +220,12 @@ def Robinhood_listing_parser(soup):
MSValue=me
MS.append(MSValue)
#print(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
# reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
def Robinhood_links_parser(soup):


+ 5
- 20
MarketPlaces/ThiefWorld/crawler_selenium.py View File

@ -32,7 +32,6 @@ baseURL = 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
# opentor()
mktName = getMKTName()
driver = getAccess()
@ -42,25 +41,11 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
def getMKTName():
@ -77,7 +62,7 @@ def getFixedURL():
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -104,7 +89,7 @@ def createFFDriver():
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 1)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@ -144,7 +129,7 @@ def getAccess():
def login(driver):
# wait for page to show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div/div[1]/div/div[1]/div[1]/ul")))
(By.XPATH, "/html/body/div/header/div[2]/div/nav/div[2]/a[1]")))
temp = driver.find_element(By.XPATH, '/html/body/div/header/div[2]/div/nav/div[2]/a[1]').get_attribute(
'href') # /html/body/div/div[2]/div/div[2]/div
@ -242,7 +227,7 @@ def crawlForum(driver):
driver.back()
# comment out
break
# break
# comment out
if count == 1:


+ 1
- 1
MarketPlaces/ThiefWorld/parser.py View File

@ -53,7 +53,7 @@ def thiefWorld_description_parser(soup: BeautifulSoup) -> Tuple:
USD = cleanString(usdText.replace("USD", "").strip())
ratingDiv = soup.find('div', {'class': 'rating_star'})
rating_vendor = ratingDiv.get('title').strip(' ')[1]
rating_vendor = ratingDiv.get('title').split(' ')[1]
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,


+ 2
- 16
MarketPlaces/Tor2door/crawler_selenium.py View File

@ -29,7 +29,6 @@ baseURL = 'http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion
# Opens Tor Browser, crawls the website
def startCrawling():
# opentor()
marketName = getMKTName()
driver = getAccess()
@ -39,24 +38,11 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(marketName, baseURL, True)
# Opens Tor Browser
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Login using premade account credentials and do login captcha manually
def login(driver):
#wait for login page
@ -118,7 +104,7 @@ def getFixedURL():
# Closes Tor Browser
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")


+ 4
- 18
MarketPlaces/TorBay/crawler_selenium.py View File

@ -32,7 +32,6 @@ baseURL = 'http://torbay3253zck4ym5cbowwvrbfjjzruzthrx3np5y6owvifrnhy5ybid.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
# opentor()
mktName = getMKTName()
driver = getAccess()
@ -42,25 +41,11 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
def getMKTName():
@ -77,7 +62,7 @@ def getFixedURL():
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -124,6 +109,7 @@ def createFFDriver():
return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
#return: return the selenium driver or string 'down'
def getAccess():
@ -228,7 +214,7 @@ def crawlForum(driver):
driver.back()
# comment out
break
# break
# comment out
if count == 1:


+ 10
- 27
MarketPlaces/TorMarket/crawler_selenium.py View File

@ -31,35 +31,19 @@ baseURL = 'http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
# opentor()
mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
# login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
def getMKTName():
@ -76,7 +60,7 @@ def getFixedURL():
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -101,9 +85,9 @@ def createFFDriver():
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 1)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
@ -186,12 +170,12 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
# # Hacking Tutorials
# links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/guides-tutorials/hacking/')
# # Tutorials
# links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/guides-tutorials/')
# Malware
links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/malware/')
# # Hacking Services
# links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/services/hacking-services/')
# # Services
# links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/services/')
return links
@ -238,8 +222,7 @@ def crawlForum(driver):
break
try:
link = driver.find_element(by=By.XPATH, value=
'/html/body/div[2]/div/div/div[1]/main/nav/ul/li[5]/a').get_attribute('href')
link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1


+ 49
- 52
MarketPlaces/TorMarket/parser.py View File

@ -104,61 +104,58 @@ def tormarket_listing_parser(soup):
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
products_list = soup.find_all('li')
nm = 0
products_list = soup.find('ul', {"class": "products columns-3 tablet-columns-2 mobile-columns-1"}).find_all('li')
nm = len(products_list)
for product in products_list:
# Finding the name of the product
name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text
name_of_product_cleaned = cleanString(name_of_product.strip())
# print(name_of_product_cleaned)
name.append(name_of_product_cleaned)
#finding the URL
try:
# Finding the name of the product
name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text
name_of_product_cleaned = cleanString(name_of_product.strip())
print(name_of_product_cleaned)
name.append(name_of_product_cleaned)
#finding the URL
try:
url = product.find("div", {"class": "product-loop-content text-center"}).find("a").get("href")
print(url)
href.append(url)
except AttributeError as e:
print("I can't find the link")
raise e
#finding the rating of the product
rating_score_of_product = product.find("div", {"class": "product-loop-content text-center"}).find("div").find("span").text
rating_item.append(cleanString(rating_score_of_product.strip()))
print("done")
#finding the rating of the vendors
rating_score_of_vendor = product.find("div", {"class": "wcfmmp-store-rating"}).find("strong").text
rating_vendor.append(cleanString(rating_score_of_vendor.strip()))
print("done")
#finding the cost in USD
cost = product.find("span", {"class": "woocommerce-Price-amount amount"}).text
USD.append(cost)
print("done")
#finding the name of the vendor
vendor_name = product.find("div", {"class": "wcfmmp_sold_by_wrapper"}).find("a").text
vendor.append(cleanString(vendor_name.strip()))
print("done")
#everything else appends a -1
success.append("-1")
CVE.append("-1")
MS.append("-1")
category.append("-1")
describe.append("-1")
views.append("-1")
reviews.append("-1")
addDate.append("-1")
BTC.append("-1")
EURO.append("-1")
sold.append("-1")
qLeft.append("-1")
shipFrom.append("-1")
shipTo.append("-1")
print("Done! moving onto the next product!")
print(len(shipTo))
nm += 1
url = product.find("div", {"class": "product-loop-content text-center"}).find("a").get("href")
# print(url)
href.append(url)
except AttributeError as e:
print("I'm somewhere I don't belong. I'm going to leave")
continue
print("I can't find the link")
raise e
#finding the rating of the product
rating_score_of_product = product.find("div", {"class": "product-loop-content text-center"}).find("div").find("span").text
rating_item.append(cleanString(rating_score_of_product.strip()))
# print("done")
#finding the rating of the vendors
rating_score_of_vendor = product.find("div", {"class": "wcfmmp-store-rating"}).find("strong").text
rating_vendor.append(cleanString(rating_score_of_vendor.strip()))
# print("done")
#finding the cost in USD
cost = product.find("span", {"class": "woocommerce-Price-amount amount"}).text
USD.append(cost)
# print("done")
#finding the name of the vendor
vendor_name = product.find("div", {"class": "wcfmmp_sold_by_wrapper"}).find("a").text
vendor.append(cleanString(vendor_name.strip()))
# print("done")
#everything else appends a -1
success.append("-1")
CVE.append("-1")
MS.append("-1")
category.append("-1")
describe.append("-1")
views.append("-1")
reviews.append("-1")
addDate.append("-1")
BTC.append("-1")
EURO.append("-1")
sold.append("-1")
qLeft.append("-1")
shipFrom.append("-1")
shipTo.append("-1")
# print("Done! moving onto the next product!")
# print(len(shipTo))
# Populate the final variable (this should be a list with all fields scraped)


+ 19
- 21
MarketPlaces/Utilities/utilities.py View File

@ -242,7 +242,7 @@ def cleanLink(originalLink):
def organizeProducts(marketplace, nm, vendor, rating_vendor, success_vendor, nombre, CVE, MS, category, describe,
views, reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href):
views, reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor):
rw = []
@ -291,9 +291,13 @@ def organizeProducts(marketplace, nm, vendor, rating_vendor, success_vendor, nom
lne += ","
lne += "-1" if len(shipTo) == 0 else shipTo[n] # 19
lne += ","
lne += "-1" if len(href) == 0 else href[n] # 20
lne += "-1" if len(image) == 0 else image[n] # 20
lne += ","
lne += day + " " + ahora # 21
lne += "-1" if len(image_vendor) == 0 else image_vendor[n] # 21
lne += ","
lne += "-1" if len(href) == 0 else href[n] # 22
lne += ","
lne += day + " " + ahora # 23
rw.append(lne)
@ -363,29 +367,24 @@ def encrypt_encode_image_to_base64(driver, xpath):
return None
def decode_decrypt_image_in_base64(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
for img_tag in soup.find_all('img'):
def decode_decrypt_image_in_base64(string_image):
src_attr = img_tag.get('src')
try:
if src_attr and src_attr.startswith('data:image'):
base64_image = bytes(string_image, encoding='utf-8')
encrypted_image = base64.b64decode(base64_image)
decrypted_image = aes_decryption(encrypted_image)
try:
im = Image.open(io.BytesIO(decrypted_image))
im.show()
string_image = src_attr.split('base64,')[-1]
base64_image = bytes(string_image, encoding='utf-8')
encrypted_image = base64.b64decode(base64_image)
decrypted_image = aes_decryption(encrypted_image)
return decrypted_image
im = Image.open(io.BytesIO(decrypted_image))
im.show()
except Exception as e:
print(e)
pass
except Exception as e:
print(e)
pass
return None
def replace_image_sources(driver, html_content):
@ -415,7 +414,6 @@ def replace_image_sources(driver, html_content):
def cleanHTML(driver, html):
clean_html = replace_image_sources(driver, html)
# decode_decrypt_image_in_base64(clean_html)
formats = [
"jpg", "jpeg", "jfif", "pjpeg", "pjp",


+ 7
- 22
MarketPlaces/ViceCity/crawler_selenium.py View File

@ -32,7 +32,6 @@ baseURL = 'http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
# opentor()
mktName = getMKTName()
driver = getAccess()
@ -42,25 +41,11 @@ def startCrawling():
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Returns the name of the website
#return: name of site in string type
def getMKTName():
@ -77,7 +62,7 @@ def getFixedURL():
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -104,10 +89,10 @@ def createFFDriver():
# ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
# ff_prof.set_preference("permissions.default.image", 3)
# ff_prof.set_preference("browser.download.folderList", 2)
# ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
# ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
@ -271,7 +256,7 @@ def crawlForum(driver):
driver.back()
# comment out
break
# break
# comment out
if count == 1:


Loading…
Cancel
Save