Browse Source

moved finished websites from test to production and unfinished websites from production to test

main
westernmeadow 1 year ago
parent
commit
5a4f55a98d
61 changed files with 3640 additions and 10007 deletions
  1. +0
    -29
      .idea/DW_Pipeline_Test.iml
  2. +0
    -166
      Forums/AbyssForum/parser.py
  3. +0
    -298
      Forums/Altenens/crawler_selenium.py
  4. +0
    -165
      Forums/Altenens/parser.py
  5. +0
    -303
      Forums/Cardingleaks/crawler_selenium.py
  6. +0
    -167
      Forums/Cardingleaks/parser.py
  7. +0
    -257
      Forums/CryptBB/crawler_mechanize.py
  8. +0
    -331
      Forums/CryptBB/crawler_selenium.py
  9. +0
    -282
      Forums/CryptBB/parser.py
  10. +69
    -55
      Forums/DWForums/crawler_selenium.py
  11. +312
    -0
      Forums/DWForums/parser.py
  12. +69
    -60
      Forums/Dread/crawler_selenium.py
  13. +334
    -0
      Forums/Dread/parser.py
  14. +328
    -0
      Forums/Helium/crawler_selenium.py
  15. +248
    -0
      Forums/Helium/parser.py
  16. +0
    -212
      Forums/HiddenAnswers/parser.py
  17. +0
    -24
      Forums/Initialization/forums_mining.py
  18. +0
    -40
      Forums/Initialization/prepare_parser.py
  19. +0
    -302
      Forums/Libre/crawler_selenium.py
  20. +0
    -249
      Forums/Libre/parser.py
  21. +0
    -310
      Forums/OnniForums/crawler_selenium.py
  22. +0
    -222
      Forums/OnniForums/parser.py
  23. +0
    -57
      Forums/OnniForums/testing.py
  24. +0
    -321
      Forums/Procrax/crawler_selenium.py
  25. +0
    -189
      Forums/Procrax/parser.py
  26. +0
    -293
      MarketPlaces/AnonMarket/crawler_selenium.py
  27. +0
    -195
      MarketPlaces/AnonMarket/parser.py
  28. +0
    -226
      MarketPlaces/Apocalypse/parser.py
  29. +107
    -92
      MarketPlaces/Ares/crawler_selenium.py
  30. +227
    -0
      MarketPlaces/Ares/parser.py
  31. +173
    -62
      MarketPlaces/Bohemia/crawler_selenium.py
  32. +104
    -104
      MarketPlaces/Bohemia/parser.py
  33. +0
    -262
      MarketPlaces/DarkBazar/crawler_selenium.py
  34. +0
    -284
      MarketPlaces/DarkMatter/crawler_selenium.py
  35. +0
    -261
      MarketPlaces/DarkMatter/parser.py
  36. +0
    -286
      MarketPlaces/DigitalThriftShop/crawler_selenium.py
  37. +0
    -173
      MarketPlaces/DigitalThriftShop/parser.py
  38. +0
    -288
      MarketPlaces/HiddenMarket/parser.py
  39. +0
    -42
      MarketPlaces/Initialization/markets_mining.py
  40. +0
    -70
      MarketPlaces/Initialization/prepare_parser.py
  41. +325
    -0
      MarketPlaces/Kingdom/crawler_mechanize.py
  42. +342
    -0
      MarketPlaces/Kingdom/crawler_selenium.py
  43. +188
    -0
      MarketPlaces/Kingdom/parser.py
  44. +0
    -235
      MarketPlaces/LionMarketplace/parser.py
  45. +0
    -291
      MarketPlaces/MetaVerseMarket/crawler_selenium.py
  46. +0
    -269
      MarketPlaces/MetaVerseMarket/parser.py
  47. +0
    -289
      MarketPlaces/Nexus/crawler_selenium.py
  48. +0
    -236
      MarketPlaces/Nexus/parser.py
  49. +50
    -48
      MarketPlaces/Quest/crawler_selenium.py
  50. +232
    -0
      MarketPlaces/Quest/parser.py
  51. +0
    -256
      MarketPlaces/RobinhoodMarket/crawler_selenium.py
  52. +0
    -334
      MarketPlaces/RobinhoodMarket/parser.py
  53. +156
    -106
      MarketPlaces/Royal/crawler_selenium.py
  54. +73
    -88
      MarketPlaces/Royal/parser.py
  55. +0
    -190
      MarketPlaces/ThiefWorld/parser.py
  56. +0
    -268
      MarketPlaces/TorBay/crawler_selenium.py
  57. +0
    -183
      MarketPlaces/TorBay/parser.py
  58. +0
    -277
      MarketPlaces/TorMarket/crawler_selenium.py
  59. +0
    -189
      MarketPlaces/TorMarket/parser.py
  60. +55
    -71
      MarketPlaces/WeTheNorth/crawler_selenium.py
  61. +248
    -0
      MarketPlaces/WeTheNorth/parser.py

+ 0
- 29
.idea/DW_Pipeline_Test.iml View File

@ -5,33 +5,4 @@
<orderEntry type="jdk" jdkName="C:\Users\calsyslab\anaconda3" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyNamespacePackagesService">
<option name="namespacePackageFolders">
<list>
<option value="$MODULE_DIR$/Forums/BestCardingWorld" />
<option value="$MODULE_DIR$/Forums/CryptBB" />
<option value="$MODULE_DIR$/Forums/OnniForums" />
<option value="$MODULE_DIR$/MarketPlaces/ThiefWorld" />
<option value="$MODULE_DIR$/MarketPlaces/Apocalypse" />
<option value="$MODULE_DIR$/MarketPlaces/DarkMatter" />
<option value="$MODULE_DIR$/MarketPlaces/DigitalThriftShop" />
<option value="$MODULE_DIR$/MarketPlaces/HiddenMarket" />
<option value="$MODULE_DIR$/MarketPlaces/LionMarketplace" />
<option value="$MODULE_DIR$/MarketPlaces/Nexus" />
<option value="$MODULE_DIR$/MarketPlaces/RobinhoodMarket" />
<option value="$MODULE_DIR$/MarketPlaces/TorBay" />
<option value="$MODULE_DIR$/MarketPlaces/TorMarket" />
<option value="$MODULE_DIR$/MarketPlaces/ViceCity" />
<option value="$MODULE_DIR$/Forums/Altenens" />
<option value="$MODULE_DIR$/Forums/Cardingleaks" />
<option value="$MODULE_DIR$/Forums/HiddenAnswers" />
<option value="$MODULE_DIR$/Forums/Libre" />
<option value="$MODULE_DIR$/Forums/Procrax" />
<option value="$MODULE_DIR$/MarketPlaces/DarkBazar" />
<option value="$MODULE_DIR$/MarketPlaces/AnonMarket" />
<option value="$MODULE_DIR$/MarketPlaces/Tor2door" />
<option value="$MODULE_DIR$/MarketPlaces/MetaVerseMarket" />
</list>
</option>
</component>
</module>

+ 0
- 166
Forums/AbyssForum/parser.py View File

@ -1,166 +0,0 @@
__author__ = 'Helium'
# Here, we are importing the auxiliary functions to clean or convert data
from Forums.Utilities.utilities import *
from datetime import date
from datetime import timedelta
import re
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
def abyssForums_description_parser(soup):
# Fields to be parsed
topic = "-1" # 0 topic name
user = [] # 1 all users of each post
addDate = [] # 2 all dated of each post
feedback = [] # 3 all feedbacks of each vendor (this was found in just one Forum and with a number format)
status = [] # 4 all user's authority in each post such as (adm, member, dangerous)
reputation = [] # 5 all users's karma in each post (usually found as a number)
sign = [] # 6 all user's signature in each post (usually a standard message after the content of the post)
post = [] # 7 all messages of each post
interest = [] # 8 all user's interest in each post
image_user = [] # 9 all user avatars of each post
image_post = [] # 10 all first images of each post
# Finding the topic (should be just one coming from the Listing Page)
li = soup.find("div", {"class": "page-body"}).find("h2", {"class": "topic-title"})
topic = li.text.replace(",","")
topic = topic.replace("\n","")
topic = cleanString(topic.strip())
regex = re.compile('post has-profile.*')
posts = soup.find_all('div', {"class": regex})
# print(len(posts))
# For each message (post), get all the fields we are interested to:
for ipost in posts:
# Finding the author (user) of the post
author = ipost.find('a', {"class": "username"}).text
user.append(cleanString(author)) # Remember to clean the problematic characters
status.append("-1")
reputation.append("-1")
interest.append("-1")
sign.append("-1")
feedback.append("-1")
image_post.append("-1")
img = ipost.find('dl', {"class": "postprofile"}).find('img')
if img is not None:
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
image_user.append(img)
image_user.append("-1")
date_time_obj = ipost.find('time').attrs
date = date_time_obj['datetime'][0:10]
time = date_time_obj['datetime'][11:19]
date_time_obj = datetime.strptime(date + " " + time, '%Y-%m-%d %H:%M:%S')
addDate.append(date_time_obj)
# Finding the post
inner = ipost.find('div', {"class": "content"})
inner = inner.text.strip()
post.append(cleanString(inner))
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
# Sending the results
return row
# This is the method to parse the Listing Pages (one page with many posts)
def abyssForums_listing_parser(soup: BeautifulSoup):
nm = 0 # this variable should receive the number of topics
forum = "AbyssForum" # 0 *forum name
board = "-1" # 1 board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
author = [] # 2 all authors of each topic
topic = [] # 3 all topics
views = [] # 4 number of views of each topic
posts = [] # 5 number of posts of each topic
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
addDate = [] # when the topic was created (difficult to find)
image_author = [] # 8 all author avatars used in each topic
# Listing and Description pages)
#finding the board
board = soup.find("h2", {"class": "forum-title"}).text
board = cleanString(board.strip())
type_of_posts = soup.find_all("li", {"class": re.compile("row bg\d")} )
for literature in type_of_posts:
title_of_post = literature.find("a", {"class": "topictitle"}).text
title_of_post = cleanString(title_of_post)
topic.append(title_of_post)
user = literature.find("div", {"class": "topic-poster responsive-hide left-box"}).find("a", {"class": "username"}).text
author.append(user)
num_post = literature.find("dd", {"class": "posts"}).text.replace("Replies","").strip()
posts.append(num_post)
num_view = literature.find("dd", {"class": "views"}).text.replace("Views","").strip()
views.append(num_view)
#if int(num_post) != 0: join the last user who posted with the author?
# reply = literature.find("dd", {"class": "lastpost"}).find("a", {"class": "username"}).text
# user.append(reply)
date_time_obj = literature.find('time').attrs
date = date_time_obj['datetime'][0:10]
time = date_time_obj['datetime'][11:19]
date_added = datetime.strptime(date + " " + time, '%Y-%m-%d %H:%M:%S')
addDate.append(date_added)
listing_href = literature.find("a", {"class": "topictitle"}).get("href")
href.append(listing_href)
image_author.append("-1")
nm = len(topic)
return organizeTopics(
forum=forum,
nm=nm,
board=board,
author=author,
topic=topic,
views=views,
posts=posts,
href=href,
addDate=addDate,
image_author=image_author
)
def abyssForum_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
#print(soup.find('table', {"class": "tborder clear"}).find(
# 'tbody').find_all('tr', {"class": "inline_row"}))
listing = soup.find_all('dl', {"class": "row-item topic_read"})
for a in listing:
link = a.find('div', {"class": "list-inner"}).find('a').get('href')
href.append(link)
return href

+ 0
- 298
Forums/Altenens/crawler_selenium.py View File

@ -1,298 +0,0 @@
__author__ = 'Helium'
'''
Altenens Forum Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import configparser
import subprocess
from bs4 import BeautifulSoup
from Forums.Initialization.prepare_parser import new_parse
from Forums.Altenens.parser import altenens_links_parser
from Forums.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'https://altenens.is/'
# Opens Tor Browser, crawls the website
def startCrawling():
forumName = getForumName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(forumName, baseURL, True)
# Login using premade account credentials and do login captcha manually
def login(driver):
#click login button
login_link = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[1]/div/div/div/div[1]/a[1]').get_attribute('href')
driver.get(login_link) # open tab with url
#entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/form/div[1]/div/dl[1]/dd/input')
#Username here
usernameBox.send_keys('mylittlepony45')#sends string to the username box
passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/form/div[1]/div/dl[2]/dd/div/div/input')
#Password here
passwordBox.send_keys('johnnyTest@18')# sends string to passwordBox
input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
# wait for 50 sec until id = tab_content is found, then cont
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div[1]/div[1]/div/div/div/div[1]/a[1]')))
# Returns the name of the website
def getForumName():
name = 'Altenens'
return name
# Return the link of the website
def getFixedURL():
url = 'https://altenens.is/'
return url
# Closes Tor Browser
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close() #close tab
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from Forums.Initialization.forums_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", True)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)# open url in browser
return driver
except:
driver.close()# close tab
return 'down'
# Saves the crawled html page
def savePage(driver, html, url):
cleanPage = cleanHTML(driver, html)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
from Forums.Initialization.forums_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
# Creates the file name from passed URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
def getInterestedLinks():
links = []
# Hacking
links.append('https://altenens.is/forums/hacking.469162/')
# Hacking showoff
links.append('https://altenens.is/forums/hacking-showoff.469232/')
# Remote administration
links.append('https://altenens.is/forums/remote-administration.469161/')
# Cracking tools
links.append('https://altenens.is/forums/cracking-tools.469204/')
# Cracking tutorials
links.append('https://altenens.is/forums/cracking-tutorials-other-methods.469205/')
# Combo lists and configs
links.append('https://altenens.is/forums/combolists-and-configs.469206/')
# Programming
links.append('https://altenens.is/forums/programming.469239/')
return links
# newest version of crawling
def crawlForum(driver):
print("Crawling the Altenens forum")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
topics = topicPages(html)
for topic in topics:
has_next_topic_page = True
counter = 1
page = topic
while has_next_topic_page:
itemURL = urlparse.urljoin(baseURL, str(page))
try:
driver.get(itemURL)
except:
driver.refresh()
if isListingLink(driver.current_url):
break
savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# # comment out
# if counter == 2:
# break
try:
page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
if page == "":
raise NoSuchElementException
counter += 1
except NoSuchElementException:
has_next_topic_page = False
try:
driver.get(link)
except:
driver.refresh()
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try:
link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
print("Crawling the Altenens forum done.")
# Returns 'True' if the link is Topic link, may need to change for every website
def isDescriptionLink(url):
if 'threads' in url:
return True
return False
# Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url):
if '.is/forums' in url:
return True
return False
# calling the parser to define the links
def topicPages(html):
soup = BeautifulSoup(html, "html.parser")
#print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)
return altenens_links_parser(soup)
def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")

+ 0
- 165
Forums/Altenens/parser.py View File

@ -1,165 +0,0 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from Forums.Utilities.utilities import *
from datetime import date
from datetime import timedelta
import re
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
def altenens_description_parser(soup):
topic = "-1" # 0 *topic name
user = [] # 1 *all users of each post
status = [] # 2 all user's authority in each post such as (adm, member, dangerous)
reputation = [] # 3 all user's karma in each post (usually found as a number)
interest = [] # 4 all user's interest in each post
sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
post = [] # 6 all messages of each post
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
addDate = [] # 8 all dates of each post
image_user = [] # 9 all user avatars of each post
image_post = [] # 10 all first images of each post
etopic = soup.find("h1", {"class": "p-title-value"})
if etopic is not None:
topic = etopic.text
topic = cleanString(topic.strip())
body = soup.find('div', {"class": "block-container lbContainer"})
iposts = body.find_all('article', {"class": "message message--post js-post js-inlineModContainer"})
for ipost in iposts:
author = ipost.find('h4', {"class": "message-name"}).text
user.append(cleanString(author.strip()))
stat = ipost.find('h5', {"class": "userTitle message-userTitle"}).text
status.append(cleanString(stat.strip()))
bar = ipost.find('div', {"class": "xtr-progress-bar"})
if bar is not None:
rep = bar.find('p').get('data-value')
else:
rep = "-1"
reputation.append(cleanString(rep))
interest.append("-1")
signature = ipost.find('aside', {"class": "message-signature"})
if signature is not None:
signature = signature.text.strip()
else:
signature = "-1"
sign.append(cleanString(signature))
inner = ipost.find('div', {"class": "bbWrapper"}).find(text=True, recursive=False)
if inner is not None:
inner = inner.strip()
else:
inner = "" # cannot use -1 because the post is hidden unless you reply
post.append(cleanString(inner))
feedback.append("-1")
dt = ipost.find('time', {"class": "u-dt"}).get('datetime')
date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z')
addDate.append(date_time_obj)
img = ipost.find('div', {"class": "message-avatar-wrapper"}).find('img')
if img is not None:
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
image_user.append(img)
image_post.append("-1")
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
# Sending the results
return row
# This is the method to parse the Listing Pages (one page with many posts)
def altenens_listing_parser(soup):
nm = 0 # *this variable should receive the number of topics
forum = "Altenens" # 0 *forum name
board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
author = [] # 2 *all authors of each topic
topic = [] # 3 *all topics
views = [] # 4 number of views of each topic
posts = [] # 5 number of posts of each topic
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
# Listing and Description pages)
addDate = [] # 7 when the topic was created (difficult to find)
image_author = [] # 8 all author avatars used in each topic
board = soup.find('h1', {"class": "p-title-value"}).text
board = cleanString(board.strip())
regex = re.compile('structItem structItem--thread.*')
itopics = soup.find_all('div', {"class": regex})
nm = len(itopics)
for itopic in itopics:
topics = itopic.find('div', {"class": "structItem-title"}).text
topic.append(cleanString(topics.strip()))
author_icon = itopic.find('a', {"class": "avatar avatar--s"})
if author_icon != None:
author_icon = author_icon.find('img')
author_icon = author_icon.get('src')
author_icon = author_icon.split('base64,')[-1]
else:
author_icon = "-1"
image_author.append(author_icon)
link = itopic.find('div', {"class": "structItem-title"}).find('a').get('href')
href.append(link)
user = itopic.find('ul', {"class": "structItem-parts"}).find('a').text
author.append(cleanString(user.strip()))
dt = itopic.find('time', {"class": "u-dt"}).get('datetime')
date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z')
addDate.append(date_time_obj)
nposts = itopic.find('dl', {"class": "pairs pairs--justified"}).text
nposts = nposts.replace('Replies', '')
nposts = nposts.replace('K', '000')
posts.append(cleanString(nposts))
nviews = itopic.find('dl', {"class": "pairs pairs--justified structItem-minor"}).text
nviews = nviews.replace('Views', '')
nviews = nviews.replace('K', '000')
views.append(cleanString(nviews))
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author)
def altenens_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.find_all('div', {"class": "structItem-cell structItem-cell--main"})
for a in listing:
link = a.find('a', {"class": ""}).get('href')
href.append(link)
return href

+ 0
- 303
Forums/Cardingleaks/crawler_selenium.py View File

@ -1,303 +0,0 @@
__author__ = 'DarkWeb'
'''
Cardingleaks Forum Crawler (Selenium)
Crawler updated and fixed
The site has this thing sometime where you'll have to look at a new post everyday. makes sure
you login first before crawling.
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from PIL import Image
import urllib.parse as urlparse
import os, re, time
import subprocess
from bs4 import BeautifulSoup
from Forums.Initialization.prepare_parser import new_parse
from Forums.Cardingleaks.parser import cardingleaks_links_parser
from Forums.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'https://leaks.ws/'
# Opens Tor Browser, crawls the website
def startCrawling():
forumName = getForumName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(forumName, baseURL, True)
# Login using premade account credentials and do login captcha manually
def login(driver):
#click login button
login_link = driver.find_element(
by=By.XPATH, value='/html/body/div[2]/div[1]/nav/div/div[3]/div[1]/a[1]').\
get_attribute('href')
driver.get(login_link)# open tab with url
#entering username and password into input boxes
usernameBox = driver.find_element(by=By.NAME, value='login')
#Username here
usernameBox.send_keys('somanyfrogs')#sends string to the username box
passwordBox = driver.find_element(by=By.NAME, value='password')
#Password here
passwordBox.send_keys('therearewaytoomanyherehowwhy')# sends string to passwordBox
login = driver.find_element(by=By.CLASS_NAME, value='block-container')
login_link = login.find_element(by=By.TAG_NAME, value='button')
login_link.click()
# input('input')
# wait for listing page show up (This Xpath may need to change based on different seed url)
# wait for 50 sec until id = tab_content is found, then cont
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.CLASS_NAME, 'p-body-pageContent')))
# Returns the name of the website
def getForumName() -> str:
name = 'Cardingleaks'
return name
# Return the link of the website
def getFixedURL():
url = 'https://leaks.ws/'
return url
# Closes Tor Browser
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close() #close tab
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from Forums.Initialization.forums_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", True)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
# Saves the crawled html page
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
from Forums.Initialization.forums_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
# Creates the file name from passed URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if name == '':
name = str(counter)
counter = counter + 1
return name
def getInterestedLinks():
links = []
# carding methods
links.append('https://leaks.ws/forums/carding-methods.82/')
# # carding schools
# links.append('https://leaks.ws/forums/help-desk-carding-school.35/')
# # carding discussion
# links.append('https://leaks.ws/forums/carding-discussion-desk.58/')
# # carding tutorials
# links.append('https://leaks.ws/forums/carding-tutorials.13/')
# # carding tools and software
# links.append('https://leaks.ws/forums/carding-tools-softwares.10/')
# # exploits and cracking tools
# links.append('https://leaks.ws/forums/exploits-cracking-tools.22/')
return links
def crawlForum(driver):
print("Crawling the Cardingleaks forum")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
topics = topicPages(html)
for topic in topics:
has_next_topic_page = True
counter = 1
page = topic
while has_next_topic_page:
itemURL = urlparse.urljoin(baseURL, str(page))
try:
driver.get(itemURL)
except:
driver.refresh()
if isListingLink(driver.current_url):
break
savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# # comment out
# if counter == 2:
# break
try:
page = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
if page == "":
raise NoSuchElementException
counter += 1
except NoSuchElementException:
has_next_topic_page = False
# making sure we go back to the listing page (browser back button simulation)
try:
driver.get(link)
except:
driver.refresh()
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try:
link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
print("Crawling the Cardingleaks forum done.")
# Returns 'True' if the link is Topic link, may need to change for every website
def isDescriptionLink(url):
if 'threads' in url:
return True
return False
# Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url):
if '.ws/forums' in url:
return True
return False
# calling the parser to define the links
def topicPages(html):
soup = BeautifulSoup(html, "html.parser")
return cardingleaks_links_parser(soup)
def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")

+ 0
- 167
Forums/Cardingleaks/parser.py View File

@ -1,167 +0,0 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from Forums.Utilities.utilities import *
from datetime import date
from datetime import timedelta
import re
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup, ResultSet, Tag
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
def cardingleaks_description_parser(soup: Tag):
# Fields to be parsed
topic = "-1" # 0 *topic name
user = [] # 1 *all users of each post
status = [] # 2 all user's authority in each post such as (adm, member, dangerous)
reputation = [] # 3 all user's karma in each post (usually found as a number)
interest = [] # 4 all user's interest in each post
sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
post = [] # 6 all messages of each post
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
addDate = [] # 8 all dates of each post
image_user = [] # 9 all user avatars of each post
image_post = [] # 10 all first images of each post
li = soup.find("h1", {"class": "p-title-value"})
topic = cleanString(li.text.strip())
post_list: ResultSet[Tag] = soup.find("div", {"class": "block-body js-replyNewMessageContainer"}).find_all("article", {"data-author": True})
for ipost in post_list:
username = ipost.get('data-author')
user.append(username)
user_status = ipost.find("h5", {"class": "userTitle message-userTitle"}).text
status.append(cleanString(user_status.strip()))
user_statistics: ResultSet[Tag] = ipost.find("div", {"class": "message-userExtras"}).find_all("dl", {"class": "pairs pairs--justified"})
user_reputation = "-1"
for stat in user_statistics:
data_type = stat.find("span").get("data-original-title")
if data_type == "Points":
user_reputation = stat.find("dd").text
break
reputation.append(cleanString(user_reputation.strip()))
interest.append("-1")
sign.append("-1")
user_post = ipost.find("div", {"class": "message-content js-messageContent"}).text
post.append(cleanString(user_post.strip()))
feedback.append("-1")
datetime_text = ipost.find("ul", {"class": "message-attribution-main listInline"}).find("time").get("datetime")
datetime_obj = datetime.strptime(datetime_text, "%Y-%m-%dT%H:%M:%S%z")
addDate.append(datetime_obj)
img = ipost.find('div', {"class": "message-content js-messageContent"}).find('img')
if img is not None:
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
image_post.append(img)
img = ipost.find('div', {"class": "message-avatar"}).find('img')
if img is not None:
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
image_user.append(img)
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
# Sending the results
return row
# This is the method to parse the Listing Pages (one page with many posts)
def cardingleaks_listing_parser(soup: Tag):
nm = 0 # *this variable should receive the number of topics
forum = "Cardingleaks" # 0 *forum name
board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
author = [] # 2 *all authors of each topic
topic = [] # 3 *all topics
views = [] # 4 number of views of each topic
posts = [] # 5 number of posts of each topic
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
# Listing and Description pages)
addDate = [] # 7 when the topic was created (difficult to find)
image_user = [] # 8 all user avatars used in each topic
# Finding the board (should be just one)
li = soup.find("h1", {"class": "p-title-value"})
board = cleanString(li.text.strip())
thread_list = soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True})
sticky = soup.find('div', {"class": "structItemContainer-group structItemContainer-group--sticky"})
if sticky is not None:
thread_list = sticky.find_all("div", {"data-author": True}) + thread_list
nm = len(thread_list)
for thread in thread_list:
thread_author = thread.get("data-author")
author.append(thread_author)
thread_topic = thread.find("div", {"class": "structItem-title"}).text
topic.append(cleanString(thread_topic.strip()))
author_icon = thread.find("a", {"class": "avatar avatar--s"})
if author_icon is not None:
author_icon = author_icon.find('img')
if author_icon is not None:
author_icon = author_icon.get('src').split('base64,')[-1]
image_user.append(author_icon)
else:
image_user.append('-1')
else:
image_user.append('-1')
thread_view = thread.find("dl", {"class": "pairs pairs--justified structItem-minor"}).find("dd").text
# Context text view count (i.e., 8.8K) to numerical (i.e., 8800)
if thread_view.find("K") > 0:
thread_view = str(int(float(thread_view.replace("K", "")) * 1000))
views.append(thread_view)
thread_posts = thread.find("dl", {"class": "pairs pairs--justified"}).find("dd").text
posts.append(cleanString(thread_posts.strip()))
thread_href = thread.find("div", {"class": "structItem-title"}).find("a").get("href")
href.append(thread_href)
thread_date = thread.find("li", {"class": "structItem-startDate"}).find("time").get("datetime")
datetime_obj = datetime.strptime(thread_date, "%Y-%m-%dT%H:%M:%S%z")
addDate.append(datetime_obj)
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_user)
def cardingleaks_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.find_all('div', {"class": "structItem-title"})
for a in listing:
link = a.find('a').get('href')
href.append(link)
return [href[-1]]

+ 0
- 257
Forums/CryptBB/crawler_mechanize.py View File

@ -1,257 +0,0 @@
__author__ = '91Shadows'
'''
CryptBB Crawler (Mechanize)
'''
import codecs, os, re
import socks, socket, time
from datetime import date
import urllib.parse as urlparse
import http.client as httplib
import mechanize
import subprocess
from bs4 import BeautifulSoup
from Forums.Initialization.prepare_parser import new_parse
from Forums.BestCardingWorld.parser import bestcardingworld_links_parser
counter = 1
httplib.HTTPConnection._http_vsn = 10
httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
baseURL = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=42&sid=ee2cbfd73c12923d979790b2bb4bdfd5'
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150)
# Opens Tor Browser, crawls the website
def startCrawling():
opentor()
getUrl()
forumName = getForumName()
br = getAccess()
if br != 'down':
crawlForum(br)
new_parse(forumName, False)
# new_parse(forumName, False)
closetor()
# Opens Tor Browser
def opentor():
global pid
print("Connecting Tor...")
path = open('../../path.txt').readline()
pro = subprocess.Popen(path)
pid = pro.pid
time.sleep(7.5)
input("Tor Connected. Press ENTER to continue\n")
return
# Creates a connection through Tor Port
def getUrl(timeout=None):
socket.socket = socks.socksocket
socket.create_connection = create_connection
return
# Makes the onion address request
def create_connection(address, timeout=None, source_address=None):
sock = socks.socksocket()
sock.connect(address)
return sock
# Returns the name of website
def getForumName():
name = 'CryptBB'
return name
# Return the link of website
def getFixedURL():
url = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=42&sid=ee2cbfd73c12923d979790b2bb4bdfd5'
return url
# Closes Tor Browser
def closetor():
global pid
os.system("taskkill /pid " + str(pid))
print('Closing Tor...')
time.sleep(3)
return
# Creates a Mechanize browser and initializes its options
def createBrowser():
br = mechanize.Browser()
cj = mechanize.CookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'),
('Accept', '*/*')]
return br
def getAccess():
url = getFixedURL()
br = createBrowser()
try:
br.open(url)
return br
except:
return 'down'
# Saves the crawled html page
def savePage(page, url):
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
a = page.read()
open(filePath, "wb").write(a)
return
# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = 'C:/Users/CALSysLab/Documents/threatIntelligence-main/DarkWebMining_Working/Forums/ThiefWorld/HTML_Pages/' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + '/' + 'Description/' + fileName + '.html'
else:
fullPath = 'C:/Users/CALSysLab/Documents/threatIntelligence-main/DarkWebMining_Working/Forums/ThiefWorld/HTML_Pages/' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + '/' + 'Listing/' + fileName + '.html'
return fullPath
# Creates the name of the file based on URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
# Hacking and Markets related topics
def getInterestedLinks():
links = []
links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=43&sid=e12864ffccc5df877b03b573534955be')
return links
# Start crawling Forum pages
def crawlForum(br):
print("Crawling CryptBB forum")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
page = br.open(link)
savePage(page, link)
res = br.response().read()
soup = BeautifulSoup(res, 'html.parser')
next_link = soup.find("a", {"rel": "next"})
if next_link != None:
full_url = urlparse.urljoin(linksToCrawl[i], next_link['href'])
linksToCrawl.insert(i + 1, full_url)
listOfTopics = findDescriptionPages(link)
for topic in listOfTopics:
itemPage = br.open(str(topic))
savePage(itemPage, topic)
except Exception as e:
print('Error getting link: ', link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("CryptBB forum done sucessfully. Press ENTER to continue\n")
return
# Returns True if the link is 'Topic' Links, may need to change for diff websites
def isDescriptionLink(url):
if 'topic' in url:
return True
return False
# Returns True if the link is a listingPage link, may need to change for diff websites
def isListingLink(url):
'''
reg = 'board=[0-9]+.[0-9]+\Z'
if len(re.findall(reg, url)) == 0:
return False
return True
'''
if 'forum' in url:
return True
return False
# calling the parser to define the links
def findDescriptionPages(url):
soup = ""
error = False
try:
html = codecs.open(
"C:\\Users\\CALSysLab\\Documents\\threatIntelligence-main\\DarkWebMining_Working\\Forums\\BestCardingWorld\\HTML_Pages\\" + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + "\\Listing\\" + getNameFromURL(url) + ".html", encoding='utf8')
soup = BeautifulSoup(html, "html.parser")
except:
try:
html = open(
"C:\\Users\\CALSysLab\\Documents\\threatIntelligence-main\\DarkWebMining_Working\\Forums\\BestCardingWorld\\HTML_Pages\\" + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + "\\Listing\\" + getNameFromURL(url) + ".html")
soup = BeautifulSoup(html, "html.parser")
except:
error = True
print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.")
if not error:
return bestcardingworld_links_parser(soup)
else:
return []
def crawler():
startCrawling()
print("Crawling and Parsing CryptBB .... DONE!")

+ 0
- 331
Forums/CryptBB/crawler_selenium.py View File

@ -1,331 +0,0 @@
__author__ = 'DarkWeb'
'''
CryptBB Forum Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from PIL import Image
import urllib.parse as urlparse
import os, re, time
import subprocess
from bs4 import BeautifulSoup
from Forums.Initialization.prepare_parser import new_parse
from Forums.CryptBB.parser import cryptBB_links_parser
from Forums.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/'
# Opens Tor Browser, crawls the website
def startCrawling():
forumName = getForumName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(forumName, baseURL, True)
# Login using premade account credentials and do login captcha manually
def login(driver):
#click login button
login_link = driver.find_element(
by=By.XPATH, value='/html/body/div/div[2]/div/table/tbody/tr[2]/td/center/pre/strong/a[1]').\
get_attribute('href')
driver.get(login_link)# open tab with url
#entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[2]/td[2]/input')
#Username here
usernameBox.send_keys('holyre')#sends string to the username box
passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[3]/td[2]/input')
#Password here
passwordBox.send_keys('PlatinumBorn2')# sends string to passwordBox
'''
# wait for captcha page show up
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div/div[2]/div/form/div/input")))
# save captcha to local
driver.find_element(by=By.XPATH, value='//*[@id="captcha_img"]').screenshot(r'..\CryptBB\captcha.png')
# This method will show image in any image viewer
im = Image.open(r'..\CryptBB\captcha.png')
im.show()
# wait until input space show up
inputBox = driver.find_element(by=By.XPATH, value='//*[@id="imagestring"]')
# ask user input captcha solution in terminal
userIn = input("Enter solution: ")
# send user solution into the input space
inputBox.send_keys(userIn)
# click the verify(submit) button
driver.find_element(by=By.XPATH, value="/html/body/div/div[2]/div/form/div/input").click()
'''
input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
# wait for 50 sec until id = tab_content is found, then cont
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="tab_content"]')))
# Returns the name of the website
def getForumName() -> str:
name = 'CryptBB'
return name
# Return the link of the website
def getFixedURL():
url = 'http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/'
return url
# Closes Tor Browser
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close() #close tab
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from Forums.Initialization.forums_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", True)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
# Saves the crawled html page
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
from Forums.Initialization.forums_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
# Creates the file name from passed URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if name == '':
name = str(counter)
counter = counter + 1
return name
def getInterestedLinks():
links = []
# Beginner Programming
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86')
# Beginner Carding and Fraud
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=91')
# Beginner Hacking
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=87')
# Newbie
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=84')
# Beginner Hardware
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=89')
# Training Challenges
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=96')
# Darknet Discussions
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=88')
# Public Leaks and Warez
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=97')
# Sell
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=44')
return links
def crawlForum(driver):
print("Crawling the CryptBB forum")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
topics = topicPages(html)
for topic in topics:
has_next_topic_page = True
counter = 1
page = topic
while has_next_topic_page:
itemURL = urlparse.urljoin(baseURL, str(page))
try:
driver.get(itemURL)
except:
driver.refresh()
if isListingLink(driver.current_url):
break
savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# # comment out
# if counter == 2:
# break
try:
temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div')
page = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
if page == "":
raise NoSuchElementException
counter += 1
except NoSuchElementException:
has_next_topic_page = False
# making sure we go back to the listing page (browser back button simulation)
try:
driver.get(link)
except:
driver.refresh()
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try:
temp = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div')
link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
print("Crawling the CrypttBB forum done.")
# Returns 'True' if the link is Topic link, may need to change for every website
def isDescriptionLink(url):
if 'thread' in url:
return True
return False
# Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url):
if '.onion/forumdisplay' in url:
return True
return False
# calling the parser to define the links
def topicPages(html):
soup = BeautifulSoup(html, "html.parser")
return cryptBB_links_parser(soup)
def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")

+ 0
- 282
Forums/CryptBB/parser.py View File

@ -1,282 +0,0 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from Forums.Utilities.utilities import *
from datetime import date
from datetime import timedelta
import re
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
def cryptBB_description_parser(soup):
# Fields to be parsed
topic = "-1" # 0 *topic name
user = [] # 1 *all users of each post
status = [] # 2 all user's authority in each post such as (adm, member, dangerous)
reputation = [] # 3 all user's karma in each post (usually found as a number)
interest = [] # 4 all user's interest in each post
sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
post = [] # 6 all messages of each post
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
addDate = [] # 8 all dates of each post
image_user = [] # 9 all user avatars of each post
image_post = [] # 10 all first images of each post
# Finding the topic (should be just one coming from the Listing Page)
li = soup.find("td", {"class": "thead"}).find('strong')
topic = li.text
topic = re.sub("\[\w*\]", '', topic)
topic = topic.replace(",","")
topic = topic.replace("\n","")
topic = cleanString(topic.strip())
# Finding the repeated tag that corresponds to the listing of posts
posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all(
'div', {"class": "post"})
# For each message (post), get all the fields we are interested to:
for ipost in posts:
if ipost.find('div', {"class": "deleted_post_author"}):
continue
# Finding a first level of the HTML page
post_wrapper = ipost.find('span', {"class": "largetext"})
# Finding the author (user) of the post
author = post_wrapper.text.strip()
user.append(cleanString(author)) # Remember to clean the problematic characters
# Finding the status of the author
smalltext = ipost.find('div', {"class": "post_author"})
if smalltext is not None:
# CryptBB does have membergroup and postgroup
membergroup = smalltext.find('div', {"class": "profile-rank"})
postgroup = smalltext.find('div', {"class": "postgroup"})
if membergroup != None:
membergroup = membergroup.text.strip()
if postgroup != None:
postgroup = postgroup.text.strip()
membergroup = membergroup + " - " + postgroup
else:
if postgroup != None:
membergroup = postgroup.text.strip()
else:
membergroup = "-1"
status.append(cleanString(membergroup))
# Finding the interest of the author
# CryptBB does not have blurb
blurb = smalltext.find('li', {"class": "blurb"})
if blurb != None:
blurb = blurb.text.strip()
else:
blurb = "-1"
interest.append(cleanString(blurb))
# Finding the reputation of the user
# CryptBB does have reputation
author_stats = smalltext.find('div', {"class": "author_statistics"})
karma = author_stats.find('strong')
if karma != None:
karma = karma.text
karma = karma.replace("Community Rating: ", "")
karma = karma.replace("Karma: ", "")
karma = karma.strip()
else:
karma = "-1"
reputation.append(cleanString(karma))
else:
status.append('-1')
interest.append('-1')
reputation.append('-1')
# Getting here another good tag to find the post date, post content and users' signature
postarea = ipost.find('div', {"class": "post_content"})
dt = postarea.find('span', {"class": "post_date"}).text
# dt = dt.strip().split()
dt = dt.strip()
day=date.today()
if "Today" in dt:
today = day.strftime('%m-%d-%Y')
stime = dt.replace('Today,','').strip()
date_time_obj = today + ', '+stime
date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
elif "Yesterday" in dt:
yesterday = day - timedelta(days=1)
yesterday = yesterday.strftime('%m-%d-%Y')
stime = dt.replace('Yesterday,','').strip()
date_time_obj = yesterday + ', '+stime
date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
elif "ago" in dt:
date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p')
else:
date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p')
addDate.append(date_time_obj)
# Finding the post
inner = postarea.find('div', {"class": "post_body scaleimages"})
quote = inner.find('blockquote')
if quote is not None:
quote.decompose()
inner = inner.text.strip()
post.append(cleanString(inner))
# Finding the user's signature
# signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"})
signature = ipost.find('div', {"class": "signature scaleimages"})
if signature != None:
signature = signature.text.strip()
# print(signature)
else:
signature = "-1"
sign.append(cleanString(signature))
# As no information about user's feedback was found, just assign "-1" to the variable
feedback.append("-1")
img = ipost.find('div', {"class": "post_body scaleimages"}).find('img')
if img is not None:
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
image_post.append(img)
avatar = ipost.find('div', {"class": "author_avatar"})
if avatar is not None:
img = avatar.find('img')
if img is not None:
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
else:
img = "-1"
image_user.append(img)
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
# Sending the results
return row
# This is the method to parse the Listing Pages (one page with many posts)
def cryptBB_listing_parser(soup):
nm = 0 # *this variable should receive the number of topics
forum = "CryptBB" # 0 *forum name
board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
author = [] # 2 *all authors of each topic
topic = [] # 3 *all topics
views = [] # 4 number of views of each topic
posts = [] # 5 number of posts of each topic
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
# Listing and Description pages)
addDate = [] # 7 when the topic was created (difficult to find)
image_author = [] # 8 all author avatars used in each topic
# Finding the board (should be just one)
board = soup.find('span', {"class": "active"}).text
board = cleanString(board.strip())
# Finding the repeated tag that corresponds to the listing of topics
itopics = soup.find_all('tr',{"class": "inline_row"})
# Counting how many topics
nm = len(itopics)
for itopic in itopics:
# For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
# to don't miss any topic
# Adding the topic to the topic list
try:
topics = itopic.find('span', {"class": "subject_old"}).find('a').text
except:
topics = itopic.find('span', {"class": "subject_new"}).find('a').text
topics = re.sub("\[\w*\]", '', topics)
topic.append(cleanString(topics))
image_author.append(-1)
# Adding the url to the list of urls
try:
link = itopic.find('span', {"class": "subject_old"}).find('a').get('href')
except:
link = itopic.find('span',{"class": "subject_new"}).find('a').get('href')
href.append(link)
# Finding the author of the topic
ps = itopic.find('div', {"class":"author smalltext"}).text
user = ps.strip()
author.append(cleanString(user))
# Finding the number of replies
columns = itopic.findChildren('td',recursive=False)
replies = columns[3].text
if replies == '-':
posts.append('-1')
else:
posts.append(cleanString(replies))
# Finding the number of Views
tview = columns[4].text
if tview == '-':
views.append('-1')
else:
views.append(cleanString(tview))
# If no information about when the topic was added, just assign "-1" to the variable
addDate.append("-1")
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author)
def cryptBB_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.find('table', {"class": "tborder clear"}).find('tbody').find_all('tr', {"class": "inline_row"})
for a in listing:
try:
link = a.find('span', {"class": "subject_old"}).find('a').get('href')
except:
link = a.find('span', {"class": "subject_new"}).find('a').get('href')
href.append(link)
return href

Forums/HiddenAnswers/crawler_selenium.py → Forums/DWForums/crawler_selenium.py View File

@ -1,7 +1,7 @@
__author__ = 'Helium'
__author__ = 'DarkWeb'
'''
HiddenAnswers Crawler (Selenium)
DWForums Forum Crawler (Selenium)
'''
from selenium import webdriver
@ -12,26 +12,24 @@ from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from PIL import Image
import urllib.parse as urlparse
import os, re, time
import os, time
from datetime import date
import configparser
import subprocess
from bs4 import BeautifulSoup
from Forums.Initialization.prepare_parser import new_parse
from Forums.HiddenAnswers.parser import hiddenanswers_links_parser
from Forums.DWForums.parser import dwForums_links_parser
from Forums.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/'
baseURL = 'http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/'
# Opens Tor Browser, crawls the website
def startCrawling():
forumName = getForumName()
driver: webdriver.Firefox = getAccess()
driver = getAccess()
if driver != 'down':
try:
@ -41,25 +39,48 @@ def startCrawling():
print(driver.current_url, e)
closeDriver(driver)
new_parse(forumName, baseURL, True)
new_parse(forumName, baseURL, False)
# Login using premade account credentials and do login captcha manually
def login(driver):
#click login button
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.CSS_SELECTOR, ".button--icon--user")))
login_link = driver.find_element(by=By.CSS_SELECTOR, value=".button--icon--user")
login_link.click()
#entering username and password into input boxes
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[4]/div/div[2]/div/form/div[1]")))
container = driver.find_element(by=By.XPATH, value="/html/body/div[4]/div/div[2]/div/form/div[1]")
# print(container.get_attribute("outerHTML"))
boxes = container.find_elements(by=By.CLASS_NAME, value="input")
# print(len(boxes))
#Username here
boxes[0].send_keys('nice_reamer08')
#Password here
boxes[1].send_keys('tjpv$]Nc}XG@`%LM')
# no captcha on this site
# click the verify(submit) button
driver.find_element(by=By.CSS_SELECTOR, value=".button--icon--login").click()
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div[2]/div[2]/div/div[2]/div[4]/div/ul/li[14]/a')))
(By.CSS_SELECTOR, '.p-staffBar-inner > div:nth-child(4) > div:nth-child(1) > a:nth-child(1)')))
# Returns the name of the website
def getForumName():
name = 'HiddenAnswers'
name = 'DWForums'
return name
# Return the link of the website
def getFixedURL():
url = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/'
url = 'http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/'
return url
@ -106,11 +127,12 @@ def createFFDriver():
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
def getAccess():
url = getFixedURL()
driver = createFFDriver()
@ -157,26 +179,24 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
# hacking
links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/hacking')
# darknet and tor
links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/darknet-and-tor')
# internet
links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/internet')
# links
links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/questions/links')
# programming
links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/programming')
# knowledge and information
links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/knowledge-and-information')
# other
links.append('http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion/index.php/other')
# Hacking
links.append('http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/forums/hacking-forum.33/')
# # Beginner Carding and Fraud
# links.append('http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/forums/remote-administration.34/')
# # Cracking Tools
# links.append('http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/forums/cracking-tools.35/')
# # Cracking Tutorials and Other Methods - error here about file not exisitng
# links.append('http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/forums/cracking-tutorials-other-methods.36/')
# # Combolists and Configs
# links.append('http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/forums/combolists-and-configs.58/')
# # Paid Software and Antivirus
# links.append('http://dwforumuugiyderhybcpfxmlmoawgq6z3w6hk45nrnem3p7kwszhybad.onion/forums/paid-softwares-and-antivirus.59/')
return links
def crawlForum(driver: webdriver.Firefox):
print("Crawling the HiddenAnswers forum")
def crawlForum(driver):
print("Crawling the DWForums forum")
linksToCrawl = getInterestedLinks()
@ -208,18 +228,14 @@ def crawlForum(driver: webdriver.Firefox):
driver.get(itemURL)
except:
driver.refresh()
if isListingLink(driver.current_url):
break
savePage(driver, driver.page_source, topic + f"page{counter}")
savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# # comment out
# if counter == 2:
# break
# comment out
if counter == 2:
break
try:
page = driver.find_element(by=By.CLASS_NAME, value='qa-page-next').get_attribute('href')
page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
if page == "":
raise NoSuchElementException
counter += 1
@ -227,21 +243,19 @@ def crawlForum(driver: webdriver.Firefox):
except NoSuchElementException:
has_next_topic_page = False
# making sure we go back to the listing page (browser back button simulation)
try:
driver.get(link)
except:
driver.refresh()
for i in range(counter):
driver.back()
# comment out
break
# # comment out
# break
#
# # comment out
# if count == 1:
# break
# comment out
if count == 1:
break
try:
link = driver.find_element(by=By.CLASS_NAME, value='qa-page-next').get_attribute('href')
temp = driver.find_element(by=By.LINK_TEXT, value="Next")
link = temp.get_attribute('href')
if link == "":
raise NoSuchElementException
@ -254,19 +268,19 @@ def crawlForum(driver: webdriver.Firefox):
print(link, e)
i += 1
print("Crawling the HiddenAnswers forum done.")
input("Crawling DWForums forum done sucessfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link
def isDescriptionLink(url):
if 'http' not in url:
if '/threads/' in url:
return True
return False
# Returns True if the link is a listingPage link
def isListingLink(url):
if 'http' in url:
if '/forums/' in url:
return True
return False
@ -275,9 +289,9 @@ def isListingLink(url):
def topicPages(html):
soup = BeautifulSoup(html, "html.parser")
#print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)
return hiddenanswers_links_parser(soup)
return dwForums_links_parser(soup)
def crawler():
startCrawling()
# print("Crawling and Parsing Abyss .... DONE!")
# print("Crawling and Parsing BestCardingWorld .... DONE!")

+ 312
- 0
Forums/DWForums/parser.py View File

@ -0,0 +1,312 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from Forums.Utilities.utilities import *
from datetime import date
from datetime import timedelta
import re
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
def dwForums_description_parser(soup):
# Fields to be parsed
topic = "-1" # 0 *topic name
user = [] # 1 *all users of each post
status = [] # 2 all user's authority in each post such as (adm, member, dangerous)
reputation = [] # 3 all user's karma in each post (usually found as a number)
interest = [] # 4 all user's interest in each post
sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
post = [] # 6 all messages of each post
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
addDate = [] # 8 all dates of each post
# Finding the topic (should be just one coming from the Listing Page)
li = soup.find("h1", {"class": "p-title-value"})
topic = li.text
topic = topic.replace(u'\xa0', ' ')
topic = topic.replace(",","")
topic = topic.replace("\n","")
topic = cleanString(topic.strip())
# print(topic)
# Finding the repeated tag that corresponds to the listing of posts
# posts = soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg"}) + \
# soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg2"})
posts = soup.find('div', {"class": "js-replyNewMessageContainer"}).find_all(
'article', {"class": "js-post"}, recursive=False)
# print(len(posts))
# For each message (post), get all the fields we are interested to:
for ipost in posts:
# Finding a first level of the HTML page
# post_wrapper = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "poster"})
post_wrapper = ipost.find('h4', {"class": "message-name"})
# Finding the author (user) of the post
# author = post_wrapper.find('h4')
author = post_wrapper.text.strip()
# print("author " + author)
user.append(cleanString(author)) # Remember to clean the problematic characters
# Finding the status of the author
# Testing here two possibilities to find this status and combine them
# if ipost.find('h5', {"class": "deleted_post_author"}):
# status.append(-1)
# interest.append(-1)
# reputation.append(-1)
# addDate.append(-1)
# post.append("THIS POST HAS BEEN REMOVED!")
# sign.append(-1)
# feedback.append(-1)
# continue
# CryptBB does have membergroup and postgroup
membergroup = ipost.find('h5', {"class": "userTitle"})
# DWForums doesnt have postgroups
postgroup = None
if membergroup != None:
membergroup = membergroup.text.strip()
if postgroup != None:
postgroup = postgroup.text.strip()
membergroup = membergroup + " - " + postgroup
else:
if postgroup != None:
membergroup = postgroup.text.strip()
else:
membergroup = "-1"
status.append(cleanString(membergroup))
# print("status " + cleanString(membergroup))
# Finding the interest of the author
# DWForums does not have blurb
blurb = ipost.find('li', {"class": "blurb"})
if blurb != None:
blurb = blurb.text.strip()
else:
blurb = "-1"
interest.append(cleanString(blurb))
# Finding the reputation of the user
# CryptBB does have reputation
author_stats = ipost.find('div', {"class": "message-userExtras"})
if author_stats != None:
karma = author_stats.find_all('dl', {"class": "pairs"})[2]
else:
karma = None
if karma != None:
karma = karma.text
karma = karma.replace("Reaction score","")
karma = karma.replace(":", "")
karma = karma.strip()
else:
karma = "-1"
reputation.append(cleanString(karma))
# print("karma " + cleanString(karma))
# Getting here another good tag to find the post date, post content and users' signature
postarea = ipost.find('div', {"class": "message-attribution-main"})
dt = postarea.find('time', {"class": "u-dt"})['datetime']
# dt = dt.strip().split()
dt = dt.strip()[:16]
dt = dt.replace("T",", ")
day=date.today()
if "Yesterday" in dt:
yesterday = day - timedelta(days=1)
yesterday = yesterday.strftime('%m-%d-%Y')
stime = dt.replace('Yesterday,','').strip()
date_time_obj = yesterday+ ', '+stime
date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %H:%M')
elif "hours ago" in dt:
day = day.strftime('%m-%d-%Y')
date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %H:%M')
else:
date_time_obj = datetime.strptime(dt, '%Y-%m-%d, %H:%M')
stime = date_time_obj.strftime('%b %d, %Y')
sdate = date_time_obj.strftime('%I:%M %p')
addDate.append(date_time_obj)
# print("date " + str(date_time_obj))
# Finding the date of the post
# date_time_obj = datetime.strptime(dt, '%a %b %d, %Y %I:%M %p')
# smalltext = postarea.find('div', {"class": "flow_hidden"}).find('div', {"class": "keyinfo"})\
# .find('div', {"class": "smalltext"})
# sdatetime = smalltext.text
# sdatetime = sdatetime.replace(u"\xab","") # Removing unnecessary characters
# sdatetime = sdatetime.replace(u"\xbb","") # Removing unnecessary characters
# sdatetime = sdatetime.split("on: ") # Removing unnecessary characters
# sdatetime = sdatetime[1].strip()
# stime = sdatetime[:-12:-1] # Finding the time of the post
# stime = stime[::-1]
# sdate = sdatetime.replace(stime,"") # Finding the date of the post
# sdate = sdate.replace(",","")
# sdate = sdate.strip()
# Covert the date of the post that can be informed as: "12 February 2016", "today", "yesterday". We need
# a date format here as "mm/dd/yyyy"
# addDate.append(convertDate(sdate,"english", crawlerDate) + " " + stime)
# Finding the post
inner = ipost.find('article', {"class": "message-body"})
inner = inner.text.strip()
# print(inner)
post.append(cleanString(inner))
# Finding the users's signature
# signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"})
signature = ipost.find('aside', {"class": "message-signature"})
if signature != None:
signature = signature.text.strip()
# print(signature)
else:
signature = "-1"
sign.append(cleanString(signature))
# As no information about users's feedback was found, just assign "-1" to the variable
feedback.append("-1")
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
# Sending the results
return row
# This is the method to parse the Listing Pages (one page with many posts)
def dwForums_listing_parser(soup):
nm = 0 # *this variable should receive the number of topics
forum = "DWForums" # 0 *forum name
board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
author = [] # 2 *all authors of each topic
topic = [] # 3 *all topics
views = [] # 4 number of views of each topic
posts = [] # 5 number of posts of each topic
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
# Listing and Description pages)
addDate = [] # 7 when the topic was created (difficult to find)
# Finding the board (should be just one)
board = soup.find('h1', {"class": "p-title-value"}).text
board = cleanString(board.strip())
# Finding the repeated tag that corresponds to the listing of topics
regex = re.compile('.*structItem--thread.*')
itopics = soup.find_all("div", {"class": regex})
for itopic in itopics:
# For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
# to don't miss any topic
# tds = itopic.findAll('td', {"class": "subject stickybg2"})
#
# if len(tds) > 0:
# tag.append("strong")
# tag.append("subject stickybg2")
# tag.append("stats stickybg")
# else:
# tds = itopic.findAll('td', {"class": "subject windowbg2"})
# if len(tds) > 0:
# tag.append("span")
# tag.append("subject windowbg2")
# tag.append("stats windowbg")
# Adding the topic to the topic list
topics = itopic.find("div", {"class": "structItem-title"}).text
topics = topics.replace(",", "")
topics = topics.replace("\n", "")
topic.append(cleanString(topics.strip()))
# Counting how many topics we have found so far
nm = len(topic)
# Adding the url to the list of urls
link = itopic.select_one('a[href^="/threads/"]')
link = link['href']
link = cleanLink(link)
href.append(link)
# Finding the author of the topic
minor = itopic.find('div', {"class": "structItem-minor"})
ps = minor.find('li').text
user = ps.strip()
author.append(cleanString(user))
# Finding the number of replies
meta = itopic.find("div", {"class": "structItem-cell--meta"})
meta = meta.find_all("dl")
post = meta[0].find("dd").text
post = post.replace("K", "000")
posts.append(cleanString(post))
# Finding the number of Views
tview = meta[1].find("dd").text
tview = tview.replace("K", "000")
views.append(cleanString(tview))
# If no information about when the topic was added, just assign "-1" to the variable
minor = itopic.find("div", {"class": "structItem-minor"})
dt = minor.find('time')['datetime']
dt = dt.strip()[:16]
dt = dt.replace("T", ", ")
day = date.today()
if "Yesterday" in dt:
yesterday = day - timedelta(days=1)
yesterday = yesterday.strftime('%m-%d-%Y')
stime = dt.replace('Yesterday,', '').strip()
date_time_obj = yesterday + ', ' + stime
date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %H:%M')
else:
date_time_obj = datetime.strptime(dt, '%Y-%m-%d, %H:%M')
stime = date_time_obj.strftime('%b %d, %Y')
sdate = date_time_obj.strftime('%I:%M %p')
addDate.append(date_time_obj)
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate)
def dwForums_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
#print(soup.find('table', {"class": "tborder clear"}).find(
# 'tbody').find_all('tr', {"class": "inline_row"}))
regex = re.compile('.*structItem--thread.*')
listing = soup.find_all("div", {"class": regex})
for a in listing:
link = a.select_one('a[href^="/threads/"]')
link = link['href']
href.append(link)
return href

Forums/AbyssForum/crawler_selenium.py → Forums/Dread/crawler_selenium.py View File

@ -1,7 +1,7 @@
__author__ = 'Helium'
__author__ = 'DarkWeb'
'''
AbyssForum Crawler (Selenium)
Dread Forum Crawler (Selenium)
'''
from selenium import webdriver
@ -12,20 +12,18 @@ from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import configparser
import subprocess
from bs4 import BeautifulSoup
from Forums.Initialization.prepare_parser import new_parse
from Forums.AbyssForum.parser import abyssForum_links_parser
from Forums.Dread.parser import dread_links_parser
from Forums.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/'
baseURL = 'http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/'
# Opens Tor Browser, crawls the website
@ -41,25 +39,45 @@ def startCrawling():
print(driver.current_url, e)
closeDriver(driver)
new_parse(forumName, baseURL, True)
new_parse(forumName, baseURL, False)
# Login using premade account credentials and do login captcha manually
def login(driver):
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="sn-category-3"]')))
'''
# code for captcha, for now, it runs too slow so captcha expires
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.CSS_SELECTOR, ".image")))
inputBoxes = driver.find_elements(by=By.TAG_NAME, value='input')
for index, inputBox in enumerate(inputBoxes):
driver.find_element(by=By.CSS_SELECTOR, value='.image').screenshot(r'..\Dread\captcha.png')
im = Image.open(r'..\Dread\captcha.png')
im.show()
userIn = input("Enter character: ")
inputBox.send_keys(userIn)
im.close()
if index != 5:
inputBoxes[index+1].click()
driver.find_element(by=By.XPATH, value="/html/body/div/div[2]/div/form/div/input").click()
'''
input("Press ENTER when CAPTCHA is completed\n")
#entering username and password into input boxes
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div/div[2]")))
# Returns the name of the website
def getForumName():
name = 'AbyssForum'
name = 'Dread'
return name
# Return the link of the website
def getFixedURL():
url = 'http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/'
url = 'http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/'
return url
@ -89,8 +107,8 @@ def createFFDriver():
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@ -101,16 +119,18 @@ def createFFDriver():
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", True)
ff_prof.set_preference("xpinstall.signatures.required", False);
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
def getAccess():
url = getFixedURL()
driver = createFFDriver()
@ -157,26 +177,24 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
# Hacked Database
links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=26')
# Hire a Hacker
links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=27')
# Hacking Tools
links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=28')
# Carding Forums
links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=30')
# Social Media Hacking
links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=32')
# Hacking Tutorials
links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=12')
# Cracking Tutorials
links.append('http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion/viewforum.php?f=13')
# # OpSec
# links.append('http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/d/OpSec')
# Hacking 180
links.append('http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/d/hacking')
# # Jobs4Crypto
# links.append('http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/d/Jobs4Crypto')
# # Hacktown
# links.append('http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/d/HackTown')
# # Malware
# links.append('http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/d/malware')
# # Programming
# links.append('http://dreadytofatroptsdj6io7l3xptbet6onoyno2yv7jicoxknyazubrad.onion/d/programming')
return links
def crawlForum(driver):
print("Crawling the AbyssForum forum")
print("Crawling the Dread forum")
linksToCrawl = getInterestedLinks()
@ -208,20 +226,14 @@ def crawlForum(driver):
driver.get(itemURL)
except:
driver.refresh()
if isListingLink(driver.current_url):
break
savePage(driver, driver.page_source, topic + f"page{counter}") # very important
savePage(driver, driver.page_source, topic + f"page{counter}")
# # comment out
# if counter == 2:
# break
# comment out
if counter == 2:
break
try:
temp = driver.find_element(By.CLASS_NAME, 'pagination')
temp = temp.find_element(by=By.CLASS_NAME, value='next')
page = temp.find_element(by=By.CLASS_NAME, value='button').get_attribute('href')
page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
if page == "":
raise NoSuchElementException
counter += 1
@ -229,23 +241,20 @@ def crawlForum(driver):
except NoSuchElementException:
has_next_topic_page = False
# making sure we go back to the listing page (browser back button simulation)
try:
driver.get(link)
except:
driver.refresh()
for i in range(counter):
driver.back()
# comment out
break
# # comment out
# break
#
# # comment out
# if count == 1:
# break
# comment out
if count == 1:
break
try:
temp = driver.find_element(By.CLASS_NAME, 'pagination')
temp = temp.find_element(by=By.CLASS_NAME, value='next')
link = temp.find_element(by=By.CLASS_NAME, value='button').get_attribute('href')
temp = driver.find_element(by=By.CLASS_NAME, value="pagination")
link = temp.find_element(by=By.CLASS_NAME, value="next").get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
@ -257,19 +266,19 @@ def crawlForum(driver):
print(link, e)
i += 1
print("Crawling the AbyssForum forum done.")
input("Crawling Dread forum done sucessfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link
def isDescriptionLink(url):
if 'viewtopic' in url:
if '/post/' in url:
return True
return False
# Returns True if the link is a listingPage link
def isListingLink(url):
if '.onion/viewforum' in url:
if '/d/' in url:
return True
return False
@ -278,9 +287,9 @@ def isListingLink(url):
def topicPages(html):
soup = BeautifulSoup(html, "html.parser")
#print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)
return abyssForum_links_parser(soup)
return dread_links_parser(soup)
def crawler():
startCrawling()
# print("Crawling and Parsing Abyss .... DONE!")
# print("Crawling and Parsing BestCardingWorld .... DONE!")

+ 334
- 0
Forums/Dread/parser.py View File

@ -0,0 +1,334 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
import datetime
from Forums.Utilities.utilities import *
from datetime import date
from datetime import timedelta
import re
import traceback
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
def dread_description_parser(soup):
# Fields to be parsed
topic = "-1" # 0 *topic name
user = [] # 1 *all users of each post
status = [] # 2 all user's authority in each post such as (adm, member, dangerous)
reputation = [] # 3 all user's karma in each post (usually found as a number)
interest = [] # 4 all user's interest in each post
sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
post = [] # 6 all messages of each post
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
addDate = [] # 8 all dates of each post
# Finding the topic (should be just one coming from the Listing Page)
container = soup.find('div', {"class": "content"})
li = container.find("a", {"class": "title"})
if li == None:
return None
topic = li.text
topic = topic.replace(u'\xa0', ' ')
topic = topic.replace(",","")
topic = topic.replace("\n","")
topic = cleanString(topic.strip())
# print(topic)
# Finding the repeated tag that corresponds to the listing of posts
# posts = soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg"}) + \
# soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg2"})
# putting the initial post data since it is separated from comments
# author name
init_post = container.find('div', {"class": "item"})
author = init_post.find('div', {"class": "author"}).select_one('a[href^="/u/"]').text
flair = init_post.find('div', {"class": "author"}).find("span", {"class": "flair"})
try:
flair = flair.text.strip()
author = author.replace(flair, '')
except:
pass
author = author.strip()
user.append(cleanString(author))
# status
flair = init_post.find("span", {"class": "flair"})
if flair != None:
flair = flair.text.strip()
else:
flair = "-1"
status.append(cleanString(flair))
# no blurb
interest.append(-1)
# points for post
karma = init_post.find("div", {"class": "voteCount"})
if karma != None:
karma = karma.text
karma = karma.replace("points", "")
karma = karma.replace(":", "")
karma = karma.strip()
else:
karma = "-1"
reputation.append(cleanString(karma))
# date
spans = init_post.find('div', {"class": "author"}).find('span', recursive=False)
dt = spans['title']
month = find_month(dt)
split_text = dt.split()
day = int(re.search(r'\d+', split_text[0]).group())
year = int(split_text[2])
hm = re.findall(r'\d+', split_text[-1])
hm[0] = int(hm[0])
hm[1] = int(hm[1])
date_time_obj = datetime(year, month, day, hour=hm[0], minute=hm[1])
addDate.append(date_time_obj)
# content
inner = init_post.find("div", {"class": "postContent"})
inner = inner.text.strip()
post.append(cleanString(inner))
# no signature
sign.append(-1)
# no feedback
feedback.append(-1)
comments = soup.find('div', {"class": "postComments"})
if comments == None:
row = (topic, post, user, addDate, feedback, status, reputation, sign, interest)
return row
else:
comments = soup.find('div', {"class": "postComments"}).find_all('div', "comment")
# print(len(posts))
# For each message (post), get all the fields we are interested to:
for ipost in comments:
# Finding a first level of the HTML page
# post_wrapper = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "poster"})
cc = ipost.find('div', {"class": "commentContent"})
post_wrapper = cc.find('a', {"class": "username"}).text
flair = cc.find("span", {"class": "flair"})
try:
flair = flair.text.strip()
post_wrapper = post_wrapper.replace(flair, '')
except:
pass
author = post_wrapper.strip()
user.append(cleanString(author))
# Finding the status of the author
# Dread does not have membergroup and postgroup, but it has flair, similar enough
postgroup = None
if flair != None:
if postgroup != None:
postgroup = postgroup.text.strip()
flair = flair + " - " + postgroup
else:
if postgroup != None:
flair = postgroup.text.strip()
else:
flair = "-1"
status.append(cleanString(flair))
# print("status " + cleanString(membergroup))
# Finding the interest of the author
# Dread does not have blurb
interest.append(-1)
# Finding the reputation of the user
# Dread doesn't have reputation per user, but instead each post has its own point system
karma = cc.find('div', {"class": "votes"})
if karma != None:
karma = karma.text
karma = karma.replace("points","")
karma = karma.replace(":", "")
karma = karma.strip()
else:
karma = "-1"
reputation.append(cleanString(karma))
# print("karma " + cleanString(karma))
# Getting here another good tag to find the post date, post content and users' signature
postarea = ipost.find('div', {"class": "timestamp"}).find('span', recursive=False)
dt = postarea['title']
month = find_month(dt)
split_text = dt.split()
day = int(re.search(r'\d+', split_text[0]).group())
year = int(split_text[2])
hm = re.findall(r'\d+', split_text[-1])
hm[0] = int(hm[0])
hm[1] = int(hm[1])
date_time_obj = datetime(year, month, day, hour=hm[0], minute=hm[1])
addDate.append(date_time_obj)
# Finding the post
inner = ipost.find('div', {"class": "commentBody"})
inner = inner.text.strip()
# print(inner)
post.append(cleanString(inner))
# No signature for Dread
sign.append(-1)
# As no information about users's feedback was found, just assign "-1" to the variable
feedback.append("-1")
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
# Sending the results
return row
# This is the method to parse the Listing Pages (one page with many posts)
def dread_listing_parser(soup):
nm = 0 # *this variable should receive the number of topics
forum = "Dread" # 0 *forum name
board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
author = [] # 2 *all authors of each topic
topic = [] # 3 *all topics
views = [] # 4 number of views of each topic
posts = [] # 5 number of posts of each topic
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
# Listing and Description pages)
addDate = [] # 7 when the topic was created (difficult to find)
# Finding the board (should be just one)
board = soup.find('a', {"class": "banner-top"}).text
board = cleanString(board.strip())
# Finding the repeated tag that corresponds to the listing of topics
itopics = soup.find("div", {"class": "postBoard"}).find_all("div", {"class": "item"}, recursive=False)
for itopic in itopics:
# For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
# to don't miss any topic
# Adding the topic to the topic list
topic_title = itopic.find("a", {"class": "title"})
title_flair = topic_title.find('span', {"class": "flair"})
topics = topic_title.text
try:
title_flair = title_flair.text.strip()
topics = topics.replace(title_flair, '')
except:
pass
topics = topics.replace(u'\xa0', ' ')
topics = topics.replace(",", "")
topics = topics.replace("\n", "")
topic.append(cleanString(topics.strip()))
# Counting how many topics we have found so far
nm = len(topic)
# Adding the url to the list of urls
link = topic_title['href']
link = cleanLink(link)
href.append(link)
# Finding the author of the topic
ps = itopic.find('div', {"class": "author"})
post_wrapper = ps.select_one('a[href^="/u/"]').text
flair = ps.find("span", {"class": "flair"})
try:
flair = flair.text.strip()
post_wrapper = post_wrapper.replace(flair, '')
except:
pass
user = post_wrapper.strip()
author.append(cleanString(user))
# Finding the number of replies
meta = itopic.find("div", {"class": "postMain"})
post = meta.find("a").text
post = post.replace("comments", '').strip()
posts.append(cleanString(post))
# Finding the number of Views - not shown in Dread
views.append("-1")
# If no information about when the topic was added, just assign "-1" to the variable
spans = itopic.find('div', {"class": "author"}).find('span', recursive=False)
dt = spans['title']
month = find_month(dt)
split_text = dt.split()
day = int(re.search(r'\d+', split_text[0]).group())
year = int(split_text[2])
hm = re.findall(r'\d+', split_text[-1])
hm[0] = int(hm[0])
hm[1] = int(hm[1])
date_time_obj = datetime(year, month, day, hour=hm[0], minute=hm[1])
addDate.append(date_time_obj)
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate)
def dread_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
#print(soup.find('table', {"class": "tborder clear"}).find(
# 'tbody').find_all('tr', {"class": "inline_row"}))
listing = soup.find("div", {"class": "postBoard"}).find_all("div",{"class": "item"}, recursive=False)
for a in listing:
link = a.find("a", {"class": "title"})
link = link['href']
href.append(link)
return href
def find_month(s):
if 'January' in s:
return 1
elif 'February' in s:
return 2
elif 'March' in s:
return 3
elif 'April' in s:
return 4
elif 'May' in s:
return 5
elif 'June' in s:
return 6
elif 'July' in s:
return 7
elif 'August' in s:
return 8
elif 'September' in s:
return 9
elif 'October' in s:
return 10
elif 'November' in s:
return 11
elif 'December' in s:
return 12

+ 328
- 0
Forums/Helium/crawler_selenium.py View File

@ -0,0 +1,328 @@
__author__ = 'DarkWeb'
'''
Helium Forum Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
import urllib.parse as urlparse
import os, time
from datetime import date
import subprocess
from bs4 import BeautifulSoup
from Forums.Initialization.prepare_parser import new_parse
from Forums.Helium.parser import helium_links_parser
from Forums.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/'
# Opens Tor Browser, crawls the website
def startCrawling():
# opentor()
# forumName = getForumName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# new_parse(forumName, False)
# Opens Tor Browser
def opentor():
global pid
print("Connecting Tor...")
path = open('../../path.txt').readline().strip()
pro = subprocess.Popen(path)
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Login using premade account credentials and do login captcha manually
def login(driver):
#wait for login page
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[2]/div/div[1]/div/div/div[2]/form/div[5]/div/button")))
#entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
#Username here
usernameBox.send_keys('holyre')
passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
#Password here
passwordBox.send_keys('PlatinumBorn2')
'''
# wait for captcha page show up
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="captcha_img"]')))
# save captcha to local
driver.find_element(by=By.XPATH, value='//*[@id="captcha_img"]').screenshot(r'..\Helium\captcha.png')
# This method will show image in any image viewer
im = Image.open(r'..\Helium\captcha.png')
im.show()
# wait until input space show up
inputBox = driver.find_element(by=By.XPATH, value='//*[@id="captcha"]')
# ask user input captcha solution in terminal
userIn = input("Enter solution: ")
# send user solution into the input space
inputBox.send_keys(userIn)
# click the verify(submit) button
driver.find_element(by=By.XPATH, value="/html/body/div[2]/div/div[1]/div/div/div[2]/form/div[5]/div/button").click()
'''
input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div[2]/div/p')))
# Returns the name of the website
def getForumName():
name = 'Helium'
return name
# Return the link of the website
def getFixedURL():
url = 'http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/login'
return url
# Closes Tor Browser
def closetor(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
file = open('../../path.txt', 'r')
lines = file.readlines()
ff_binary = FirefoxBinary(lines[0].strip())
ff_prof = FirefoxProfile(lines[1].strip())
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
# ff_prof.set_preference("permissions.default.image", 2)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", True)
ff_prof.update_preferences()
service = Service(lines[2].strip())
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
return driver
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
# Saves the crawled html page
def savePage(page, url):
cleanPage = cleanHTML(page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = r'..\Helium\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
else:
fullPath = r'..\Helium\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
return fullPath
# Creates the file name from passed URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if name == '':
name = str(counter)
counter = counter + 1
return name
def getInterestedLinks():
links = []
# # General Discussion
# links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/6')
# # Anonymity and Security
# links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/8')
# # Programming
# links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/9')
# # Carding Discussions
# links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/10')
# # Hacked Database (free)
# links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/11')
# Hacking tools, exploits and POC
links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/17')
# # Hacked Database
# links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/12')
# # Hacking and other Services
# links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/13')
# # Selling/Buying Malware, Exploits etc
# links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/22')
# # General Tutorials
# links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/18')
# # Hacking Tutorials
# links.append('http://fahue6hb7odzns36vfoi2dqfvqvjq4btt7vo52a67jivmyz6a6h3vzqd.onion/board/19')
return links
def crawlForum(driver):
print("Crawling the Helium forum")
linksToCrawl = getInterestedLinks()
# visited = set(linksToCrawl)
# initialTime = time.time()
i = 0
count = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True
while has_next_page:
list = topicPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver.page_source, item)
driver.back()
# comment out
break
# comment out
if count == 1:
count = 0
break
try:
bar = driver.find_element(by=By.XPATH, value=
'/html/body/div[2]/div/div[3]/ul')
li = bar.find_elements(By.TAG_NAME, 'li')[-1]
link = li.find_element(By.TAG_NAME, 'a').get_attribute('href')
if link == "":
raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling Helium forum done successfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link
def isDescriptionLink(url):
if 'topic' in url:
return True
return False
# Returns True if the link is a listingPage link
def isListingLink(url):
if 'board' in url:
return True
return False
# calling the parser to define the links
def topicPages(html):
soup = BeautifulSoup(html, "html.parser")
return helium_links_parser(soup)
def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")

+ 248
- 0
Forums/Helium/parser.py View File

@ -0,0 +1,248 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from Forums.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
def helium_description_parser(soup):
# Fields to be parsed
topic = "-1" # topic name
user = [] # all users of each post
addDate = [] # all dated of each post
feedback = [] # all feedbacks of each vendor (this was found in just one Forum and with a number format)
status = [] # all user's authority in each post such as (adm, member, dangerous)
reputation = [] # all users's karma in each post (usually found as a number)
sign = [] # all user's signature in each post (usually a standard message after the content of the post)
post = [] # all messages of each post
interest = [] # all user's interest in each post
# Finding the topic (should be just one coming from the Listing Page)
li = soup.find("h4", {"class": "text-truncated"})
topic = li.text
topic = topic.replace("Topic:", "")
topic = topic.replace("Post Reply", "")
topic = topic.replace(",", "")
topic = topic.replace("\n", "")
topic = cleanString(topic.strip())
# Finding the repeated tag that corresponds to the listing of posts
posts = soup.findAll('div', {"id": "a9"})
# For each message (post), get all the fields we are interested to:
for ipost in posts:
# Finding a first level of the HTML page
# Finding the author (user) of the post
heading = ipost.find('div', {"class": "panel-heading"})
title = heading.find('div', {"class": "panel-title"}).text
author = title.replace("User:", "")
author = author.strip()
user.append(cleanString(author)) # Remember to clean the problematic characters
# Finding the status of the author
# Testing here two possibilities to find this status and combine them
# Helium does not have membergroup and postgroup
membergroup = heading.find('li', {"class": "membergroup"})
postgroup = heading.find('li', {"class": "postgroup"})
if membergroup != None:
membergroup = membergroup.text.strip()
if postgroup != None:
postgroup = postgroup.text.strip()
membergroup = membergroup + " - " + postgroup
else:
if postgroup != None:
membergroup = postgroup.text.strip()
else:
membergroup = "-1"
status.append(cleanString(membergroup))
# Finding the interest of the author
# Helium does not have blurb
blurb = heading.find('li', {"class": "blurb"})
if blurb != None:
blurb = blurb.text.strip()
else:
blurb = "-1"
interest.append(cleanString(blurb))
# Finding the reputation of the user
# Helium does not have karma
karma = heading.find('li', {"class": "karma"})
if karma != None:
karma = karma.text
karma = karma.replace("Community Rating: ","")
karma = karma.replace("Karma: ","")
karma = karma.strip()
else:
karma = "-1"
reputation.append(cleanString(karma))
# Getting here another good tag to find the post date, post content and users' signature
postarea = ipost.find('div', {"class": "content_body"})
# Finding the date of the post
# Helium does not have date
addDate.append("-1")
# dt = ipost.find('p', {"class": "author"}).text.split('»')[1]
# # dt = dt.strip().split()
# dt = dt.strip()
# date_time_obj = datetime.strptime(dt, '%a %b %d, %Y %I:%M %p')
# stime = date_time_obj.strftime('%a %b %d, %Y')
# sdate = date_time_obj.strftime('%I:%M %p')
# addDate.append(date_time_obj)
# date_time_obj = datetime.strptime(dt, '%a %b %d, %Y %I:%M %p')
# smalltext = postarea.find('div', {"class": "flow_hidden"}).find('div', {"class": "keyinfo"})\
# .find('div', {"class": "smalltext"})
# sdatetime = smalltext.text
# sdatetime = sdatetime.replace(u"\xab","") # Removing unnecessary characters
# sdatetime = sdatetime.replace(u"\xbb","") # Removing unnecessary characters
# sdatetime = sdatetime.split("on: ") # Removing unnecessary characters
# sdatetime = sdatetime[1].strip()
# stime = sdatetime[:-12:-1] # Finding the time of the post
# stime = stime[::-1]
# sdate = sdatetime.replace(stime,"") # Finding the date of the post
# sdate = sdate.replace(",","")
# sdate = sdate.strip()
# Covert the date of the post that can be informed as: "12 February 2016", "today", "yesterday". We need
# a date format here as "mm/dd/yyyy"
#addDate.append(convertDate(sdate,"english", crawlerDate) + " " + stime)
# Finding the post
paragraphs = postarea.find_all('p')
p = ""
for paragraph in paragraphs:
p += paragraph.text.strip() + " "
quote = postarea.find('div', {"class": "standard_quote"})
if quote != None:
q = quote.text.strip()
p.replace(q, "")
post.append(cleanString(p.strip()))
# Finding the users's signature
# Helium does not have signature
#signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"})
signature = ipost.find('div', {"class": "post_wrapper"})
if signature != None:
signature = signature.text.strip()
else:
signature = "-1"
sign.append(cleanString(signature))
# As no information about users's feedback was found, just assign "-1" to the variable
feedback.append("-1")
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, post, user, addDate, feedback, status, reputation, sign, interest)
# Sending the results
return row
# This is the method to parse the Listing Pages (one page with many posts)
def helium_listing_parser(soup):
board = "-1" # board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
nm = 0 # this variable should receive the number of topics
topic = [] # all topics
user = [] # all users of each topic
post = [] # number of posts of each topic
view = [] # number of views of each topic
addDate = [] # when the topic was created (difficult to find)
href = [] # this variable should receive all cleaned urls (we will use this to do the marge between
# Listing and Description pages)
# Finding the board (should be just one)
parents = soup.find('div', {"class": "col-md-12"}).findAll('li')
board = parents[1].text + u"->" + parents[2].get('title')
board = board.replace("\n", "")
board = cleanString(board.strip())
# Finding the repeated tag that corresponds to the listing of topics
itopics = soup.find('table', {"class": "table"}).find('tbody').findAll('td', {"class": "col-md-8"})
repliesViews = soup.find('table', {"class": "table"}).find('tbody').findAll('td', {"class": "col-md-2"})
# Counting how many topics we have found so far
nm = len(itopics)
index = 0
for itopic in itopics:
# Adding the topic to the topic list
topics = itopic.find('a').get('title')
topics = topics.replace(",", "")
topic.append(cleanString(topics.strip()))
# Adding the url to the list of urls
link = itopic.find('a').get('href')
link = cleanLink(link)
href.append(link)
# Finding the author of the topic
author = itopic.find('strong').text
user.append(cleanString(author.strip()))
rv = repliesViews[index].find('p').text.split()
# Finding the number of replies
posts = rv[0].replace("Replies", "")
post.append(cleanString(posts.strip()))
# Finding the number of Views
tview = rv[1].replace("Views", "")
view.append(cleanString(tview.strip()))
# If no information about when the topic was added, just assign "-1" to the variable
# dt = itopic.find('div', {"class": "responsive-hide"}).text.split('»')[1]
# dt = dt.strip()
# date_time_obj = datetime.strptime(dt,'%a %b %d, %Y %I:%M %p')
# addDate.append(date_time_obj)
addDate.append("-1")
index += 1
return organizeTopics("Helium", nm, topic, board, view, post, user, addDate, href)
def helium_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.find('table', {"class": "table"}).find('tbody').findAll('td', {"class": "col-md-8"})
for a in listing:
bae = a.find('a', href=True)
link = bae['href']
href.append(link)
return href

+ 0
- 212
Forums/HiddenAnswers/parser.py View File

@ -1,212 +0,0 @@
__author__ = 'Helium'
# Here, we are importing the auxiliary functions to clean or convert data
from typing import List
from Forums.Utilities.utilities import *
from datetime import date
from datetime import timedelta
import re
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup, ResultSet, Tag
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
def HiddenAnswers_description_parser(soup: BeautifulSoup):
# Fields to be parsed
topic: str = "-1" # 0 topic name
user: List[str] = [] # 1 all users of each post
addDate: List[datetime] = [] # 2 all dated of each post
feedback: List[str] = [] # 3 all feedbacks of each vendor (this was found in just one Forum and with a number format)
status: List[str] = [] # 4 all user's authority in each post such as (adm, member, dangerous)
reputation: List[str] = [] # 5 all user's karma in each post (usually found as a number)
sign: List[str] = [] # 6 all user's signature in each post (usually a standard message after the content of the post)
post: List[str] = [] # 7 all messages of each post
interest: List[str] = [] # 8 all user's interest in each post
image_user = [] # 9 all user avatars of each post
image_post = [] # 10 all first images of each post
# Finding the topic (should be just one coming from the Listing Page)
li = soup.find("h1").find("span", {"itemprop": "name"})
topic = li.text
question: Tag = soup.find("div", {"class": "qa-part-q-view"})
question_user = question.find("span", {"class": "qa-q-view-who-data"}).text
user.append(cleanString(question_user.strip()))
question_time = question.find("span", {"class": "qa-q-view-when-data"}).find("time").get("datetime")
datetime_string = question_time.split("+")[0]
datetime_obj = datetime.strptime(datetime_string, "%Y-%m-%dT%H:%M:%S")
addDate.append(datetime_obj)
question_user_status = question.find("span", {"class": "qa-q-view-who-title"})
if question_user_status is not None:
question_user_status = question_user_status.text
status.append(cleanString(question_user_status.strip()))
else:
status.append('-1')
question_user_karma = question.find("span", {"class": "qa-q-view-who-points-data"})
if question_user_karma is not None:
question_user_karma = question_user_karma.text
# Convert karma to pure numerical string
if question_user_karma.find("k") > -1:
question_user_karma = str(float(question_user_karma.replace("k", "")) * 1000)
reputation.append(cleanString(question_user_karma.strip()))
else:
reputation.append('-1')
question_content = question.find("div", {"class": "qa-q-view-content qa-post-content"}).text
post.append(cleanString(question_content.strip()))
feedback.append("-1")
sign.append("-1")
interest.append("-1")
img = question.find('div', {"class": "qa-q-view-content qa-post-content"}).find('img')
if img is not None:
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
image_post.append(img)
img = question.find('span', {"class": "qa-q-view-avatar-meta"}).find('img')
if img is not None:
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
image_user.append(img)
answer_list: ResultSet[Tag] = soup.find("div", {"class": "qa-a-list"}).find_all("div", {"class": "qa-a-list-item"})
for replies in answer_list:
user_name = replies.find("span", {"class", "qa-a-item-who-data"}).text
user.append(cleanString(user_name.strip()))
date_added = replies.find("span", {"class": "qa-a-item-when"}).find("time", {"itemprop": "dateCreated"}).get('datetime')
date_string = date_added.split("+")[0]
datetime_obj = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S")
addDate.append(datetime_obj)
post_data = replies.find("div", {"class": "qa-a-item-content qa-post-content"}).find("div",{"itemprop":"text"}).text
post.append(cleanString(post_data.strip()))
user_reputations = replies.find("span", {"class", "qa-a-item-who-title"})
if user_reputations is not None:
user_reputations = user_reputations.text
status.append(cleanString(user_reputations.strip()))
else:
status.append('-1')
karma = replies.find("span", {"class": "qa-a-item-who-points-data"})
if karma is not None:
karma = karma.text
# Convert karma to pure numerical string
if karma.find("k") > -1:
karma = str(float(karma.replace("k", "")) * 1000)
reputation.append(cleanString(karma.strip()))
else:
reputation.append('-1')
feedback.append("-1")
sign.append("-1")
interest.append("-1")
img = replies.find("div", {"class": "qa-a-item-content qa-post-content"}).find("div",{"itemprop":"text"}).find('img')
if img is not None:
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
image_post.append(img)
img = replies.find('span', {"class": "qa-a-item-avatar-meta"}).find('img')
if img is not None:
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
image_user.append(img)
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
# Sending the results
return row
def HiddenAnswers_listing_parser(soup: BeautifulSoup):
nm: int = 0 # this variable should receive the number of topics
forum: str = "HiddenAnswers" # 0 *forum name
board = "-1" # 1 board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
user: List[str] = [] # 2 all users of each topic
topic: List[str] = [] # 3 all topics
view: List[int] = [] # 4 number of views of each topic
post: List[int] = [] # 5 number of posts of each topic
href: List[str] = [] # 6 this variable should receive all cleaned urls (we will use this to do the merge between
# Listing and Description pages)
addDate: List[str] = [] # 7 when the topic was created (difficult to find)
image_user = [] # 8 all user avatars used in each topic
# Finding the board
board = soup.find("div", {"class": "qa-main-heading"}).find("h1").text
board = board.replace('Recent questions in', '')
board = cleanString(board.strip())
queries_by_user: ResultSet[Tag] = soup.find("div", {"class": "qa-q-list"}).find_all("div", {"class": "qa-q-list-item"})
for queries in queries_by_user:
topic_of_query = queries.find("div", {"class": "qa-q-item-title"}).find("a").text
topic.append(cleanString(topic_of_query.strip()))
image_user.append("-1") # qa-q-item-where
author = queries.find("span", {"class": "qa-q-item-who-data"}).text
user.append(cleanString(author.strip()))
num_answers = queries.find("span", {"class": "qa-a-count-data"}).text
post.append(cleanString(num_answers.strip()))
view.append("-1")
date_posted = queries.find("span", {"class": "qa-q-item-when-data"}).text
if date_posted.find("day") > 0:
datetime_obj = datetime.now() - timedelta(days=1)
else:
try:
datetime_obj = datetime.strptime(f"{date_posted} {date.today().year}", "%b %d %Y")
except ValueError:
datetime_obj = datetime.strptime(f"{date_posted}", "%b %d, %Y")
addDate.append(datetime_obj)
#this link will be cleaned
listing_href = queries.find("div", {"class": "qa-q-item-title"}).find("a").get("href")
href.append(listing_href)
nm = len(topic)
return organizeTopics(forum, nm, board, user, topic, view, post, href, addDate, image_user)
#need to change this method
def hiddenanswers_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
#print(soup.find('table', {"class": "tborder clear"}).find(
# 'tbody').find_all('tr', {"class": "inline_row"}))
listing = soup.find_all('div', {"class": "qa-q-item-title"})
for a in listing:
link = a.find('a').get('href')
href.append(link)
return href

+ 0
- 24
Forums/Initialization/forums_mining.py View File

@ -6,14 +6,6 @@ Starting point of the Darkweb Forums Mining
from datetime import *
from Forums.BestCardingWorld.crawler_selenium import crawler as crawlerBestCardingWorld
from Forums.CryptBB.crawler_selenium import crawler as crawlerCryptBB
from Forums.OnniForums.crawler_selenium import crawler as crawlerOnniForums
from Forums.AbyssForum.crawler_selenium import crawler as crawlerAbyssForum
from Forums.Procrax.crawler_selenium import crawler as crawlerProcraxForum
from Forums.HiddenAnswers.crawler_selenium import crawler as crawlerHiddenAnswers
from Forums.Cardingleaks.crawler_selenium import crawler as crawlerCardingleaks
from Forums.Altenens.crawler_selenium import crawler as crawlerAltenens
from Forums.Libre.crawler_selenium import crawler as crawlerLibre
import configparser
import os
@ -118,22 +110,6 @@ if __name__ == '__main__':
if forum == "BestCardingWorld":
crawlerBestCardingWorld()
elif forum == "CryptBB":
crawlerCryptBB()
elif forum == "OnniForums":
crawlerOnniForums()
elif forum == "AbyssForum":
crawlerAbyssForum()
elif forum == "HiddenAnswers":
crawlerHiddenAnswers()
elif forum == 'Procrax':
crawlerProcraxForum()
elif forum == 'Cardingleaks':
crawlerCardingleaks()
elif forum == 'Altenens':
crawlerAltenens()
elif forum == 'Libre':
crawlerLibre()
print("\nScraping process completed!")


+ 0
- 40
Forums/Initialization/prepare_parser.py View File

@ -8,14 +8,6 @@ from psycopg2.extras import RealDictCursor
from Forums.DB_Connection.db_connection import *
from Forums.BestCardingWorld.parser import *
from Forums.Cardingleaks.parser import *
from Forums.CryptBB.parser import *
from Forums.OnniForums.parser import *
from Forums.Altenens.parser import *
from Forums.Procrax.parser import *
from Forums.Libre.parser import *
from Forums.HiddenAnswers.parser import *
from Forums.AbyssForum.parser import *
from Forums.Classifier.classify_product import predict
# from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
@ -124,22 +116,6 @@ def parse_listing(forum, listingFile, soup, createLog, logFile):
if forum == "BestCardingWorld":
rw = bestcardingworld_listing_parser(soup)
elif forum == "Cardingleaks":
rw = cardingleaks_listing_parser(soup)
elif forum == "CryptBB":
rw = cryptBB_listing_parser(soup)
elif forum == "OnniForums":
rw = onniForums_listing_parser(soup)
elif forum == "Altenens":
rw = altenens_listing_parser(soup)
elif forum == "Procrax":
rw = procrax_listing_parser(soup)
elif forum == "Libre":
rw = libre_listing_parser(soup)
elif forum == "HiddenAnswers":
rw = HiddenAnswers_listing_parser(soup)
elif forum == "AbyssForum":
rw = abyssForums_listing_parser(soup)
else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception
@ -163,22 +139,6 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile):
if forum == "BestCardingWorld":
rmm = bestcardingworld_description_parser(soup)
elif forum == "Cardingleaks":
rmm = cardingleaks_description_parser(soup)
elif forum == "CryptBB":
rmm = cryptBB_description_parser(soup)
elif forum == "OnniForums":
rmm = onniForums_description_parser(soup)
elif forum == "Altenens":
rmm = altenens_description_parser(soup)
elif forum == "Procrax":
rmm = procrax_description_parser(soup)
elif forum == "Libre":
rmm = libre_description_parser(soup)
elif forum == "HiddenAnswers":
rmm = HiddenAnswers_description_parser(soup)
elif forum == "AbyssForum":
rmm = abyssForums_description_parser(soup)
else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception


+ 0
- 302
Forums/Libre/crawler_selenium.py View File

@ -1,302 +0,0 @@
__author__ = 'DarkWeb'
'''
Libre Forum Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from PIL import Image
import urllib.parse as urlparse
import os, re, time
import subprocess
from bs4 import BeautifulSoup
from Forums.Initialization.prepare_parser import new_parse
from Forums.Libre.parser import libre_links_parser
from Forums.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/'
# Opens Tor Browser, crawls the website
def startCrawling():
forumName = getForumName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(forumName, baseURL, True)
# Login using premade account credentials and do login captcha manually
def login(driver):
input('Press enter when CAPTCHA is completed, and you\'re at the login page')
#entering username and password into input boxes
usernameBox = driver.find_element(by=By.NAME, value='username')
#Username here
usernameBox.send_keys('ct1234')#sends string to the username box
passwordBox = driver.find_element(by=By.NAME, value='password')
#Password here
passwordBox.send_keys('r5o0wqmw')# sends string to passwordBox
input("Press the login button and solve the CAPTCHA then press enter\n")
# input('input')
# wait for listing page show up (This Xpath may need to change based on different seed url)
# wait for 50 sec until id = tab_content is found, then cont
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.TAG_NAME, 'nav')))
# click link to correct forum board
login_link = driver.find_element(by=By.XPATH, value='/html/body/nav/div[1]/a[3]').get_attribute('href')
driver.get(login_link) # open tab with url
# wait for listing page show up (This Xpath may need to change based on different seed url)
# wait for 50 sec until id = tab_content is found, then cont
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div/div/div[3]/div[5]')))
# Returns the name of the website
def getForumName() -> str:
name = 'Libre'
return name
# Return the link of the website
def getFixedURL():
url = 'http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/'
return url
# Closes Tor Browser
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close() #close tab
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from Forums.Initialization.forums_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", True)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
# Saves the crawled html page
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
from Forums.Initialization.forums_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
# Creates the file name from passed URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if name == '':
name = str(counter)
counter = counter + 1
return name
def getInterestedLinks():
links = []
# cybersecurity
links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/CyberSecurity')
# services
links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/Services')
# programming
links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/Programming')
# jobs for crypto
links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/JobsforCypto')
# darknet markets
links.append('http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion/c/DarkNetMarkets')
return links
def crawlForum(driver):
print("Crawling the Libre forum")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
topics = topicPages(html)
for topic in topics:
has_next_topic_page = True
counter = 1
page = topic
while has_next_topic_page:
itemURL = urlparse.urljoin(baseURL, str(page))
try:
driver.get(itemURL)
except:
driver.refresh()
if isListingLink(driver.current_url):
break
savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# # comment out
# if counter == 2:
# break
try:
page = "" # no next page so far may have some later on
if page == "":
raise NoSuchElementException
counter += 1
except NoSuchElementException:
has_next_topic_page = False
# making sure we go back to the listing page (browser back button simulation)
try:
driver.get(link)
except:
driver.refresh()
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try:
link = driver.find_element(by=By.LINK_TEXT, value='>').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
print("Crawling the Libre forum done.")
# Returns 'True' if the link is Topic link, may need to change for every website
def isDescriptionLink(url):
if '/p/' in url:
return True
return False
# Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url):
if '.onion/c' in url:
return True
return False
# calling the parser to define the links
def topicPages(html):
soup = BeautifulSoup(html, "html.parser")
return libre_links_parser(soup)
def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")

+ 0
- 249
Forums/Libre/parser.py View File

@ -1,249 +0,0 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from Forums.Utilities.utilities import *
from datetime import date
from datetime import timedelta
import re
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup, ResultSet, Tag
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
def libre_description_parser(soup: Tag):
# Fields to be parsed
topic = "-1" # 0 *topic name
user = [] # 1 *all users of each post
status = [] # 2 all user's authority in each post such as (adm, member, dangerous)
reputation = [] # 3 all user's karma in each post (usually found as a number)
interest = [] # 4 all user's interest in each post
sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
post = [] # 6 all messages of each post
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
addDate = [] # 8 all dates of each post
image_user = [] # 9 all user avatars of each post
image_post = [] # 10 all first images of each post
# Finding the topic (should be just one coming from the Listing Page)
topic_found = soup.find("a", {"class": "link text-xl text-zinc-300"}).text
topic = cleanString(topic_found.strip())
original_post: Tag = soup.find("div", {"class": "flex items-start"})
original_user = original_post.find("div", {"class": "info-p"}).find("a", {"class": "link"}).text
user.append(cleanString(original_user.replace("/u/", "").strip()))
original_user_statistics: ResultSet[Tag] = original_post.find("div", {"class": "info-p"}).find_all("span")
original_time = original_user_statistics[0].text[2:]
datetime_append = datetime.strptime(original_time, "%Y-%m-%d %H:%M:%S GMT")
addDate.append(datetime_append)
original_karma = original_user_statistics[1].text[2]
reputation.append(cleanString(original_karma.strip()))
original_content = soup.find("div", {"class": "content-p"}).text
post.append(cleanString(original_content.strip()))
status.append("-1")
interest.append("-1")
sign.append("-1")
feedback.append("-1")
image_post.append("-1")
img = original_post.find('img')
if img is not None:
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
image_user.append(img)
# Finding the repeated tag that corresponds to the listing of posts
# try:
posts: ResultSet[Tag] = soup.find_all("div", {"class": "flex items-stretch"})
# For each message (post), get all the fields we are interested to:
for ipost in posts:
# Finding a first level of the HTML page
# Finding the author (user) of the post
user_name = ipost.find("p", {"class": "text-zinc-400 text-justify"}).find("a", {"class": "link"}).text
user.append(cleanString(user_name.replace("/u/", "").strip())) # Remember to clean the problematic characters
status.append("-1")
# Finding the interest of the author
# CryptBB does not have blurb
interest.append("-1")
# Finding the reputation of the user
# CryptBB does have reputation
karma = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text
karma_cleaned = karma.split(" ")[6]
reputation.append(cleanString(karma_cleaned.strip()))
# Getting here another good tag to find the post date, post content and users' signature
date_posted = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text
date_time_cleaned = date_posted.replace(user_name, "")[3:-12]
datetime_append = datetime.strptime(date_time_cleaned, "%Y-%m-%d %H:%M:%S GMT")
addDate.append(datetime_append)
# Finding the post
user_post = ipost.find("div", {"class": "content-c"}).text
post.append(cleanString(user_post))
# Finding the user's signature
sign.append("-1")
# As no information about user's feedback was found, just assign "-1" to the variable
feedback.append("-1")
# As no information about post's image was found, just assign "-1" to the variable
image_post.append("-1")
# As no information about user's image was found, just assign "-1" to the variable
image_user.append("-1")
# Populate the final variable (this should be a list with all fields scraped)
# print(topic)
# print(user)
# print(status)
# print(reputation)
# print(interest)
# print(sign)
# print(post)
# print(feedback)
# print(addDate)
# print(len(user))
# print(len(status))
# print(len(reputation))
# print(len(interest))
# print(len(sign))
# print(len(feedback))
# print(len(addDate))
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
# Sending the results
return row
# This is the method to parse the Listing Pages (one page with many posts)
def libre_listing_parser(soup):
nm = 0 # *this variable should receive the number of topics
forum = "Libre" # 0 *forum name
board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
author = [] # 2 *all authors of each topic
topic = [] # 3 *all topics
views = [] # 4 number of views of each topic
posts = [] # 5 number of posts of each topic
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
# Listing and Description pages)
addDate = [] # 7 when the topic was created (difficult to find)
image_author = [] # 8 all author avatars used in each topic
# Finding the board (should be just one)
board = soup.find('div', {"class": "title"}).find("h1").text
board = cleanString(board.strip())
# Finding the repeated tag that corresponds to the listing of topics
itopics = soup.find("div", {"class", "space-y-2 mt-4"}).find_all('div', {"class": "flex box"})
nm = 0
for itopic in itopics:
nm += 1
# For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
# to don't miss any topic
# Adding the topic to the topic list
topic_string = itopic.find("a", {"class": "link text-xl text-zinc-300"}).text
cleaned_topic_string = cleanString(topic_string.strip())
topic.append(cleaned_topic_string)
image_author.append("-1")
# Adding the url to the list of urls
link_to_clean = itopic.find('div', {'class': 'flex space-x-2 items-center'}).find('a').get('href')
href.append(link_to_clean)
# Finding the author of the topic
username_not_cleaned = itopic.find('div', {"class": "flex-grow p-2 text-justify"}).find('a').text
username_cleaned = username_not_cleaned.split("/")[-1]
author.append(cleanString(username_cleaned))
# Finding the number of views
num_views = itopic.find_all("div", {"class": "flex items-center"})[0].find("p").text
views.append(cleanString(num_views))
# Finding the number of replies
num_replies = itopic.find_all("div", {"class": "flex items-center"})[1].find("p").text
posts.append(cleanString(num_replies))
# If no information about when the topic was added, just assign "-1" to the variable
date_time_concatenated = itopic.find("p", {"class": "text-sm text-zinc-400 italic"}).text
date_time_cleaned = date_time_concatenated.replace(username_not_cleaned, "")
# creating the datetime object
date_time_array = date_time_cleaned[3:]
datetime_append = datetime.strptime(date_time_array, "%Y-%m-%d %H:%M:%S GMT")
addDate.append(datetime_append)
# print(forum)
# print(nm)
# print(board)
# print(author)
# print(topic)
# print(views)
# print(href)
# print(addDate)
# print(len(author))
# print(len(topic))
# print(len(views))
# print(len(href))
# print(len(addDate))
return organizeTopics(
forum=forum,
nm=nm,
board=board,
author=author,
topic=topic,
views=views,
posts=posts,
href=href,
addDate=addDate,
image_author=image_author
)
def libre_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.find("div", {"class", "space-y-2 mt-4"}).find_all('div', {"class": "flex box"})
for a in listing:
link = a.find('div', {'class': 'flex space-x-2 items-center'}).find('a').get('href')
href.append(link)
return href

+ 0
- 310
Forums/OnniForums/crawler_selenium.py View File

@ -1,310 +0,0 @@
__author__ = 'Helium'
'''
OnniForums Crawler (Selenium)
Now goes through multiple topic pages.
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from PIL import Image
import urllib.parse as urlparse
import os, re, time
import configparser
from datetime import date
import subprocess
from bs4 import BeautifulSoup
from Forums.Initialization.prepare_parser import new_parse
from Forums.OnniForums.parser import onniForums_links_parser
from Forums.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/'
# Opens Tor Browser, crawls the website
def startCrawling():
forumName = getForumName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(forum=forumName, url=baseURL, createLog=True)
# Login using premade account credentials and do login captcha manually
def login(driver):
#click login button
login_link = driver.find_element(
by=By.XPATH, value='/html/body/div/div[1]/div[2]/div[1]/div/span/a[1]').get_attribute('href')
driver.get(login_link)
#entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[2]/td[2]/input')
#Username here
usernameBox.send_keys('cabbage_purely')
passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[3]/td[2]/input')
#Password here
passwordBox.send_keys('$ourP@tchK1ds')
clicker = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/div/input')
clicker.click()
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="content"]')))
# Returns the name of the website
def getForumName():
name = 'OnniForums'
return name
# Return the link of the website
def getFixedURL():
url = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/'
return url
# Closes Tor Browser
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from Forums.Initialization.forums_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", True)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
# Saves the crawled html page
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
from Forums.Initialization.forums_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
# Creates the file name from passed URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
def getInterestedLinks():
links = []
# Hacking & Cracking tutorials
links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-tutorials')
# # Hacking & Cracking questions
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-questions')
# # Exploit PoCs
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Exploit-PoCs')
# # sellers
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Sellers')
# # buyers questions
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Buyers-Questions')
# # combo lists
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Combo-lists')
# # Malware-development
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Malware-development')
# # coding
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Coding')
# # Carding & Fraud
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Carding-Fraud')
# # OPSEC
# links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-OPSEC--13')
return links
def crawlForum(driver):
print("Crawling the OnniForums forum")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
topics = topicPages(html)
for topic in topics:
has_next_topic_page = True
counter = 1
page = topic
while has_next_topic_page:
itemURL = urlparse.urljoin(baseURL, str(page))
try:
driver.get(itemURL)
except:
driver.refresh()
if isListingLink(driver.current_url):
break
savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# # comment out
# if counter == 2:
# break
try:
temp = driver.find_element(by=By.CLASS_NAME, value='float_left')
page = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
if page == "":
raise NoSuchElementException
counter += 1
except NoSuchElementException:
has_next_topic_page = False
# making sure we go back to the listing page (browser back button simulation)
try:
driver.get(link)
except:
driver.refresh()
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try:
temp = driver.find_element(by=By.CLASS_NAME, value='float_left')
link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
print("Crawling the OnniForums forum done.")
# Returns 'True' if the link is Topic link
def isDescriptionLink(url):
if 'Thread' in url:
return True
return False
# Returns True if the link is a listingPage link
def isListingLink(url):
if '.onion/Forum' in url:
return True
return False
# calling the parser to define the links
def topicPages(html):
soup = BeautifulSoup(html, "html.parser")
#print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)
return onniForums_links_parser(soup)
def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")

+ 0
- 222
Forums/OnniForums/parser.py View File

@ -1,222 +0,0 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from typing import List
from Forums.Utilities.utilities import *
from datetime import date
from datetime import timedelta
import re
import string
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
def onniForums_description_parser(soup: BeautifulSoup) -> tuple:
topicName: str = "-1" # 0 *topic name
users : List[str] = [] # 1 *all users of each post
statuses : List[str] = [] # 2 all user's authority in each post such as (adm, member, dangerous)
reputations : List[str] = [] # 3 all user's karma in each post (usually found as a number)
interests : List[str] = [] # 4 all user's interest in each post
signs : List[str] = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
posts : List[str] = [] # 6 all messages of each post
feedbacks : List[str] = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
addDates : List[datetime] = [] # 8 all dates of each post
image_user : List[str] = [] # 9 all user avatars of each post
image_post : List[str] = [] # 10 all first images of each post
# Getting the topicName
topicName = soup.find("table", {"class": "tborder tfixed clear"}) \
.find("td", {"class": "thead"}) \
.find_all("div")[-1].text
topicName = cleanString(topicName.strip())
topics_array = soup.find_all("div", {"class": "post"})
for topic in topics_array:
# Extracting and cleaning author information
author_information: BeautifulSoup = topic.find("div", {"class": "author_information"})
username: str = author_information.find("span", {"class": "largetext"}).text
username_cleaned = cleanString(username.strip())
users.append(username_cleaned)
user_status: str = author_information.find("span", {"class": "smalltext"}).text
# Banned users often have weird text issues in HTML
# So we detect banned users and give them a unique string
if user_status.find("Banned") > 0: user_status_cleaned = "Banned"
elif user_status.find("Unregistered") > 0: user_status_cleaned = "Unregistered"
else: user_status_cleaned = cleanString(user_status.strip()) # Remove excessive spaces in string
# Add cleaned data into array
statuses.append(user_status_cleaned)
if user_status_cleaned in ['Unregistered', 'Banned']: reputations.append(-1)
else:
author_statistics: BeautifulSoup = topic.find("div", {"class": "author_statistics"})
reputation: str = author_statistics.find_all("div", {"class": "float_right"})[-1].text
reputation_cleaned = cleanString(reputation.strip())
reputations.append(reputation_cleaned)
# Append a "-1" to `interests` and `signs` array since they don't exist on this forum
interests.append("-1")
signs.append("-1")
post_content: str = topic.find("div", {"class": "post_body scaleimages"}).text
# Clean post content of excessive spaces and characters
post_content_cleaned = post_content.replace("[You must reply to view this hidden content]", "")
post_content_cleaned = cleanString(post_content_cleaned.strip())
posts.append(post_content_cleaned)
# Append a "-1" to `feedbacks` array since they don't exists on this forum
feedbacks.append("-1")
date_posted = topic.find("span", {"class": "post_date"}).text.strip()
if 'modified' in date_posted:
date_posted = date_posted.split('(')[0].strip()
if 'Today' in date_posted or 'Yesterday' in date_posted:
day = topic.find("span", {"class": "post_date"}).find('span').get('title').strip()
time = date_posted.split(',')[1].strip()
date_posted = day + ', ' + time
date_object = datetime.strptime(date_posted, "%m-%d-%Y, %I:%M %p")
elif 'hour' in date_posted or 'minute' in date_posted:
date_posted = topic.find("span", {"class": "post_date"}).find('span').get('title').strip()
date_object = datetime.strptime(date_posted, "%m-%d-%Y, %I:%M %p")
else:
date_object = datetime.strptime(date_posted, "%m-%d-%Y, %I:%M %p")
addDates.append(date_object)
image_post.append("-1")
avatar = topic.find('div', {"class": "author_avatar"})
if avatar is not None:
img = avatar.find('img')
if img is not None:
img = img.get('src').split('base64,')[-1]
else:
img = '-1'
else:
img = "-1"
image_user.append(img)
# TESTING PURPOSES - DO NOT REMOVE
# Populate the final variable (this should be a list with all fields scraped)
row = (topicName, users, statuses, reputations, interests, signs, posts, feedbacks, addDates, image_user, image_post)
# Sending the results
return row
def onniForums_listing_parser(soup: BeautifulSoup):
nm = 0 # this variable should receive the number of topics
forum = "OnniForums" # 0 *forum name
boardName = "-1" # 1 board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
user: List[str] = [] # 2 all users of each topic
topic : List[str] = [] # 3 all topics
view: List[int] = [] # 4 number of views of each topic
post : List[int] = [] # 5 number of posts of each topic
href: List[str] = [] # 6 this variable should receive all cleaned urls (we will use this to do the merge between Listing and Description pages)
addDate : List[str] = [] # 7 when the topic was created (difficult to find)
image_author : List[str] = [] # 8 all author avatars used in each topic
# Finding the board (should be just one)
board_metadata: BeautifulSoup = soup.find("table",{"class" : "tborder clear"})
boardName = board_metadata.find_all("div")[1].text
boardName = cleanString(boardName.strip())
thread_arrays = board_metadata.find_all("tr", {"class":"inline_row"}) # gets the information of posts
nm = len(thread_arrays)
for thread in thread_arrays: #getting the information from the posts and sorting them into the arrays defined above
body = thread.find("span",{"class": "subject_new"})
try:
post_subject: str = body.text #getting the topic
except:
body = thread.find("span",{"class": "subject_old"})
post_subject: str = body.text
post_subject_cleaned = cleanString(post_subject.strip())
topic.append(post_subject_cleaned)
author_icon = thread.find('div', {"class": "lavatar-old lavatar-old-f"})
if author_icon != None:
author_icon = author_icon.find('img')
author_icon = author_icon.get('src')
author_icon = author_icon.split('base64,')[-1]
else:
author_icon = "-1"
image_author.append(author_icon)
reply_count = thread.find_all("td", {"align": "center"})[2].text
post.append(cleanNumbers(reply_count))
views = thread.find_all("td", {"align": "center"})[3].text
view.append(cleanNumbers(views))
# dates_added: str = thread.find("span",{"class" : "thread_start_datetime smalltext"}).text
# dates_added_cleaned = dates_added.split(',')[0]
# addDate.append(dates_added_cleaned)
author = thread.find("span",{"class" : "author smalltext"}).text
author_cleaned = cleanString(author.strip())
user.append(author_cleaned)
thread_link = body.find('a').get('href')
href.append(thread_link)
return organizeTopics(
forum=forum,
nm=nm,
board=boardName,
author=user,
topic=topic,
views=view,
posts=post,
href=href,
addDate=addDate,
image_author=image_author
)
# This is the method to parse the Listing Pages (one page with many posts)
def onniForums_links_parser(soup: BeautifulSoup):
href = []
listing = soup.find_all('tr', {'class': 'inline_row'})
for thread in listing:
try:
link = thread.find('span', {"class": "subject_old"}).find('a').get('href')
except:
link = thread.find('span', {"class": "subject_new"}).find('a').get('href')
href.append(link)
return href

+ 0
- 57
Forums/OnniForums/testing.py View File

@ -1,57 +0,0 @@
import os
from Forums.OnniForums.parser import onniForums_description_parser
from Forums.OnniForums.parser import onniForums_listing_parser
from bs4 import BeautifulSoup
baseUrl = './HTML_Pages/06272023/Listing/httponnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qdonionForumCardingFraud.html'
with open(baseUrl, 'r') as file:
testHTML = file.read()
soup = BeautifulSoup(testHTML, 'html.parser')
output = onniForums_listing_parser(soup)
print(output)
all_descriptions = os.listdir("./HTML_Pages/06272023/Description/")[1:]
total = len(all_descriptions)
descriptions_with_unicode_error = 0
print("\nTESTING DESCRIPTION PARSER:\n")
for desc in all_descriptions:
print(f"\nTesting: ./HTML_Pages/06272023/Description/{desc} \n")
try:
with open(f"./HTML_Pages/06272023/Description/{desc}", "r") as file:
test_html = file.read()
soup = BeautifulSoup(test_html, features="html.parser")
description_output = onniForums_description_parser(soup)
print(f"\nTopic name : {description_output[0]}")
print(f"Contents : {description_output[1]}")
print(f"Users : {description_output[2]}")
print(f"Dates posted: {description_output[3]}")
print(f"Feedbacks : {description_output[4]}")
print(f"Statuses : {description_output[5]}")
print(f"Reputations : {description_output[6]}")
print(f"Signatures : {description_output[7]}")
print(f"Interests : {description_output[8]}\n")
except UnicodeDecodeError:
descriptions_with_unicode_error += 1
print(f"UnicodeDecodeError: the file `{desc}` cannot be decoded by Python!")
print("\nTESTING COMPLETE\n")
print(f"Number of descriptions : {total}")
print(f"Descriptions w/ errors : {descriptions_with_unicode_error}")
print(f"Failure percentage : {round(descriptions_with_unicode_error/total, 4) * 100}%\n")

+ 0
- 321
Forums/Procrax/crawler_selenium.py View File

@ -1,321 +0,0 @@
__author__ = 'Helium'
'''
Procrax Forum Crawler (Selenium)
rechecked and confirmed
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import configparser
import subprocess
from bs4 import BeautifulSoup
from Forums.Initialization.prepare_parser import new_parse
from Forums.Procrax.parser import procrax_links_parser
from Forums.Utilities.utilities import cleanHTML
counter = 1
BASE_URL = 'https://procrax.cx/'
FORUM_NAME = 'Procrax'
# Opens Tor Browser, crawls the website
def startCrawling():
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(forum=FORUM_NAME, url=BASE_URL, createLog=True)
# Login using premade account credentials and do login captcha manually
def login(driver):
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div[3]/div[2]/div[1]/form/div/div/div/dl[4]/dd/div/div[2]/button/span')))
#entering username and password into input boxes
usernameBox = driver.find_element(by=By.NAME, value='login')
#Username here
usernameBox.send_keys('cheese_pizza_man')#sends string to the username box
passwordBox = driver.find_element(by=By.NAME, value='password')
#Password here
passwordBox.send_keys('Gr33nSp@m&3ggs')# sends string to passwordBox
clicker = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[3]/div[2]/div[3]/div[2]/div[1]/form/div/div/div/dl[4]/dd/div/div[2]/button/span')
clicker.click()
# # wait for listing page show up (This Xpath may need to change based on different seed url)
# # wait for 50 sec until id = tab_content is found, then cont
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div[3]/div[1]/div/div[1]/div')))
# Returns the name of the website
def getForumName():
name = 'Procrax'
return name
# Return the link of the website
def getFixedURL():
url = 'https://procrax.cx/'
return url
# Closes Tor Browser
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close() #close tab
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from Forums.Initialization.forums_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", True)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
def getAccess():
driver = createFFDriver()
try:
driver.get(BASE_URL)# open url in browser
return driver
except:
driver.close()# close tab
return 'down'
# Saves the crawled html page
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
from Forums.Initialization.forums_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + FORUM_NAME + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
# Creates the file name from passed URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
def getInterestedLinks():
links = []
# verified sales
links.append('https://procrax.cx/forums/verified-sales-market.10/')
# unverified sales
links.append('https://procrax.cx/forums/unverified-sales-market.12/')
# combos
links.append('https://procrax.cx/forums/bases.79/')
# tools
links.append('https://procrax.cx/forums/tools.81/')
# configs
links.append('https://procrax.cx/forums/configs.82/')
# craxtube
links.append('https://procrax.cx/forums/craxtube.83/')
# general hacking
links.append('https://procrax.cx/forums/general-hacking.24/')
# hacking security tools
links.append('https://procrax.cx/forums/hacking-security-tools.20/')
# hacktube
links.append('https://procrax.cx/forums/hacktube.22/')
# cardingtube
links.append('https://procrax.cx/forums/cardingtube.26/')
# cardable
links.append('https://procrax.cx/forums/cardable-websites.28/')
# spam software
links.append('https://procrax.cx/forums/mailing.72/')
# spam tools
links.append('https://procrax.cx/forums/tools-bots-validators.73/')
# darknet news
links.append('https://procrax.cx/forums/darknet-news-articles.42/')
# links
links.append('https://procrax.cx/forums/darknet-markets-deep-onion-links.43/')
# courses
links.append('https://procrax.cx/forums/courses.59/')
# software
links.append('https://procrax.cx/forums/software.76/')
# general forum
links.append('https://procrax.cx/forums/forum-discussions-updates.7/')
return links
def crawlForum(driver):
print("Crawling the Procrax forum")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
topics = topicPages(html)
for topic in topics:
has_next_topic_page = True
counter = 1
page = topic
while has_next_topic_page:
itemURL = urlparse.urljoin(BASE_URL, str(page))
try:
driver.get(itemURL)
except:
driver.refresh()
if isListingLink(driver.current_url):
break
savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# # comment out
# if counter == 2:
# break
try:
page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
if page == "":
raise NoSuchElementException
counter += 1
except NoSuchElementException:
has_next_topic_page = False
# making sure we go back to the listing page (browser back button simulation)
try:
driver.get(link)
except:
driver.refresh()
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try:
link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
print("Crawling the Procrax forum done.")
# Returns 'True' if the link is Topic link, may need to change for every website
def isDescriptionLink(url):
if 'threads' in url:
return True
return False
# Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url):
if '.cx/forums' in url:
return True
return False
# calling the parser to define the links
def topicPages(html):
soup = BeautifulSoup(html, "html.parser")
#print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)
return procrax_links_parser(soup)
def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")

+ 0
- 189
Forums/Procrax/parser.py View File

@ -1,189 +0,0 @@
__author__ = 'Helium'
# Here, we are importing the auxiliary functions to clean or convert data
from Forums.Utilities.utilities import *
from datetime import date
from datetime import timedelta
import re
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup, ResultSet, Tag
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
def procrax_description_parser(soup: Tag):
# Fields to be parsed
topic = "-1" # 0 topic name
user = [] # 1 all users of each post
addDate = [] # 2 all dated of each post
feedback = [] # 3 all feedbacks of each vendor (this was found in just one Forum and with a number format)
status = [] # 4 all user's authority in each post such as (adm, member, dangerous)
reputation = [] # 5 all user's karma in each post (usually found as a number)
sign = [] # 6 all user's signature in each post (usually a standard message after the content of the post)
post = [] # 7 all messages of each post
interest = [] # 8 all user's interest in each post
image_user = [] # 9 all user avatars of each post
image_post = [] # 10 all first images of each post
# Finding the topic (should be just one coming from the Listing Page)
li = soup.find("h1", {"class": "p-title-value"})
topic = li.text
thread: ResultSet[Tag] = soup.find("div", {"class": "block-body js-replyNewMessageContainer"}).find_all("article", {"data-author": True})
for ipost in thread:
username = ipost.find("h4", {"class": "message-name"}).text
user.append(cleanString(username.strip()))
date_posted = ipost.find("ul", {"class": "message-attribution-main listInline"}).find("time").get("datetime")
datetime_obj = datetime.strptime(date_posted, "%Y-%m-%dT%H:%M:%S%z")
addDate.append(datetime_obj)
feedback.append("-1")
user_status = ipost.find("h5", {"class": "userTitle message-userTitle"}).text
status.append(cleanString(user_status.strip()))
user_lvl = ipost.find("div", {"class": "afAwardLevel"})
if user_lvl is not None:
user_lvl = user_lvl.text
reputation.append(cleanString(user_lvl.strip()))
else:
reputation.append('-1')
sign.append("-1")
user_post = ipost.find("article", {"class": "message-body js-selectToQuote"}).text
post.append(cleanString(user_post.strip()))
interest.append("-1")
bbWrapper = ipost.find('div', {"class": "bbWrapper"})
if bbWrapper is not None:
img = bbWrapper.find('img')
if img is not None:
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
else:
img = "-1"
image_post.append(img)
avatar = ipost.find("a", {"class": "avatar avatar--m"})
if avatar is not None:
img = avatar.find('img')
if img is not None:
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
else:
img = "-1"
image_user.append(img)
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
# Sending the results
return row
# This is the method to parse the Listing Pages (one page with many posts)
def procrax_listing_parser(soup: Tag):
nm = 0 # this variable should receive the number of topics
forum: str = "Procrax" # 0 *forum name
board = "-1" # 1 board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
author = [] # 2 all authors of each topic
topic = [] # 3 all topics
views = [] # 4 number of views of each topic
posts = [] # 5 number of posts of each topic
href = [] # 6this variable should receive all cleaned urls (we will use this to do the marge between
# Listing and Description pages)
addDate = [] # 7 when the topic was created (difficult to find)
image_author = [] # 8 all author avatars used in each topic
# Finding the board (should be just one)
li = soup.find("h1", {"class": "p-title-value"})
board = cleanString(li.text.strip())
threads_list = soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True})
sticky = soup.find("div", {"class": "structItemContainer-group structItemContainer-group--sticky"})
if sticky is not None:
threads_list = sticky.find_all("div", {"data-author": True}) + threads_list
nm = len(threads_list)
for thread in threads_list:
thread_title = thread.find("div", {"class": "structItem-title"}).text
topic.append(cleanString(thread_title.strip()))
author_icon = thread.find('a', {"class": "avatar avatar--s"})
if author_icon != None:
author_icon = author_icon.find('img')
if author_icon != None:
author_icon = author_icon.get('src')
author_icon = author_icon.split('base64,')[-1]
else:
author_icon = "-1"
else:
author_icon = "-1"
image_author.append(author_icon)
thread_author = thread.get("data-author")
author.append(cleanString(thread_author))
thread_views = thread.find("dl", {"class": "pairs pairs--justified structItem-minor"}).find('dd').text
thread_views = thread_views.lower().replace("k", "000")
thread_views = thread_views.lower().replace("m", "000000")
views.append(thread_views.strip())
thread_replies = thread.find("dl", {"class": "pairs pairs--justified"}).find('dd').text
# All threads contain one topic post and reply posts
thread_total_posts = thread_replies.lower().replace("k", "000")
posts.append(thread_total_posts.strip())
thread_date = thread.find("li", {"class": "structItem-startDate"}).find("time").get("datetime")
datetime_obj = datetime.strptime(thread_date, "%Y-%m-%dT%H:%M:%S%z")
addDate.append(datetime_obj)
thread_link: str = thread.find("div", {"class": "structItem-title"}).find('a', {'class': ''}).get('href')
href.append(thread_link)
return organizeTopics(
forum=forum,
nm=nm,
board=board,
author=author,
topic=topic,
views=views,
posts=posts,
addDate=addDate,
href=href,
image_author=image_author
)
def procrax_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.find_all('div', {"class": "structItem-title"})
for a in listing:
link = a.find('a', {'class': ''}).get('href')
href.append(link)
return href

+ 0
- 293
MarketPlaces/AnonMarket/crawler_selenium.py View File

@ -1,293 +0,0 @@
__author__ = 'Helium'
'''
Anon Market Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.AnonMarket.parser import AnonMarket_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion'
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Returns the name of the website
#return: name of site in string type
def getMKTName():
name = 'AnonMarket'
return name
# Return the base link of the website
#return: url of base site in string type
def getFixedURL():
url = 'http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion'
return url
# Closes Tor Browser
#@param: current selenium driver
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", False)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
#return: return the selenium driver or string 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
#@param: raw url as crawler crawls through every site
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
#@param: raw url as crawler crawls through every site
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
#in this example, there are a couple of categories some threads fall under such as
# Guides and Tutorials, Digital Products, and Software and Malware
#as you can see they are categories of products
def getInterestedLinks():
links = []
# Malware
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/malware')
# Bootkits
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/bootkits')
# Backdoors
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/backdoors')
# Keyloggers
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/keyloggers')
# Wireless Trackers
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/wireless_trackers')
# Screen Scrapers
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/screen_scrapers')
# Mobile Forensic Tools
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/mobile_forensics_tools')
# Wifi Jammers
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/wifi_jammers')
# Carding
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/carding')
# Worms
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/worms')
# Viruses
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/viruses')
# Trojans
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/trojans')
# Botnets
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/botnets')
# Security Technology
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/security_technology')
# Hacks
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/hacks')
# Exploit kits
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/exploit_kit')
# Security
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/security')
# Ransomware
links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/ransomware')
return links
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
#topic and description pages are crawled through here, where both types of pages are saved
#@param: selenium driver
def crawlForum(driver):
print("Crawling the Anon Market")
linksToCrawl = getInterestedLinks()
for link in linksToCrawl:
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
# Get all product links on the current page
products_list = productPages(html)
for item in products_list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver, driver.page_source, item)
driver.back() # Go back to listing after visiting each product
# # comment out
# break
#
# # comment out
# if count == 1:
# break
# Locate the next page link
try:
# Find the active page number
active_page_element = driver.find_element(By.XPATH, '//div[@class="page activepage"]')
# current_page = int(active_page_element.text)
next_page_element = active_page_element.find_element(By.XPATH, 'following-sibling::a[1]')
link = next_page_element.get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
print("Crawling the Anon Market done.")
# Returns 'True' if the link is a description link
#@param: url of any url crawled
#return: true if is a description page, false if not
def isDescriptionLink(url):
if 'product' in url:
return True
return False
# Returns True if the link is a listingPage link
#@param: url of any url crawled
#return: true if is a Listing page, false if not
def isListingLink(url):
if 'category' in url:
return True
return False
# calling the parser to define the links, the html is the url of a link from the list of interested link list
#@param: link from interested link list ie. getInterestingLinks()
#return: list of description links that should be crawled through
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return AnonMarket_links_parser(soup)
def crawler():
startCrawling()
# print("Crawling and Parsing Nexus .... DONE!")

+ 0
- 195
MarketPlaces/AnonMarket/parser.py View File

@ -1,195 +0,0 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
import re
#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of description page
#return: 'row' that contains a variety of lists that each hold info on the description page
def AnonMarket_description_parser(soup):
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
name_of_product = soup.find("div", {"class": "heading"}).text
name = cleanString(name_of_product.strip())
description_div = soup.find("div", {"class": "tab1"})
if description_div is None:
describe = "-1"
else:
describe = cleanString(description_div.text.strip())
info_div = soup.find('div', {'class': 'information'})
table = info_div.find('table') if info_div else None
# Find all table rows
rows = table.find_all('tr')
# Parse each row to get relevant data
data = {}
for row in rows:
columns = row.find_all('td')
if len(columns) == 3:
key = columns[0].text.strip()
value = columns[2].text.strip()
data[key] = value
# Extract specific data from the dictionary and assign them to individual variables
vendor = data.get('Vendor', '-1')
shipFrom = data.get('Location', '-1')
shipTo = data.get('Ships to', '-1')
category = data.get('Category', '-1')
USD = data.get('Price', '-1').split()[0]
left = data.get('Stock', '-1')
# image
image = soup.find('img', {"class": "bigthumbnail"})
image = image.get('src').split('base64,')[-1]
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
return row
#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of listing page
#return: 'row' that contains a variety of lists that each hold info on the listing page
def AnonMarket_listing_parser(soup):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
mktName = "AnonMarket" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft = [] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
base_url = "http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion"
cat = soup.find("div", {'class': 'heading'}).text
products_list = soup.find_all('div', {'class': 'item'})
nm = 0
for product in products_list:
name_of_product = product.find("div", {"class": "title"}).text.strip()
name.append(name_of_product)
name_of_vendor = product.find("a", {'class': 'seller'}).text.strip()
vendor.append(name_of_vendor)
category.append(cat)
tbody = product.find('div', {"class": "info"}).find('tbody')
# rating_item
width = tbody.find('div', {"class": "stars2"}).get('style')
rating_item.append(cleanNumbers(width.strip()))
tr = tbody.findAll('tr', recursive=False)
td = tr[2].findAll('td')
# sold
sold.append(td[0].text.strip())
# reviews
reviews.append(td[1].text.strip())
product_link_element = product.find("div", {"class": "title"}).find_parent('a')
link = product_link_element['href']
full_link = base_url + link
href.append(full_link)
# Append '-1' for unavailable data
rating_vendor.append("-1")
success.append("-1")
CVE.append("-1")
MS.append("-1")
describe.append("-1")
views.append("-1")
addDate.append("-1")
BTC.append("-1")
USD.append("-1")
EURO.append("-1")
qLeft.append("-1")
shipFrom.append("-1")
shipTo.append("-1")
nm += 1
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
#called by the crawler to get description links on a listing page
#@param: beautifulsoup object that is using the correct html page (listing page)
#return: list of description links from a listing page
def AnonMarket_links_parser(soup):
# Base URL to prepend to each product link
base_url = "http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion"
# Returning all links that should be visited by the Crawler
href = []
# Using a shorter, but still unique, class name
listing = soup.find('div', {'class': 'items'}).find_all('a', href=True, attrs={'href': lambda x: "/product/" in x})
for a in listing:
link = a.get('href')
if link: # Checks if 'href' attribute is not None
# Prepending the base URL to the scraped link
full_link = base_url + link
href.append(full_link)
# Filtering out any links that might not have '/product/' in them
product_links = [link for link in href if '/product/' in link]
return product_links

+ 0
- 226
MarketPlaces/Apocalypse/parser.py View File

@ -1,226 +0,0 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup, ResultSet, Tag
def apocalypse_description_parser(soup: Tag):
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
content: Tag = soup.find("div", {'id': "article_page"})
product_name = content.find("p", {"class": "list-group-item text-center mb-0 box"}).text
name = cleanString(product_name.strip())
product_description = content.find("pre").text
describe = cleanString(product_description.strip())
# Finding Product Image
image = soup.find('div', {'class': 'col-md-7 text-center'}).find('img')
image = image.get('src').split('base64,')[-1]
product_reviews_list: Tag = content.find("table", {"class": "table product_reviews"}) \
.find_all("li")
review = str(len(product_reviews_list))
product_category = content.find("a", {"class": "badge badge-danger"}).text
category = cleanString(product_category.strip())
product_ships_from = content.find("span", {"class": "badge badge-info"}).text
shipFrom = cleanString(product_ships_from.strip())
product_success_badge: ResultSet[Tag] = content.find_all("span", {"class": "badge badge-success"})
product_ships_to = product_success_badge[1].text
shipTo = cleanString(product_ships_to.strip())
product_supply = content.find("span", {"class": "badge badge-warning"}).text
left = cleanString(product_supply.strip())
product_primary_badge: ResultSet[Tag] = content.find_all("span", {"class": "badge badge-primary"})
# Product vendor comes in the form of "@ vendor_name"
product_vendor = product_primary_badge[0].text.replace("@", "")
vendor = cleanString(product_vendor.strip())
sold = cleanString(product_primary_badge[1].text.strip())
product_prices: Tag = content.find("p", {"style": "border-bottom:1px solid GREY;"})
USD = product_prices.find("span", {"class": "pr"}).text
prices_array: ResultSet[Tag] = product_prices.find_all("span", {"class": "pr1"})
BTC = prices_array[1].text
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
return row
def apocalypse_listing_parser(soup: Tag):
# Fields to be parsed
nm = 0 # Total_Products (Should be Integer)
mktName = "Apocalypse" # 0 Marketplace_Name
name = [] # 1 Product_Name
CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 3 Product_MS_Classification (Microsoft Security)
category = [] # 4 Product_Category
describe = [] # 5 Product_Description
escrow = [] # 6 Vendor_Warranty
views = [] # 7 Product_Number_Of_Views
reviews = [] # 8 Product_Number_Of_Reviews
addDate = [] # 9 Product_AddDate
lastSeen = [] # 10 Product_LastViewDate
BTC = [] # 11 Product_BTC_SellingPrice
USD = [] # 12 Product_USD_SellingPrice
EURO = [] # 13 Product_EURO_SellingPrice
sold = [] # 14 Product_QuantitySold
qLeft =[] # 15 Product_QuantityLeft
shipFrom = [] # 16 Product_ShippedFrom
shipTo = [] # 17 Product_ShippedTo
vendor = [] # 18 Vendor
rating = [] # 19 Vendor_Rating
success = [] # 20 Vendor_Successful_Transactions
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
table = soup.find("div", {"class": "col-lg-9 my-4"})
if table is None:
table = soup.find("div", {"class": "col-lg-9"})
listings: ResultSet[Tag] = table.find_all("div", {"class": "col-lg-4 col-md-6 mb-1"})
for prod in listings:
product_name = prod.find('h5', {"class": "art_title"}).text
name.append(cleanString(product_name.strip()))
# Finding Product Image
product_image = prod.find('img', {'class': 'customHeight'})
product_image = product_image.get('src').split('base64,')[-1]
image.append(product_image)
CVE.append("-1")
MS.append("-1")
describe.append("-1")
escrow.append("-1")
reviews.append("-1")
addDate.append("-1")
lastSeen.append("-1")
BTC.append("-1")
EURO.append("-1")
shipTo.append("-1")
success.append("-1")
image_vendor.append("-1")
product_price = prod.find("span", {"class": "priceP"}).text
USD.append(cleanString(product_price.strip()))
product_sold = prod.find("span", {"class": "badge badge-success"}).text
sold.append(cleanString(product_sold.strip()))
product_statistics: ResultSet[Tag] = prod.find_all("p", {"class": "mb-0 card-text"})
product_category = product_statistics[0].find("a").text
category.append(cleanString(product_category.strip()))
product_sold = product_statistics[1].find("span").text
sold.append(cleanString(product_sold.strip()))
product_quantity_left = product_statistics[2].find("span", {"class": "badge bluebadge"}).text
qLeft.append(cleanString(product_quantity_left.strip()))
product_views = product_statistics[3].find("span").text
views.append(cleanString(product_views.strip()))
product_ships_from = product_statistics[4].find("span").text
shipFrom.append(cleanString(product_ships_from.strip()))
product_vendor_tag: Tag = product_statistics[5].find("a").find("span", {"class": "badge badge-primary"})
# Product vendors & ratings are displayed as "vender_name ★ 5.0"
# When split by the star (★), it should return a 2-value array
product_vendor, product_vendor_rating = product_vendor_tag.text.split("")
try:
vendor.append(cleanString(product_vendor.strip()))
rating.append(cleanString(product_vendor_rating.strip()))
except Exception as e:
raise e
product_href = prod.find('a').get('href')
href.append(product_href)
nm += 1
return organizeProducts(
marketplace=mktName,
nm=nm,
vendor=vendor,
rating_vendor=rating,
success_vendor=success,
nombre=name,
CVE=CVE,
MS=MS,
category=category,
describe=describe,
views=views,
reviews=reviews,
rating_item=["-1" for _ in range(nm)],
addDate=addDate,
BTC=BTC,
USD=USD,
EURO=EURO,
sold=sold,
qLeft=qLeft,
shipFrom=shipFrom,
shipTo=shipTo,
href=href,
image=image,
image_vendor=image_vendor
)
#called by the crawler to get description links on a listing page
#@param: beautifulsoup object that is using the correct html page (listing page)
#return: list of description links from a listing page
def apocalypse_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.findAll('div', {"class": "col-lg-4 col-md-6 mb-1"})
for a in listing:
bae = a.find('a', href=True)
link = bae['href']
href.append(link)
return href

MarketPlaces/LionMarketplace/crawler_selenium.py → MarketPlaces/Ares/crawler_selenium.py View File

@ -1,7 +1,7 @@
__author__ = 'Helium'
__author__ = 'DarkWeb'
'''
LionMarketplace Marketplace Crawler (Selenium)
Ares Market Crawler (Selenium)
'''
from selenium import webdriver
@ -9,64 +9,107 @@ from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from PIL import Image
import urllib.parse as urlparse
import os, re, time
import os, time
from datetime import date
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.LionMarketplace.parser import lionmarketplace_links_parser
from MarketPlaces.Ares.parser import ares_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/'
baseURL = 'http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion'
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
# Opens Tor Browser, crawls the website
def startCrawling():
mktName = getMKTName()
marketName = getMarketName()
driver = getAccess()
if driver != 'down':
try:
# login(driver)
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(mktName, baseURL, True)
new_parse(marketName, False)
# Login using premade account credentials and do login captcha manually
def login(driver):
#wait for login page
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[3]/div[3]/div[2]/div/div[2]/div/center")))
#entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
#Username here
usernameBox.send_keys('blabri')
passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
#Password here
passwordBox.send_keys('fishowal')
'''
# wait for captcha page show up
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[3]/div/div/img")))
# save captcha to local
driver.find_element(by=By.XPATH, value='/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[3]/div/div/img').screenshot(
r'..\Ares\captcha.png')
# This method will show image in any image viewer
im = Image.open(r'..\Ares\captcha.png')
im.show()
# wait until input space show up
inputBox = driver.find_element(by=By.XPATH, value='/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[3]/input')
# ask user input captcha solution in terminal
userIn = input("Enter solution: ")
# send user solution into the input space
inputBox.send_keys(userIn)
# click the verify(submit) button
driver.find_element(by=By.XPATH, value="/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[4]/div/div/button").click()
'''
input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div[7]/div[3]/div[2]/div[1]/div[1]')))
# Returns the name of the website
#return: name of site in string type
def getMKTName():
name = 'LionMarketplace'
def getMarketName():
name = 'Ares'
return name
# Return the base link of the website
#return: url of base site in string type
# Return the link of the website
def getFixedURL():
url = 'http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/'
url = 'http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion'
return url
# Closes Tor Browser
#@param: current selenium driver
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
driver.quit()
time.sleep(3)
return
@ -103,14 +146,12 @@ def createFFDriver():
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
#return: return the selenium driver or string 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
@ -122,30 +163,7 @@ def getAccess():
return 'down'
# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
# then allows for manual solving of captcha in the terminal
#@param: current selenium web driver
def login(driver):
# wait for page to show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="username"]')))
# entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
# Username here
usernameBox.send_keys('blabri')
passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
# Password here
passwordBox.send_keys('fishowal')
input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/nav/div/div/ul[2]/form/button")))
# Saves the crawled html page, makes the directory path for html pages if not made
# Saves the crawled html page
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
@ -155,7 +173,6 @@ def savePage(driver, page, url):
# Gets the full path of the page to be saved along with its appropriate file name
#@param: raw url as crawler crawls through every site
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
@ -168,37 +185,47 @@ def getFullPathName(url):
return fullPath
# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
#@param: raw url as crawler crawls through every site
# Creates the file name from passed URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
if name == '':
name = str(counter)
counter = counter + 1
return name
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
#in this example, there are a couple of categories some threads fall under such as
# Guides and Tutorials, Digital Products, and Software and Malware
#as you can see they are categories of products
def getInterestedLinks():
links = []
# Hacking
links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/ba142ac0-c7e7-11ec-9bd1-fdd89c3d3f91')
# Digital
links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/12')
# # Digital - Other
# links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/91ecd5d0-002c-11ec-9b46-ede2378c5d3c')
# # Digital - VPN
# links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/9431b830-002b-11ec-86d6-cdaf65cd97f1')
# # Digital - Coding
# links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/948b7400-a939-11ec-adc5-2f775203130c')
# Digital - Malware
links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/95c37970-002c-11ec-a5dc-1f4432087ed2')
# # Digital - Guides
# links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/9a8bea70-002b-11ec-a3db-c90dd329f662')
# # Digital - Hacking
# links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/a81693f0-002b-11ec-9c39-110550ce4921')
# # Digital - Malware
# links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/b3258c50-002b-11ec-b658-876d3d651145')
# # Digital - Services
# links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/bae64840-002b-11ec-bbcc-a93431540099')
# # Digital - Software
# links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/cff75df0-002b-11ec-8d0a-81fddeb36bf1')
# # Digital - Exploits
# links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/ef029550-002f-11ec-8711-675a8b116ba6')
# # Digital - Tutorials
# links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/f6e9c3b0-002b-11ec-85aa-c79a6ac8cfe8')
return links
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
#topic and description pages are crawled through here, where both types of pages are saved
#@param: selenium driver
def crawlForum(driver):
print("Crawling the LionMarketplace market")
print("Crawling the Ares market")
linksToCrawl = getInterestedLinks()
@ -228,16 +255,19 @@ def crawlForum(driver):
savePage(driver, driver.page_source, item)
driver.back()
# # comment out
# break
#
# # comment out
# if count == 1:
# break
# comment out
break
# comment out
if count == 1:
break
try:
nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div/div[2]/nav')
link = nav.find_element(by=By.PARTIAL_LINK_TEXT, value='Next').get_attribute('href')
nav = driver.find_element(by=By.XPATH, value=
'/html/body/div[7]/div[3]/div/div[2]/nav')
a = nav.find_element(by=By.LINK_TEXT, value="Next")
link = a.get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
@ -249,12 +279,10 @@ def crawlForum(driver):
print(link, e)
i += 1
print("Crawling the LionMarketplace market done.")
input("Crawling Ares market done sucessfully. Press ENTER to continue\n")
# Returns 'True' if the link is a description link
#@param: url of any url crawled
#return: true if is a description page, false if not
# Returns 'True' if the link is Topic link
def isDescriptionLink(url):
if 'product' in url:
return True
@ -262,29 +290,16 @@ def isDescriptionLink(url):
# Returns True if the link is a listingPage link
#@param: url of any url crawled
#return: true if is a Listing page, false if not
def isListingLink(url):
if 'category' in url:
return True
return False
# calling the parser to define the links, the html is the url of a link from the list of interested link list
#@param: link from interested link list ie. getInterestingLinks()
#return: list of description links that should be crawled through
# calling the parser to define the links
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return lionmarketplace_links_parser(soup)
# Drop links that "signout"
# def isSignOut(url):
# #absURL = urlparse.urljoin(url.base_url, url.url)
# if 'signout' in url.lower() or 'logout' in url.lower():
# return True
#
# return False
return ares_links_parser(soup)
def crawler():

+ 227
- 0
MarketPlaces/Ares/parser.py View File

@ -0,0 +1,227 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
# This is the method to parse the Description Pages (one page to each Product in the Listing Pages)
def ares_description_parser(soup):
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
# Finding Product Name
name = soup.find('div', {'class': "col-md-12 my-2"}).text
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = name.strip()
bae = soup.find('div', {'class': "col-md-7"}).find('span').find_all('span')
# Finding Vendor
vendor = bae[0].text
vendor = vendor.replace(",", "")
vendor = vendor.replace("...", "")
vendor = vendor.strip()
# Finding Vendor Rating
full_stars = bae[2].find_all('i', {'class': "fas fa-star"})
half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"})
rating_vendor = len(full_stars) + (0.5 if half_star is not None else 0)
# Finding Successful Transactions
success = bae[4].text
success = success.replace("Sales ", "")
success = success.strip()
bae = soup.find('span', {'class': "text-left"}).find_all('span')
# Finding Prices
USD = bae[0].text
USD = USD.replace("\n$", "")
USD = USD.strip()
shipping_info = bae[4].text
if "Digital" not in shipping_info:
shipping_info = shipping_info.split(" ")
# Finding Shipment Information (Origin)
shipFrom = shipping_info[0].strip()
# Finding Shipment Information (Destination)
shipTo = shipping_info[1].strip()
bae = soup.find_all('textarea')
# Finding the Product description
describe = bae[0].text
describe = describe.replace("\n", " ")
describe = describe.replace("\r", " ")
describe = describe.strip()
# Finding the Terms and Conditions
terms = bae[1].text
terms = terms.replace("\n", " ")
terms = terms.strip()
'''
# Finding the Number of Product Reviews
tag = soup.findAll(text=re.compile('Reviews'))
for index in tag:
reviews = index
par = reviews.find('(')
if par >=0:
reviews = reviews.replace("Reviews (","")
reviews = reviews.replace(")","")
reviews = reviews.split(",")
review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
else :
review = "-1"
'''
# Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve:
CVE = " "
for idx in cve:
CVE += (idx)
CVE += " "
CVE = CVE.replace(',', ' ')
CVE = CVE.replace('\n', '')
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
if ms:
MS = " "
for im in ms:
MS += (im)
MS += " "
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo)
# Sending the results
return row
# This is the method to parse the Listing Pages
def ares_listing_parser(soup):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
mktName = "Ares" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 6 Product_MS_Classification (Microsoft Security)
category = [] # 7 Product_Category
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft = [] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
listing = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"})
# Populating the Number of Products
nm = len(listing)
for a in listing:
bae = a.findAll('a', href=True)
# Adding the url to the list of urls
link = bae[0].get('href')
link = cleanLink(link)
href.append(link)
# Finding the Vendor
vendor_name = bae[1].text
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
# Finding the Product
product = bae[2].find('img').get('alt')
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.strip()
name.append(product)
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue="-1"
else:
cee = " "
for idx in cve:
cee += (idx)
cee += " "
cee = cee.replace(',', ' ')
cee = cee.replace('\n', '')
cveValue=cee
CVE.append(cveValue)
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue="-1"
else:
me = " "
for im in ms:
me += (im)
me += " "
me = me.replace(',', ' ')
me = me.replace('\n', '')
MSValue=me
MS.append(MSValue)
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
def ares_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.findAll('a', {"class": "btn btn-success w-100 my-1"})
for a in listing:
link = a['href']
href.append(link)
return href

MarketPlaces/ThiefWorld/crawler_selenium.py → MarketPlaces/Bohemia/crawler_selenium.py View File

@ -1,7 +1,7 @@
__author__ = 'Helium'
__author__ = 'DarkWeb'
'''
ThiefWorld Market Crawler (Selenium)
Bohemia Market Crawler (Selenium)
'''
from selenium import webdriver
@ -10,6 +10,7 @@ from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
@ -18,16 +19,13 @@ import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.ThiefWorld.parser import thiefworld_links_parser
from MarketPlaces.Bohemia.parser import bohemia_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/'
baseURL = 'http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/'
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
@ -37,26 +35,81 @@ def startCrawling():
if driver != 'down':
try:
captcha(driver)
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(mktName, baseURL, True)
new_parse(mktName, False)
def login(driver):
#wait for login page
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div/div[4]/div/div/form/input[1]")))
#click on login page confirmation
driver.find_element(by=By.XPATH, value="/html/body/div/div[4]/div/div/form/input[1]").click()
#wait until next page shows up
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[1]/input")))
#entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[1]/input')
#username here
usernameBox.send_keys('ct-1234')
passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[2]/input')
#password here
passwordBox.send_keys('DementedBed123-')
#session time
session_select = Select(driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[3]/select'))
session_select.select_by_visible_text('300 Minutes')
'''
#wait for captcha page to show up
inputBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[4]/div/input')
#save captcha to local
driver.find_element(by=By.XPATH, value='//*[@id="captcha"]').screenshot(r'..\Bohemia\captcha2.png')
im = Image.open(r'..\Bohemia\captcha2.png')
im.show()
#ask user input captcha solution in terminal
userIn = input("Enter Solution: ")
#send user solution into input field
inputBox.send_keys(userIn)
#click the submit button
driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[5]/button').click()
'''
input("Press ENTER when CAPTCHA is completed\n")
#wait for listing page to show up
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[2]/div[2]/div[1]/div")))
# Returns the name of the website
#return: name of site in string type
def getMKTName():
name = 'ThiefWorld'
name = 'Bohemia'
return name
# Returns credentials needed for the mkt
def getCredentials():
credentials = 'blank blank blank blank cap 0'
return credentials
# Return the base link of the website
#return: url of base site in string type
def getFixedURL():
url = 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/'
url = 'http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/'
return url
@ -80,13 +133,13 @@ def createFFDriver():
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("places.history.enabled", False)
# ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
# ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
# ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
# ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
# ff_prof.set_preference("signon.rememberSignons", False)
# ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
@ -98,18 +151,17 @@ def createFFDriver():
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", False)
ff_prof.set_preference("javascript.enabled", True)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
#return: return the selenium driver or string 'down'
def getAccess():
@ -126,18 +178,77 @@ def getAccess():
# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
# then allows for manual solving of captcha in the terminal
#@param: current selenium web driver
def login(driver):
# wait for page to show up (This Xpath may need to change based on different seed url)
def captcha(driver):
'''
# wait for captcha page show up (for bohemia it takes A WHILE)
print("Connecting Bohemia...")
time.sleep(7.5)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located((By.XPATH, "/html/body/div/div/form/div")))
input('Bohemia Connected. Press ENTER to continue\n')
# save captcha to local
driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div/div").screenshot(r'..\Bohemia\captcha.png')
# open method used to open different extension image file
im = Image.open(r'..\Bohemia\captcha.png')
# This method will show image in any image viewer
im.show()
# Prints link to console since captcha requires the link
print(getFixedURL())
# wait until input space show up
inputBox = driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div/div/input")
# ask user input captha solution in terminal
userIn = input("Enter solution: ")
# send user solution into the input space
inputBox.send_keys(userIn)
# click the verify(submit) button
driver.find_element(by=By.XPATH, value='/html/body/div/div/form/button[1]').click()
# im.close()
'''
input("Press ENTER when CAPTCHA is completed\n")
# wait for next captcha to show up
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div/header/div[2]/div/nav/div[2]/a[1]")))
(By.XPATH, "/html/body/div/div/form")))
'''
for square in range(1,7):
inputBox = driver.find_element(by=By.XPATH, value=f"/html/body/div/div/form/div[1]/input[{square}]")
inputBox.click()
time.sleep(.5)
# userIn = input("Enter Solution: ")
# inputBox.send_keys(userIn)
# Takes screenshot every iteration because after input the captcha changes
driver.find_element(by=By.XPATH, value="/html/body/div/div/form").screenshot(r'..\Bohemia\captcha1.png')
# Opens and crops image
im = Image.open(r'..\Bohemia\captcha1.png')
im = im.crop(((im.width // 2 - 80), (im.height // 2 - 100), (im.width // 2 + 80), (im.height // 2 + 60)))
im.show()
# im.close()
temp = driver.find_element(By.XPATH, '/html/body/div/header/div[2]/div/nav/div[2]/a[1]').get_attribute(
'href') # /html/body/div/div[2]/div/div[2]/div
link = urlparse.urljoin(baseURL, str(temp))
driver.get(link) # open
# wait for listing page show up (This Xpath may need to change based on different seed url)
userIn = input("Enter Solution: ")
inputBox.send_keys(userIn)
#locate and press submit button
driver.find_element(by=By.XPATH, value="/html/body/div/div/form/button[1]").click()
# driver.find_element(by=By.XPATH, value='/html/body/div/div/form/button[2]')
'''
input("Press ENTER when CAPTCHA is completed\n")
#wait for next page to show up
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.ID, "side-bar")))
(By.XPATH, "/html/body/div/div[4]/div/div/form/input[1]")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(driver, page, url):
@ -172,7 +283,6 @@ def getNameFromURL(url):
counter = counter + 1
return name
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
#in this example, there are a couple of categories some threads fall under such as
# Guides and Tutorials, Digital Products, and Software and Malware
@ -180,30 +290,31 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
# Hacking and DOSS
links.append(['Hacking and DOSS', 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/35'])
# Carding Manuals
links.append(['Carding Manuals', 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/20'])
# Software
links.append(['Software', 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/37'])
# Database
links.append(['Database', 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/38'])
# Malware and Botnets
links.append('http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/listings?page=1&type=all&catid=95')
# #Exploits
# links.append('http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/listings?page=1&type=all&catid=99')
# #Methods
# links.append('http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/listings?catid=100')
# #Exploit kits
# links.append('http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/listings?catid=101')
# #Hacking Software
# links.append('http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/listings?catid=103')
return links
return links
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
#topic and description pages are crawled through here, where both types of pages are saved
#@param: selenium driver
def crawlForum(driver):
print("Crawling the ThiefWorld market")
print("Crawling the Bohemia Market")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
cat = linksToCrawl[i][0]
link = linksToCrawl[i][1]
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
@ -215,7 +326,6 @@ def crawlForum(driver):
except:
driver.refresh()
html = driver.page_source
html += f"<calsys-cat>{cat}</calsys-cat>"
savePage(driver, html, link)
list = productPages(html)
@ -228,17 +338,18 @@ def crawlForum(driver):
savePage(driver, driver.page_source, item)
driver.back()
# # comment out
# break
#
# # comment out
# if count == 1:
# break
# comment out
break
# comment out
if count == 1:
break
try:
nav = driver.find_element(by=By.XPATH, value='/html/body/div/div[1]/div/div/div[2]/div[3]')
right = nav.find_element(by=By.CLASS_NAME, value='pag_right')
link = right.find_element(by=By.TAG_NAME, value='a').get_attribute('href')
nav = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div[2]/ul')
a = nav.find_element(by=By.PARTIAL_LINK_TEXT, value="Next")
link = a.get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
@ -250,14 +361,14 @@ def crawlForum(driver):
print(link, e)
i += 1
print("Crawling the ThiefWorld market done.")
input("Crawling Bohemia Market done sucessfully. Press ENTER to continue\n")
# Returns 'True' if the link is a description link
#@param: url of any url crawled
#return: true if is a description page, false if not
def isDescriptionLink(url):
if 'product' in url:
if bool(re.search(r'\blisting\b',url)): # accurate with bohemia
return True
return False
@ -266,7 +377,7 @@ def isDescriptionLink(url):
#@param: url of any url crawled
#return: true if is a Listing page, false if not
def isListingLink(url):
if 'catalog' in url:
if bool(re.search(r'\blistings\b',url)): # accurate with bohemia
return True
return False
@ -276,16 +387,16 @@ def isListingLink(url):
#return: list of description links that should be crawled through
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return thiefworld_links_parser(soup)
return bohemia_links_parser(soup)
# Drop links that "signout"
# def isSignOut(url):
# #absURL = urlparse.urljoin(url.base_url, url.url)
# if 'signout' in url.lower() or 'logout' in url.lower():
# return True
#
# return False
def isSignOut(url):
#absURL = urlparse.urljoin(url.base_url, url.url)
if 'signout' in url.lower() or 'logout' in url.lower():
return True
return False
def crawler():

MarketPlaces/DarkBazar/parser.py → MarketPlaces/Bohemia/parser.py View File

@ -1,5 +1,7 @@
__author__ = 'DarkWeb'
import re
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
@ -11,7 +13,8 @@ from bs4 import BeautifulSoup
# stores info it needs in different lists, these lists are returned after being organized
# @param: soup object looking at html page of description page
# return: 'row' that contains a variety of lists that each hold info on the description page
def darkbazar_description_parser(soup):
def bohemia_description_parser(soup):
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
@ -33,75 +36,83 @@ def darkbazar_description_parser(soup):
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
# Finding Product Name
divmb = soup.findAll('div', {'class': "mb-1"})
name = divmb[0].text
name = soup.find('h1', {"style": "margin: 0; margin-bottom: 0.5em;"}).text
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = name.strip()
# Finding Vendor
vendor = divmb[1].find('a').text.strip()
vendor = soup.find('div', {"class": "user-photo"}).find_next_sibling('a').text
vendor = vendor.strip()
# Finding Vendor Rating
temp = soup.find('div', {'class': ""}).text
temp = temp.split('(')
rating = temp[0].replace("Vendor's Review : ", "")
rating = rating.replace("%", "")
rating_vendor = rating.strip()
# Finding the Product Rating and Number of Product Reviews
reviews = temp[2].replace(" review)", "")
reviews = reviews.strip()
rating_vendor = soup.find('span', {'class': "user-percent"}).text.strip()
temp = temp[1].split(")")
rating = temp[1].replace("Product Review : ", "")
rating = rating.replace("%", "")
rating_item = rating.strip()
# Finding Users' Successful Transactions
temp = ''
success = soup.find('span', {'class': "smalltext shadow-text"}).text
temp = success.split("|")
success = str(temp[1])
success = success.strip()
# Finding Prices
USD = soup.find('div', {'class': "h3 text-primary"}).text.strip()
prices = soup.find('div', {'class': "col-md-3 sidebar-navigation user-details"}
).find('div', {'class': "container text-left"})
USD = prices.find('h1').text.strip()
BTC = prices.find('h1').find_next_sibling('h3').text
BTC = BTC.replace("BTC", "")
BTC = BTC.strip()
# Finding the Product Category
pmb = soup.findAll('p', {'class': "mb-1"})
detail_row = soup.find('div', {'class': "detail-container text-left"}).find_all('strong')
category = pmb[-1].text
category = category.replace("Category: ", "").strip()
# Finding the Product Category (there isnt a thing for it on the page
# category = li[1].find('span', {'class': "tag is-dark"}).text.strip()
# Finding the Product Quantity Available
left = divmb[-1].text
left = left.split(",", 1)[1]
left = left.replace("in stock", "")
left = left.strip()
left = soup.find('div', {'class': "container detail-container text-left"})
left = left.find('div', {'class': "detail-row"}).text.replace('\n', '')
left = left.split("Available Stock:")
left = left[1].strip()
# Finding Number Sold
sold = divmb[-1].text
sold = sold.split(",", 1)[0]
sold = sold.replace("sold", "")
sold = detail_row[0].find_parent()
sold = sold.text
sold = sold.replace("Total Sold:", "")
sold = sold.strip()
# Finding Shipment Information (Origin)
pmb[0].text
shipFrom = shipFrom.replace("Ships from: ", "").strip()
# Finding Shipment Information (Destination)
pmb[1].text
shipTo = shipTo.replace("Ships to: ", "").strip()
# Finding Shipment Information (Origin) (There is no shipping information)
'''if "Ships from:" in li[-2].text:
shipFrom = li[-2].text
shipFrom = shipFrom.replace("Ships from: ", "")
# shipFrom = shipFrom.replace(",", "")
shipFrom = shipFrom.strip()'''
# Finding Shipment Information (Destination) (No shipping info
'''shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text
shipTo = shipTo.replace("Ships to: ", "")
shipTo = shipTo.strip()
if "certain countries" in shipTo:
countries = ""
tags = li[-1].find_all('span', {'class': "tag"})
for tag in tags:
country = tag.text.strip()
countries += country + ", "
shipTo = countries.strip(", ")'''
# Finding the Product description
cardbody = soup.findAll('div', {'class': "card-body"})
describe = cardbody[1].text.strip()
describe = soup.find('div', {'class': "container feedback-container"})
describe = describe.find_next_sibling('div', {'class': "container"}).find('p').text
describe = describe.replace("\n", " ")
describe = describe.strip()
# Finding Product Image
image = soup.find('div', {'class': 'product-primary'}).find('img')
image = image.get('src')
image = image.split('base64,')[-1]
# Finding the Number of Product Reviews
review = detail_row[2].find_parent().text
review = review.split("Based on")
review = review[1].replace("ratings)", "").strip()
# Searching for CVE and MS categories
# Searching for CVE and MS categories (cant find it)
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve:
CVE = " "
@ -121,7 +132,7 @@ def darkbazar_description_parser(soup):
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
BTC, USD, EURO, sold, left, shipFrom, shipTo)
# Sending the results
return row
@ -131,17 +142,17 @@ def darkbazar_description_parser(soup):
# stores info it needs in different lists, these lists are returned after being organized
# @param: soup object looking at html page of listing page
# return: 'row' that contains a variety of lists that each hold info on the listing page
def darkbazar_listing_parser(soup):
def bohemia_listing_parser(soup):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
mktName = "DarkBazar" # 0 *Marketplace_Name
nm = 0 # *Total_Products (Should be Integer)
mktName = "Bohemia" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 6 Product_MS_Classification (Microsoft Security)
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
@ -152,21 +163,18 @@ def darkbazar_listing_parser(soup):
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft = [] # 17 Product_QuantityLeft
qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
href = [] # 20 Product_Links
listing = soup.findAll('div', {"id": "itembox"})
listing = soup.findAll('div', {"class": "product-link"})
# Populating the Number of Products
nm = len(listing)
for a in listing:
bae = a.findAll('a', href=True)
lb = a.findAll('div', {"id": "littlebox"})
# Adding the url to the list of urls
link = bae[0].get('href')
@ -174,64 +182,60 @@ def darkbazar_listing_parser(soup):
href.append(link)
# Finding the Product
product = lb[1].find('a').text
product = bae[0].text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.replace("...", "")
product = product.strip()
name.append(product)
# Finding Product Image
product_image = a.find('img')
product_image = product_image.get('src')
product_image = product_image.split('base64,')[-1]
image.append(product_image)
bae = a.find('div', {'class': "container"})
# Finding Prices
price = lb[-1].find('div', {"class": "mb-1"}).text
price = price.replace("$","")
price = price.strip()
USD.append(price)
price = bae.find('div', {'class': "product-price"}).find('h2').text
ud = price.replace("USD", " ")
# u = ud.replace("$","")
ud = ud.replace(",", "")
ud = ud.strip()
USD.append(ud)
bc = bae.find('div', {'class': "product-price"}).find('span', {'class': "shadow-text smalltext boldtext"}).text
bc = bc.replace("\n", "")
bc = bc.split()
bc = bc[0].replace("BTC", "").strip()
BTC.append(bc)
# Finding the Vendor
vendor_name = lb[-1].find("a").text
vendor_name = vendor_name.replace(",", "")
vendor_name = bae.find('b').find('a').text
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
image_vendor.append("-1")
# Finding the Category
cat = lb[-1].find("span").text
cat = cat.replace("class:", "")
cat = bae.find('span', {'class': "shadow-text smalltext"}).find('strong').text
cat = cat.strip()
category.append(cat)
span = lb[0].findAll("span")
# Finding Number of Views
num = span[0].text
num = num.replace("views:", "")
# Finding Number Sold and Quantity Left
num = bae.find('div', {'class': "product-details-bottom"}).find('span').text
num = num.replace("Sold", "")
num = num.replace("times in total", "")
num = num.strip()
sold.append(num)
# Finding Number Sold
num = span[2].text
num = num.replace("Sold:", "")
num = num.strip()
sold.append(num)
# Finding Quantity Left
quant = span[1].text
quant = quant.replace("stock:", "")
quant = quant.strip()
quant = bae.find('div', {'class': "product-price"}).text
quant = quant.replace("\n", "")
quant = quant.split("Available")
quant = quant[0].replace("Autoship", "").strip()
qLeft.append(quant)
# add shipping information
ship = lb[2].findAll('small')[1].findAll('span')[1].text.split("->")
shipFrom.append(ship[0].replace("Ship from ", "").strip())
shipTo.append(ship[1].replace("to ", "").strip())
# Finding Successful Transactions
freq = bae.find('div', {'title': "Total Sales"}).find_parent().text.replace("\n", "")
freq = freq.strip().split()
freq = freq[-1].strip()
success.append(freq)
# find vendor rating
rate = bae.find('b').find('strong').text.strip()
rating_vendor.append(rate)
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
@ -262,28 +266,24 @@ def darkbazar_listing_parser(soup):
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
# called by the crawler to get description links on a listing page
# @param: beautifulsoup object that is using the correct html page (listing page)
# return: list of description links from a listing page
def darkbazar_links_parser(soup):
def bohemia_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.findAll('div', {"id": "itembox"})
# for a in listing:
# bae = a.find('a', {"class": "text-info"}, href=True)
# link = bae['href']
# href.append(link)
temp = soup.find('div', {"class": "col-md-9 sidebar-content-right listing-content"})
temp = temp.find('div', {"class": "product-listing"})
listing = temp.findAll('div', {"class": "product-heading"})
for a in listing:
bae = a.findAll('a', href=True)
# Adding the url to the list of urls
link = bae[0].get('href')
bae = a.find('a', href=True)
link = bae['href']
href.append(link)
return href

+ 0
- 262
MarketPlaces/DarkBazar/crawler_selenium.py View File

@ -1,262 +0,0 @@
__author__ = 'DarkWeb'
'''
DarkBazar Marketplace Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.DarkBazar.parser import darkbazar_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/'
def startCrawling():
mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Returns the name of the website
def getMKTName():
name = 'DarkBazar'
return name
# Return the base link of the website
def getFixedURL():
url = 'http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/'
return url
# Closes Tor Browser
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", False)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
def login(driver):
input("Press ENTER when CAPTCHA is complete and login page has loaded\n")
# entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//input[@name="username"]')
# Username here
usernameBox.send_keys('aliciamykeys')
passwordBox = driver.find_element(by=By.XPATH, value='//input[@name="password"]')
# Password here
passwordBox.send_keys('aliciawherearemykey$')
# session time
session_select = Select(driver.find_element(by=By.XPATH, value='/html/body/main/div/div/div/div/div/form/div[4]/div/div[2]/select'))
session_select.select_by_visible_text('Session 60min')
input("Press ENTER when CAPTCHA is completed and you exit the newsletter\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="submit"]')))
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
def getMKTName() -> str:
name = 'DarkBazar'
return name
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if name == '':
name = str(counter)
counter = counter + 1
return name
def getInterestedLinks():
links = []
# Digital Goods
links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=3')
# Services
links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=5')
return links
def crawlForum(driver):
print("Crawling the DarkBazar market")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver, driver.page_source, item)
driver.back()
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try:
link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
print("Crawling the DarkBazar market done.")
# Returns 'True' if the link is Topic link, may need to change for every website
def isDescriptionLink(url):
if 'item' in url:
return True
return False
# Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url):
if 'category=' in url:
return True
return False
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return darkbazar_links_parser(soup)
def crawler():
startCrawling()

+ 0
- 284
MarketPlaces/DarkMatter/crawler_selenium.py View File

@ -1,284 +0,0 @@
__author__ = 'Helium'
'''
DarkMatter Marketplace Crawler (Selenium)
Crawler works, but it slow since there is a speed check for clicking
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.DarkMatter.parser import darkmatter_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/'
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Returns the name of the website
#return: name of site in string type
def getMKTName():
name = 'DarkMatter'
return name
# Return the base link of the website
#return: url of base site in string type
def getFixedURL():
url = 'http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/'
return url
# Closes Tor Browser
#@param: current selenium driver
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
#ff_prof.set_preference("network.dns.disablePrefetch", True)#connection issue
#ff_prof.set_preference("network.http.sendRefererHeader", 0)#connection issue
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", False)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
#return: return the selenium driver or string 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
# then allows for manual solving of captcha in the terminal
#@param: current selenium web driver
def login(driver):
input("Press ENTER when CAPTCHA is completed and page is loaded\n")
# wait for page to show up (This Xpath may need to change based on different seed url)
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
#@param: raw url as crawler crawls through every site
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
#@param: raw url as crawler crawls through every site
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
#in this example, there are a couple of categories some threads fall under such as
# Guides and Tutorials, Digital Products, and Software and Malware
#as you can see they are categories of products
def getInterestedLinks():
links = []
# digital fraud software
links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=76')
# legit
links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=78')
# hack guides
links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=94')
# services
links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=117')
# software/malware
links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=121')
return links
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
#topic and description pages are crawled through here, where both types of pages are saved
#@param: selenium driver
def crawlForum(driver):
print("Crawling the DarkMatter market")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
time.sleep(3) # to keep from detecting click speed
driver.get(itemURL)
except:
driver.refresh()
savePage(driver, driver.page_source, item)
time.sleep(3) # to keep from detecting click speed
driver.back()
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try:
link = driver.find_element(by=By.LINK_TEXT, value=">").get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
print("Crawling the DarkMatter market done.")
# Returns 'True' if the link is a description link
#@param: url of any url crawled
#return: true if is a description page, false if not
def isDescriptionLink(url):
if 'products/' in url and '/products/?category' not in url:
return True
return False
# Returns True if the link is a listingPage link
#@param: url of any url crawled
#return: true if is a Listing page, false if not
def isListingLink(url):
if '?category' in url:
return True
return False
# calling the parser to define the links, the html is the url of a link from the list of interested link list
#@param: link from interested link list ie. getInterestingLinks()
#return: list of description links that should be crawled through
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return darkmatter_links_parser(soup)
# Drop links that "signout"
# def isSignOut(url):
# #absURL = urlparse.urljoin(url.base_url, url.url)
# if 'signout' in url.lower() or 'logout' in url.lower():
# return True
#
# return False
def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")

+ 0
- 261
MarketPlaces/DarkMatter/parser.py View File

@ -1,261 +0,0 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of description page
#return: 'row' that contains a variety of lists that each hold info on the description page
def darkmatter_description_parser(soup):
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
# 0 *Vendor_Name
try:
temp = soup.find('table', {'class', 'vtable'})
temp = temp.findAll('tr')
temp2 = temp[3].find('a').text
vendor = cleanString(temp2.strip())
except:
temp = soup.find('table', {'class', 'vtable'})
temp = temp.findAll('tr')
temp2 = temp[4].find('a').text
vendor = cleanString(temp2.strip())
# product name
name = soup.find('div', {'class', 'title-h2'}).text
name = cleanString(name.strip())
#product description
temp = soup.find('pre', {'class', 'description'}).text
temp = temp.replace('\n', ' ')
describe = cleanString(temp.strip())
#product category
try:
temp = soup.find('table', {'class', 'vtable'})
temp = temp.findAll('tr')
temp2 = temp[4].find('th').text
temp2 = cleanString(temp2)
if (temp2 == "Category"):
temp2 = temp[4].find('a').text
category = cleanString(temp2.strip())
except:
temp = soup.find('table', {'class', 'vtable'})
temp = temp.findAll('tr')
temp2 = temp[5].find('th').text
temp2 = cleanString(temp2.strip)
if (temp2 == "Category"):
temp2 = temp[5].find('a').text
category = cleanString(temp2.strip())
# usd
temp = soup.find('table', {'class', 'vtable'})
temp = temp.findAll('tr')
temp2 = temp[1].find('td').text
temp2 = temp2.replace(' USD', '')
USD = cleanString(temp2)
# 15 Product_QuantitySold
temp = soup.find('table', {'class', 'vtable'})
temp = temp.findAll('tr')
temp2 = temp[5].find('th').text
temp2 = cleanString(temp2)
temp3 = temp[6].find('th').text
temp3 = cleanString(temp3)
if (temp2 == "Sold"):
temp2 = temp[5].find('td').text
sold = cleanString(temp2.strip())
elif (temp3 == "Sold"):
temp2 = temp[6].find('td').text
sold = cleanString(temp2.strip())
# Finding Product Image
image = soup.find('td', {"class": "vtop"}).find('img')
if image is not None:
image = image.get('src').split('base64,')[-1]
else:
image = '-1'
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
return row
#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of listing page
#return: 'row' that contains a variety of lists that each hold info on the listing page
def darkmatter_listing_parser(soup):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
mktName = "DarkMatter" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 6 Product_MS_Classification (Microsoft Security)
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
names = soup.find('div', {"class": "content"}).findAll('td', {"class": "lefted", "colspan": "3"})
left = soup.find('div', {"class": "content"}).findAll('table', {"class": "vtable"})
right = soup.find('div', {"class": "content"}).findAll('td', {"class": "vtop centered"})
images = soup.find('div', {"class": "content"}).findAll('td', {"class": "vcentered"})
# vtop centered
count = 0
# Populating the Number of Products
nm = len(names)
for a in names:
# product name
temp = a.find('a').text
if ("pcs x " in temp):
index = temp.index("pcs x ")
result = temp[index + len("pcs x "):]
name.append(cleanString(result))
elif("pks x " in temp):
index = temp.index("pks x ")
result = temp[index + len("pks x "):]
name.append(cleanString(result))
elif ("job x " in temp):
index = temp.index("job x ")
result = temp[index + len("job x "):]
name.append(cleanString(result))
CVE.append("-1")
MS.append("-1")
temp2 = left[count].findAll('tr')
length_2 = len(temp2) - 1
# category
temp = temp2[1].find('td').text
category.append(cleanString(temp.strip()))
describe.append("-1")
#escrow.append("-1")
views.append("-1")
reviews.append("-1")
addDate.append("-1")
#lastSeen.append("-1")
BTC.append("-1")
image_vendor.append("-1")
# usd
temp3 = right[count*2].find('span').text
temp = temp3.replace(' USD', '')
USD.append(cleanString(temp))
EURO.append("-1")
# 14 Product_QuantitySold
temp3 = temp2[length_2].find('th').text
temp3 = cleanString(temp3)
if (temp3 == "Sold:"):
temp = temp2[length_2].find('td').text
sold.append(cleanString(temp.strip()))
else:
sold.append("-1")
qLeft.append("-1")
shipFrom.append("-1")
# ship to
temp3 = temp2[length_2].find('th').text
temp3 = cleanString(temp3)
if (temp3 == "Ship To:"):
temp = temp2[length_2].find('td').text
shipTo.append(cleanString(temp.strip()))
else:
shipTo.append("-1")
# vendor
temp = temp2[0].find('a').text
vendor.append(cleanString(temp.strip()))
# add product rating (stars)
rating.append("-1")
success.append("-1")
temp = a.find('a').get('href')
href.append(temp)
# Finding Product Image
image = images[count*2].find('img').get('src')
image = image.split('base64,')[-1]
count += 1
rating_item.append("-1")
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
#called by the crawler to get description links on a listing page
#@param: beautifulsoup object that is using the correct html page (listing page)
#return: list of description links from a listing page
def darkmatter_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.find('div', {"class": "content"}).findAll('td', {"class": "lefted", 'colspan': '3'})
for a in listing:
bae = a.find('a', href=True)
link = bae['href']
href.append(link)
return href

+ 0
- 286
MarketPlaces/DigitalThriftShop/crawler_selenium.py View File

@ -1,286 +0,0 @@
__author__ = 'Helium'
'''
DigitalThriftShop Marketplace Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.DigitalThriftShop.parser import digitalthriftshop_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/'
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Returns the name of the website
#return: name of site in string type
def getMKTName():
name = 'DigitalThriftShop'
return name
# Return the base link of the website
#return: url of base site in string type
def getFixedURL():
url = 'http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/'
return url
# Closes Tor Browser
#@param: current selenium driver
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", False)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
#return: return the selenium driver or string 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
# then allows for manual solving of captcha in the terminal
#@param: current selenium web driver
def login(driver):
# wait for page to show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.ID, "woocommerce_product_categories-2")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
#@param: raw url as crawler crawls through every site
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
#@param: raw url as crawler crawls through every site
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
#in this example, there are a couple of categories some threads fall under such as
# Guides and Tutorials, Digital Products, and Software and Malware
#as you can see they are categories of products
def getInterestedLinks():
links = []
# Apps
links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/apps/')
# Books
links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/books/')
# Bot nets
links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/botnets/')
# ransomware
links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/ransomware/')
# rats
links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/rats/')
# scripts
links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/scripts/')
return links
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
#topic and description pages are crawled through here, where both types of pages are saved
#@param: selenium driver
def crawlForum(driver):
print("Crawling the DigitalThriftShop market")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver, driver.page_source, item)
driver.back()
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try:
nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div/div[2]/main/div[1]/nav')
link = nav.find_element(by=By.PARTIAL_LINK_TEXT, value='').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
print("Crawling the DigitalThriftShop market done.")
# Returns 'True' if the link is a description link
#@param: url of any url crawled
#return: true if is a description page, false if not
def isDescriptionLink(url):
if 'product/' in url:
return True
return False
# Returns True if the link is a listingPage link
#@param: url of any url crawled
#return: true if is a Listing page, false if not
def isListingLink(url):
if 'product-' in url:
return True
return False
# calling the parser to define the links, the html is the url of a link from the list of interested link list
#@param: link from interested link list ie. getInterestingLinks()
#return: list of description links that should be crawled through
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return digitalthriftshop_links_parser(soup)
# Drop links that "signout"
# def isSignOut(url):
# #absURL = urlparse.urljoin(url.base_url, url.url)
# if 'signout' in url.lower() or 'logout' in url.lower():
# return True
#
# return False
def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")

+ 0
- 173
MarketPlaces/DigitalThriftShop/parser.py View File

@ -1,173 +0,0 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup, ResultSet, Tag
#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of description page
#return: 'row' that contains a variety of lists that each hold info on the description page
def digitalThriftShop_description_parser(soup: Tag):
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
product_name = soup.find("h1", {"class": "product_title entry-title"}).text
name = cleanString(product_name.strip())
product_description = soup.find("div", {"id": "tab-description"}).find("p").text
describe = cleanString(product_description.strip())
# Finding Product Image
image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img')
image = image.get('src').split('base64,')[-1]
product_category = soup.find("span", {"class": "posted_in"}).find("a").text
category = cleanString(product_category.strip())
product_rating: Tag = soup.find("div", {"class": "woocommerce-product-rating"})
if product_rating is not None:
rating_item = product_rating.find("strong", {"class": "rating"}).text
reviews = product_rating.find("span", {"class": "rating"}).text
product_BTC = soup.find("div", {"id": "price-BTC"}).find("span", {"class": "priceinfo cw-noselect"}).text
BTC = cleanString(product_BTC.strip())
product_USD = soup.find("span", {"class": "woocommerce-Price-amount amount"}).text
USD = cleanString(product_USD.replace("$", "").strip())
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
return row
#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of listing page
#return: 'row' that contains a variety of lists that each hold info on the listing page
def digitalThriftShop_listing_parser(soup: Tag):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
mktName = "DigitalThriftShop" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 6 Product_MS_Classification (Microsoft Security)
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
product_category = soup.find("h1", {"class": "woocommerce-products-header__title page-title"}).text
products_list: ResultSet[Tag] = soup.find("ul", {"class": "products columns-5"}).find_all("li")
for product in products_list:
nm += 1
vendor.append(mktName)
rating_vendor.append("-1")
success.append("-1")
product_name = product.find("h2", {"class": "woocommerce-loop-product__title"}).text
name.append(cleanString(product_name.strip()))
# Finding Product Image
product_image = product.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'})
product_image = product_image.get('src').split('base64,')[-1]
image.append(product_image)
CVE.append("-1")
MS.append("-1")
category.append(cleanString(product_category.strip()))
describe.append("-1")
views.append("-1")
reviews.append("-1")
image_vendor.append("-1")
try:
product_rating = product.find("div", {"class": "star-rating"}).find("strong", {"class": "rating"}).text
rating_item.append(cleanString(product_rating.strip()))
except:
rating_item.append("-1")
addDate.append("-1")
BTC.append("-1")
product_USD = product.find("span", {"class": "price"}).text
USD.append(product_USD.replace("$", "").strip())
EURO.append("-1")
sold.append("-1")
qLeft.append("-1")
shipFrom.append("-1")
shipTo.append("-1")
product_href = product.find("a", {"class": "woocommerce-LoopProduct-link woocommerce-loop-product__link"}).get("href")
href.append(cleanString(product_href.strip()))
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
#called by the crawler to get description links on a listing page
#@param: beautifulsoup object that is using the correct html page (listing page)
#return: list of description links from a listing page
def digitalthriftshop_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.find('ul', {"class": "products columns-5"}).findAll('li')
for a in listing:
bae = a.find('a', href=True)
link = bae['href']
href.append(link)
return href

+ 0
- 288
MarketPlaces/HiddenMarket/parser.py View File

@ -1,288 +0,0 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
# This is the method to parse the Description Pages (one page to each Product in the Listing Pages)
def hiddenmarket_description_parser(soup):
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
bae = soup.find('div', {'class': "main"})
# Finding Product Name
name = bae.find('div', {'class': "heading"}).text
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = name.strip()
mb = bae.find('div', {'class': "information"}).findAll('tr')
# Finding Vendor
vendor = mb[1].find('a').text
vendor = vendor.replace(",", "")
vendor = vendor.strip()
# # Finding Vendor Rating
# full_stars = bae[2].find_all('i', {'class': "fas fa-star"})
# half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"})
# rating = len(full_stars) + (0.5 if half_star is not None else 0)
# Finding Quantity Left
temp = mb[-3].text
left = temp.replace("Quantity in stock:", "")
left = left.strip()
# Finding USD
USD = mb[0].text
USD = USD.replace("Price:", "")
USD = USD.replace("USD", "")
USD = USD.strip()
# Finding BTC
# temp = bae.find('div', {"class": "small"}).text.split("BTC")
# BTC = temp[0].strip()
# Finding Shipment Information (Origin)
shipFrom = mb[2].text
shipFrom = shipFrom.replace("Seller location:", "")
shipFrom = shipFrom.strip()
# Finding Shipment Information (Destination)
shipTo = mb[3].text
shipTo = shipTo.replace("Ships to (seller):", "")
shipTo = shipTo.strip()
# Finding the Product description
describe = bae.find('div', {"class": "twotabs"}).find('div', {'class': "tab1"}).text
describe = cleanString(describe.strip())
# Finding Product Image
image = soup.find('div', {"class": "thumbnails"}).find('img', {"class": "bigthumbnail"})
image = image.get('src').split('base64,')[-1]
# Finding the Product Category
category = mb[-4].text
category = category.replace("Category:", "")
category = category.strip()
#Finding the number of reviews
reviews = bae.find_all('div', {'class': "heading"})
reviews = reviews[-2].text
reviews = reviews.replace("Comments (", "")
reviews = reviews.replace(")", "")
# Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve:
CVE = " "
for idx in cve:
CVE += (idx)
CVE += " "
CVE = CVE.replace(',', ' ')
CVE = CVE.replace('\n', '')
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
if ms:
MS = " "
for im in ms:
MS += (im)
MS += " "
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
return row
# This is the method to parse the Listing Pages
def hiddenmarket_listing_parser(soup):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
mktName = "HiddenMarket" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 6 Product_MS_Classification (Microsoft Security)
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft = [] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
listing = soup.findAll('div', {"class": "item"})
# Populating the Number of Products
nm = len(listing)
# Finding Category
cat = soup.find("div", {'class': "heading"}).text
cat = cat.replace(",", "")
cat = cat.strip()
for card in listing:
category.append(cat)
# Adding the url to the list of urls
link = card.find_all('a')
link = link[1].get('href')
href.append(link)
# Finding Product Name
product = card.find('div', {'class': "title"})
product = product.text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.strip()
name.append(product)
# Finding Product Image
image.append("-1")
# Finding Vendor
vendor_name = card.find('div', {"class": "seller"}).text
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
image_vendor.append("-1")
# Finding USD
usd = card.find('div', {"class": "buttons"}).find('div', {'class': "price"}).text
usd = usd.replace("USD", "")
usd = usd.strip()
USD.append(usd)
tb = card.find("div", {"class": "stats"})
tb = tb.find_all('td')
# Finding Reviews
num = tb[-1].text
num = num.strip()
reviews.append(num)
# Finding Views
view = tb[-3].text.strip()
views.append(view)
# Finding Num of Sales
sale = tb[-2].text.strip()
sold.append(sale)
# Finding Item Rating
if num == '0':
item_rating = '-1'
else:
item_rating = card.find('div', {'class': 'stats'}).find('div', {'class': "stars2"})
item_rating = item_rating.get('style')
item_rating = item_rating.replace("width:", "")
item_rating = item_rating.replace("%", "")
rating_item.append(item_rating)
# Finding shipping info
shipping = card.find('div', {'class': "shipping"}).text.split('>')
# SHip from
origin = shipping[0].strip()
shipFrom.append(origin)
#Ship to
destination = shipping[1].strip()
shipTo.append(destination)
# Finding description (site only shows partial description on listing pages)
# description = card.next_sibling.find('div', {'class': "description"}).text
# description = description.replace("\n", " ")
# description = description.replace("\r", " ")
# description = description.replace("-", " ")
# description = description.strip()
# describe.append(description)
# Searching for CVE and MS categories
cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue = "-1"
else:
cee = " "
for idx in cve:
cee += (idx)
cee += " "
cee = cee.replace(',', ' ')
cee = cee.replace('\n', '')
cveValue = cee
CVE.append(cveValue)
ms = card.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue = "-1"
else:
me = " "
for im in ms:
me += (im)
me += " "
me = me.replace(',', ' ')
me = me.replace('\n', '')
MSValue = me
MS.append(MSValue)
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
def hiddenmarket_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.findAll('div', {"class": "item"})
for div in listing:
link = div.findAll('a')
link = link[1]
link = link['href']
href.append(link)
return href

+ 0
- 42
MarketPlaces/Initialization/markets_mining.py View File

@ -6,28 +6,14 @@ Starting point of the Darkweb Markets Mining
from datetime import *
from MarketPlaces.DarkFox.crawler_selenium import crawler as crawlerDarkFox
from MarketPlaces.Tor2door.crawler_selenium import crawler as crawlerTor2door
from MarketPlaces.ThiefWorld.crawler_selenium import crawler as crawlerThiefWorld
from MarketPlaces.TorBay.crawler_selenium import crawler as crawlerTorBay
from MarketPlaces.LionMarketplace.crawler_selenium import crawler as crawlerLionMarketplace
from MarketPlaces.TorMarket.crawler_selenium import crawler as crawlerTorMarket
from MarketPlaces.MikesGrandStore.crawler_selenium import crawler as crawlerMikesGrandStore
from MarketPlaces.DarkTor.crawler_selenium import crawler as crawlerDarkTor
from MarketPlaces.DigitalThriftShop.crawler_selenium import crawler as crawlerDigitalThriftShop
from MarketPlaces.AnonymousMarketplace.crawler_selenium import crawler as crawlerAnonymousMarketplace
from MarketPlaces.Apocalypse.crawler_selenium import crawler as crawlerApocalypseMarketplace
from MarketPlaces.CityMarket.crawler_selenium import crawler as crawlerCityMarket
from MarketPlaces.DarkMatter.crawler_selenium import crawler as crawlerDarkMatter
from MarketPlaces.M00nkeyMarket.crawler_selenium import crawler as crawlerM00nkeyMarket
from MarketPlaces.ViceCity.crawler_selenium import crawler as crawlerViceCity
from MarketPlaces.HiddenMarket.crawler_selenium import crawler as crawlerHiddenMarket
from MarketPlaces.RobinhoodMarket.crawler_selenium import crawler as crawlerRobinhoodMarket
from MarketPlaces.Nexus.crawler_selenium import crawler as crawlerNexus
from MarketPlaces.CypherMarketplace.crawler_selenium import crawler as crawlerCypher
from MarketPlaces.DarkBazar.crawler_selenium import crawler as crawlerDarkBazar
from MarketPlaces.PabloEscobarMarket.crawler_selenium import crawler as crawlerPabloEscobar
from MarketPlaces.AnonMarket.crawler_selenium import crawler as crawlerAnonMarket
from MarketPlaces.MetaVerseMarket.crawler_selenium import crawler as crawlerMetaVerse
import configparser
import os
@ -105,49 +91,21 @@ if __name__ == '__main__':
# if crawlerDarkFox(base["url"], base["categories"]):
# break
crawlerDarkFox()
elif mkt == 'Tor2door':
crawlerTor2door()
elif mkt == "ThiefWorld":
crawlerThiefWorld()
elif mkt == "TorBay":
crawlerTorBay()
elif mkt == "LionMarketplace":
crawlerLionMarketplace()
elif mkt == "TorMarket":
crawlerTorMarket()
elif mkt == "MikesGrandStore":
crawlerMikesGrandStore()
elif mkt == "DarkTor":
crawlerDarkTor()
elif mkt == "DigitalThriftShop":
crawlerDigitalThriftShop()
elif mkt == "AnonymousMarketplace":
crawlerAnonymousMarketplace()
elif mkt == "Apocalypse":
crawlerApocalypseMarketplace()
elif mkt == "CityMarket":
crawlerCityMarket()
elif mkt == "DarkMatter":
crawlerDarkMatter()
elif mkt == "M00nkeyMarket":
crawlerM00nkeyMarket()
elif mkt == "ViceCity":
crawlerViceCity()
elif mkt == "HiddenMarket":
crawlerHiddenMarket()
elif mkt == "RobinhoodMarket":
crawlerRobinhoodMarket()
elif mkt == "Nexus":
crawlerNexus()
elif mkt == "CypherMarketplace":
crawlerCypher()
elif mkt == "DarkBazar":
crawlerDarkBazar()
elif mkt == "PabloEscobarMarket":
crawlerPabloEscobar()
elif mkt == "AnonMarket":
crawlerAnonMarket()
elif mkt == "MetaVerseMarket":
crawlerMetaVerse()
print("\nScraping process completed!")

+ 0
- 70
MarketPlaces/Initialization/prepare_parser.py View File

@ -9,26 +9,12 @@ from psycopg2.extras import RealDictCursor
from MarketPlaces.DB_Connection.db_connection import *
from MarketPlaces.DarkFox.parser import *
from MarketPlaces.Tor2door.parser import *
from MarketPlaces.Apocalypse.parser import *
from MarketPlaces.ThiefWorld.parser import *
from MarketPlaces.AnonymousMarketplace.parser import *
from MarketPlaces.ViceCity.parser import *
from MarketPlaces.TorBay.parser import *
from MarketPlaces.M00nkeyMarket.parser import *
from MarketPlaces.DarkMatter.parser import *
from MarketPlaces.DigitalThriftShop.parser import *
from MarketPlaces.LionMarketplace.parser import *
from MarketPlaces.TorMarket.parser import *
from MarketPlaces.HiddenMarket.parser import *
from MarketPlaces.RobinhoodMarket.parser import *
from MarketPlaces.Nexus.parser import *
from MarketPlaces.MikesGrandStore.parser import *
from MarketPlaces.DarkBazar.parser import *
from MarketPlaces.PabloEscobarMarket.parser import *
from MarketPlaces.AnonMarket.parser import *
from MarketPlaces.CityMarket.parser import *
from MarketPlaces.MetaVerseMarket.parser import *
from MarketPlaces.Classifier.classify_product import predict
@ -132,46 +118,18 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
if marketPlace == "DarkFox":
rw = darkfox_listing_parser(soup)
elif marketPlace == "Tor2door":
rw = tor2door_listing_parser(soup)
elif marketPlace == "Apocalypse":
rw = apocalypse_listing_parser(soup)
elif marketPlace == "ThiefWorld":
rw = thiefWorld_listing_parser(soup)
elif marketPlace == "AnonymousMarketplace":
rw = anonymousMarketplace_listing_parser(soup)
elif marketPlace == "ViceCity":
rw = vicecity_listing_parser(soup)
elif marketPlace == "TorBay":
rw = torbay_listing_parser(soup)
elif marketPlace == "M00nkeyMarket":
rw = m00nkey_listing_parser(soup)
elif marketPlace == "HiddenMarket":
rw = hiddenmarket_listing_parser(soup)
elif marketPlace == "DarkMatter":
rw = darkmatter_listing_parser(soup)
elif marketPlace == "DigitalThriftShop":
rw = digitalThriftShop_listing_parser(soup)
elif marketPlace == "LionMarketplace":
rw = lionmarketplace_listing_parser(soup)
elif marketPlace == "TorMarket":
rw = tormarket_listing_parser(soup)
elif marketPlace == "RobinhoodMarket":
rw = Robinhood_listing_parser(soup)
elif marketPlace == "Nexus":
rw = nexus_listing_parser(soup)
elif marketPlace == "MikesGrandStore":
rw = mikesGrandStore_listing_parser(soup)
elif marketPlace == "DarkBazar":
rw = darkbazar_listing_parser(soup)
elif marketPlace == "PabloEscobarMarket":
rw = pabloescobarmarket_listing_parser(soup)
elif marketPlace == "AnonMarket":
rw = AnonMarket_listing_parser(soup)
elif marketPlace == "CityMarket":
rw = city_listing_parser(soup)
elif marketPlace == "MetaVerseMarket":
rw = metaversemarket_listing_parser(soup)
else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception
@ -194,46 +152,18 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
if marketPlace == "DarkFox":
rmm = darkfox_description_parser(soup)
elif marketPlace == "Tor2door":
rmm = tor2door_description_parser(soup)
elif marketPlace == "Apocalypse":
rmm = apocalypse_description_parser(soup)
elif marketPlace == "ThiefWorld":
rmm = thiefWorld_description_parser(soup)
elif marketPlace == "AnonymousMarketplace":
rmm = anonymousMarketplace_description_parser(soup)
elif marketPlace == "ViceCity":
rmm = vicecity_description_parser(soup)
elif marketPlace == "TorBay":
rmm = torbay_description_parser(soup)
elif marketPlace == "M00nkeyMarket":
rmm = m00nkey_description_parser(soup)
elif marketPlace == "HiddenMarket":
rmm = hiddenmarket_description_parser(soup)
elif marketPlace == "DarkMatter":
rmm = darkmatter_description_parser(soup)
elif marketPlace == "DigitalThriftShop":
rmm = digitalThriftShop_description_parser(soup)
elif marketPlace == "LionMarketplace":
rmm = lionmarketplace_description_parser(soup)
elif marketPlace == "TorMarket":
rmm = tormarket_description_parser(soup)
elif marketPlace == "RobinhoodMarket":
rmm = Robinhood_description_parser(soup)
elif marketPlace == "Nexus":
rmm = nexus_description_parser(soup)
elif marketPlace == "MikesGrandStore":
rmm = mikesGrandStore_description_parser(soup)
elif marketPlace == "DarkBazar":
rmm = darkbazar_description_parser(soup)
elif marketPlace == "PabloEscobarMarket":
rmm = pabloescobarmarket_description_parser(soup)
elif marketPlace == "AnonMarket":
rmm = AnonMarket_description_parser(soup)
elif marketPlace == "CityMarket":
rmm = city_description_parser(soup)
elif marketPlace == "MetaVerseMarket":
rmm = metaversemarket_description_parser(soup)
else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception


+ 325
- 0
MarketPlaces/Kingdom/crawler_mechanize.py View File

@ -0,0 +1,325 @@
__author__ = '91Shadows'
'''
DarkFox marketplace Crawler
'''
import codecs
import socks, socket, time
from datetime import date
import urllib.parse as urlparse
import http.client as httplib
import mechanize
import os
import subprocess
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.DarkFox.parser import darkfox_links_parser
counter = 1
httplib.HTTPConnection._http_vsn = 10
httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
baseURL = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/'
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150)
# Opens Tor Browser, crawls the mkt
def startCrawling():
opentor()
getUrl()
url = getFixedURL()
mktName = getMKTName()
credentials = getCredentials()
br = getAccess(url, credentials)
if br != 'down':
crawlMkt(url, br)
#new_parse(mktName, False)
#new_parse(mktName, False)
closetor()
#Opens Tor Browser
def opentor():
global pid
print("Connecting Tor...")
path = open('../../path.txt').readline()
pro = subprocess.Popen(path)
pid = pro.pid
time.sleep(5)
input("Tor Connected. Press ENTER to continue\n")
return
# Creates a connection through Tor Port
def getUrl(timeout=None):
socket.socket = socks.socksocket
socket.create_connection = create_connection
return
# Makes the onion address request
def create_connection(address, timeout=None, source_address=None):
sock = socks.socksocket()
sock.connect(address)
return sock
# Returns the name of the mkt (Crypto)
def getMKTName():
name = 'DarkFox'
return name
# Returns credentials needed for the mkt
def getCredentials():
credentials = 'blank blank blank blank cap 0'
return credentials
# Return the link of the mkt (DarkFox Link)
def getFixedURL():
url = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/'
return url
# Closes Tor Browser
def closetor():
global pid
os.system("taskkill /pid " + str(pid))
print('Closing Tor...')
time.sleep(3)
return
# Creates a Mechanize browser and initializes its options
def createBrowser():
br = mechanize.Browser()
cj = mechanize.CookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv( True )
br.set_handle_redirect( True )
br.set_handle_referer( True )
br.set_handle_robots(False)
br.set_handle_refresh( mechanize._http.HTTPRefreshProcessor(), max_time = 1 )
br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'),
('Accept', '*/*')]
return br
def getAccess(loginPage, credentials):
logInName = credentials.split()[0]
userName = credentials.split()[1]
logInPass = credentials.split()[2]
password = credentials.split()[3]
captchaName = credentials.split()[4]
formId = credentials.split()[5]
br = createBrowser()
try:
keepTrying = True
while (keepTrying):
br.open(loginPage)
time.sleep(7)
html = br.response()
soup = BeautifulSoup(html)
image_tags = soup.findAll('div', {"class": "imgWrap"})
captchaLink = image_tags[0]
imagelink = captchaLink['style'].split('url(')[1][:-1]
data = br.open(imagelink).read()
br.back()
open('captcha.png', "wb").write(data)
'''
subprocess.Popen("python capt.py", shell=False)
time.sleep(61)
captchaAnswerFile = open("answer.txt", "r")
captchaAnswer = captchaAnswerFile.read().__str__()
'''
captchaAnswer = input('Please provide me with captcha : ')
formIndex = int(formId)
br.select_form(nr=formIndex)
#br[logInName] = userName
#br[logInPass] = password
br[captchaName] = captchaAnswer.__str__()
br.submit()
if br.geturl() != 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/':
keepTrying = False
return br
except:
return 'down'
# Saves the crawled html page
def savePage(page, url):
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
a = page.read()
open(filePath, "wb").write(a)
return
# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
else:
fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
return fullPath
# Creates the name of the file based on URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
# Hacking and Markets related topics
def getInterestedLinks():
links = []
# Guides and Tutorials
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/30739153-1fcd-45cd-b919-072b439c6e06')
# Digital Products
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781')
# Software and Malware
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc')
# Services
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280')
# Miscellaneous
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/fd1c989b-1a74-4dc0-92b0-67d8c1c487cb')
# Hosting and Security
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/5233fd6a-72e6-466d-b108-5cc61091cd14')
# links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing1.html')
# links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing2.html')
return links
def crawlMkt(url, br):
print("Crawling the DarkFox marketplace")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try :
page = br.open(link)
savePage(page, link)
for l in br.links():
absURL = urlparse.urljoin(l.base_url, l.url)
if absURL not in visited and not isSignOut(absURL) and isListingLink(absURL):
visited.add(absURL)
#disabling the process of finding other links
#linksToCrawl.append(absURL)
# crawler asks parser to get links of ALL products on ALL listing pages
list = productPages(link)
j = 0
for item in list:
if j == 2:
break
#itemURL = baseURL + str(item)
try:
#itemPage = br.open(itemURL)
itemPage = br.open(item)
savePage(itemPage, item)
except:
#print 'Error in page: ', itemURL
print('Error in page: ', item)
j+=1
except Exception as e:
print(link, e.message)
i += 1
#finalTime = time.time()
#print finalTime - initialTime
input("Crawling DarkFox marketplace done sucessfully. Press ENTER to continue\n")
return
def isDescriptionLink(url):
if 'product' in url:
return True
return False
# Returns True if the link is a listingPage link
def isListingLink(url):
if 'category' in url:
return True
return False
# calling the parser to define the links
def productPages(url):
soup = ""
error = False
try:
html = codecs.open(
r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html', encoding='utf8')
soup = BeautifulSoup(html, "html.parser")
except:
try:
html = open(
r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html')
soup = BeautifulSoup(html, "html.parser")
except:
error = True
print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.")
if error:
return []
else:
return darkfox_links_parser(soup)
# Drop links that "singout"
def isSignOut(url):
#absURL = urlparse.urljoin(url.base_url, url.url)
if 'signout' in url.lower() or 'logout' in url.lower():
return True
return False
def crawler():
startCrawling()
#print "Crawling and Parsing Crypto .... DONE!"

+ 342
- 0
MarketPlaces/Kingdom/crawler_selenium.py View File

@ -0,0 +1,342 @@
__author__ = 'DarkWeb'
'''
Kingdom Market Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from PIL import Image
import base64
from io import BytesIO
import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.Kingdom.parser import kingdom_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion'
# Opens Tor Browser, crawls the website
def startCrawling():
# marketName = getMarketName()
driver = getAccess()
if driver != 'down':
try:
captcha(driver)
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
# new_parse(marketName, False)
def captcha(driver):
'''
# wait for captcha page
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div/div[1]")))
# save captcha to local
driver.find_element(by=By.XPATH, value='/html/body/div/div[2]').screenshot(
r'..\Kingdom\captcha1.png')
# This method will show image in any image viewer
im = Image.open(r'..\Kingdom\captcha1.png')
im.show()
iframes = driver.find_elements(by=By.TAG_NAME, value='iframe')
# ask user input captcha solution in terminal
print("Enter squares from smallest to largest (squares are numbered 1-9 left to right)")
for order in ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']:
id = input(f"{order}: ")
iframes[int(id)-1].click()
'''
input("Press ENTER when CAPTCHA is completed\n")
# wait for login page
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button")))
# Login using premade account credentials and do login captcha manually
def login(driver):
# wait for login page
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button")))
# entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-name"]')
# Username here
usernameBox.send_keys('blabri')
passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-passwd"]')
# Password here
passwordBox.send_keys('fishowal')
select = Select(driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-sessiontime"]'))
select.select_by_visible_text('24 hours')
'''
# wait for captcha page show up
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="captcha"]')))
# save captcha to local
driver.find_element(by=By.XPATH, value='//*[@id="captcha"]').screenshot(r'..\Kingdom\captcha2.png')
# This method will show image in any image viewer
im = Image.open(r'..\Kingdom\captcha2.png')
im.show()
# wait until input space show up
inputBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-captcha"]')
# ask user input captcha solution in terminal
userIn = input("Enter solution: ")
# send user solution into the input space
inputBox.send_keys(userIn)
# click the verify(submit) button
driver.find_element(by=By.XPATH, value="/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button").click()
'''
input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div/div/div[3]/div[2]')))
# Returns the name of the website
def getMarketName():
name = 'Kingdom'
return name
# Return the link of the website
def getFixedURL():
url = 'http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion'
return url
# Closes Tor Browser
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", False)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
# Saves the crawled html page
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
# Creates the file name from passed URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
def getInterestedLinks():
links = []
# Software and Malware
links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=127&t=c298a77d9e93ad32')
# # Services
# links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=45&t=c298a77d9e93ad32')
# # Exploits
# links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=45')
# # Tools
# links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=46')
# # Malware
# links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=47')
# # Cryptography
# links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=48')
# # Others
# links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=49')
# # Hacking Tutorials
# links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=50')
# # Hacked Accounts and Database Dumps
# links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=30')
# # Android Moded pak
# links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=53')
return links
def crawlForum(driver):
print("Crawling the Kingdom market")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver, driver.page_source, item)
driver.back()
# comment out
break
# comment out
if count == 1:
break
try:
temp = driver.find_element(by=By.XPATH, value=
'/html/body/div/div/div[3]/div[2]/div[2]/div/div/ul')
next = temp.find_element_by_class_name("next")
link = link.find_element_by_tag_name('a').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
input("Crawling Kingdom Market done sucessfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link
def isDescriptionLink(url):
if 'view' in url:
return True
return False
# Returns True if the link is a listingPage link
def isListingLink(url):
if 'category' in url:
return True
return False
# calling the parser to define the links
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
#print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)
return kingdom_links_parser(soup)
def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")

+ 188
- 0
MarketPlaces/Kingdom/parser.py View File

@ -0,0 +1,188 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
# This is the method to parse the Description Pages (one page to each Product in the Listing Pages)
def kingdom_description_parser(soup):
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
# Finding Product Name
tag = soup.find('div', {"class": "col-md-9"})
desc = tag.find('div',{"class": "col-md-8"}).find('div', {"class": "box-cont"})
name = tag.find('div',{"class": "col-md-8"}).find('div', {"class": "box-head"}).text
name = name.replace('\n', ' ')
name = name.replace(',', ' ')
name = name.strip()
# Finding Prices
# Kingdom prices can be shown in a variety of currencies, not all in USD, so keeping currency
rows = desc.find_all('div', {"class", "row"}, recursive=False)
price = rows[-1].find('div', {"class": "row"}).find('h3').text
price = price.replace(',', '')
price = price.strip()
# USD = price.replace("USD",'')
BTC = rows[-1].find('div', {"class": "row"}).find_next_sibling('div').find('span').text
# Finding Vendor
vendor = rows[0].select_one('a[href^="/user"]').text
vendor = vendor.replace(",", " ")
vendor = vendor.strip()
# Finding Shipment Information (Origem)
descs = rows[0].find_all('div', {"class": "col-md-3 text-right"})
shipFrom = descs[2].text
shipFrom = shipFrom.replace(",", "")
shipFrom = shipFrom.strip()
# Finding Shipment Information (Destiny)
shipTo = rows[-1].find('div', {"class": "col-md-6"}).text
shipTo = shipTo.replace("Ship to:","")
shipTo = shipTo.replace(",","").strip()
if(shipTo == ''):
shipTo = -1
# Finding the Product Category
category = descs[0].text
category = category.replace(",", "")
category = category.strip()
# Finding the Product Quantity Available
left = descs[1].text
left = left.replace(",", "")
left = left.strip()
# Finding when the Product was Added
dt = descs[-1].text.strip()
addDate = datetime.strptime(dt, '%d.%m.%Y')
# Finding the Product description
describe = cleanString(soup.find('div', {"id": "descriptionContent"}).text)
# Finding the Number of Product Reviews
review = len(soup.find('div', {"id": "feedbackContent"}).find_all(recursive=False))
# Searching for CVE and MS categories
# no cve or ms in Kingdom
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo)
# Sending the results
return row
def kingdom_listing_parser(soup):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
mktName = "Kingdom" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 6 Product_MS_Classification (Microsoft Security)
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
listing = soup.find('div', {"id": "p0"}).find('div').find_all('div', {"class": "row"}, recursive=False)
# Populating the Number of Products
nm = len(listing)
for a in listing:
# Finding Prices
#in array USD, there may be prices not in USD, so includes currency as well
prices = a.find('div', {"class": "col-md-3"})
u = prices.find('h3').text
u = u.strip()
u = u.replace(',', '')
u = u.strip()
USD.append(u)
bc = prices.find('div').find('span').text
BTC.append(bc)
# Finding the Product
product = a.find('div', {"class": "col-md-7"}).select_one('a[href^="/offer/view?"]').text
product = product.replace('\n', ' ')
product = product.replace(","," ")
product = product.strip()
name.append(product)
# Finding the Vendor
vendor_name = a.select_one('a[href^="/user"]').text
vendor_name = vendor_name.replace(",", " ").replace('/', '')
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
# Adding the url to the list of urls
link = a.find('div', {"class": "col-md-7"}).select_one('a[href^="/offer/view?"]')['href']
link = cleanLink(link)
href.append(link)
# Searching for CVE and MS categories
# cve and ms not in kingdom
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
def kingdom_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.findAll('div', {"class": "col-md-7"})
for a in listing:
link = a.select_one('a[href^="/offer/view?"]')
link = link['href']
href.append(link)
return href

+ 0
- 235
MarketPlaces/LionMarketplace/parser.py View File

@ -1,235 +0,0 @@
__author__ = 'Helium'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of description page
#return: 'row' that contains a variety of lists that each hold info on the description page
def lionmarketplace_description_parser(soup):
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
# vendor name
temp = soup.find('div', {'class': 'btn-group'}).find('a').text
vendor = (cleanString(temp.strip()))
# table with info
table = soup.find('table')
rows = table.findAll('tr')
# vendor rating
pos = soup.find('span', {"class": "fas fa-plus-circle text-success"}).parent.text
pos = int(pos.strip())
neu = soup.find('span', {"class": "fas fa-stop-circle text-secondary"}).parent.text
neu = int(neu.strip())
neg = soup.find('span', {"class": "fas fa-minus-circle text-danger"}).parent.text
neg = int(neg.strip())
total = pos + neu + neg
if total > 0:
rating_vendor = str((pos + 0.5*neu) / total)
# product name
temp = soup.find('div', {'class', 'row'}).find('h2').text
name = (cleanString(temp.strip()))
# product description
temp = soup.find('div', {'class': "mt-4"}).contents[-1]
describe = cleanString(temp.strip())
# Finding Product Image
image = soup.find('div', {'id': 'slide-1'}).find('img')
image = image.get('src')
image = image.split('base64,')[-1]
full = rows[0].findAll('i', {"class": "fas fa-star"})
half = rows[0].find('i', {"class": "fas fa-star-half-alt"})
rating_item = len(full)
if half is not None:
rating_item += 0.5
rating_item = str(rating_item)
# USD selling price
temp = rows[2].find('strong').text
if " $" in temp:
temp = temp.replace(" $", "")
elif "$" in temp:
temp = temp.replace("$", "")
USD = cleanString((temp.strip()))
# product sold
temp = rows[4].find('td')
if temp is not None and cleanString(temp.text.strip()) == 'Left/Sold':
temp = rows[4].findAll('td')
temp = temp[1].findAll('span')
# left
sold = temp[1].text
left = temp[0].text
sold = cleanNumbers(sold.strip())
left = cleanNumbers(left.strip())
else:
sold = '-1'
left = "-1"
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
return row
#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of listing page
#return: 'row' that contains a variety of lists that each hold info on the listing page
def lionmarketplace_listing_parser(soup):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
mktName = "LionMarketplace" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 6 Product_MS_Classification (Microsoft Security)
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
listings = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"})
# Populating the Number of Products
nm = len(listings)
for listing in listings:
a = listing.find('div', {"class": "card-body"})
row = a.findAll('p')
# vendor
temp = row[3].text
temp = temp.replace("Vendor:", "")
vendor.append(cleanString(temp.strip()))
image_vendor.append("-1")
# vendor rating
rating_vendor.append("-1")
# successful transactions CHECK AGAIN HERE
success.append("-1")
# product name
temp = a.find('a').text
name.append(cleanString(temp.strip()))
# Finding Product Image
product_image = listing.find('img', {'class': 'card-img-top rounded'})
product_image = product_image.get('src')
product_image = product_image.split('base64,')[-1]
image.append(product_image)
CVE.append('-1')
MS.append('-1')
# product category
temp = row[1].text
temp = temp.replace("Category: ", "")
category.append(cleanString(temp.strip()))
describe.append('-1')
# product views
vnum = listing.find('p', {"class": "position-absolute bg-primary opacity-60 text-white mt-4 mr-5 pr-1"}).text
views.append(cleanNumbers(vnum.strip()))
reviews.append('-1') # 10 Product_Number_Of_Reviews
rating_item.append('-1') # 11 Product_Rating
addDate.append('-1') # 12 Product_AddDate
# BTC
BTC.append('-1')
# USD
temp = row[0].find('strong').text
USD.append(cleanNumbers(temp.strip())) # 14 Product_USD_SellingPrice
EURO.append("-1") # 15 Product_EURO_SellingPrice
# product sold
sold.append("-1")
qLeft.append('-1') # 17 Product_QuantityLeft
shipFrom.append('-1') # 18 Product_ShippedFrom
shipTo.append('-1') # 19 Product_ShippedTo
# href
temp = a.find('a').get('href')
href.append(temp)
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
#called by the crawler to get description links on a listing page
#@param: beautifulsoup object that is using the correct html page (listing page)
#return: list of description links from a listing page
def lionmarketplace_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listings = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"})
for listing in listings:
a = listing.find('div', {"class": "card-body"})
bae = a.find('a', href=True)
link = bae['href']
href.append(link)
return href

+ 0
- 291
MarketPlaces/MetaVerseMarket/crawler_selenium.py View File

@ -1,291 +0,0 @@
__author__ = 'Helium'
'''
MetaVerseMarket Marketplace Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.MetaVerseMarket.parser import metaversemarket_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/login'
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Returns the name of the website
#return: name of site in string type
def getMKTName():
name = 'MetaVerseMarket'
return name
# Return the base link of the website
#return: url of base site in string type
def getFixedURL():
url = 'http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/login'
return url
# Closes Tor Browser
#@param: current selenium driver
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", False)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
#return: return the selenium driver or string 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
# then allows for manual solving of captcha in the terminal
#@param: current selenium web driver
def login(driver):
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="username"]')))
# entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
# Username here
usernameBox.send_keys('metotomoto')
passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
# Password here
passwordBox.send_keys('lionking_kumba1ya')
input("Press ENTER when CAPTCHA is completed and you exit the newsletter\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="searchq"]')))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
#@param: raw url as crawler crawls through every site
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
#@param: raw url as crawler crawls through every site
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
#in this example, there are a couple of categories some threads fall under such as
# Guides and Tutorials, Digital Products, and Software and Malware
#as you can see they are categories of products
def getInterestedLinks():
links = []
# software and malware
links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/softwares-and-malwares')
# guides and tutorials
links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/guides-and-tutorials')
# services
links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/services')
return links
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
#topic and description pages are crawled through here, where both types of pages are saved
#@param: selenium driver
def crawlForum(driver):
print("Crawling the MetaVerse market")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver, driver.page_source, item)
driver.back()
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try:
link = driver.find_element(by=By.PARTIAL_LINK_TEXT, value='Next').get_attribute('href')
if link.endswith('#') or link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
print("Crawling the MetaVerse market done.")
# Returns 'True' if the link is a description link
#@param: url of any url crawled
#return: true if is a description page, false if not
def isDescriptionLink(url):
if 'PR' in url:
return True
return False
# Returns True if the link is a listingPage link
#@param: url of any url crawled
#return: true if is a Listing page, false if not
def isListingLink(url):
if 'products' in url:
return True
return False
# calling the parser to define the links, the html is the url of a link from the list of interested link list
#@param: link from interested link list ie. getInterestingLinks()
#return: list of description links that should be crawled through
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return metaversemarket_links_parser(soup)
# Drop links that "signout"
# def isSignOut(url):
# #absURL = urlparse.urljoin(url.base_url, url.url)
# if 'signout' in url.lower() or 'logout' in url.lower():
# return True
#
# return False
def crawler():
startCrawling()
# print("Crawling and Parsing MetaVerseMarket .... DONE!")

+ 0
- 269
MarketPlaces/MetaVerseMarket/parser.py View File

@ -1,269 +0,0 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
# stores info it needs in different lists, these lists are returned after being organized
# @param: soup object looking at html page of description page
# return: 'row' that contains a variety of lists that each hold info on the description page
def metaversemarket_description_parser(soup):
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
# Finding Product Name
name = soup.find('div', {'class': "panel-heading"}).text
name = cleanString(name.strip())
temp = soup.findAll('div', {'class': "col-xs-12 col-sm-6 mt-5"})
# Finding Product Image
image = temp[0].find('img')
image = image.get('src')
image = image.split('base64,')[-1]
# Finding Vendor
temp = temp[1].findAll('span')
vendor = temp[1].find('b').text
vendor = cleanString(vendor.strip())
# Finding Vendor Rating
pos = soup.find('span', {'class': "badge bg-success fs-12px"}).text
pos = int(cleanNumbers(pos).strip())
neg = soup.find('span', {'class': "badge bg-danger fs-12px"}).text
neg = int(cleanNumbers(neg).strip())
total = pos + neg
if total > 0:
rating_vendor = str(pos / total)
# Finding Prices
USD = soup.find('h3', {'class': "mb-2"}).text
USD = cleanNumbers(USD).strip()
# Finding the Product Category
temp = soup.select('div[class="mt-2"]')[1].text
temp = temp.replace("Category:", "")
category = temp.strip()
# Finding Number of Views
views = soup.find('button', {"class": "btn btn-secondary text-center w-33 fw-bold"}).text
views = views.strip()
# Finding the Product Quantity Available
temp = soup.find('button', {"class": "btn btn-success text-center w-33 fw-bold"}).text
temp = temp.split("/")
left = temp[1].strip()
# Finding Number Sold
sold = temp[0].strip()
# Finding Shipment Information (Origin)
temp = soup.find('div', {'class': "alert alert-info"}).text
temp = temp.split("to")
shipFrom = temp[0].replace("Shipping from ", "").strip()
# Finding Shipment Information (Destination)
shipTo = temp[1].split("for")
shipTo = shipTo[0].strip()
# Finding the Product description
describe = soup.find('p', {'class': "card-text"}).text
describe = cleanString(describe.strip())
# Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve:
CVE = " "
for idx in cve:
CVE += (idx)
CVE += " "
CVE = CVE.replace(',', ' ')
CVE = CVE.replace('\n', '')
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
if ms:
MS = " "
for im in ms:
MS += (im)
MS += " "
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
return row
# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
# stores info it needs in different lists, these lists are returned after being organized
# @param: soup object looking at html page of listing page
# return: 'row' that contains a variety of lists that each hold info on the listing page
def metaversemarket_listing_parser(soup):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
mktName = "MetaVerseMarket" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft = [] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
listing = soup.findAll('div', {"class": "col-12 col-sm-4 col-xl-3 product_item_col p-1"})
# Populating the Number of Products
nm = len(listing)
for a in listing:
bae = a.findAll('a', href=True)
# Adding the url to the list of urls
link = bae[0].get('href')
link = cleanLink(link)
href.append(link)
# Finding the Product
product = bae[1].find('span', {"class": "text-primary"}).text
name.append(cleanString(product.strip()))
# Finding Prices
price = a.find('strong').text
USD.append(cleanNumbers(price).strip())
# Finding the Vendor
temp = a.find('div', {'class': "mt-1 fs-12px"})
temp = temp.findAll('span')
vendor_name = temp[1].find('b').text
vendor.append(cleanString(vendor_name.strip()))
# Finding the Category
cat = a.select_one('div[class="fs-12px"]')
cat = cat.findAll('span')[1].text
cat = cat.strip()
category.append(cat)
ul = a.find('ul', {"class": "product-actions"})
# Finding Number Sold and Quantity Left
temp = ul.find('span', {'class': "badge bg-success"}).text
temp = temp.split("/")
num = temp[0]
num = num.replace('k', '000')
sold.append(cleanNumbers(num).strip())
quant = temp[1]
quant = quant.replace('k', '000')
qLeft.append(cleanNumbers(quant).strip())
# Finding Descrption
# description = a.find('p', {'class': "alert alert-light text-ssbold p-1"}).text
# description = description.replace("\n", " ")
# description = description.strip()
# describe.append(cleanString(description))
# Finding Number of Views
view = ul.find('span', {'class': "badge bg-primary"}).text
view = view.replace('.', '')
view = view.replace('K', '000')
views.append(view.strip())
# Find where ships from
ships = a.find('div', {'class': "alert alert-info item_alert fs-12px p-1"})
ships = ships.findAll('b')
sFrom = ships[0].text.strip()
shipFrom.append(sFrom)
# Find where it ships to
sTo = ships[1].text.strip()
shipTo.append(sTo)
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue = "-1"
else:
cee = " "
for idx in cve:
cee += (idx)
cee += " "
cee = cee.replace(',', ' ')
cee = cee.replace('\n', '')
cveValue = cee
CVE.append(cveValue)
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue = "-1"
else:
me = " "
for im in ms:
me += (im)
me += " "
me = me.replace(',', ' ')
me = me.replace('\n', '')
MSValue = me
MS.append(MSValue)
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
# called by the crawler to get description links on a listing page
# @param: beautifulsoup object that is using the correct html page (listing page)
# return: list of description links from a listing page
def metaversemarket_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.findAll('div', {"class": "col-12 col-sm-4 col-xl-3 product_item_col p-1"})
for a in listing:
bae = a.find('a', href=True)
link = bae['href']
href.append(link)
return href

+ 0
- 289
MarketPlaces/Nexus/crawler_selenium.py View File

@ -1,289 +0,0 @@
__author__ = 'Helium'
'''
Nexus Market Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.Nexus.parser import nexus_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion'
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
input("Press ENTER when page loads after DDOS protection")
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Returns the name of the website
#return: name of site in string type
def getMKTName():
name = 'Nexus'
return name
# Return the base link of the website
#return: url of base site in string type
def getFixedURL():
url = 'http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion'
return url
# Closes Tor Browser
#@param: current selenium driver
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", True)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
#return: return the selenium driver or string 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
#@param: raw url as crawler crawls through every site
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isListingLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
return fullPath
# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
#@param: raw url as crawler crawls through every site
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
#in this example, there are a couple of categories some threads fall under such as
# Guides and Tutorials, Digital Products, and Software and Malware
#as you can see they are categories of products
def getInterestedLinks():
links = []
# malware
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/')
# hacking-spam
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/hacking-spam/')
# hacking services
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/hacking/')
# programming services
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/programacao/')
# remote admin services
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/administracao-remota/')
# hacking guides
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-hacking/')
# malware guides
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-malware/')
# fraud guides
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-fraudes/')
# fraud software
links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/fraudes/software-de-fraude/')
return links
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
#topic and description pages are crawled through here, where both types of pages are saved
#@param: selenium driver
def crawlForum(driver):
print("Crawling the Nexus market")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
# waiting for btc price to load
try:
WebDriverWait(driver, 1).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[1]/div[2]/div/div/main/ul/li[1]/div/span/span[3]")))
time.sleep(5)
except:
pass
html = driver.page_source
savePage(driver, html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
# waiting for btc price to load
try:
WebDriverWait(driver, 1).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[1]/div[2]/div/div/main/div[3]/div[2]/p/span[3]")))
except:
pass
savePage(driver, driver.page_source, item)
driver.back()
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try:
link = driver.find_element(by=By.LINK_TEXT, value='').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
print("Crawling the Nexus market done.")
# Returns 'True' if the link is a description link
#@param: url of any url crawled
#return: true if is a description page, false if not
def isDescriptionLink(url):
if 'produto' in url:
return True
return False
# Returns True if the link is a listingPage link
#@param: url of any url crawled
#return: true if is a Listing page, false if not
def isListingLink(url):
if 'categoria-produto' in url:
return True
return False
# calling the parser to define the links, the html is the url of a link from the list of interested link list
#@param: link from interested link list ie. getInterestingLinks()
#return: list of description links that should be crawled through
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return nexus_links_parser(soup)
def crawler():
startCrawling()
# print("Crawling and Parsing Nexus .... DONE!")

+ 0
- 236
MarketPlaces/Nexus/parser.py View File

@ -1,236 +0,0 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
import re
usd_to_brl_r = None
#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of description page
#return: 'row' that contains a variety of lists that each hold info on the description page
def nexus_description_parser(soup):
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
#finding the name of the product
name_of_product = soup.find("h1", {"class": "product_title entry-title"}).text
name = cleanString(name_of_product.strip())
# Finding USD Price
real = soup.find('span', {"class": "price"}).find('bdi').text
real = real.split(',')
whole = cleanNumbers(real[0]).replace('.', '')
real = whole + '.' + real[1]
usd = float(real) / usd_to_brl_r
USD = str(round(usd, 2))
# Find the BTC Price
prices = soup.find('p', {"class": "price"}).findAll('span', {"class": "cs"})
if len(prices) > 0:
BTC = prices[0].text
BTC = cleanNumbers(BTC.strip())
# finding the description of the product
description_div = soup.find("div", {"class": "woocommerce-product-details__short-description"})
if description_div is None:
describe = "-1"
else:
describe = cleanString(description_div.text.strip())
# Finding Product Image
image = soup.find('div', {'class': 'woocommerce-product-gallery__wrapper'}).find('img')
image = image.get('src')
image = image.split('base64,')[-1]
#find the category of the product
name_of_category = soup.find("span", {"class": "posted_in"}).find("a").text
category = cleanString(name_of_category.strip())
#finding the name of the vendor
name_of_vendor = soup.find("div", {"class": "dokan-vendor-name"}).find("h5").text
vendor = cleanString(name_of_vendor)
#finding the vendor's rating
vendorRating = soup.find("div", {"class": "dokan-vendor-rating"}).find("p").text
rating_vendor = cleanString(vendorRating)
#everything else gets a -1 because they are not found
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
return row
#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of listing page
#return: 'row' that contains a variety of lists that each hold info on the listing page
def nexus_listing_parser(soup):
global usd_to_brl_r
while usd_to_brl_r is None:
try:
usd_to_brl_r = float(input("1 US Dollar = (Brazilian Real) "))
except ValueError:
pass
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
mktName = "Nexus" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 6 Product_MS_Classification (Microsoft Security)
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
main = soup.find('main', {'id': 'main'})
products_list = main.find('ul', recursive=False).find_all('li', recursive=False)
nm = len(products_list)
for product in products_list:
# Finding the name of the product
name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text
name_of_product_cleaned = cleanString(name_of_product.strip())
# print(name_of_product_cleaned)
name.append(name_of_product_cleaned)
#finding the URL
try:
url = product.find("a", class_="woocommerce-loop-product__link").get('href')
href.append(url)
except AttributeError as e:
print("I can't find the link")
raise e
# Finding Product Image
product_image = product.find('a', {'class': 'woocommerce-loop-image-link woocommerce-LoopProduct-link woocommerce-loop-product__link'}).find('img')
product_image = product_image.get('src')
product_image = product_image.split('base64,')[-1]
image.append(product_image)
# Finding USD Price
real = product.find('span', {"class": "price"}).find('bdi').text
real = real.split(',')
whole = cleanNumbers(real[0]).replace('.', '')
real = whole + '.' + real[1]
usd = float(real) / usd_to_brl_r
USD.append(str(round(usd, 2)))
# Finding BTC Price
prices = product.find('span', {"class": "price"}).findAll('span', {"class": "cs"})
if len(prices) > 0:
price = prices[0].text
BTC.append(cleanNumbers(price.strip()))
#everything else appends a -1
rating_vendor.append("-1")
vendor.append('-1')
success.append("-1")
CVE.append("-1")
MS.append("-1")
category.append("-1")
describe.append("-1")
views.append("-1")
reviews.append("-1")
addDate.append("-1")
EURO.append("-1")
sold.append("-1")
qLeft.append("-1")
shipFrom.append("-1")
shipTo.append("-1")
image_vendor.append("-1")
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(
marketplace = mktName,
nm = nm,
vendor = vendor,
rating_vendor = rating_vendor,
success_vendor = success,
nombre = name,
CVE = CVE,
MS = MS,
category = category,
describe = describe,
views = views,
reviews = reviews,
rating_item = rating_item,
addDate = addDate,
BTC = BTC,
USD = USD,
EURO = EURO,
sold = sold,
qLeft = qLeft,
shipFrom = shipFrom,
shipTo = shipTo,
href = href,
image = image,
image_vendor = image_vendor
)
#called by the crawler to get description links on a listing page
#@param: beautifulsoup object that is using the correct html page (listing page)
#return: list of description links from a listing page
def nexus_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
# Using a shorter, but still unique, class name
listing = soup.find_all("a", class_="woocommerce-loop-product__link")
for a in listing:
link = a.get('href')
if link: # Checks if 'href' attribute is not None
href.append(link)
return href

MarketPlaces/Tor2door/crawler_selenium.py → MarketPlaces/Quest/crawler_selenium.py View File

@ -1,7 +1,7 @@
__author__ = 'DarkWeb'
'''
Tor2door Market Crawler (Selenium)
Quest Market Crawler (Selenium)
'''
from selenium import webdriver
@ -16,22 +16,22 @@ from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.Tor2door.parser import tor2door_links_parser
from MarketPlaces.Quest.parser import quest_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion'
baseURL = 'http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion'
# Opens Tor Browser, crawls the website
def startCrawling():
marketName = getMKTName()
marketName = getMarketName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
@ -39,15 +39,15 @@ def startCrawling():
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(marketName, baseURL, True)
new_parse(marketName, False)
# Login using premade account credentials and do login captcha manually
def login(driver):
#wait for login page
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="username"]')))
(By.XPATH, "/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[4]/div/div/button")))
#entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
@ -60,19 +60,19 @@ def login(driver):
'''
# wait for captcha page show up
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/main/div/div/div/div/div/form/div[3]/div/div[1]/label/img")))
(By.XPATH, "/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[3]/div/img")))
# save captcha to local
driver.find_element(by=By.XPATH, value='/html/body/main/div/div/div/div/div/form/div[3]/div/div[1]/label/img').screenshot(
r'..\Tor2door\captcha.png')
driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[3]/div/img').screenshot(
r'..\Quest\captcha.png')
# This method will show image in any image viewer
im = Image.open(r'..\Tor2door\captcha.png')
im = Image.open(r'..\Quest\captcha.png')
im.show()
# wait until input space show up
inputBox = driver.find_element(by=By.XPATH, value='//*[@id="captcha"]')
inputBox = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[3]/input')
# ask user input captcha solution in terminal
userIn = input("Enter solution: ")
@ -81,24 +81,24 @@ def login(driver):
inputBox.send_keys(userIn)
# click the verify(submit) button
driver.find_element(by=By.XPATH, value="/html/body/main/div/div/div/div/div/form/div[4]/button").click()
driver.find_element(by=By.XPATH, value="/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[4]/div/div/button").click()
'''
input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/main/div/div/div[1]/div/div[1]/div/h5')))
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div[5]/div/div/div/span')))
# Returns the name of the website
def getMKTName():
name = 'Tor2door'
def getMarketName():
name = 'Quest'
return name
# Return the link of the website
def getFixedURL():
url = 'http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/login'
url = 'http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion'
return url
@ -109,7 +109,7 @@ def closeDriver(driver):
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.quit()
driver.close()
time.sleep(3)
return
@ -129,8 +129,8 @@ def createFFDriver():
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@ -146,7 +146,7 @@ def createFFDriver():
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
@ -198,22 +198,24 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
# Digital - Guides - Hacking
links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=55')
# Digital - Guides - Others
links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=57')
# Digital - Software
links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=60')
# Software - Malware
links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=69')
# Software - Others
links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=78')
# # Digital - Services
# links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/8ae67900-22ed-11ec-a710-31f963ce8d35')
# # Digital - Software
# links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/92809300-22ed-11ec-b143-af312e1dab77')
# # Digital - Tutorials
# links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/9d1592b0-22ed-11ec-b82d-c3d2878a8716')
# # Digital - Malware
# links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/a35bae90-22ed-11ec-ad2e-410f5a5339b5')
# # Digital - Hacking
# links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/b4252cf0-22ed-11ec-8032-751549438ed5')
# Digital - Exploits
links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/c0c3ac60-22ed-11ec-9e97-41cd1912fdee')
return links
def crawlForum(driver):
print("Crawling the Tor2door market")
print("Crawling the Quest market")
linksToCrawl = getInterestedLinks()
@ -243,17 +245,17 @@ def crawlForum(driver):
savePage(driver, driver.page_source, item)
driver.back()
# # comment out
# break
#
# # comment out
# if count == 1:
# break
# comment out
break
# comment out
if count == 1:
break
try:
nav = driver.find_element(by=By.XPATH, value=
'/html/body/main/div/div/div[2]/div[11]/div/nav')
a = nav.find_element(by=By.LINK_TEXT, value="")
nav = driver.find_element(by=By.XPATH, value='/html/body/div[6]/nav')
li = nav.find_elements(By.TAG_NAME, 'li')
a = li[-1].find_element(By.TAG_NAME, 'a')
link = a.get_attribute('href')
if link == "":
raise NoSuchElementException
@ -266,19 +268,19 @@ def crawlForum(driver):
print(link, e)
i += 1
print("Crawling the Tor2door market done.")
input("Crawling Quest market done sucessfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link
def isDescriptionLink(url):
if 'products/' in url:
if 'product' in url:
return True
return False
# Returns True if the link is a listingPage link
def isListingLink(url):
if 'category=' in url:
if 'category' in url:
return True
return False
@ -286,7 +288,7 @@ def isListingLink(url):
# calling the parser to define the links
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return tor2door_links_parser(soup)
return quest_links_parser(soup)
def crawler():

+ 232
- 0
MarketPlaces/Quest/parser.py View File

@ -0,0 +1,232 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
# This is the method to parse the Description Pages (one page to each Product in the Listing Pages)
def quest_description_parser(soup):
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
row = soup.find_all('div', {'class': "row"})
# Finding Product Name
name = row[1].text
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = name.strip()
small = row[3].find_all('small')
# Finding Vendor
vendor = small[0].text
vendor = vendor.replace("Vendor:", "")
vendor = vendor.replace(",", "")
vendor = vendor.strip()
# Finding Vendor Rating
full_stars = small[2].find_all('i', {'class': "fas fa-star"})
half_star = small[2].find('i', {'class': "fas fa-star-half-alt"})
rating_vendor = len(full_stars) + (0.5 if half_star is not None else 0)
# Finding Successful Transactions
success = small[4].text
success = success.replace("Total Sales:", "")
success = success.strip()
small = row[2].find('p', {'class': "text-left"}).find_all('small')
# Finding Prices
USD = small[1].text
USD = USD.replace("$", "")
USD = USD.strip()
shipping_info = row[2].find('p', {'class': "text-left"}).find('span').text.strip()
if "Digital" not in shipping_info:
shipping_info = shipping_info.split(" ")
# Finding Shipment Information (Origin)
shipFrom = shipping_info[0].strip()
# Finding Shipment Information (Destination)
shipTo = shipping_info[1].strip()
textarea = row[2].find_all('textarea')
# Finding the Product description
describe = textarea[0].text
describe = describe.replace("\n", " ")
describe = describe.replace("\r", " ")
describe = describe.strip()
'''
# Finding the Number of Product Reviews
tag = soup.findAll(text=re.compile('Reviews'))
for index in tag:
reviews = index
par = reviews.find('(')
if par >=0:
reviews = reviews.replace("Reviews (","")
reviews = reviews.replace(")","")
reviews = reviews.split(",")
review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
else :
review = "-1"
'''
# Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve:
CVE = " "
for idx in cve:
CVE += (idx)
CVE += " "
CVE = CVE.replace(',', ' ')
CVE = CVE.replace('\n', '')
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
if ms:
MS = " "
for im in ms:
MS += (im)
MS += " "
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo)
# Sending the results
return row
# This is the method to parse the Listing Pages
def quest_listing_parser(soup):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
mktName = "Quest" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 6 Product_MS_Classification (Microsoft Security)
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
# Finding category of listing page
cat = soup.find('span', {'class': "btn btn-sm btn-outline-mgray active border-info"}).text
cat = cat.replace("Digital -", "")
cat = cat.strip()
listing = soup.find_all('div', {"class": "col-md-2 my-md-0 col-12"})
# Populating the Number of Products
nm = len(listing)
for a in listing:
bae = a.find_all('a', href=True)
# Adding the category
category.append(cat)
# Adding the url to the list of urls
link = bae[0].get('href')
link = cleanLink(link)
href.append(link)
# Finding the Vendor
vendor_name = bae[2].text
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
# Finding the Product
product = bae[1].find('img').get('alt')
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.strip()
name.append(product)
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue="-1"
else:
cee = " "
for idx in cve:
cee += (idx)
cee += " "
cee = cee.replace(',', ' ')
cee = cee.replace('\n', '')
cveValue=cee
CVE.append(cveValue)
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue="-1"
else:
me = " "
for im in ms:
me += (im)
me += " "
me = me.replace(',', ' ')
me = me.replace('\n', '')
MSValue=me
MS.append(MSValue)
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
def quest_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.findAll('div', {"class": "col-md-2 my-md-0 col-12"})
for div in listing:
link = div.find('a')["href"]
href.append(link)
return href

+ 0
- 256
MarketPlaces/RobinhoodMarket/crawler_selenium.py View File

@ -1,256 +0,0 @@
__author__ = 'chris'
'''
RobinhoodMarket Market Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from PIL import Image
import urllib.parse as urlparse
import os, re, time
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.RobinhoodMarket.parser import Robinhood_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/'
# Opens Tor Browser, crawls the website
def startCrawling():
marketName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
# Captcha
input("Press ENTER when website has loaded")
# Robinhood doesn't need login
# login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(marketName, baseURL, True)
# Login is not needed in Robinhood
def login(driver):
pass
# Returns the name of the website
def getMKTName():
name = 'RobinhoodMarket'
return name
# Return the link of the website
def getFixedURL():
url = 'http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/'
return url
# Closes Tor Browser
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.quit()
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", False)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
# Saves the crawled html page
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
# Creates the file name from passed URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if name == '':
name = str(counter)
counter = counter + 1
return name
def getInterestedLinks():
links = []
# Hacking
links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/hacking/')
# Other Software
links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/other-software/')
return links
def crawlForum(driver):
print("Crawling the Robinhood market")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
list = productPages(html)
for c, item in enumerate(list):
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver, driver.page_source, item)
driver.back()
# comment out
# if c == 4:
# break
# comment out
# if count == 1:
# break
# go to next page of market
try:
nav = driver.find_element(by=By.XPATH, value="//a[@class='next page-numbers']")
link = nav.get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
print("Crawling the Robinhood market done.")
# Returns 'True' if the link is Topic link
def isDescriptionLink(url):
if 'product' in url and 'category' not in url:
return True
return False
# Returns True if the link is a listingPage link
def isListingLink(url):
if 'category=' in url:
return True
return False
# calling the parser to define the links
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return Robinhood_links_parser(soup)
def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")
if __name__ == '__main__':
startCrawling()

+ 0
- 334
MarketPlaces/RobinhoodMarket/parser.py View File

@ -1,334 +0,0 @@
__author__ = 'chris'
import re
import traceback
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
# Import for test run
import glob
import os
import codecs
import shutil
# This is the method to parse the Description Pages (one page to each Product in the Listing Pages)
def Robinhood_description_parser(soup):
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
# Finding Product Name
name = soup.find('h1').text
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = name.strip()
# Finding description
desc = ''
tab = soup.find('div', {"id": "tab-description"})
if tab is not None:
for p in tab.findAll('p'):
desc += p.text
if desc == '':
short = soup.find('div', {"class": "woocommerce-product-details__short-description"})
if short is not None:
desc = short.text
describe = cleanString(desc.strip())
# Finding Product Image
image = soup.find('div', {'class': 'woocommerce-product-gallery__wrapper'}).find('img')
image = image.get('src')
image = image.split('base64,')[-1]
# Finding Vendor
vendor = soup.find('a', {'class': 'wcfm_dashboard_item_title'}).text
vendor = vendor.replace(",", "")
vendor = vendor.replace("Sold by:", "")
vendor = vendor.strip()
# Finding Vendor Image
vendor_image = soup.find('div', {'class': 'wcfmmp_sold_by_container_left'}).find('img')
vendor_image = vendor_image.get('src')
vendor_image = vendor_image.split('base64,')[-1]
# Finding Category
catSpan = soup.find('span', {'class': 'posted_in'})
category = catSpan.find('a').text
# Finding USD
priceText = soup.find('p', {'class': 'price'}).text
USD = str(priceText).strip()
# Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve:
CVE = " "
for idx in cve:
CVE += (idx)
CVE += " "
CVE = CVE.replace(',', ' ')
CVE = CVE.replace('\n', '')
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
if ms:
MS = " "
for im in ms:
MS += (im)
MS += " "
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
return row
# This is the method to parse the Listing Pages
def Robinhood_listing_parser(soup):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
mktName = "RobinhoodMarket" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 6 Product_MS_Classification (Microsoft Security)
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
listing = soup.find('ul', {"class": "products columns-4"})
items = listing.findAll('li')
# Populating the Number of Products
nm = len(items)
for card in items:
# Finding Category
cat = soup.find("h1").text
cat = cat.replace('\n', ' ')
cat = cat.replace(",", "")
cat = cat.strip()
category.append(cat)
bae = card.findAll('a')
# Adding the url to the list of urls
link = card.find('a').get('href')
href.append(link)
# Finding Product Name
product = card.find("h2").text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.strip()
name.append(product)
# Finding Product Image
product_image = card.find('a').find('img')
product_image = product_image.get('src')
product_image = product_image.split('base64,')[-1]
image.append(product_image)
info = card.find('div', {'class': 'wcfmmp_sold_by_container'})
# Finding Vendor
vendor_name = info.find('a', {'class', 'wcfm_dashboard_item_title'}).text
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
# Finding Vendor Image
vendor_icon = info.find('img', {'class', 'wcfmmp_sold_by_logo'})
vendor_icon = vendor_icon.get('src')
vendor_icon = vendor_icon.split('base64,')[-1]
image_vendor.append(vendor_icon)
# Finding USD
span = card.find('span', {'class': 'price'})
if span is not None:
bdi = span.find('bdi')
usdText = bdi.find('span').next_sibling
usdVal = usdText.text
else:
usdVal = "0"
USD.append(usdVal)
# Searching for CVE and MS categories
cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue="-1"
else:
cee = " "
for idx in cve:
cee += (idx)
cee += " "
cee = cee.replace(',', ' ')
cee = cee.replace('\n', '')
cveValue=cee
CVE.append(cveValue)
ms = card.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue="-1"
else:
me = " "
for im in ms:
me += (im)
me += " "
me = me.replace(',', ' ')
me = me.replace('\n', '')
MSValue=me
MS.append(MSValue)
#print(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
# reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
def Robinhood_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
#list = soup.findAll('div', {"class": "woocommerce columns-4"})
listing = soup.find('ul', {"class": "products columns-4"}).findAll('li')
for item in listing:
link = item.find('a')['href']
href.append(link)
return href
if __name__ == '__main__':
nError = 0
marketPlace = 'RobinhoodMarket'
lines = [] # listing pages
lns = [] # description pages
detPage = {}
'''
# reading description pages
count = 0
for fileDescription in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\08082023\\Description", '*.html')):
count += 1
lns.append(fileDescription)
# if count > 5:
# break
for index, line2 in enumerate(lns):
print("Reading description folder of '" + marketPlace + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns)))
try:
html = codecs.open(line2.strip('\n'), encoding='utf8')
soup = BeautifulSoup(html, "html.parser")
html.close()
except:
try:
html = open(line2.strip('\n'))
soup = BeautifulSoup(html, "html.parser")
html.close()
except:
nError += 1
print("There was a problem to read the file " + line2 + " in the Description section!")
# if createLog:
# logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section.\n")
continue
try:
print(Robinhood_description_parser(soup))
except:
traceback.print_exc()
print("There was a problem to parse the file " + line2 + " in the Description section!")
'''
# reading listing pages
count = 0
for fileListing in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\08082023\\Listing", '*.html')):
count += 1
lines.append(fileListing)
#if count > 1:
# break
for index, line1 in enumerate(lines):
print("Reading listing folder of '" + marketPlace + "', file '" + os.path.basename(line1) + "', index= " + str(index + 1) + " ... " + str(len(lines)))
readError = False
try:
html = codecs.open(line1.strip('\n'), encoding='utf8')
soup = BeautifulSoup(html, "html.parser")
html.close()
except:
try:
html = open(line1.strip('\n'))
soup = BeautifulSoup(html, "html.parser")
html.close()
except:
print("There was a problem to read the file " + line1 + " in the Listing section!")
readError = True
if not readError:
parseError = False
try:
test = Robinhood_listing_parser(soup)
print(Robinhood_listing_parser(soup))
except:
traceback.print_exc()
print("There was a problem to parse the file " + line1 + " in the listing section!")
parseError = True
print("DONE")

MarketPlaces/Apocalypse/crawler_selenium.py → MarketPlaces/Royal/crawler_selenium.py View File

@ -1,68 +1,171 @@
__author__ = 'Helium'
__author__ = 'DarkWeb'
'''
Apocalypse Forum Crawler (Selenium)
two captchas. if you get a captcha wrong you have to reload program.
Royal Marketplace Crawler (Selenium)
'''
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.Apocalypse.parser import apocalypse_links_parser
from MarketPlaces.Royal.parser import royal_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/'
baseURL = 'http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion'
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
# Opens Tor Browser, crawls the website
def startCrawling():
mktName = getMKTName()
marketName = getMarketName()
driver = getAccess()
if driver != 'down':
try:
captcha(driver)
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
closeDriver(driver)
new_parse(marketName, False)
def captcha(driver):
'''
# wait for captcha page
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[2]/div/div/div/div/form/div/div[2]/button")))
inputChars = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div/div/form/div/div[2]/div[1]/input')
inputNum = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div/div/form/div/div[2]/div[2]/input')
driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div/div/form/div/div[1]/div/div').screenshot(
r'..\Royal\captcha1.png')
im = Image.open(r'..\Royal\captcha1.png')
im.show()
chars = input("Enter characters: ")
inputChars.send_keys(chars)
num = input("Enter number of wrong puzzle pieces: ")
inputNum.send_keys(num)
# click the verify(submit) button
driver.find_element(by=By.XPATH, value="/html/body/div[2]/div/div/div/div/form/div/div[2]/button").click()
'''
input("Press ENTER when CAPTCHA is completed\n")
new_parse(mktName, baseURL, True)
# wait for login page
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[2]/div/div/div[2]/h1")))
'''
temp = driver.find_element(by=By.XPATH, value='/html/body/div/div/form/div[1]')
boxes = temp.find_elements(by=By.TAG_NAME, value='input')
for box in boxes:
# click box to update captcha image
box.click()
# save clock captcha to local
time.sleep(1)
driver.find_element(by=By.XPATH, value='/html/body/div/div/form/div[1]/div').screenshot(
r'..\Royal\captcha1.png')
im = Image.open(r'..\Royal\captcha1.png')
im.show()
letter = input("Enter letter: ")
box.send_keys(letter)
# click the verify(submit) button
driver.find_element(by=By.XPATH, value="/html/body/div/div/form/button[1]").click()
# wait for login page
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[1]/div/div/div[2]/form/input[3]")))
'''
# Login using premade account credentials and do login captcha manually
def login(driver):
# wait for login page
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[2]/div/div/div[2]/form/div[4]")))
# entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
# Username here
usernameBox.send_keys('blabri')
passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
# Password here
passwordBox.send_keys('fishowal')
# click "Login"
driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div[2]/form/div[4]').click()
'''
# wait for captcha page show up
time.sleep(3)
# save captcha to local
driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div[2]/form/div[4]/label/div/div').screenshot(
r'..\Royal\captcha2.png')
# This method will show image in any image viewer
im = Image.open(r'..\Royal\captcha2.png')
im.show()
# ask user input captcha solution in terminal
userIn = input("Enter location of wrong pieces (squares are numbered 1-24 left to right, # # #): ")
squares = userIn.split()
# send user solution into the input space
for id in squares:
driver.find_element(by=By.XPATH, value='//*[@id="cl[' + str((int(id)-1)) + ']"]').click()
# click the verify(submit) button
driver.find_element(by=By.XPATH, value="/html/body/div[2]/div/div/div[2]/form/div[4]/label/div/div/div/button").click()
'''
input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div[3]/div/div[5]/div[1]')))
# Returns the name of the website
#return: name of site in string type
def getMKTName():
name = 'Apocalypse'
def getMarketName():
name = 'Royal'
return name
# Return the base link of the website
#return: url of base site in string type
# Return the link of the website
def getFixedURL():
url = 'http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/'
url = 'http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion'
return url
# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
@ -87,8 +190,8 @@ def createFFDriver():
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@ -104,14 +207,12 @@ def createFFDriver():
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
#return: return the selenium driver or string 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
@ -123,32 +224,7 @@ def getAccess():
return 'down'
# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
# then allows for manual solving of captcha in the terminal
#@param: current selenium web driver
def login(driver):
input("Press ENTER when CAPTCHA is completed\n")
# wait for page to show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="name"]')))
# entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="name"]')
# Username here
usernameBox.send_keys('shooby')
passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
# Password here
passwordBox.send_keys('perry_1the2_platypu$')
input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[1]/div[2]/div[1]/div[1]/a[13]")))
# Saves the crawled html page, makes the directory path for html pages if not made
# Saves the crawled html page
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
@ -158,7 +234,6 @@ def savePage(driver, page, url):
# Gets the full path of the page to be saved along with its appropriate file name
#@param: raw url as crawler crawls through every site
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
@ -171,41 +246,33 @@ def getFullPathName(url):
return fullPath
# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
#@param: raw url as crawler crawls through every site
# Creates the file name from passed URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
if name == '':
name = str(counter)
counter = counter + 1
return name
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
#in this example, there are a couple of categories some threads fall under such as
# Guides and Tutorials, Digital Products, and Software and Malware
#as you can see they are categories of products
def getInterestedLinks():
links = []
# Digital Goods
links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/74')
# Fraud
links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/75')
# Services
links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/category/76')
# software and malware
links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/subcategory/30')
# Digital - Fraud Software
links.append('http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion/category/Fraud%20Software')
# # Digital - Guides and Tutorials
# links.append('http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion/category/Guides%20&%20Tutorials')
# # Digital - Legitimate Software
# links.append('http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion/category/Legitimiate%20Software')
# # Services - Carding
# links.append('http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion/category/Carding')
return links
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
#topic and description pages are crawled through here, where both types of pages are saved
#@param: selenium driver
def crawlForum(driver):
print("Crawling the Apocalypse market")
print("Crawling the Royal market")
linksToCrawl = getInterestedLinks()
@ -233,22 +300,20 @@ def crawlForum(driver):
except:
driver.refresh()
savePage(driver, driver.page_source, item)
# driver.back()
try:
driver.get(link)
except:
driver.refresh()
driver.back()
# comment out
break
# # comment out
# break
#
# # comment out
# if count == 1:
# break
# comment out
if count == 1:
break
try:
nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div/div[2]/nav')
link = nav.find_element(by=By.PARTIAL_LINK_TEXT, value='»').get_attribute('href')
nav = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div[1]/div[2]/nav')
li = nav.find_elements(by=By.TAG_NAME, value='li')
a = li[-1].find_element(by=By.TAG_NAME, value='a')
link = a.get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
@ -260,42 +325,27 @@ def crawlForum(driver):
print(link, e)
i += 1
print("Crawling the Apocalypse market done.")
input("Crawling Royal forum done sucessfully. Press ENTER to continue\n")
# Returns 'True' if the link is a description link
#@param: url of any url crawled
#return: true if is a description page, false if not
# Returns 'True' if the link is Topic link
def isDescriptionLink(url):
if 'article' in url:
if 'product' in url:
return True
return False
# Returns True if the link is a listingPage link
#@param: url of any url crawled
#return: true if is a Listing page, false if not
def isListingLink(url):
if 'category' in url:
return True
return False
# calling the parser to define the links, the html is the url of a link from the list of interested link list
#@param: link from interested link list ie. getInterestingLinks()
#return: list of description links that should be crawled through
# calling the parser to define the links
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return apocalypse_links_parser(soup)
# Drop links that "signout"
# def isSignOut(url):
# #absURL = urlparse.urljoin(url.base_url, url.url)
# if 'signout' in url.lower() or 'logout' in url.lower():
# return True
#
# return False
return royal_links_parser(soup)
def crawler():

MarketPlaces/Tor2door/parser.py → MarketPlaces/Royal/parser.py View File

@ -8,7 +8,7 @@ from bs4 import BeautifulSoup
# This is the method to parse the Description Pages (one page to each Product in the Listing Pages)
def tor2door_description_parser(soup):
def royal_description_parser(soup):
# Fields to be parsed
@ -31,23 +31,18 @@ def tor2door_description_parser(soup):
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
bae = soup.find('div', {'class': "col-9"})
# Finding Product Name
name = bae.find('h2').text
name = soup.find('h5', {'class': "bold"}).text
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = name.strip()
mb = bae.findAll('div', {"class": "mb-1"})
bae = soup.find('div', {'class': "card-header bg-light"})
# Finding Vendor
vendor = mb[0].text
vendor = bae.find('a').text
vendor = vendor.replace(",", "")
vendor = vendor.replace("Sold by:", "")
vendor = vendor.strip()
# # Finding Vendor Rating
@ -55,24 +50,45 @@ def tor2door_description_parser(soup):
# half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"})
# rating = len(full_stars) + (0.5 if half_star is not None else 0)
# Finding Quantity Sold and Left
temp = mb[4].text.split(',')
sold = temp[0].replace("sold", "")
sold = sold.strip()
left = temp[1].replace("in stock", "")
left = left.strip()
# Finding USD
USD = bae.find('div', {"class": "h3 text-secondary"}).text
USD = USD.replace("$", "")
USD = USD.strip()
# Finding BTC
temp = bae.find('div', {"class": "small"}).text.split("BTC")
BTC = temp[0].strip()
# Finding Successful Transactions
success = bae.find('b').text
success = success.replace("(", "")
success = success.replace(")", "")
success = success.strip()
form = soup.find_all('form', {'method': "POST"})
bae = form[1].find_all('div', {'class': "row"})
# Finding Quantity Sold
div = bae[2].find_all('div', {'class': "col"})
temp = div[1].text
temp = temp.split()
if len(temp) > 0:
sold = temp[0].strip()
sold = re.sub(r'[^0-9.]', "", sold)
if sold == "":
sold = -1
else:
sold = -1
# Finding Quantity Left
div = bae[3].find_all('div', {'class': "col"})
temp = div[1].text
temp = temp.split()
if len(temp) > 0:
left = temp[0].strip()
left = re.sub(r'[^0-9.]', "", left)
if left == "":
left = -1
else:
left = -1
# Finding Prices
temp = bae[-2].find('strong').text
temp = temp.replace("Price:", "")
temp = temp.split()
USD = temp[0].strip()
USD = re.sub(r'[^0-9.]', "", USD)
# shipping_info = bae[4].text
# if "Digital" not in shipping_info:
@ -85,7 +101,7 @@ def tor2door_description_parser(soup):
# shipTo = shipping_info[1].strip()
# Finding the Product description
describe = bae.find('div', {"class": "card border-top-0"}).text
describe = soup.find('xmp').text
describe = describe.replace("\n", " ")
describe = describe.replace("\r", " ")
describe = describe.strip()
@ -108,23 +124,20 @@ def tor2door_description_parser(soup):
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
image = bae.find('div', {"class": "product-primary"}).find('img')
image = image.get('src').split('base64,')[-1]
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
BTC, USD, EURO, sold, left, shipFrom, shipTo)
# Sending the results
return row
# This is the method to parse the Listing Pages
def tor2door_listing_parser(soup):
def royal_listing_parser(soup):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
mktName = "Tor2door" # 0 *Marketplace_Name
mktName = "Royal" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
@ -144,69 +157,43 @@ def tor2door_listing_parser(soup):
qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
href = [] # 20 Product_Links
# Finding category of listing page
cat = soup.find('li', {'class': "breadcrumb-item active"}).text
cat = cat.strip()
listing = soup.findAll('div', {"class": "card product-card mb-3"})
listing = soup.findAll('div', {'class': "card search border shadow-sm mb-3"})
# Populating the Number of Products
nm = len(listing)
# Finding Category
cat = soup.find("div", {"class": "col-9"})
cat = cat.find("h2").text
cat = cat.replace("Category: ", "")
cat = cat.replace(",", "")
cat = cat.strip()
for a in listing:
bae = a.findAll('a', href=True)
for card in listing:
# Adding the category
category.append(cat)
bae = card.findAll('a')
# Adding the url to the list of urls
link = bae[0].get('href')
link = bae[1].get('href')
link = cleanLink(link)
href.append(link)
# Finding Product Name
product = bae[1].text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.strip()
name.append(product)
# Finding Vendor
vendor_name = bae[2].text
# Finding the Vendor
vendor_name = bae[0].text
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
# Finding USD
usd = card.find('div', {"class": "mb-1"}).text
usd = usd.replace("$", "")
usd = usd.strip()
USD.append(usd)
# Finding Rating
stars = card.find("ul", {"class": "star-list"})
full = stars.findAll('i', {"class": "fas fa-star star star-active"})
half = stars.find('i', {"class": "fas fa-star-half star star-active"})
rating = len(full)
if half is not None:
rating += 0.5
rating_item.append(str(rating))
# Finding Reviews
num = card.find("span", {"class": "rate-count"}).text
num = num.replace("(", "")
num = num.replace("review)", "")
num = num.replace("reviews)", "")
num = num.strip()
reviews.append(num)
# Finding the Product
product = bae[2].get('title')
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.strip()
name.append(product)
# Searching for CVE and MS categories
cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue="-1"
else:
@ -219,7 +206,7 @@ def tor2door_listing_parser(soup):
cveValue=cee
CVE.append(cveValue)
ms = card.findAll(text=re.compile('MS\d{2}-\d{3}'))
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue="-1"
else:
@ -232,24 +219,22 @@ def tor2door_listing_parser(soup):
MSValue=me
MS.append(MSValue)
image = bae[0].find('img')
image = image.get('src').split('base64,')[-1]
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
def tor2door_links_parser(soup):
def royal_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.findAll('div', {"class": "card product-card mb-3"})
listing = soup.findAll('div', {"class": "card search border shadow-sm mb-3"})
for div in listing:
link = div.find('a')['href']
a = div.find_all('a')
link = a[1].get('href')
href.append(link)
return href

+ 0
- 190
MarketPlaces/ThiefWorld/parser.py View File

@ -1,190 +0,0 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from typing import List, Tuple
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup, ResultSet, Tag
def thiefWorld_description_parser(soup: BeautifulSoup) -> Tuple:
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
name = soup.find("h1", {'class': 'title'}).text
name = cleanString(name.strip())
describe = soup.find('div', {'id': 'descriptionContent'}).text
describe = cleanString(describe.strip())
# Finding Product Image
image = soup.find('div', {'class': 'product_img_big'}).find('img')
image = image.get('src')
image = image.split('base64,')[-1]
commentListTag: Tag = soup.find('ul', {'class': 'comment_list scrollbar'})
commentList = commentListTag.find_all('li')
review = str(len(commentList))
citySelection: str = soup.find('ul', {'class': 'meta text-muted i_location'}).text
shipFrom = cleanString(citySelection.strip())
vendor = soup.find('h1', {'class': 'title over'}).text
vendor = cleanString(vendor.strip())
usdTag: Tag = soup.find('div', {'class': 'product_price__big'}).find('span')
usdText = usdTag.text.strip('/')[0]
# usdText format: "<value> USD " (i.e., "70 000 USD ")
USD = cleanString(usdText.replace("USD", "").strip())
ratingDiv = soup.find('div', {'class': 'rating_star'})
rating_vendor = ratingDiv.get('title').split(' ')[1]
rating_item = soup.find('div', {'class': 'product_rate'}).text
rating_item = rating_item.replace("rating", "")
rating_item = cleanString(rating_item.strip())
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
return row
def thiefWorld_listing_parser(soup: BeautifulSoup):
# Fields to be parsed
nm = 0 # Total_Products (Should be Integer)
mktName = "ThiefWorld" # 0 Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 6 Product_MS_Classification (Microsoft Security)
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
productList: ResultSet[Tag] = soup.find_all('div', {'class': 'catalog_item'})
nm = len(productList)
for product in productList:
productTitle: Tag = product.find('div', {'class': 'title'}).find('a')
productName = cleanString(productTitle.text.strip())
name.append(productName)
# Finding Product Image
product_image = product.find('noscript').find('img')
product_image = product_image.get('src')
product_image = product_image.split('base64,')[-1]
image.append(product_image)
productHref = productTitle.get('href')
href.append(productHref)
CVE.append('-1')
MS.append('-1')
cat = soup.find('calsys-cat').text
category.append(cat.strip())
productDescription = product.find('div', {'class': 'text'}).text
productDescription = cleanString(productDescription.strip())
describe.append(productDescription)
views.append('-1')
reviews.append('-1')
addDate.append('-1')
BTC.append('-1')
priceText = product.find('span', {'class': 'price'}).find('span').text
priceText = priceText.split('USD')[0]
priceText = cleanString(priceText.strip())
USD.append(priceText)
EURO.append('-1')
sold.append('-1')
qLeft.append('-1')
shipFrom.append('-1')
shipTo.append('-1')
productVendor = product.find('div', {'class': 'market over'}).find('a').text
productVendor = cleanString(productVendor.strip())
vendor.append(productVendor)
image_vendor.append('-1')
rating_vendor.append('-1')
#rating_item.append('-1')
rating = product.find('div', {'class': 'rating_star_yellow'}).attrs.get('style')
rating = rating.replace("width: ", "")
rating_item.append(cleanString(rating))
success.append('-1')
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
#called by the crawler to get description links on a listing page
#@param: beautifulsoup object that is using the correct html page (listing page)
#return: list of description links from a listing page
def thiefworld_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.find('div', {"class": "row tile__list tileitems_filter pad15 tileproduct__list"}).findAll('div', {"class": "desc"})
for a in listing:
bae = a.find('div', {"class": "title"}).find('a', href=True)
link = bae['href']
href.append(link)
return href

+ 0
- 268
MarketPlaces/TorBay/crawler_selenium.py View File

@ -1,268 +0,0 @@
__author__ = 'Helium'
'''
TorBay Market Forum Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
import urllib.parse as urlparse
import os, time
from datetime import date
import subprocess
import configparser
import subprocess
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.TorBay.parser import torbay_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://torbay3253zck4ym5cbowwvrbfjjzruzthrx3np5y6owvifrnhy5ybid.onion/'
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Returns the name of the website
#return: name of site in string type
def getMKTName():
name = 'TorBay'
return name
# Return the base link of the website
#return: url of base site in string type
def getFixedURL():
url = 'http://torbay3253zck4ym5cbowwvrbfjjzruzthrx3np5y6owvifrnhy5ybid.onion/'
return url
# Closes Tor Browser
#@param: current selenium driver
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", True)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
#return: return the selenium driver or string 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
# then allows for manual solving of captcha in the terminal
#@param: current selenium web driver
def login(driver):
# wait for page to show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[2]/div/div/div/ul/li[6]/a")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
#@param: raw url as crawler crawls through every site
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
#@param: raw url as crawler crawls through every site
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
#in this example, there are a couple of categories some threads fall under such as
# Guides and Tutorials, Digital Products, and Software and Malware
#as you can see they are categories of products
def getInterestedLinks():
links = []
# Hacking
links.append('http://torbay3253zck4ym5cbowwvrbfjjzruzthrx3np5y6owvifrnhy5ybid.onion/category/hacking')
return links
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
#topic and description pages are crawled through here, where both types of pages are saved
#@param: selenium driver
def crawlForum(driver):
print("Crawling the TorBay Market")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver, driver.page_source, item)
driver.back()
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try:
nav = driver.find_element(by=By.XPATH, value='/html/body/section/div/div/div[2]/div/div[2]/ul')
link = nav.find_element(by=By.PARTIAL_LINK_TEXT, value='Next').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
print("Crawling the TorBay market done.")
# Returns 'True' if the link is a description link
#@param: url of any url crawled
#return: true if is a description page, false if not
def isDescriptionLink(url):
if 'product' in url:
return True
return False
# Returns True if the link is a listingPage link
#@param: url of any url crawled
#return: true if is a Listing page, false if not
def isListingLink(url):
if 'category' in url:
return True
return False
# calling the parser to define the links, the html is the url of a link from the list of interested link list
#@param: link from interested link list ie. getInterestingLinks()
#return: list of description links that should be crawled through
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return torbay_links_parser(soup)
def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")

+ 0
- 183
MarketPlaces/TorBay/parser.py View File

@ -1,183 +0,0 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of description page
#return: 'row' that contains a variety of lists that each hold info on the description page
def torbay_description_parser(soup):
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
# Finding Product Name
try:
product_name = soup.find('div', {'class': 'product-information'}).find('h1').text
name = cleanString(product_name.strip())
except:
product_name = soup.find('div', {'class': 'profile-info'}).find('h2').text
name = cleanString(product_name.strip())
# Finding Vendor FIx
vendor_name = soup.find('div', {"class": "profile-info"}).find('h2').text
vendor = cleanString(vendor_name.strip())
# Finding Vendor Image
vendor_image = soup.find('div', {'class': 'avatar'}).find('img')
vendor_image = vendor_image.get('src')
vendor_image = vendor_image.split('base64,')[-1]
# Finding Prices
USD = soup.find('div', {'class': "total-price"}).find('span').text.strip()
# Finding the Product Category
cat = soup.find('div', {'class': "profile-info"}).find('p').text
category = cleanString(cat.strip())
# Finding the Product description
try:
describe = soup.find('div', {'class': "info"}).find('p').text
if "\n" in describe:
describe = describe.replace("\n", " ")
describe = describe.replace("\r", " ")
describe = cleanString(describe.strip())
except:
# print("product desc")
describe = soup.find('div', {'class': 'info'}).text
describe = cleanString(describe.strip())
# Finding Product Image
image = soup.find('div', {'class': 'image text-center'}).find('img')
image = image.get('src')
image = image.split('base64,')[-1]
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
return row
#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of listing page
#return: 'row' that contains a variety of lists that each hold info on the listing page
def torbay_listing_parser(soup):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
mktName = "TorBay" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 6 Product_MS_Classification (Microsoft Security)
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
listing = soup.findAll('div', {"class": "product-card"})
# Populating the Number of Products
nm = len(listing)
for a in listing:
product_name = a.find('p', {'class': 'name'}).text
name.append(cleanString(product_name.strip()))
# Finding Product Image
image.append("-1")
prod = a.find('p', {'class': 'price'}).text # price
USD.append(cleanString(prod.strip()))
ven = a.find('div', {'class': 'pc-footer'}).find('div').find('a').text # pc-footer
vendor.append(cleanString(ven.strip()))
# print(ven)
# Finding Vendor Image
image_vendor.append("-1")
h = a.find('p', {'class': 'name'}).find('a').get('href')
href.append(h)
CVE.append("-1")
MS.append("-1")
rating_vendor.append("-1")
success.append("-1")
describe.append("-1")
views.append("-1")
reviews.append("-1")
rating_item.append("-1")
addDate.append("-1")
BTC.append("-1")
EURO.append("-1")
sold.append("-1")
qLeft.append("-1")
shipFrom.append("-1")
shipTo.append("-1")
category.append("Hacking")
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
#called by the crawler to get description links on a listing page
#@param: beautifulsoup object that is using the correct html page (listing page)
#return: list of description links from a listing page
def torbay_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.find('section', {"id": "content"}).findAll('div', {"class": "product-card"})
for a in listing:
bae = a.find('div', {"class": "pc-footer"}).find('a', {"class": "btn btn-primary"}, href=True)
link = bae['href']
href.append(link)
return href

+ 0
- 277
MarketPlaces/TorMarket/crawler_selenium.py View File

@ -1,277 +0,0 @@
__author__ = 'Helium'
'''
TorMarket Forum Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.TorMarket.parser import tormarket_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/'
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Returns the name of the website
#return: name of site in string type
def getMKTName():
name = 'TorMarket'
return name
# Return the base link of the website
#return: url of base site in string type
def getFixedURL():
url = 'http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/'
return url
# Closes Tor Browser
#@param: current selenium driver
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", False)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
#return: return the selenium driver or string 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
# then allows for manual solving of captcha in the terminal
#@param: current selenium web driver
def login(driver):
# wait for page to show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[2]/div/div/div/main/article/div/section[4]/div/div[1]/div/div/div/div/ul/li[15]/ul/li[3]/a")))
# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
#@param: raw url as crawler crawls through every site
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
#@param: raw url as crawler crawls through every site
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
#in this example, there are a couple of categories some threads fall under such as
# Guides and Tutorials, Digital Products, and Software and Malware
#as you can see they are categories of products
def getInterestedLinks():
links = []
# Tutorials
links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/guides-tutorials/')
# Malware
links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/malware/')
# Services
links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/services/')
return links
# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
#topic and description pages are crawled through here, where both types of pages are saved
#@param: selenium driver
def crawlForum(driver):
print("Crawling the TorMarket market")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver, driver.page_source, item)
driver.back()
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try:
link = driver.find_element(by=By.LINK_TEXT, value='NEXT').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
print("Crawling the TorMarket market done.")
# Returns 'True' if the link is a description link
#@param: url of any url crawled
#return: true if is a description page, false if not
def isDescriptionLink(url):
if 'shop' in url:
return True
return False
# Returns True if the link is a listingPage link
#@param: url of any url crawled
#return: true if is a Listing page, false if not
def isListingLink(url):
if 'product-category' in url:
return True
return False
# calling the parser to define the links, the html is the url of a link from the list of interested link list
#@param: link from interested link list ie. getInterestingLinks()
#return: list of description links that should be crawled through
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return tormarket_links_parser(soup)
# Drop links that "signout"
# def isSignOut(url):
# #absURL = urlparse.urljoin(url.base_url, url.url)
# if 'signout' in url.lower() or 'logout' in url.lower():
# return True
#
# return False
def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")

+ 0
- 189
MarketPlaces/TorMarket/parser.py View File

@ -1,189 +0,0 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
import re
#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of description page
#return: 'row' that contains a variety of lists that each hold info on the description page
def tormarket_description_parser(soup):
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
#finding the name of the product
name_of_product = soup.find("h1", {"class": "product_title entry-title"}).find("a").text
name = cleanString(name_of_product.strip())
#finding the description of the product
description_of_product = soup.find("div", {"class": "woocommerce-product-details__short-description"}).text
describe = cleanString(description_of_product.strip())
#finding the name of the vendor
name_of_vendor = soup.find("div", {"class": "wcfmmp_sold_by_store"})
if name_of_vendor is not None:
name_of_vendor = name_of_vendor.find("a").text
vendor = cleanString(name_of_vendor.strip())
else:
vendor = "TorMarket"
#finding the price of the item
price = soup.find("p", {"class": "price"}).find("bdi").text
price_cleaned = price[1:]
USD = price_cleaned.strip()
category = soup.find('span', {"class": "posted_in"}).text
category = category.split(':')[-1]
category = category.replace(',', '/')
category = cleanString(category.strip())
#everything else gets a -1 because they are not found
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
return row
#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of listing page
#return: 'row' that contains a variety of lists that each hold info on the listing page
def tormarket_listing_parser(soup):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
mktName = "TorMarket" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft = [] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
products_list = soup.find('ul', {"class": "products columns-3 tablet-columns-2 mobile-columns-1"}).find_all('li')
nm = len(products_list)
for product in products_list:
# Finding the name of the product
name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text
name_of_product_cleaned = cleanString(name_of_product.strip())
# print(name_of_product_cleaned)
name.append(name_of_product_cleaned)
#finding the URL
try:
url = product.find("div", {"class": "product-loop-content text-center"}).find("a").get("href")
# print(url)
href.append(url)
except AttributeError as e:
print("I can't find the link")
raise e
#finding the rating of the product
rating_score_of_product = product.find("div", {"class": "product-loop-content text-center"}).find("div").find("span").text
rating_item.append(cleanString(rating_score_of_product.strip()))
# print("done")
#finding the rating of the vendors
rating_score_of_vendor = product.find("div", {"class": "wcfmmp-store-rating"})
if rating_score_of_vendor is not None:
rating_score_of_vendor = rating_score_of_vendor.find("strong").text
rating_vendor.append(cleanString(rating_score_of_vendor.strip()))
else:
rating_vendor.append('-1')
# print("done")
#finding the cost in USD
cost = product.find("span", {"class": "woocommerce-Price-amount amount"}).text
USD.append(cost)
# print("done")
#finding the name of the vendor
vendor_name = product.find("div", {"class": "wcfmmp_sold_by_wrapper"})
if vendor_name is not None:
vendor_name = vendor_name.find("a").text
vendor.append(cleanString(vendor_name.strip()))
else:
vendor.append(mktName)
# print("done")
#everything else appends a -1
success.append("-1")
CVE.append("-1")
MS.append("-1")
category.append("-1")
describe.append("-1")
views.append("-1")
reviews.append("-1")
addDate.append("-1")
BTC.append("-1")
EURO.append("-1")
sold.append("-1")
qLeft.append("-1")
shipFrom.append("-1")
shipTo.append("-1")
# print("Done! moving onto the next product!")
# print(len(shipTo))
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
#called by the crawler to get description links on a listing page
#@param: beautifulsoup object that is using the correct html page (listing page)
#return: list of description links from a listing page
def tormarket_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.findAll('div', {"class": "product-loop-content text-center"})
for a in listing:
bae = a.find('h2', {"class": "woocommerce-loop-product__title"}).find('a', href=True)
link = bae['href']
href.append(link)
return href

MarketPlaces/HiddenMarket/crawler_selenium.py → MarketPlaces/WeTheNorth/crawler_selenium.py View File

@ -1,7 +1,7 @@
__author__ = 'DarkWeb'
'''
HiddenMarket Market Crawler (Selenium)
WeTheNorth Market Crawler (Selenium)
'''
from selenium import webdriver
@ -16,20 +16,20 @@ from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.HiddenMarket.parser import hiddenmarket_links_parser
from MarketPlaces.WeTheNorth.parser import wethenorth_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/'
baseURL = 'http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion'
# Opens Tor Browser, crawls the website
def startCrawling():
marketName = getMKTName()
marketName = getMarketName()
driver = getAccess()
if driver != 'down':
@ -40,39 +40,40 @@ def startCrawling():
print(driver.current_url, e)
closeDriver(driver)
new_parse(marketName, baseURL, True)
new_parse(marketName, False)
# Login using premade account credentials and do login captcha manually
def login(driver):
# wait for login page
time.sleep(3)
#wait for login page
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[3]/div[3]")))
(By.XPATH, "/html/body/div/div[2]/div[2]/div/div[3]/form/div[1]/input")))
# entering username and password into input boxes
# usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
# Username here
# usernameBox.send_keys('ct1234')
# passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
# Password here
# passwordBox.send_keys('DementedBed1230')
#entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[1]/input')
#Username here
usernameBox.send_keys('blabri')
passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[2]/input')
#Password here
passwordBox.send_keys('fishowal')
'''
# wait for captcha page show up
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/main/div/div/div/div/div/form/div[3]/div/div[1]/label/img")))
(By.XPATH, "/html/body/div/div[2]/div[2]/div/div[3]/form/div[3]/div/img")))
# save captcha to local
driver.find_element(by=By.XPATH, value='/html/body/main/div/div/div/div/div/form/div[3]/div/div[1]/label/img').screenshot(
r'..\captcha.png')
driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[3]/div/img').screenshot(
r'..\WeTheNorth\captcha.png')
# This method will show image in any image viewer
im = Image.open(r'..\captcha.png')
im = Image.open(r'..\WeTheNorth\captcha.png')
im.show()
# wait until input space show up
inputBox = driver.find_element(by=By.XPATH, value='//*[@id="captcha"]')
inputBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[4]/input')
# ask user input captcha solution in terminal
userIn = input("Enter solution: ")
@ -81,24 +82,24 @@ def login(driver):
inputBox.send_keys(userIn)
# click the verify(submit) button
driver.find_element(by=By.XPATH, value="/html/body/main/div/div/div/div/div/form/div[4]/button").click()
driver.find_element(by=By.XPATH, value="/html/body/div/div[2]/div[2]/div/div[3]/form/div[5]/input").click()
'''
# input("Press ENTER when CAPTCHA is completed\n")
input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
# WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
# (By.XPATH, '/html/body/main/div/div/div[1]/div/div[1]/div/h5')))
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="information"]')))
# Returns the name of the website
def getMKTName():
name = 'HiddenMarket'
def getMarketName():
name = 'WeTheNorth'
return name
# Return the link of the website
def getFixedURL():
url = 'http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/'
url = 'http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion'
return url
@ -109,7 +110,7 @@ def closeDriver(driver):
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.quit()
driver.close()
time.sleep(3)
return
@ -140,7 +141,7 @@ def createFFDriver():
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", False)
ff_prof.set_preference("javascript.enabled", True)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
@ -198,34 +199,19 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
# Civil Software
links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/civil_softwares')
# Tutorials - Carding
links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/carding')
# Digital - Hacks
links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/hacks')
# Digital - Exploit Kit
links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/exploit_kit')
# 0Day
links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/0day')
# Digital Forensics
links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/digital_forensics')
# Tutorials - Mining
links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/mining')
# Tutorials - Worms
links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/worms')
# Tutorials - Viruses
links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/viruses')
# Tutorials - Trojans
links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/trojans')
# Tutorials - Botnets
links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/botnets')
# # Fraud Software
# links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=5&podcategory=3')
# # Guides and Tutorials - Hacking
# links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=3&podcategory=3')
# Software and Malware
links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=10')
return links
def crawlForum(driver):
print("Crawling the HiddenMarket market")
print("Crawling the WeTheNorth market")
linksToCrawl = getInterestedLinks()
@ -233,20 +219,15 @@ def crawlForum(driver):
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
categoryLink = link
maxNumPages = 0 # temp value.
try:
pg_counter = 1
has_next_page = True
count = 0
pageCount = 1
while has_next_page:
try:
driver.get(link)
if pageCount == 1:
maxNumPages = int(driver.find_element(by=By.CLASS_NAME, value='main')
.find_element(by=By.CLASS_NAME, value='pages')
.find_elements(By.CLASS_NAME, value='page')[-1].text)
except:
driver.refresh()
html = driver.page_source
@ -262,19 +243,22 @@ def crawlForum(driver):
savePage(driver, driver.page_source, item)
driver.back()
# # comment out
# break
#
# # comment out
# if count == 1:
# break
# comment out
break
# comment out
if count == 1:
break
try:
pageCount += 1
if pageCount > maxNumPages:
nav = driver.find_element(by=By.XPATH, value=
'/html/body/div[2]/div[3]/div[3]/div[2]/div[7]')
pg_counter += 1
pg_counter_str = "p=" + str(pg_counter) + "&"
a = nav.find_element(by=By.XPATH, value = '//a[contains(@href,"'+pg_counter_str+'")]')
link = a.get_attribute('href')
if link == "":
raise NoSuchElementException
pageLink = "/" + str(pageCount) + "/"
link = categoryLink + pageLink
count += 1
except NoSuchElementException:
@ -284,7 +268,7 @@ def crawlForum(driver):
print(link, e)
i += 1
print("Crawling the HiddenMarket market done.")
input("Crawling WeTheNorth market done sucessfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link
@ -304,7 +288,7 @@ def isListingLink(url):
# calling the parser to define the links
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return hiddenmarket_links_parser(soup)
return wethenorth_links_parser(soup)
def crawler():

+ 248
- 0
MarketPlaces/WeTheNorth/parser.py View File

@ -0,0 +1,248 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
# This is the method to parse the Description Pages (one page to each Product in the Listing Pages)
def wethenorth_description_parser(soup):
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
# Finding Product Name
listDes = soup.find('div', {'class': "listDes"})
name = listDes.find('h2').text
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = name.strip()
# Finding Vendor
vendor = listDes.find('b').text
vendor = vendor.replace(",", "")
vendor = vendor.replace("...", "")
vendor = vendor.replace("-", "")
vendor = vendor.strip()
# Finding Vendor Rating
# rating = listDes.find('span',{'class':'levelSet'})
# rating = rating.text
# rating = rating.replace('\n', ' ')
# rating = rating.replace(",", "")
# rating = rating.strip()
# Finding Successful Transactions
success = listDes.find_all('p')[1]
success = success.find('span').text
success = success.split()
success = success[0].strip()
# Finding Prices - all prices in We The North are in CAD, I left the CAD in the resulting String so that it would show CAD for all prices
padp = listDes.find('p',{'class':'padp'})
USD = padp.find('span').text
USD = USD.strip()
# Finding Escrow - no escrow on WTN market
shipping_info = listDes.find('tbody')
if "Digital" not in shipping_info:
shipping_info = shipping_info.find_all('tr')
row1 = shipping_info[0].find_all('td')
# Finding Shipment Information (Origin)
shipFrom = row1[-1].text
shipFrom=shipFrom.strip()
if shipFrom=="":
shipFrom="-1"
row2 = shipping_info[1].find_all('td')
# Finding Shipment Information (Destination)
shipTo = row2[-1].text
shipTo= shipTo.strip()
if shipTo == "":
shipTo = "-1"
# Finding the Product description
describe = soup.find("div",{'class':'tabcontent'})
describe = describe.find('p').text
describe = describe.replace("\n", " ")
describe = describe.replace("\r", " ")
describe = describe.strip()
'''
# Finding the Number of Product Reviews
tag = soup.findAll(text=re.compile('Reviews'))
for index in tag:
reviews = index
par = reviews.find('(')
if par >=0:
reviews = reviews.replace("Reviews (","")
reviews = reviews.replace(")","")
reviews = reviews.split(",")
review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
else :
review = "-1"
'''
# Searching for CVE and MS categories
# no CVE or MS for WTN market
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo)
# Sending the results
return row
# This is the method to parse the Listing Pages
def wethenorth_listing_parser(soup):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
mktName = "WeTheNorth" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 6 Product_MS_Classification (Microsoft Security)
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
right_content = soup.find('div', {"class": "right-content"})
listing = right_content.findAll('div', {"class": "col-1search"})
listing = listing[3:]
# Populating the Number of Products
nm = len(listing)
for a in listing:
bae = a.findAll('a', href=True)
# Adding the url to the list of urls
link = bae[0].get('href')
link = cleanLink(link)
href.append(link)
# Finding the Vendor
vendor_name = a.find('p', {'class': 'padp'})
vendor_name = vendor_name.find('a').text
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
# Finding the Product
product = bae[0].text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.strip()
name.append(product)
# Finding the Category
category_name = a.find('p', {'class': 'padp'}).text
first_dash = category_name.find('-')
second_dash = category_name[first_dash+1:].find('-')
category_name = category_name[first_dash+1:second_dash]
category_name=category_name.strip()
category.append(category_name)
# Finding Views
view_count = a.text
view_count = view_count[view_count.find('Views:'): view_count.find('Sales:')]
view_count = view_count.replace('Views:', ' ')
view_count = view_count.replace('/', ' ')
view_count = view_count.strip()
views.append(view_count)
# Finding success sales
sold_count = a.text
sold_count = sold_count[sold_count.find('Sales:'): sold_count.find('Short')]
sold_count = sold_count.replace('Sales:', ' ')
sold_count = sold_count.replace('/', ' ')
sold_count = sold_count.strip()
success.append(sold_count)
# Searching for CVE and MS categories
# no CVE or MS in WTN market
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue="-1"
else:
cee = " "
for idx in cve:
cee += (idx)
cee += " "
cee = cee.replace(',', ' ')
cee = cee.replace('\n', '')
cveValue=cee
CVE.append(cveValue)
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue="-1"
else:
me = " "
for im in ms:
me += (im)
me += " "
me = me.replace(',', ' ')
me = me.replace('\n', '')
MSValue=me
MS.append(MSValue)
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
def wethenorth_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
right_content = soup.find('div',{"class": "right-content"})
listing = right_content.findAll('div', {"class": "col-1search"})
#cut out the irrelevant products that are in blue, the first three products of each page usually unrelated
listing = listing[3:]
for a in listing:
link = a.find('a')
link = link['href']
href.append(link)
return href

|||||||
x
 
000:0
Loading…
Cancel
Save