Browse Source

added forum and market templates

main
westernmeadow 1 year ago
parent
commit
a567086dd6
5 changed files with 1421 additions and 0 deletions
  1. +257
    -0
      Forums/CryptBB/crawler_mechanize.py
  2. +331
    -0
      Forums/CryptBB/crawler_selenium.py
  3. +282
    -0
      Forums/CryptBB/parser.py
  4. +262
    -0
      MarketPlaces/DarkBazar/crawler_selenium.py
  5. +289
    -0
      MarketPlaces/DarkBazar/parser.py

+ 257
- 0
Forums/CryptBB/crawler_mechanize.py View File

@ -0,0 +1,257 @@
__author__ = '91Shadows'
'''
CryptBB Crawler (Mechanize)
'''
import codecs, os, re
import socks, socket, time
from datetime import date
import urllib.parse as urlparse
import http.client as httplib
import mechanize
import subprocess
from bs4 import BeautifulSoup
from Forums.Initialization.prepare_parser import new_parse
from Forums.BestCardingWorld.parser import bestcardingworld_links_parser
counter = 1
httplib.HTTPConnection._http_vsn = 10
httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
baseURL = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=42&sid=ee2cbfd73c12923d979790b2bb4bdfd5'
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150)
# Opens Tor Browser, crawls the website
def startCrawling():
opentor()
getUrl()
forumName = getForumName()
br = getAccess()
if br != 'down':
crawlForum(br)
new_parse(forumName, False)
# new_parse(forumName, False)
closetor()
# Opens Tor Browser
def opentor():
global pid
print("Connecting Tor...")
path = open('../../path.txt').readline()
pro = subprocess.Popen(path)
pid = pro.pid
time.sleep(7.5)
input("Tor Connected. Press ENTER to continue\n")
return
# Creates a connection through Tor Port
def getUrl(timeout=None):
socket.socket = socks.socksocket
socket.create_connection = create_connection
return
# Makes the onion address request
def create_connection(address, timeout=None, source_address=None):
sock = socks.socksocket()
sock.connect(address)
return sock
# Returns the name of website
def getForumName():
name = 'CryptBB'
return name
# Return the link of website
def getFixedURL():
url = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=42&sid=ee2cbfd73c12923d979790b2bb4bdfd5'
return url
# Closes Tor Browser
def closetor():
global pid
os.system("taskkill /pid " + str(pid))
print('Closing Tor...')
time.sleep(3)
return
# Creates a Mechanize browser and initializes its options
def createBrowser():
br = mechanize.Browser()
cj = mechanize.CookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'),
('Accept', '*/*')]
return br
def getAccess():
url = getFixedURL()
br = createBrowser()
try:
br.open(url)
return br
except:
return 'down'
# Saves the crawled html page
def savePage(page, url):
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
a = page.read()
open(filePath, "wb").write(a)
return
# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = 'C:/Users/CALSysLab/Documents/threatIntelligence-main/DarkWebMining_Working/Forums/ThiefWorld/HTML_Pages/' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + '/' + 'Description/' + fileName + '.html'
else:
fullPath = 'C:/Users/CALSysLab/Documents/threatIntelligence-main/DarkWebMining_Working/Forums/ThiefWorld/HTML_Pages/' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + '/' + 'Listing/' + fileName + '.html'
return fullPath
# Creates the name of the file based on URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
# Hacking and Markets related topics
def getInterestedLinks():
links = []
links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=43&sid=e12864ffccc5df877b03b573534955be')
return links
# Start crawling Forum pages
def crawlForum(br):
print("Crawling CryptBB forum")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
page = br.open(link)
savePage(page, link)
res = br.response().read()
soup = BeautifulSoup(res, 'html.parser')
next_link = soup.find("a", {"rel": "next"})
if next_link != None:
full_url = urlparse.urljoin(linksToCrawl[i], next_link['href'])
linksToCrawl.insert(i + 1, full_url)
listOfTopics = findDescriptionPages(link)
for topic in listOfTopics:
itemPage = br.open(str(topic))
savePage(itemPage, topic)
except Exception as e:
print('Error getting link: ', link, e)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("CryptBB forum done sucessfully. Press ENTER to continue\n")
return
# Returns True if the link is 'Topic' Links, may need to change for diff websites
def isDescriptionLink(url):
if 'topic' in url:
return True
return False
# Returns True if the link is a listingPage link, may need to change for diff websites
def isListingLink(url):
'''
reg = 'board=[0-9]+.[0-9]+\Z'
if len(re.findall(reg, url)) == 0:
return False
return True
'''
if 'forum' in url:
return True
return False
# calling the parser to define the links
def findDescriptionPages(url):
soup = ""
error = False
try:
html = codecs.open(
"C:\\Users\\CALSysLab\\Documents\\threatIntelligence-main\\DarkWebMining_Working\\Forums\\BestCardingWorld\\HTML_Pages\\" + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + "\\Listing\\" + getNameFromURL(url) + ".html", encoding='utf8')
soup = BeautifulSoup(html, "html.parser")
except:
try:
html = open(
"C:\\Users\\CALSysLab\\Documents\\threatIntelligence-main\\DarkWebMining_Working\\Forums\\BestCardingWorld\\HTML_Pages\\" + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + "\\Listing\\" + getNameFromURL(url) + ".html")
soup = BeautifulSoup(html, "html.parser")
except:
error = True
print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.")
if not error:
return bestcardingworld_links_parser(soup)
else:
return []
def crawler():
startCrawling()
print("Crawling and Parsing CryptBB .... DONE!")

+ 331
- 0
Forums/CryptBB/crawler_selenium.py View File

@ -0,0 +1,331 @@
__author__ = 'DarkWeb'
'''
CryptBB Forum Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from PIL import Image
import urllib.parse as urlparse
import os, re, time
import subprocess
from bs4 import BeautifulSoup
from Forums.Initialization.prepare_parser import new_parse
from Forums.CryptBB.parser import cryptBB_links_parser
from Forums.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/'
# Opens Tor Browser, crawls the website
def startCrawling():
forumName = getForumName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(forumName, baseURL, True)
# Login using premade account credentials and do login captcha manually
def login(driver):
#click login button
login_link = driver.find_element(
by=By.XPATH, value='/html/body/div/div[2]/div/table/tbody/tr[2]/td/center/pre/strong/a[1]').\
get_attribute('href')
driver.get(login_link)# open tab with url
#entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[2]/td[2]/input')
#Username here
usernameBox.send_keys('holyre')#sends string to the username box
passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/form/table/tbody/tr[3]/td[2]/input')
#Password here
passwordBox.send_keys('PlatinumBorn2')# sends string to passwordBox
'''
# wait for captcha page show up
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div/div[2]/div/form/div/input")))
# save captcha to local
driver.find_element(by=By.XPATH, value='//*[@id="captcha_img"]').screenshot(r'..\CryptBB\captcha.png')
# This method will show image in any image viewer
im = Image.open(r'..\CryptBB\captcha.png')
im.show()
# wait until input space show up
inputBox = driver.find_element(by=By.XPATH, value='//*[@id="imagestring"]')
# ask user input captcha solution in terminal
userIn = input("Enter solution: ")
# send user solution into the input space
inputBox.send_keys(userIn)
# click the verify(submit) button
driver.find_element(by=By.XPATH, value="/html/body/div/div[2]/div/form/div/input").click()
'''
input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
# wait for 50 sec until id = tab_content is found, then cont
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="tab_content"]')))
# Returns the name of the website
def getForumName() -> str:
name = 'CryptBB'
return name
# Return the link of the website
def getFixedURL():
url = 'http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/'
return url
# Closes Tor Browser
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close() #close tab
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from Forums.Initialization.forums_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", True)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
# Saves the crawled html page
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
from Forums.Initialization.forums_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if not isListingLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
# Creates the file name from passed URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if name == '':
name = str(counter)
counter = counter + 1
return name
def getInterestedLinks():
links = []
# Beginner Programming
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86')
# Beginner Carding and Fraud
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=91')
# Beginner Hacking
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=87')
# Newbie
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=84')
# Beginner Hardware
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=89')
# Training Challenges
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=96')
# Darknet Discussions
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=88')
# Public Leaks and Warez
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=97')
# Sell
links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=44')
return links
def crawlForum(driver):
print("Crawling the CryptBB forum")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
topics = topicPages(html)
for topic in topics:
has_next_topic_page = True
counter = 1
page = topic
while has_next_topic_page:
itemURL = urlparse.urljoin(baseURL, str(page))
try:
driver.get(itemURL)
except:
driver.refresh()
if isListingLink(driver.current_url):
break
savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# # comment out
# if counter == 2:
# break
try:
temp = driver.find_element(By.XPATH, '/html/body/div/div[2]/div/div[2]/div')
page = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
if page == "":
raise NoSuchElementException
counter += 1
except NoSuchElementException:
has_next_topic_page = False
# making sure we go back to the listing page (browser back button simulation)
try:
driver.get(link)
except:
driver.refresh()
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try:
temp = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div')
link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
print("Crawling the CrypttBB forum done.")
# Returns 'True' if the link is Topic link, may need to change for every website
def isDescriptionLink(url):
if 'thread' in url:
return True
return False
# Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url):
if '.onion/forumdisplay' in url:
return True
return False
# calling the parser to define the links
def topicPages(html):
soup = BeautifulSoup(html, "html.parser")
return cryptBB_links_parser(soup)
def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")

+ 282
- 0
Forums/CryptBB/parser.py View File

@ -0,0 +1,282 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from Forums.Utilities.utilities import *
from datetime import date
from datetime import timedelta
import re
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
def cryptBB_description_parser(soup):
# Fields to be parsed
topic = "-1" # 0 *topic name
user = [] # 1 *all users of each post
status = [] # 2 all user's authority in each post such as (adm, member, dangerous)
reputation = [] # 3 all user's karma in each post (usually found as a number)
interest = [] # 4 all user's interest in each post
sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
post = [] # 6 all messages of each post
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
addDate = [] # 8 all dates of each post
image_user = [] # 9 all user avatars of each post
image_post = [] # 10 all first images of each post
# Finding the topic (should be just one coming from the Listing Page)
li = soup.find("td", {"class": "thead"}).find('strong')
topic = li.text
topic = re.sub("\[\w*\]", '', topic)
topic = topic.replace(",","")
topic = topic.replace("\n","")
topic = cleanString(topic.strip())
# Finding the repeated tag that corresponds to the listing of posts
posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all(
'div', {"class": "post"})
# For each message (post), get all the fields we are interested to:
for ipost in posts:
if ipost.find('div', {"class": "deleted_post_author"}):
continue
# Finding a first level of the HTML page
post_wrapper = ipost.find('span', {"class": "largetext"})
# Finding the author (user) of the post
author = post_wrapper.text.strip()
user.append(cleanString(author)) # Remember to clean the problematic characters
# Finding the status of the author
smalltext = ipost.find('div', {"class": "post_author"})
if smalltext is not None:
# CryptBB does have membergroup and postgroup
membergroup = smalltext.find('div', {"class": "profile-rank"})
postgroup = smalltext.find('div', {"class": "postgroup"})
if membergroup != None:
membergroup = membergroup.text.strip()
if postgroup != None:
postgroup = postgroup.text.strip()
membergroup = membergroup + " - " + postgroup
else:
if postgroup != None:
membergroup = postgroup.text.strip()
else:
membergroup = "-1"
status.append(cleanString(membergroup))
# Finding the interest of the author
# CryptBB does not have blurb
blurb = smalltext.find('li', {"class": "blurb"})
if blurb != None:
blurb = blurb.text.strip()
else:
blurb = "-1"
interest.append(cleanString(blurb))
# Finding the reputation of the user
# CryptBB does have reputation
author_stats = smalltext.find('div', {"class": "author_statistics"})
karma = author_stats.find('strong')
if karma != None:
karma = karma.text
karma = karma.replace("Community Rating: ", "")
karma = karma.replace("Karma: ", "")
karma = karma.strip()
else:
karma = "-1"
reputation.append(cleanString(karma))
else:
status.append('-1')
interest.append('-1')
reputation.append('-1')
# Getting here another good tag to find the post date, post content and users' signature
postarea = ipost.find('div', {"class": "post_content"})
dt = postarea.find('span', {"class": "post_date"}).text
# dt = dt.strip().split()
dt = dt.strip()
day=date.today()
if "Today" in dt:
today = day.strftime('%m-%d-%Y')
stime = dt.replace('Today,','').strip()
date_time_obj = today + ', '+stime
date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
elif "Yesterday" in dt:
yesterday = day - timedelta(days=1)
yesterday = yesterday.strftime('%m-%d-%Y')
stime = dt.replace('Yesterday,','').strip()
date_time_obj = yesterday + ', '+stime
date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
elif "ago" in dt:
date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p')
else:
date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p')
addDate.append(date_time_obj)
# Finding the post
inner = postarea.find('div', {"class": "post_body scaleimages"})
quote = inner.find('blockquote')
if quote is not None:
quote.decompose()
inner = inner.text.strip()
post.append(cleanString(inner))
# Finding the user's signature
# signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"})
signature = ipost.find('div', {"class": "signature scaleimages"})
if signature != None:
signature = signature.text.strip()
# print(signature)
else:
signature = "-1"
sign.append(cleanString(signature))
# As no information about user's feedback was found, just assign "-1" to the variable
feedback.append("-1")
img = ipost.find('div', {"class": "post_body scaleimages"}).find('img')
if img is not None:
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
image_post.append(img)
avatar = ipost.find('div', {"class": "author_avatar"})
if avatar is not None:
img = avatar.find('img')
if img is not None:
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
else:
img = "-1"
image_user.append(img)
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
# Sending the results
return row
# This is the method to parse the Listing Pages (one page with many posts)
def cryptBB_listing_parser(soup):
nm = 0 # *this variable should receive the number of topics
forum = "CryptBB" # 0 *forum name
board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
author = [] # 2 *all authors of each topic
topic = [] # 3 *all topics
views = [] # 4 number of views of each topic
posts = [] # 5 number of posts of each topic
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
# Listing and Description pages)
addDate = [] # 7 when the topic was created (difficult to find)
image_author = [] # 8 all author avatars used in each topic
# Finding the board (should be just one)
board = soup.find('span', {"class": "active"}).text
board = cleanString(board.strip())
# Finding the repeated tag that corresponds to the listing of topics
itopics = soup.find_all('tr',{"class": "inline_row"})
# Counting how many topics
nm = len(itopics)
for itopic in itopics:
# For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
# to don't miss any topic
# Adding the topic to the topic list
try:
topics = itopic.find('span', {"class": "subject_old"}).find('a').text
except:
topics = itopic.find('span', {"class": "subject_new"}).find('a').text
topics = re.sub("\[\w*\]", '', topics)
topic.append(cleanString(topics))
image_author.append(-1)
# Adding the url to the list of urls
try:
link = itopic.find('span', {"class": "subject_old"}).find('a').get('href')
except:
link = itopic.find('span',{"class": "subject_new"}).find('a').get('href')
href.append(link)
# Finding the author of the topic
ps = itopic.find('div', {"class":"author smalltext"}).text
user = ps.strip()
author.append(cleanString(user))
# Finding the number of replies
columns = itopic.findChildren('td',recursive=False)
replies = columns[3].text
if replies == '-':
posts.append('-1')
else:
posts.append(cleanString(replies))
# Finding the number of Views
tview = columns[4].text
if tview == '-':
views.append('-1')
else:
views.append(cleanString(tview))
# If no information about when the topic was added, just assign "-1" to the variable
addDate.append("-1")
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author)
def cryptBB_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.find('table', {"class": "tborder clear"}).find('tbody').find_all('tr', {"class": "inline_row"})
for a in listing:
try:
link = a.find('span', {"class": "subject_old"}).find('a').get('href')
except:
link = a.find('span', {"class": "subject_new"}).find('a').get('href')
href.append(link)
return href

+ 262
- 0
MarketPlaces/DarkBazar/crawler_selenium.py View File

@ -0,0 +1,262 @@
__author__ = 'DarkWeb'
'''
DarkBazar Marketplace Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.DarkBazar.parser import darkbazar_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/'
def startCrawling():
mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Returns the name of the website
def getMKTName():
name = 'DarkBazar'
return name
# Return the base link of the website
def getFixedURL():
url = 'http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/'
return url
# Closes Tor Browser
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", False)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
def login(driver):
input("Press ENTER when CAPTCHA is complete and login page has loaded\n")
# entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//input[@name="username"]')
# Username here
usernameBox.send_keys('aliciamykeys')
passwordBox = driver.find_element(by=By.XPATH, value='//input[@name="password"]')
# Password here
passwordBox.send_keys('aliciawherearemykey$')
# session time
session_select = Select(driver.find_element(by=By.XPATH, value='/html/body/main/div/div/div/div/div/form/div[4]/div/div[2]/select'))
session_select.select_by_visible_text('Session 60min')
input("Press ENTER when CAPTCHA is completed and you exit the newsletter\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="submit"]')))
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
def getMKTName() -> str:
name = 'DarkBazar'
return name
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if name == '':
name = str(counter)
counter = counter + 1
return name
def getInterestedLinks():
links = []
# Digital Goods
links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=3')
# Services
links.append('http://jw5e5sdywqupaqgtt43uq5ysfqpd2vzefl65s2fcjlj4qfxivynv6bqd.onion/cat.php?category=5')
return links
def crawlForum(driver):
print("Crawling the DarkBazar market")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver, driver.page_source, item)
driver.back()
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try:
link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
print("Crawling the DarkBazar market done.")
# Returns 'True' if the link is Topic link, may need to change for every website
def isDescriptionLink(url):
if 'item' in url:
return True
return False
# Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url):
if 'category=' in url:
return True
return False
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return darkbazar_links_parser(soup)
def crawler():
startCrawling()

+ 289
- 0
MarketPlaces/DarkBazar/parser.py View File

@ -0,0 +1,289 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
# stores info it needs in different lists, these lists are returned after being organized
# @param: soup object looking at html page of description page
# return: 'row' that contains a variety of lists that each hold info on the description page
def darkbazar_description_parser(soup):
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
# Finding Product Name
divmb = soup.findAll('div', {'class': "mb-1"})
name = divmb[0].text
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = name.strip()
# Finding Vendor
vendor = divmb[1].find('a').text.strip()
# Finding Vendor Rating
temp = soup.find('div', {'class': ""}).text
temp = temp.split('(')
rating = temp[0].replace("Vendor's Review : ", "")
rating = rating.replace("%", "")
rating_vendor = rating.strip()
# Finding the Product Rating and Number of Product Reviews
reviews = temp[2].replace(" review)", "")
reviews = reviews.strip()
temp = temp[1].split(")")
rating = temp[1].replace("Product Review : ", "")
rating = rating.replace("%", "")
rating_item = rating.strip()
# Finding Prices
USD = soup.find('div', {'class': "h3 text-primary"}).text.strip()
# Finding the Product Category
pmb = soup.findAll('p', {'class': "mb-1"})
category = pmb[-1].text
category = category.replace("Category: ", "").strip()
# Finding the Product Quantity Available
left = divmb[-1].text
left = left.split(",", 1)[1]
left = left.replace("in stock", "")
left = left.strip()
# Finding Number Sold
sold = divmb[-1].text
sold = sold.split(",", 1)[0]
sold = sold.replace("sold", "")
sold = sold.strip()
# Finding Shipment Information (Origin)
pmb[0].text
shipFrom = shipFrom.replace("Ships from: ", "").strip()
# Finding Shipment Information (Destination)
pmb[1].text
shipTo = shipTo.replace("Ships to: ", "").strip()
# Finding the Product description
cardbody = soup.findAll('div', {'class': "card-body"})
describe = cardbody[1].text.strip()
# Finding Product Image
image = soup.find('div', {'class': 'product-primary'}).find('img')
image = image.get('src')
image = image.split('base64,')[-1]
# Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve:
CVE = " "
for idx in cve:
CVE += (idx)
CVE += " "
CVE = CVE.replace(',', ' ')
CVE = CVE.replace('\n', '')
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
if ms:
MS = " "
for im in ms:
MS += (im)
MS += " "
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
return row
# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
# stores info it needs in different lists, these lists are returned after being organized
# @param: soup object looking at html page of listing page
# return: 'row' that contains a variety of lists that each hold info on the listing page
def darkbazar_listing_parser(soup):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
mktName = "DarkBazar" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft = [] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
listing = soup.findAll('div', {"id": "itembox"})
# Populating the Number of Products
nm = len(listing)
for a in listing:
bae = a.findAll('a', href=True)
lb = a.findAll('div', {"id": "littlebox"})
# Adding the url to the list of urls
link = bae[0].get('href')
link = cleanLink(link)
href.append(link)
# Finding the Product
product = lb[1].find('a').text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.replace("...", "")
product = product.strip()
name.append(product)
# Finding Product Image
product_image = a.find('img')
product_image = product_image.get('src')
product_image = product_image.split('base64,')[-1]
image.append(product_image)
# Finding Prices
price = lb[-1].find('div', {"class": "mb-1"}).text
price = price.replace("$","")
price = price.strip()
USD.append(price)
# Finding the Vendor
vendor_name = lb[-1].find("a").text
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
image_vendor.append("-1")
# Finding the Category
cat = lb[-1].find("span").text
cat = cat.replace("class:", "")
cat = cat.strip()
category.append(cat)
span = lb[0].findAll("span")
# Finding Number of Views
num = span[0].text
num = num.replace("views:", "")
num = num.strip()
sold.append(num)
# Finding Number Sold
num = span[2].text
num = num.replace("Sold:", "")
num = num.strip()
sold.append(num)
# Finding Quantity Left
quant = span[1].text
quant = quant.replace("stock:", "")
quant = quant.strip()
qLeft.append(quant)
# add shipping information
ship = lb[2].findAll('small')[1].findAll('span')[1].text.split("->")
shipFrom.append(ship[0].replace("Ship from ", "").strip())
shipTo.append(ship[1].replace("to ", "").strip())
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue = "-1"
else:
cee = " "
for idx in cve:
cee += (idx)
cee += " "
cee = cee.replace(',', ' ')
cee = cee.replace('\n', '')
cveValue = cee
CVE.append(cveValue)
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue = "-1"
else:
me = " "
for im in ms:
me += (im)
me += " "
me = me.replace(',', ' ')
me = me.replace('\n', '')
MSValue = me
MS.append(MSValue)
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
# called by the crawler to get description links on a listing page
# @param: beautifulsoup object that is using the correct html page (listing page)
# return: list of description links from a listing page
def darkbazar_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.findAll('div', {"id": "itembox"})
# for a in listing:
# bae = a.find('a', {"class": "text-info"}, href=True)
# link = bae['href']
# href.append(link)
for a in listing:
bae = a.findAll('a', href=True)
# Adding the url to the list of urls
link = bae[0].get('href')
href.append(link)
return href

Loading…
Cancel
Save