Browse Source

Added DarkDock Marketplace Parser and Crawler

main
Matthew Kwong 6 months ago
parent
commit
438ef428a6
5 changed files with 597 additions and 1 deletions
  1. +356
    -0
      MarketPlaces/DarkDock/crawler_selenium.py
  2. +232
    -0
      MarketPlaces/DarkDock/parser.py
  3. +1
    -1
      MarketPlaces/Initialization/marketsList.txt
  4. +3
    -0
      MarketPlaces/Initialization/markets_mining.py
  5. +5
    -0
      MarketPlaces/Initialization/prepare_parser.py

+ 356
- 0
MarketPlaces/DarkDock/crawler_selenium.py View File

@ -0,0 +1,356 @@
__author__ = 'Helium'
"""
DarkDock Marketplace Crawler (Selenium)
"""
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import urllib.parse as urlparse
import os, time
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.DarkDock.parser import darkdock_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://oirolrkrppy6sei6x6bvkkdolc4cjqzqfhxisfzu6exqblahwrrvktyd.onion/'
def startCrawling():
"""Main method for the crawler.
Opens Tor Browser, crawls the website, parses, then closes Tor.
"""
mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
crawlMarket(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(mktName, baseURL, True)
def getMKTName():
"""Returns the name of the website.
"""
name = 'DarkDock'
return name
def getFixedURL():
"""Returns the base link of site.
"""
url = 'http://oirolrkrppy6sei6x6bvkkdolc4cjqzqfhxisfzu6exqblahwrrvktyd.onion/'
return url
def closeDriver(driver):
"""Closes Tor Browser.
Args:
driver: The selected Selenium driver.
"""
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
time.sleep(3)
return
def createFFDriver():
"""Creates FireFox 'driver' and configure its 'Profile' to use Tor proxy and socket.
"""
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", False)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
def getAccess():
"""The driver 'gets' the url and attempts to access the site.
Return:
A Selenium driver currently on the site or the string 'down' if it can't access the site.
"""
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
def savePage(driver, page, url):
"""Saves the crawled html page.
Cleans the html of the current page the driver is on. Then saves the current
crawled html page with its full path name without special characters into the
marketplace's directory. If the directory path doesn't exist it will make it.
Args:
driver: The Selenium driver accessing the page.
page: The html of the saved page.
url: The URL of the saved page.
"""
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
def getFullPathName(url):
"""Gets the full path name.
Gets the full path of the page to be saved along with its appropriate file name.
Determines which subdirectory to save the page, based on whether it is a description
or listing page.
Args:
url: The URL of the page.
"""
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
def getNameFromURL(url):
"""Creates the file name from the passed URL.
Generates a file name with only its alphanumeric characters.
If the name isn't unique, it will be given a unique name.
Args:
url: The URL of the selected page from the crawler as it crawls through the site.
"""
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
def getInterestedLinks():
"""Returns list of urls the crawlers runs through.
Returns a list of the different urls of interest that the crawler runs through.
An example of this can be different categories of a market related to hacking,
such as Software and Malware, Guides and Tutorials, Digital Products.
"""
links = []
categories = [
'civil_softwares',
'carding',
'theft',
'mining',
'worms',
'dump',
'viruses',
'trojans',
'botnets',
'security_technology',
'computers',
'confidential_info',
'network_services',
'database',
'surveillance',
'digital_forensics',
'0day',
'intelligence',
'private_security'
]
for category in categories:
links.append(baseURL + "category/" + category)
return links
def crawlMarket(driver):
"""Crawls and saves each page of a link of interest.
Accesses, saves, and crawls through each link of interest. For DarkDock, each
link of interest is a category, so we crawl through all numbered pages of the
category. We find the URL of all descriptions/products on the category page, and save each
individual description/product page.
Args:
driver: The Selenium driver accessing the site.
"""
print("Crawling the DarkDock market")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
baseCategoryLink = linksToCrawl[i]
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 1 # Number of pages traversed
maxPages = ''
while has_next_page:
# Try to access current link and reload if fails
try:
driver.get(link)
except:
driver.refresh()
# Save html page
html = driver.page_source
savePage(driver, html, linksToCrawl[i] + f"page{count}")
# Get the number of maxPages if maxPages isn't fetched yet
if maxPages == '':
try:
# Wait 30 seconds or until element loads
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.XPATH, '//div[@class="pages"]//a[last()]'))
)
# fetches the element that gives the total number of pages in a category
maxPages = int(driver.find_element(By.XPATH, '//div[@class="pages"]//a[last()]').text)
print(f"Total number of Pages: {maxPages}")
except Exception as e:
print(f"Element not found: {str(e)}")
# Parse the product/description pages
list = descriptionPages(html)
for item in list:
# Fetches the item URL by concatenating the base url with the item sub url
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver, driver.page_source, item)
# Go back to the previous category page
driver.back()
# # Add a break for testing if we are checking only the first description/product page
# break
# # Add a break for testing based on how many numbers of pages to test
# if count == 3:
# break
# Try to find the next page
try:
link = f"{baseCategoryLink}/{count}/"
print("\tCurrent Page :", f"{link}")
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
# If reached the number of maxPages stop crawling the current category
if count > maxPages:
print("Max Pages reached")
has_next_page = False
except Exception as e:
print(link, e)
i += 1
print("Crawling the DarkDock market done.")
def isDescriptionLink(url):
"""Returns whether the url is for a description page.
Args:
url: The url of a crawled page.
Returns:
Returns 'True' if the url is for a description page. Returns 'False' if the
url is not for a description page.
"""
if 'product' in url:
return True
return False
def isListingLink(url):
"""Returns whether the url is for a listing page.
Args:
url: The url of a crawled page.
Returns:
Returns 'True' if the url is for a listing page. Returns 'False' if the
url is not for a listing page.
"""
if 'category' in url:
return True
return False
def descriptionPages(html):
"""Returns all product/description links on the current page.
Passes the html of the category/listing page and parses it for
any description/product links.
Args:
html: The html of the selected category/listing page.
"""
soup = BeautifulSoup(html, "html.parser")
return darkdock_links_parser(soup)
def crawler():
"""Starts the crawler.
"""
startCrawling()

+ 232
- 0
MarketPlaces/DarkDock/parser.py View File

@ -0,0 +1,232 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
def darkdock_description_parser(soup):
"""Parses the description pages of a DarkDock marketplace.
It takes a BeautifulSoup object that represents the HTML page of a description page, and
extracts various information such as vendor name, product name, etc.
Args:
soup: A BeautifulSoup object that represents the HTML page of a description page.
Returns:
The row of a description item as a tuple containing the information fields extracted from the description page.
"""
vendor = "-1" # 0 Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
# Finding Vendor
vendor = soup.select_one('table tr:nth-of-type(2) td:nth-of-type(3) a u').text
vendor = cleanString(vendor)
vendor = vendor.strip()
# Finding Product Name
headings = soup.find('div', {'class': 'main'}).find_all('div', {'class': 'heading'})
name = headings[0].text
name = cleanString(name)
name = name.strip()
# Finding the Product description
describe = soup.find('div', {'class': 'tab1'}).text
describe = cleanString(describe)
describe = describe.strip()
# Finding the Product category
category = soup.select_one('table tr:nth-of-type(6) td:nth-of-type(3)').text
category = cleanString(category)
category = category.strip()
# Finding Number of Product Reviews
reviews = headings[1].text
match = re.search(r'\((\d+)\)', reviews).group(1)
reviews = cleanNumbers(reviews)
reviews = reviews.strip()
# Finding Prices
USD = soup.select_one('table tr:nth-of-type(1) td:nth-of-type(3)').text
USD = cleanNumbers(USD)
USD = USD.strip()
# Finding the Product Quantity Available
left = soup.select_one('table tr:nth-of-type(7) td:nth-of-type(3)').text
left = cleanNumbers(left)
left = left.strip()
# Finding Product Shipped From
shipFrom = soup.select_one('table tr:nth-of-type(3) td:nth-of-type(3)').text
shipFrom = cleanString(shipFrom)
shipFrom = shipFrom.strip()
# Finding Product Shipped To
shipTo = soup.select_one('table tr:nth-of-type(5) td:nth-of-type(3)').text
shipTo = cleanString(shipTo)
shipTo = shipTo.strip()
# Finding Product Image
image = soup.find('img', {'class': 'bigthumbnail'}).get('src')
image = image.split('base64,')[-1]
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results
return row
def darkdock_listing_parser(soup):
"""Parses the listing pages of a DarkDock marketplace.
It takes a BeautifulSoup object that represents the HTML page of a listing page,
and extracts various information such as vendor name, product name, etc. It then
removes and cleans the extracted information by passing it to the organizeProducts
function.
Args:
soup: A BeautifulSoup object that represents the HTML page of a listing page.
Returns:
The row of a description item as a tuple containing the information fields extracted from the listing page.
"""
# Fields to be parsed
nm = 0 # Total_Products (Should be Integer)
mktName = "DarkDock" # 0 Marketplace_Name
vendor = [] # 1 Vendor
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 Product_Name
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
category = [] # 7 Product_Category
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft = [] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
listings = soup.findAll('div', {'class': 'item'})
# Populating the Number of Products
nm = len(listings)
cat = soup.find('div', {'class': 'heading'}).text
cat = cleanString(cat)
cat = cat.strip()
for listing in listings:
# Finding the Vendor
vendor_name = listing.find('div', {'class': 'seller'}).text
vendor.append(vendor_name)
# Finding the Product
product = listing.find('div', {'class': 'title'}).text
product = cleanString(product)
product = product.strip()
name.append(product)
# Finding the Category
category.append(cat)
# Finding description
description = listing.find('div', {'class': 'description'}).text
description = cleanString(description)
description = description.strip()
describe.append(description)
# Finding product views
num_view = listing.select_one('.stats table tr:nth-of-type(3) td:nth-of-type(1)').text
num_view = cleanNumbers(num_view)
num_view = num_view.strip()
views.append(num_view)
# Finding product reviews
num_reviews = listing.select_one('.stats table tr:nth-of-type(3) td:nth-of-type(3)').text
num_reviews = cleanNumbers(num_reviews)
num_reviews = num_reviews.strip()
reviews.append(num_reviews)
# Finding product rating based on width style
rating = listing.find('div', {'class': 'stars2'}).get('style')
rating = re.findall(r"\d+\.\d+|\d+", rating)[0]
rating = cleanNumbers(rating)
rating = rating.strip()
rating_item.append(rating)
# Finding Prices
price = listing.find('div', {'class': 'price'}).text
price = price.strip()
USD.append(price)
# Finding number of times product is sold
num_sold = listing.select_one('.stats table tr:nth-of-type(3) td:nth-of-type(2)').text
num_sold = cleanNumbers(num_sold)
num_sold = num_sold.strip()
sold.append(num_sold)
# Finding shipping locations
shipping = listing.find('div',{'class': 'shipping'}).text
shippedFrom, shippedTo = cleanString(shipping).split(' > ')
shipTo.append(shippedTo)
shipFrom.append(shippedFrom)
# Adding the url to the list of urls
link = listing.find('a', recursive=False).get('href')
href.append(link)
image_vendor.append("-1")
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
def darkdock_links_parser(soup):
"""Returns a list of description links from a listing page.
It takes a BeautifulSoup object that represents the HTML page of a listing page, and
extracts all the description links from the page.
Args:
soup: A BeautifulSoup object that represents the HTML page of a listing page.
Returns:
A list of description links from a listing page.
"""
# Returning all links that should be visited by the Crawler
href = []
listing = soup.find_all('a', href=lambda href: href and '/product/' in href)
for a in listing:
href.append(a['href'])
return href

+ 1
- 1
MarketPlaces/Initialization/marketsList.txt View File

@ -1 +1 @@
DarkMarket
DarkDock

+ 3
- 0
MarketPlaces/Initialization/markets_mining.py View File

@ -26,6 +26,7 @@ from MarketPlaces.GoFish.crawler_selenium import crawler as crawlerGoFish
from MarketPlaces.ZeroDay.crawler_selenium import crawler as crawlerZeroDay
from MarketPlaces.Torzon.crawler_selenium import crawler as crawlerTorzon
from MarketPlaces.DarkMarket.crawler_selenium import crawler as crawlerDarkMarket
from MarketPlaces.DarkDock.crawler_selenium import crawler as crawlerDarkDock
import configparser
import os
@ -141,5 +142,7 @@ if __name__ == '__main__':
crawlerTorzon()
elif mkt == "DarkMarket":
crawlerDarkMarket()
elif mkt == "DarkDock":
crawlerDarkDock()
print("\nScraping process completed!")

+ 5
- 0
MarketPlaces/Initialization/prepare_parser.py View File

@ -28,6 +28,7 @@ from MarketPlaces.Torzon.parser import *
from MarketPlaces.GoFish.parser import *
from MarketPlaces.ZeroDay.parser import *
from MarketPlaces.DarkMarket.parser import *
from MarketPlaces.DarkDock.parser import *
from MarketPlaces.Classifier.classify_product import predict
from Translator.translate import translate
@ -170,6 +171,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
rw = torzon_listing_parser(soup)
elif marketPlace == "DarkMarket":
rw = darkmarket_listing_parser(soup)
elif marketPlace == "DarkDock":
rw = darkdock_listing_parser(soup)
else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception
@ -230,6 +233,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
rmm = torzon_description_parser(soup)
elif marketPlace == "DarkMarket":
rmm = darkmarket_description_parser(soup)
elif marketPlace == "DarkDock":
rmm = darkdock_description_parser(soup)
else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception


Loading…
Cancel
Save