Browse Source

Finish Crawler & Parser for TheDarkMarket

main
Khoi 1 year ago
parent
commit
ca2e8aedfd
7 changed files with 555 additions and 11 deletions
  1. +1
    -1
      MarketPlaces/Bohemia/crawler_selenium.py
  2. +1
    -1
      MarketPlaces/Initialization/marketsList.txt
  3. +5
    -1
      MarketPlaces/Initialization/markets_mining.py
  4. +10
    -4
      MarketPlaces/Initialization/prepare_parser.py
  5. +352
    -0
      MarketPlaces/TheDarkMarket/crawler_selenium.py
  6. +182
    -0
      MarketPlaces/TheDarkMarket/parser.py
  7. +4
    -4
      setup.ini

+ 1
- 1
MarketPlaces/Bohemia/crawler_selenium.py View File

@ -42,7 +42,7 @@ def startCrawling():
print(driver.current_url, e)
closeDriver(driver)
new_parse(mktName, False)
new_parse(marketPlace=mktName, url=baseURL, createLog=False)
def login(driver):


+ 1
- 1
MarketPlaces/Initialization/marketsList.txt View File

@ -1 +1 @@
ThiefWorld
TheDarkMarket

+ 5
- 1
MarketPlaces/Initialization/markets_mining.py View File

@ -14,6 +14,8 @@ from MarketPlaces.M00nkeyMarket.crawler_selenium import crawler as crawlerM00nke
from MarketPlaces.ViceCity.crawler_selenium import crawler as crawlerViceCity
from MarketPlaces.CypherMarketplace.crawler_selenium import crawler as crawlerCypher
from MarketPlaces.PabloEscobarMarket.crawler_selenium import crawler as crawlerPabloEscobar
from MarketPlaces.Bohemia.crawler_selenium import crawler as crawlerBohemia
from MarketPlaces.TheDarkMarket.crawler_selenium import crawler as crawlerTheDarkMarket
import configparser
import os
@ -75,7 +77,7 @@ def opentor():
if __name__ == '__main__':
opentor()
# opentor()
mktsList = getMarkets()
@ -107,5 +109,7 @@ if __name__ == '__main__':
crawlerCypher()
elif mkt == "PabloEscobarMarket":
crawlerPabloEscobar()
elif mkt == "TheDarkMarket":
crawlerTheDarkMarket()
print("\nScraping process completed!")

+ 10
- 4
MarketPlaces/Initialization/prepare_parser.py View File

@ -10,6 +10,7 @@ from psycopg2.extras import RealDictCursor
from MarketPlaces.DB_Connection.db_connection import *
from MarketPlaces.DarkFox.parser import *
from MarketPlaces.AnonymousMarketplace.parser import *
from MarketPlaces.TheDarkMarket.parser import *
from MarketPlaces.ViceCity.parser import *
from MarketPlaces.M00nkeyMarket.parser import *
from MarketPlaces.MikesGrandStore.parser import *
@ -130,6 +131,9 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
rw = pabloescobarmarket_listing_parser(soup)
elif marketPlace == "CityMarket":
rw = city_listing_parser(soup)
elif marketPlace == "TheDarkMarket":
rw = darkmarket_listing_parser(soup)
else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception
@ -164,6 +168,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
rmm = pabloescobarmarket_description_parser(soup)
elif marketPlace == "CityMarket":
rmm = city_description_parser(soup)
elif marketPlace == "TheDarkMarket":
rmm = darkmarket_description_parser(soup)
else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception
@ -272,12 +278,12 @@ def new_parse(marketPlace, url, createLog):
moveDescriptionError = False
findDescriptionError = False
rw = []
rw = []
if doParseListing:
rw = parse_listing(marketPlace, listingFile, listingSoup, createLog, logFile)
doDescription = rw is not None
if doDescription:
@ -287,12 +293,12 @@ def new_parse(marketPlace, url, createLog):
for rec in rw:
rec = rec.split(',')
descriptionPattern = cleanLink(rec[22]) + ".html"
# Reading the associated description Html Pages
descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern))
nFound += len(descriptions)
for descriptionIndex, descriptionFile in enumerate(descriptions):


+ 352
- 0
MarketPlaces/TheDarkMarket/crawler_selenium.py View File

@ -0,0 +1,352 @@
__author__ = 'DarkWeb'
'''
Royal Marketplace Crawler (Selenium)
'''
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.TheDarkMarket.parser import darkmarket_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://dark3xolguutzr2cn5twjyu6c3db2z3ai3aqyqascml5cdrleh3s2hqd.onion/'
# Opens Tor Browser, crawls the website
def startCrawling():
marketName = getMarketName()
driver = getAccess()
if driver != 'down':
try:
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(marketPlace=marketName, url=baseURL, createLog=True)
def captcha(driver):
'''
# wait for captcha page
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[2]/div/div/div/div/form/div/div[2]/button")))
inputChars = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div/div/form/div/div[2]/div[1]/input')
inputNum = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div/div/form/div/div[2]/div[2]/input')
driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div/div/form/div/div[1]/div/div').screenshot(
r'..\Royal\captcha1.png')
im = Image.open(r'..\Royal\captcha1.png')
im.show()
chars = input("Enter characters: ")
inputChars.send_keys(chars)
num = input("Enter number of wrong puzzle pieces: ")
inputNum.send_keys(num)
# click the verify(submit) button
driver.find_element(by=By.XPATH, value="/html/body/div[2]/div/div/div/div/form/div/div[2]/button").click()
'''
input("Press ENTER when CAPTCHA is completed\n")
# wait for login page
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[2]/div/div/div[2]/h1")))
'''
temp = driver.find_element(by=By.XPATH, value='/html/body/div/div/form/div[1]')
boxes = temp.find_elements(by=By.TAG_NAME, value='input')
for box in boxes:
# click box to update captcha image
box.click()
# save clock captcha to local
time.sleep(1)
driver.find_element(by=By.XPATH, value='/html/body/div/div/form/div[1]/div').screenshot(
r'..\Royal\captcha1.png')
im = Image.open(r'..\Royal\captcha1.png')
im.show()
letter = input("Enter letter: ")
box.send_keys(letter)
# click the verify(submit) button
driver.find_element(by=By.XPATH, value="/html/body/div/div/form/button[1]").click()
# wait for login page
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[1]/div/div/div[2]/form/input[3]")))
'''
# Login using premade account credentials and do login captcha manually
def login(driver):
# wait for login page
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[2]/div/div/div[2]/form/div[4]")))
# entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
# Username here
usernameBox.send_keys('blabri')
passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
# Password here
passwordBox.send_keys('fishowal')
# click "Login"
driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div[2]/form/div[4]').click()
'''
# wait for captcha page show up
time.sleep(3)
# save captcha to local
driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div[2]/form/div[4]/label/div/div').screenshot(
r'..\Royal\captcha2.png')
# This method will show image in any image viewer
im = Image.open(r'..\Royal\captcha2.png')
im.show()
# ask user input captcha solution in terminal
userIn = input("Enter location of wrong pieces (squares are numbered 1-24 left to right, # # #): ")
squares = userIn.split()
# send user solution into the input space
for id in squares:
driver.find_element(by=By.XPATH, value='//*[@id="cl[' + str((int(id)-1)) + ']"]').click()
# click the verify(submit) button
driver.find_element(by=By.XPATH, value="/html/body/div[2]/div/div/div[2]/form/div[4]/label/div/div/div/button").click()
'''
input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div[3]/div/div[5]/div[1]')))
# Returns the name of the website
def getMarketName():
name = 'TheDarkMarket'
return name
# Return the link of the website
def getFixedURL():
url = 'http://dark3xolguutzr2cn5twjyu6c3db2z3ai3aqyqascml5cdrleh3s2hqd.onion/'
return url
# Closes Tor Browser
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", False)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
# Saves the crawled html page
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMarketName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if not isListingLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
# Creates the file name from passed URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if name == '':
name = str(counter)
counter = counter + 1
return name
def getInterestedLinks():
links = []
# Digital - Fraud Software
links.append(baseURL + 'product-category/hacking/')
# # Digital - Guides and Tutorials
# links.append('http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion/category/Guides%20&%20Tutorials')
# # Digital - Legitimate Software
# links.append('http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion/category/Legitimiate%20Software')
# # Services - Carding
# links.append('http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion/category/Carding')
return links
def crawlForum(driver):
print("Crawling The Dark Market")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver, driver.page_source, item)
driver.back()
# comment out
# break
# comment out
# if count == 1:
# break
# Try finding next page
try:
nav = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div[1]/div[2]/nav')
li = nav.find_elements(by=By.TAG_NAME, value='li')
a = li[-1].find_element(by=By.TAG_NAME, value='a')
link = a.get_attribute('href')
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
input("Crawling Royal forum done sucessfully. Press ENTER to continue\n")
# Returns 'True' if the link is Topic link
def isDescriptionLink(url):
if '/product/' in url:
return True
return False
# Returns True if the link is a listingPage link
def isListingLink(url):
if 'category' in url:
return True
return False
# calling the parser to define the links
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return darkmarket_links_parser(soup)
def crawler():
startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!")

+ 182
- 0
MarketPlaces/TheDarkMarket/parser.py View File

@ -0,0 +1,182 @@
__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup, ResultSet, Tag
# This is the method to parse the Description Pages (one page to each Product in the Listing Pages)
def darkmarket_description_parser(soup: BeautifulSoup):
# Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1"
image_vendor = "-1"
details: Tag = soup.find("div", {"class": "wc-content"})
vendor = details.find("div", {"class": "product_meta"}).find("a", {"class": "wcvendors_cart_sold_by_meta"}).text
name = details.find("h1", {"class": "product_title entry-title"}).text
describe_list = [
elem.text for elem in
details.find("div", {"id": "tab-description"}).find_all()
if elem.name != "h2"
]
describe = " ".join(describe_list)
categories_list: ResultSet[Tag] = details.find("span", {"class": "posted_in"}).find_all("a")
category = "Hacking"
reviews = details.find("div", {"class": "review-link"}).get("title")
rating_item = details.find("div", {"class": "star-rating"}).get('title')
price_container = details.find("p", {"class": "price"})
if not price_container.find("ins"):
USD = price_container.find("span", {"class": "woocommerce-Price-amount amount"}).text.replace("$", "")
else:
USD = price_container.find("ins").find("span", {"class": "woocommerce-Price-amount amount"}).text.replace("$", "")
# print(f"\n[desc] Product: {name}")
# print(f"[desc] Price: ${USD}\n")
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, image_vendor)
# Sending the results
return row
# This is the method to parse the Listing Pages
def darkmarket_listing_parser(soup: BeautifulSoup):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
mktName = "TheDarkMarket" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 6 Product_MS_Classification (Microsoft Security)
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft =[] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
image = []
image_vendor = []
href = [] # 20 Product_Links
products_list: ResultSet[Tag] = soup.find("ul", {"class": "products columns-3"}).find_all("li")
for product in products_list:
nm += 1
product_vendor = product.find("small", {"class": "wcvendors_sold_by_in_loop"}).find("a").text
vendor.append(cleanString(product_vendor))
# rating_vendor.append("-1")
# success.append("-1")
product_name = product.find("h2", {"class": "woocommerce-loop-product__title"}).text
name.append(cleanString(product_name))
# CVE.append("-1")
# MS.append("-1")
product_category = product.find("div", {"class": 'product-categories'}).text
category.append(cleanString(product_category))
# describe.append("-1")
# views.append("-1")
# reviews.append("-1")
product_rating = product.find("div", {"class": "star-rating"}).get("title")
rating_item.append(cleanString(product_rating))
# addDate.append(datetime.now().strftime("%m/%d/%Y "))
# BTC.append("-1")
price_container = product.find("span", {"class": "price"})
if not price_container.find("ins"):
product_price = price_container.find("span", {"class": "woocommerce-Price-amount amount"}).text.replace("$", "")
else:
product_price = price_container.find("ins").find("span", {"class": "woocommerce-Price-amount amount"}).text.replace("$", "")
USD.append(cleanNumbers(product_price))
# EURO.append("-1")
# sold.append("-1")
# qLeft.append("-1")
# shipTo.append("-1")
# shipFrom.append("-1")
product_href = product.find("a", {"class": "woocommerce-LoopProduct-link woocommerce-loop-product__link"}).get("href")
href.append(product_href)
# print(f"\n[list] Product: {product_name}")
# print(f"[list] Links: ${product_href}\n")
product_images_list = product.find("a", {"class": "tf-loop-product-thumbs-link"}).find("img").get("data-srcset").split(" ")
product_image = product_images_list[0]
image.append(product_image)
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image,
image_vendor)
def darkmarket_links_parser(soup: BeautifulSoup):
# Returning all links that should be visited by the Crawler
href = []
listing: ResultSet[Tag] = soup.find("ul", {"class": "products columns-3"}).find_all("li")
for li in listing:
a = li.find('a', {"class": "woocommerce-LoopProduct-link woocommerce-loop-product__link"})
link = a.get('href')
href.append(link)
print(f"Links: {href}")
return href

+ 4
- 4
setup.ini View File

@ -1,11 +1,11 @@
[TOR]
firefox_binary_path = C:\Users\calsyslab\Desktop\Tor Browser\Browser\firefox.exe
firefox_profile_path = C:\Users\calsyslab\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default
geckodriver_path = C:\calsyslab\Project\dw_pipeline_test\selenium\geckodriver.exe
firefox_binary_path = C:\Users\minhkhoitran\Desktop\Tor Browser\Browser\firefox.exe
firefox_profile_path = C:\Users\minhkhoitran\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default
geckodriver_path = C:\nsf-reu\dw_pipeline_test\selenium\geckodriver.exe
[Project]
project_directory = C:\calsyslab\Project\dw_pipeline_test
project_directory = C:\nsf-reu\dw_pipeline_test
shared_folder = \\VBoxSvr\Shared
[PostgreSQL]


Loading…
Cancel
Save