Finish Crawler & Parser for TheDarkMarket

1 year ago · ca2e8aedfd
--- a/MarketPlaces/Bohemia/crawler_selenium.py
+++ b/MarketPlaces/Bohemia/crawler_selenium.py
@ -42,7 +42,7 @@ def startCrawling():
            print(driver.current_url, e)
        closeDriver(driver)

    new_parse(mktName, False)
    new_parse(marketPlace=mktName, url=baseURL, createLog=False)


 def login(driver):
--- a/MarketPlaces/Initialization/marketsList.txt
+++ b/MarketPlaces/Initialization/marketsList.txt
@ -1 +1 @@
 ThiefWorld
 TheDarkMarket
--- a/MarketPlaces/Initialization/markets_mining.py
+++ b/MarketPlaces/Initialization/markets_mining.py
@ -14,6 +14,8 @@ from MarketPlaces.M00nkeyMarket.crawler_selenium import crawler as crawlerM00nke
 from MarketPlaces.ViceCity.crawler_selenium import crawler as crawlerViceCity
 from MarketPlaces.CypherMarketplace.crawler_selenium import crawler as crawlerCypher
 from MarketPlaces.PabloEscobarMarket.crawler_selenium import crawler as crawlerPabloEscobar
 from MarketPlaces.Bohemia.crawler_selenium import crawler as crawlerBohemia
 from MarketPlaces.TheDarkMarket.crawler_selenium import crawler as crawlerTheDarkMarket

 import configparser
 import os
@ -75,7 +77,7 @@ def opentor():

 if __name__ == '__main__':

    opentor()
    # opentor()

    mktsList = getMarkets()

@ -107,5 +109,7 @@ if __name__ == '__main__':
            crawlerCypher()
        elif mkt == "PabloEscobarMarket":
            crawlerPabloEscobar()
        elif mkt == "TheDarkMarket":
            crawlerTheDarkMarket()

    print("\nScraping process completed!")
--- a/MarketPlaces/Initialization/prepare_parser.py
+++ b/MarketPlaces/Initialization/prepare_parser.py
@ -10,6 +10,7 @@ from psycopg2.extras import RealDictCursor
 from MarketPlaces.DB_Connection.db_connection import *
 from MarketPlaces.DarkFox.parser import *
 from MarketPlaces.AnonymousMarketplace.parser import *
 from MarketPlaces.TheDarkMarket.parser import *
 from MarketPlaces.ViceCity.parser import *
 from MarketPlaces.M00nkeyMarket.parser import *
 from MarketPlaces.MikesGrandStore.parser import *
@ -130,6 +131,9 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
            rw = pabloescobarmarket_listing_parser(soup)
        elif marketPlace == "CityMarket":
            rw = city_listing_parser(soup)
        elif marketPlace == "TheDarkMarket":
            rw = darkmarket_listing_parser(soup)
            
        else:
            print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
            raise Exception
@ -164,6 +168,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
            rmm = pabloescobarmarket_description_parser(soup)
        elif marketPlace == "CityMarket":
            rmm = city_description_parser(soup)
        elif marketPlace == "TheDarkMarket":
            rmm = darkmarket_description_parser(soup)
        else:
            print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
            raise Exception
@ -272,12 +278,12 @@ def new_parse(marketPlace, url, createLog):
        moveDescriptionError = False
        findDescriptionError = False

        rw = []
        rw = []       

        if doParseListing:

            rw = parse_listing(marketPlace, listingFile, listingSoup, createLog, logFile)

            
            doDescription = rw is not None

        if doDescription:
@ -287,12 +293,12 @@ def new_parse(marketPlace, url, createLog):
            for rec in rw:

                rec = rec.split(',')

                
                descriptionPattern = cleanLink(rec[22]) + ".html"

                # Reading the associated description Html Pages
                descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern))

                
                nFound += len(descriptions)

                for descriptionIndex, descriptionFile in enumerate(descriptions):
--- a/MarketPlaces/TheDarkMarket/crawler_selenium.py
+++ b/MarketPlaces/TheDarkMarket/crawler_selenium.py
@ -0,0 +1,352 @@
 __author__ = 'DarkWeb'

 '''
 Royal Marketplace Crawler (Selenium)
 '''

 from selenium import webdriver
 from selenium.webdriver.support.select import Select
 from selenium.common.exceptions import NoSuchElementException
 from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
 from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
 from selenium.webdriver.firefox.service import Service
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 from PIL import Image


 import urllib.parse as urlparse
 import os, re, time
 from datetime import date
 import subprocess
 from bs4 import BeautifulSoup
 from MarketPlaces.Initialization.prepare_parser import new_parse
 from MarketPlaces.TheDarkMarket.parser import darkmarket_links_parser
 from MarketPlaces.Utilities.utilities import cleanHTML

 counter = 1
 baseURL = 'http://dark3xolguutzr2cn5twjyu6c3db2z3ai3aqyqascml5cdrleh3s2hqd.onion/'


 # Opens Tor Browser, crawls the website
 def startCrawling():
    marketName = getMarketName()
    driver = getAccess()

    if driver != 'down':
        try:
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closeDriver(driver)

    new_parse(marketPlace=marketName, url=baseURL, createLog=True)


 def captcha(driver):
    '''
    # wait for captcha page
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
        (By.XPATH, "/html/body/div[2]/div/div/div/div/form/div/div[2]/button")))

    inputChars = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div/div/form/div/div[2]/div[1]/input')
    inputNum = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div/div/form/div/div[2]/div[2]/input')

    driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div/div/form/div/div[1]/div/div').screenshot(
        r'..\Royal\captcha1.png')

    im = Image.open(r'..\Royal\captcha1.png')
    im.show()

    chars = input("Enter characters: ")
    inputChars.send_keys(chars)

    num = input("Enter number of wrong puzzle pieces: ")
    inputNum.send_keys(num)

    # click the verify(submit) button
    driver.find_element(by=By.XPATH, value="/html/body/div[2]/div/div/div/div/form/div/div[2]/button").click()
    '''
    input("Press ENTER when CAPTCHA is completed\n")

    # wait for login page
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
        (By.XPATH, "/html/body/div[2]/div/div/div[2]/h1")))

    '''
    temp = driver.find_element(by=By.XPATH, value='/html/body/div/div/form/div[1]')
    boxes = temp.find_elements(by=By.TAG_NAME, value='input')

    for box in boxes:
        # click box to update captcha image
        box.click()

        # save clock captcha to local
        time.sleep(1)
        driver.find_element(by=By.XPATH, value='/html/body/div/div/form/div[1]/div').screenshot(
            r'..\Royal\captcha1.png')

        im = Image.open(r'..\Royal\captcha1.png')
        im.show()

        letter = input("Enter letter: ")
        box.send_keys(letter)

    # click the verify(submit) button
    driver.find_element(by=By.XPATH, value="/html/body/div/div/form/button[1]").click()

    # wait for login page
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
        (By.XPATH, "/html/body/div[1]/div/div/div[2]/form/input[3]")))
    '''


 # Login using premade account credentials and do login captcha manually
 def login(driver):
    # wait for login page
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
        (By.XPATH, "/html/body/div[2]/div/div/div[2]/form/div[4]")))

    # entering username and password into input boxes
    usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
    # Username here
    usernameBox.send_keys('blabri')
    passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
    # Password here
    passwordBox.send_keys('fishowal')

    # click "Login"
    driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div[2]/form/div[4]').click()

    '''
    # wait for captcha page show up
    time.sleep(3)

    # save captcha to local
    driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div[2]/form/div[4]/label/div/div').screenshot(
        r'..\Royal\captcha2.png')

    # This method will show image in any image viewer
    im = Image.open(r'..\Royal\captcha2.png')
    im.show()

    # ask user input captcha solution in terminal
    userIn = input("Enter location of wrong pieces (squares are numbered 1-24 left to right, # # #): ")
    squares = userIn.split()

    # send user solution into the input space
    for id in squares:
        driver.find_element(by=By.XPATH, value='//*[@id="cl[' + str((int(id)-1)) + ']"]').click()

    # click the verify(submit) button
    driver.find_element(by=By.XPATH, value="/html/body/div[2]/div/div/div[2]/form/div[4]/label/div/div/div/button").click()
    '''
    input("Press ENTER when CAPTCHA is completed\n")

    # wait for listing page show up (This Xpath may need to change based on different seed url)
    WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
        (By.XPATH, '/html/body/div[3]/div/div[5]/div[1]')))


 # Returns the name of the website
 def getMarketName():
    name = 'TheDarkMarket'
    return name


 # Return the link of the website
 def getFixedURL():
    url = 'http://dark3xolguutzr2cn5twjyu6c3db2z3ai3aqyqascml5cdrleh3s2hqd.onion/'

    return url


 # Closes Tor Browser
 def closeDriver(driver):
    # global pid
    # os.system("taskkill /pid " + str(pro.pid))
    # os.system("taskkill /t /f /im tor.exe")
    print('Closing Tor...')
    driver.close()
    time.sleep(3)
    return


 # Creates FireFox 'driver' and configure its 'Profile'
 # to use Tor proxy and socket
 def createFFDriver():
    from MarketPlaces.Initialization.markets_mining import config

    ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))

    ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
    ff_prof.set_preference("places.history.enabled", False)
    ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
    ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
    ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
    ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
    ff_prof.set_preference("signon.rememberSignons", False)
    ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
    # ff_prof.set_preference("network.dns.disablePrefetch", True)
    # ff_prof.set_preference("network.http.sendRefererHeader", 0)
    ff_prof.set_preference("permissions.default.image", 3)
    ff_prof.set_preference("browser.download.folderList", 2)
    ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
    ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
    ff_prof.set_preference('network.proxy.type', 1)
    ff_prof.set_preference("network.proxy.socks_version", 5)
    ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
    ff_prof.set_preference('network.proxy.socks_port', 9150)
    ff_prof.set_preference('network.proxy.socks_remote_dns', True)
    ff_prof.set_preference("javascript.enabled", False)
    ff_prof.update_preferences()

    service = Service(config.get('TOR', 'geckodriver_path'))

    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
    
    driver.maximize_window()

    return driver


 def getAccess():
    url = getFixedURL()
    driver = createFFDriver()
    try:
        driver.get(url)
        return driver
    except:
        driver.close()
        return 'down'


 # Saves the crawled html page
 def savePage(driver, page, url):
    cleanPage = cleanHTML(driver, page)
    filePath = getFullPathName(url)
    os.makedirs(os.path.dirname(filePath), exist_ok=True)
    open(filePath, 'wb').write(cleanPage.encode('utf-8'))
    return


 # Gets the full path of the page to be saved along with its appropriate file name
 def getFullPathName(url):
    from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE

    mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMarketName() + "/HTML_Pages")
    fileName = getNameFromURL(url)
    if not isListingLink(url):
        fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
    else:
        fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
    return fullPath


 # Creates the file name from passed URL
 def getNameFromURL(url):
    global counter
    name = ''.join(e for e in url if e.isalnum())
    if name == '':
        name = str(counter)
        counter = counter + 1
    return name


 def getInterestedLinks():
    links = []

    # Digital - Fraud Software
    links.append(baseURL + 'product-category/hacking/')
    # # Digital - Guides and Tutorials
    # links.append('http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion/category/Guides%20&%20Tutorials')
    # # Digital - Legitimate Software
    # links.append('http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion/category/Legitimiate%20Software')
    # # Services - Carding
    # links.append('http://royalrnpvfbodtt5altnnzano6hquvn2d5qy55oofc2zyqciogcevrad.onion/category/Carding')

    return links


 def crawlForum(driver):
    print("Crawling The Dark Market")

    linksToCrawl = getInterestedLinks()

    i = 0
    while i < len(linksToCrawl):
        link = linksToCrawl[i]
        print('Crawling :', link)
        try:
            has_next_page = True
            count = 0

            while has_next_page:
                try:
                    driver.get(link)
                except:
                    driver.refresh()
                html = driver.page_source
                savePage(driver, html, link)

                list = productPages(html)
                for item in list:
                    itemURL = urlparse.urljoin(baseURL, str(item))
                    try:
                        driver.get(itemURL)
                    except:
                        driver.refresh()
                    savePage(driver, driver.page_source, item)
                    driver.back()

                    # comment out
                    # break

                # comment out
                # if count == 1:
                #     break
                
                # Try finding next page
                try:
                    nav = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div/div[1]/div[2]/nav')
                    li = nav.find_elements(by=By.TAG_NAME, value='li')
                    a = li[-1].find_element(by=By.TAG_NAME, value='a')
                    link = a.get_attribute('href')
                    if link == "":
                        raise NoSuchElementException
                    count += 1

                except NoSuchElementException:
                    has_next_page = False

        except Exception as e:
            print(link, e)
        i += 1

    input("Crawling Royal forum done sucessfully. Press ENTER to continue\n")


 # Returns 'True' if the link is Topic link
 def isDescriptionLink(url):
    if '/product/' in url:
        return True
    return False


 # Returns True if the link is a listingPage link
 def isListingLink(url):
    if 'category' in url:
        return True
    return False


 # calling the parser to define the links
 def productPages(html):
    soup = BeautifulSoup(html, "html.parser")
    return darkmarket_links_parser(soup)


 def crawler():
    startCrawling()
    # print("Crawling and Parsing BestCardingWorld .... DONE!")
--- a/MarketPlaces/TheDarkMarket/parser.py
+++ b/MarketPlaces/TheDarkMarket/parser.py
@ -0,0 +1,182 @@
 __author__ = 'DarkWeb'

 # Here, we are importing the auxiliary functions to clean or convert data
 from MarketPlaces.Utilities.utilities import *

 # Here, we are importing BeautifulSoup to search through the HTML tree
 from bs4 import BeautifulSoup, ResultSet, Tag


 # This is the method to parse the Description Pages (one page to each Product in the Listing Pages)
 def darkmarket_description_parser(soup: BeautifulSoup):

    # Fields to be parsed

    vendor = "-1"                       # 0 *Vendor_Name
    success = "-1"                      # 1 Vendor_Successful_Transactions
    rating_vendor = "-1"                # 2 Vendor_Rating
    name = "-1"                         # 3 *Product_Name
    describe = "-1"                     # 4 Product_Description
    CVE = "-1"                          # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
    MS = "-1"                           # 6 Product_MS_Classification (Microsoft Security)
    category = "-1"                     # 7 Product_Category
    views = "-1"                        # 8 Product_Number_Of_Views
    reviews = "-1"                      # 9 Product_Number_Of_Reviews
    rating_item = "-1"                  # 10 Product_Rating
    addDate = "-1"                      # 11 Product_AddedDate
    BTC = "-1"                          # 12 Product_BTC_SellingPrice
    USD = "-1"                          # 13 Product_USD_SellingPrice
    EURO = "-1"                         # 14 Product_EURO_SellingPrice
    sold = "-1"                         # 15 Product_QuantitySold
    left = "-1"                         # 16 Product_QuantityLeft
    shipFrom = "-1"                     # 17 Product_ShippedFrom
    shipTo = "-1"                       # 18 Product_ShippedTo
    image = "-1"
    image_vendor = "-1"

    details: Tag = soup.find("div", {"class": "wc-content"})

    vendor = details.find("div", {"class": "product_meta"}).find("a", {"class": "wcvendors_cart_sold_by_meta"}).text
    
    name = details.find("h1", {"class": "product_title entry-title"}).text
    
    describe_list = [
        elem.text for elem in 
        details.find("div", {"id": "tab-description"}).find_all()
        if elem.name != "h2"
    ]
    
    describe = " ".join(describe_list)
    
    categories_list: ResultSet[Tag] = details.find("span", {"class": "posted_in"}).find_all("a")
    
    category = "Hacking"
    
    reviews = details.find("div", {"class": "review-link"}).get("title")
    
    rating_item = details.find("div", {"class": "star-rating"}).get('title')
    
    price_container = details.find("p", {"class": "price"})
    
    if not price_container.find("ins"):
        USD = price_container.find("span", {"class": "woocommerce-Price-amount amount"}).text.replace("$", "")
    else:
        USD = price_container.find("ins").find("span", {"class": "woocommerce-Price-amount amount"}).text.replace("$", "")

    # print(f"\n[desc] Product: {name}")
    # print(f"[desc] Price: ${USD}\n")

    # Populating the final variable (this should be a list with all fields scraped)
    row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
           BTC, USD, EURO, sold, left, shipFrom, shipTo, image, image_vendor)

    # Sending the results
    return row


 # This is the method to parse the Listing Pages
 def darkmarket_listing_parser(soup: BeautifulSoup):

    # Fields to be parsed
    nm = 0                                    # *Total_Products (Should be Integer)
    mktName = "TheDarkMarket"                      # 0 *Marketplace_Name
    vendor = []                               # 1 *Vendor y
    rating_vendor = []                        # 2 Vendor_Rating
    success = []                              # 3 Vendor_Successful_Transactions
    name = []                                 # 4 *Product_Name y
    CVE = []                                  # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
    MS = []                                   # 6 Product_MS_Classification (Microsoft Security)
    category = []                             # 7 Product_Category y
    describe = []                             # 8 Product_Description
    views = []                                # 9 Product_Number_Of_Views
    reviews = []                              # 10 Product_Number_Of_Reviews
    rating_item = []                          # 11 Product_Rating
    addDate = []                              # 12 Product_AddDate
    BTC = []                                  # 13 Product_BTC_SellingPrice
    USD = []                                  # 14 Product_USD_SellingPrice y
    EURO = []                                 # 15 Product_EURO_SellingPrice
    sold = []                                 # 16 Product_QuantitySold
    qLeft =[]                                 # 17 Product_QuantityLeft
    shipFrom = []                             # 18 Product_ShippedFrom
    shipTo = []                               # 19 Product_ShippedTo
    image = []
    image_vendor = []
    href = []                                 # 20 Product_Links

    products_list: ResultSet[Tag] = soup.find("ul", {"class": "products columns-3"}).find_all("li")
    
    for product in products_list:
        nm += 1
        
        product_vendor = product.find("small", {"class": "wcvendors_sold_by_in_loop"}).find("a").text
        vendor.append(cleanString(product_vendor))
        
        # rating_vendor.append("-1")
        # success.append("-1")
        
        product_name = product.find("h2", {"class": "woocommerce-loop-product__title"}).text
        name.append(cleanString(product_name))
        
        # CVE.append("-1")
        # MS.append("-1")
        
        product_category = product.find("div", {"class": 'product-categories'}).text
        category.append(cleanString(product_category))
        
        # describe.append("-1")
        # views.append("-1")
        # reviews.append("-1")
        
        product_rating = product.find("div", {"class": "star-rating"}).get("title")
        rating_item.append(cleanString(product_rating))
        
        # addDate.append(datetime.now().strftime("%m/%d/%Y "))
        # BTC.append("-1")
        
        price_container = product.find("span", {"class": "price"})
        
        
        if not price_container.find("ins"):
            product_price = price_container.find("span", {"class": "woocommerce-Price-amount amount"}).text.replace("$", "")
        else:
            product_price = price_container.find("ins").find("span", {"class": "woocommerce-Price-amount amount"}).text.replace("$", "")
        
        USD.append(cleanNumbers(product_price))
        
        # EURO.append("-1")
        # sold.append("-1")
        # qLeft.append("-1")
        # shipTo.append("-1")
        # shipFrom.append("-1")
        
        product_href = product.find("a", {"class": "woocommerce-LoopProduct-link woocommerce-loop-product__link"}).get("href")
        href.append(product_href)
        
        # print(f"\n[list] Product: {product_name}")
        # print(f"[list] Links: ${product_href}\n")
        
        product_images_list = product.find("a", {"class": "tf-loop-product-thumbs-link"}).find("img").get("data-srcset").split(" ")
        product_image = product_images_list[0]
        image.append(product_image)
    
    # Populate the final variable (this should be a list with all fields scraped)
    return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
                            reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image,
                            image_vendor)


 def darkmarket_links_parser(soup: BeautifulSoup):

    # Returning all links that should be visited by the Crawler
    href = []

    listing: ResultSet[Tag] = soup.find("ul", {"class": "products columns-3"}).find_all("li")

    for li in listing:

        a = li.find('a', {"class": "woocommerce-LoopProduct-link woocommerce-loop-product__link"})
        link = a.get('href')
        href.append(link)
    print(f"Links: {href}")

    return href
--- a/setup.ini
+++ b/setup.ini
@ -1,11 +1,11 @@

 [TOR]
 firefox_binary_path = C:\Users\calsyslab\Desktop\Tor Browser\Browser\firefox.exe
 firefox_profile_path = C:\Users\calsyslab\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default
 geckodriver_path = C:\calsyslab\Project\dw_pipeline_test\selenium\geckodriver.exe
 firefox_binary_path = C:\Users\minhkhoitran\Desktop\Tor Browser\Browser\firefox.exe
 firefox_profile_path = C:\Users\minhkhoitran\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default
 geckodriver_path = C:\nsf-reu\dw_pipeline_test\selenium\geckodriver.exe

 [Project]
 project_directory = C:\calsyslab\Project\dw_pipeline_test
 project_directory = C:\nsf-reu\dw_pipeline_test
 shared_folder = \\VBoxSvr\Shared

 [PostgreSQL]