Added DarkDock Marketplace Parser and Crawler

1 year ago · 438ef428a6
--- a/MarketPlaces/DarkDock/crawler_selenium.py
+++ b/MarketPlaces/DarkDock/crawler_selenium.py
@ -0,0 +1,356 @@
 __author__ = 'Helium'

 """
 DarkDock Marketplace Crawler (Selenium)
 """

 from selenium import webdriver
 from selenium.common.exceptions import NoSuchElementException
 from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
 from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
 from selenium.webdriver.firefox.service import Service
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.common.by import By

 import urllib.parse as urlparse
 import os, time
 from bs4 import BeautifulSoup
 from MarketPlaces.Initialization.prepare_parser import new_parse
 from MarketPlaces.DarkDock.parser import darkdock_links_parser
 from MarketPlaces.Utilities.utilities import cleanHTML

 counter = 1
 baseURL = 'http://oirolrkrppy6sei6x6bvkkdolc4cjqzqfhxisfzu6exqblahwrrvktyd.onion/'

 def startCrawling():
    """Main method for the crawler. 
    
    Opens Tor Browser, crawls the website, parses, then closes Tor.
    """
    mktName = getMKTName()
    driver = getAccess()

    if driver != 'down':
        try:
            crawlMarket(driver)
        except Exception as e:
            print(driver.current_url, e)
        closeDriver(driver)

    new_parse(mktName, baseURL, True)

 def getMKTName():
    """Returns the name of the website.
    """
    name = 'DarkDock'
    return name

 def getFixedURL():
    """Returns the base link of site.
    """
    url = 'http://oirolrkrppy6sei6x6bvkkdolc4cjqzqfhxisfzu6exqblahwrrvktyd.onion/'
    return url


 def closeDriver(driver):
    """Closes Tor Browser.
    
    Args:
        driver: The selected Selenium driver.
    """
    # global pid
    # os.system("taskkill /pid " + str(pro.pid))
    # os.system("taskkill /t /f /im tor.exe")
    print('Closing Tor...')
    driver.close()
    time.sleep(3)
    return


 def createFFDriver():
    """Creates FireFox 'driver' and configure its 'Profile' to use Tor proxy and socket.
    """
    from MarketPlaces.Initialization.markets_mining import config

    ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))

    ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
    ff_prof.set_preference("places.history.enabled", False)
    ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
    ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
    ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
    ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
    ff_prof.set_preference("signon.rememberSignons", False)
    ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
    ff_prof.set_preference("network.dns.disablePrefetch", True)
    ff_prof.set_preference("network.http.sendRefererHeader", 0)
    ff_prof.set_preference("permissions.default.image", 3)
    ff_prof.set_preference("browser.download.folderList", 2)
    ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
    ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
    ff_prof.set_preference('network.proxy.type', 1)
    ff_prof.set_preference("network.proxy.socks_version", 5)
    ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
    ff_prof.set_preference('network.proxy.socks_port', 9150)
    ff_prof.set_preference('network.proxy.socks_remote_dns', True)
    ff_prof.set_preference("javascript.enabled", False)
    ff_prof.update_preferences()

    service = Service(config.get('TOR', 'geckodriver_path'))

    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)

    driver.maximize_window()

    return driver

 def getAccess():
    """The driver 'gets' the url and attempts to access the site.
    
    Return:
        A Selenium driver currently on the site or the string 'down' if it can't access the site.
    """
    url = getFixedURL()
    driver = createFFDriver()
    try:
        driver.get(url)
        return driver
    except:
        driver.close()
        return 'down'

 def savePage(driver, page, url):
    """Saves the crawled html page.
    
    Cleans the html of the current page the driver is on. Then saves the current 
    crawled html page with its full path name without special characters into the
    marketplace's directory. If the directory path doesn't exist it will make it.
    
    Args:
        driver: The Selenium driver accessing the page.
        page: The html of the saved page.
        url: The URL of the saved page.
    """
    cleanPage = cleanHTML(driver, page)
    filePath = getFullPathName(url)
    os.makedirs(os.path.dirname(filePath), exist_ok=True)
    open(filePath, 'wb').write(cleanPage.encode('utf-8'))
    return

 def getFullPathName(url):
    """Gets the full path name.
    
    Gets the full path of the page to be saved along with its appropriate file name.
    Determines which subdirectory to save the page, based on whether it is a description 
    or listing page.
    
    Args:
        url: The URL of the page.
    """
    from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE

    mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
    fileName = getNameFromURL(url)
    if isDescriptionLink(url):
        fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
    else:
        fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
    return fullPath

 def getNameFromURL(url):
    """Creates the file name from the passed URL.
    
    Generates a file name with only its alphanumeric characters.
    If the name isn't unique, it will be given a unique name.
    
    Args:
        url: The URL of the selected page from the crawler as it crawls through the site.
    """
    global counter
    name = ''.join(e for e in url if e.isalnum())
    if (name == ''):
        name = str(counter)
        counter = counter + 1
    return name

 def getInterestedLinks():
    """Returns list of urls the crawlers runs through.
    
    Returns a list of the different urls of interest that the crawler runs through.
    An example of this can be different categories of a market related to hacking,
    such as Software and Malware, Guides and Tutorials, Digital Products.
    """
    links = []
    categories = [
        'civil_softwares',
        'carding',
        'theft',
        'mining',
        'worms',
        'dump',
        'viruses',
        'trojans',
        'botnets',
        'security_technology',
        'computers',
        'confidential_info',
        'network_services',
        'database',
        'surveillance',
        'digital_forensics',
        '0day',
        'intelligence',
        'private_security'
    ]
    for category in categories:
        links.append(baseURL + "category/" + category)

    return links

 def crawlMarket(driver):
    """Crawls and saves each page of a link of interest.
    
    Accesses, saves, and crawls through each link of interest. For DarkDock, each 
    link of interest is a category, so we crawl through all numbered pages of the 
    category. We find the URL of all descriptions/products on the category page, and save each 
    individual description/product page.
    
    Args:
        driver: The Selenium driver accessing the site.    
    """
    print("Crawling the DarkDock market")

    linksToCrawl = getInterestedLinks()

    i = 0
    while i < len(linksToCrawl):
        baseCategoryLink = linksToCrawl[i]
        link = linksToCrawl[i]
        print('Crawling :', link)
        
        try:
            has_next_page = True
            count = 1 # Number of pages traversed
            maxPages = ''


            while has_next_page:
    
                # Try to access current link and reload if fails
                try:
                    driver.get(link)
                except:
                    driver.refresh()

                # Save html page
                html = driver.page_source
                savePage(driver, html, linksToCrawl[i] + f"page{count}")

                # Get the number of maxPages if maxPages isn't fetched yet 
                if maxPages == '':
                    try:
                        # Wait 30 seconds or until element loads
                        WebDriverWait(driver, 30).until(
                            EC.presence_of_element_located((By.XPATH, '//div[@class="pages"]//a[last()]'))
                        )
                        # fetches the element that gives the total number of pages in a category
                        maxPages = int(driver.find_element(By.XPATH, '//div[@class="pages"]//a[last()]').text)
                        print(f"Total number of Pages: {maxPages}")
                    except Exception as e:
                        print(f"Element not found: {str(e)}")



                # Parse the product/description pages
                list = descriptionPages(html)
                for item in list:
                    # Fetches the item URL by concatenating the base url with the item sub url
                    itemURL = urlparse.urljoin(baseURL, str(item))
                    try:
                        driver.get(itemURL)
                    except:
                        driver.refresh()
                    savePage(driver, driver.page_source, item)
                    # Go back to the previous category page
                    driver.back()

                    # # Add a break for testing if we are checking only the first description/product page
                    # break
                
                          
                
                # # Add a break for testing based on how many numbers of pages to test
                # if count == 3:
                #     break
                
                # Try to find the next page
                try:
                    link = f"{baseCategoryLink}/{count}/"
                    print("\tCurrent Page :", f"{link}")
                    if link == "":
                        raise NoSuchElementException
                    count += 1

                except NoSuchElementException:
                    has_next_page = False

                # If reached the number of maxPages stop crawling the current category
                if count > maxPages:
                    print("Max Pages reached")
                    has_next_page = False

        except Exception as e:
            print(link, e)
        i += 1

    print("Crawling the DarkDock market done.")


 def isDescriptionLink(url):
    """Returns whether the url is for a description page.

    Args:
        url: The url of a crawled page.
        
    Returns:
        Returns 'True' if the url is for a description page. Returns 'False' if the
        url is not for a description page. 
    """
    if 'product' in url:
        return True
    return False


 def isListingLink(url):
    """Returns whether the url is for a listing page.
    
    Args:
        url: The url of a crawled page.
    
    Returns:
        Returns 'True' if the url is for a listing page. Returns 'False' if the
        url is not for a listing page.    
    """
    if 'category' in url:
        return True
    return False

 def descriptionPages(html):
    """Returns all product/description links on the current page.
    
    Passes the html of the category/listing page and parses it for
    any description/product links.
    
    Args:
        html: The html of the selected category/listing page.
    
    """
    soup = BeautifulSoup(html, "html.parser")
    return darkdock_links_parser(soup)


 def crawler():
    """Starts the crawler.
    """
    startCrawling()
--- a/MarketPlaces/DarkDock/parser.py
+++ b/MarketPlaces/DarkDock/parser.py
@ -0,0 +1,232 @@
 __author__ = 'DarkWeb'

 # Here, we are importing the auxiliary functions to clean or convert data
 from MarketPlaces.Utilities.utilities import *

 def darkdock_description_parser(soup):
    """Parses the description pages of a DarkDock marketplace. 
    
    It takes a BeautifulSoup object that represents the HTML page of a description page, and 
    extracts various information such as vendor name, product name, etc.

    Args:
        soup: A BeautifulSoup object that represents the HTML page of a description page.

    Returns:
        The row of a description item as a tuple containing the information fields extracted from the description page.
    """
    vendor = "-1"                       # 0 Vendor_Name
    success = "-1"                      # 1 Vendor_Successful_Transactions
    rating_vendor = "-1"                # 2 Vendor_Rating
    name = "-1"                         # 3 Product_Name
    describe = "-1"                     # 4 Product_Description
    CVE = "-1"                          # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
    MS = "-1"                           # 6 Product_MS_Classification (Microsoft Security)
    category = "-1"                     # 7 Product_Category
    views = "-1"                        # 8 Product_Number_Of_Views
    reviews = "-1"                      # 9 Product_Number_Of_Reviews
    rating_item = "-1"                  # 10 Product_Rating
    addDate = "-1"                      # 11 Product_AddedDate
    BTC = "-1"                          # 12 Product_BTC_SellingPrice
    USD = "-1"                          # 13 Product_USD_SellingPrice
    EURO = "-1"                         # 14 Product_EURO_SellingPrice
    sold = "-1"                         # 15 Product_QuantitySold
    left = "-1"                         # 16 Product_QuantityLeft
    shipFrom = "-1"                     # 17 Product_ShippedFrom
    shipTo = "-1"                       # 18 Product_ShippedTo
    image = "-1"                        # 19 Product_Image
    vendor_image = "-1"                 # 20 Vendor_Image

    # Finding Vendor
    vendor = soup.select_one('table tr:nth-of-type(2) td:nth-of-type(3) a u').text
    vendor = cleanString(vendor)
    vendor = vendor.strip()

    # Finding Product Name
    headings = soup.find('div', {'class': 'main'}).find_all('div', {'class': 'heading'})
    name = headings[0].text
    name = cleanString(name)
    name = name.strip()

    # Finding the Product description
    describe = soup.find('div', {'class': 'tab1'}).text
    describe = cleanString(describe)
    describe = describe.strip()

    # Finding the Product category
    category = soup.select_one('table tr:nth-of-type(6) td:nth-of-type(3)').text
    category = cleanString(category)
    category = category.strip()

    # Finding Number of Product Reviews
    reviews = headings[1].text
    match = re.search(r'\((\d+)\)', reviews).group(1)
    reviews = cleanNumbers(reviews)
    reviews = reviews.strip()

    # Finding Prices
    USD = soup.select_one('table tr:nth-of-type(1) td:nth-of-type(3)').text
    USD = cleanNumbers(USD)
    USD = USD.strip()

    # Finding the Product Quantity Available
    left = soup.select_one('table tr:nth-of-type(7) td:nth-of-type(3)').text
    left = cleanNumbers(left)
    left = left.strip()

    # Finding Product Shipped From
    shipFrom = soup.select_one('table tr:nth-of-type(3) td:nth-of-type(3)').text
    shipFrom = cleanString(shipFrom)
    shipFrom = shipFrom.strip()

    # Finding Product Shipped To
    shipTo = soup.select_one('table tr:nth-of-type(5) td:nth-of-type(3)').text
    shipTo = cleanString(shipTo)
    shipTo = shipTo.strip()

    # Finding Product Image
    image = soup.find('img', {'class': 'bigthumbnail'}).get('src')
    image = image.split('base64,')[-1]

    # Populating the final variable (this should be a list with all fields scraped)
    row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
            BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
    
    # Sending the results
    return row

 def darkdock_listing_parser(soup):
    """Parses the listing pages of a DarkDock marketplace.
    
    It takes a BeautifulSoup object that represents the HTML page of a listing page, 
    and extracts various information such as vendor name, product name, etc. It then 
    removes and cleans the extracted information by passing it to the organizeProducts 
    function.

    Args:
        soup: A BeautifulSoup object that represents the HTML page of a listing page.

    Returns:
        The row of a description item as a tuple containing the information fields extracted from the listing page.
    """
    # Fields to be parsed
    nm = 0                                    # Total_Products (Should be Integer)
    mktName = "DarkDock"                      # 0 Marketplace_Name
    vendor = []                               # 1 Vendor
    rating_vendor = []                        # 2 Vendor_Rating
    success = []                              # 3 Vendor_Successful_Transactions
    name = []                                 # 4 Product_Name
    CVE = []                                  # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
    MS = []                                   # 6 Product_MS_Classification (Microsoft Security) dont worry about this
    category = []                             # 7 Product_Category
    describe = []                             # 8 Product_Description
    views = []                                # 9 Product_Number_Of_Views
    reviews = []                              # 10 Product_Number_Of_Reviews
    rating_item = []                          # 11 Product_Rating
    addDate = []                              # 12 Product_AddDate
    BTC = []                                  # 13 Product_BTC_SellingPrice
    USD = []                                  # 14 Product_USD_SellingPrice
    EURO = []                                 # 15 Product_EURO_SellingPrice
    sold = []                                 # 16 Product_QuantitySold
    qLeft = []                                # 17 Product_QuantityLeft
    shipFrom = []                             # 18 Product_ShippedFrom
    shipTo = []                               # 19 Product_ShippedTo
    image = []                                # 20 Product_Image
    image_vendor = []                         # 21 Vendor_Image
    href = []                                 # 22 Product_Links

    listings = soup.findAll('div', {'class': 'item'})

    # Populating the Number of Products
    nm = len(listings)
    cat = soup.find('div', {'class': 'heading'}).text
    cat = cleanString(cat)
    cat = cat.strip()

    for listing in listings:
        # Finding the Vendor
        vendor_name = listing.find('div', {'class': 'seller'}).text
        vendor.append(vendor_name)

        # Finding the Product
        product = listing.find('div', {'class': 'title'}).text
        product = cleanString(product)
        product = product.strip()
        name.append(product)
        
        # Finding the Category
        category.append(cat)

        # Finding description
        description = listing.find('div', {'class': 'description'}).text
        description = cleanString(description)
        description = description.strip()
        describe.append(description)

        # Finding product views
        num_view = listing.select_one('.stats table tr:nth-of-type(3) td:nth-of-type(1)').text
        num_view = cleanNumbers(num_view)
        num_view = num_view.strip()
        views.append(num_view)

        # Finding product reviews
        num_reviews = listing.select_one('.stats table tr:nth-of-type(3) td:nth-of-type(3)').text
        num_reviews = cleanNumbers(num_reviews)
        num_reviews = num_reviews.strip()
        reviews.append(num_reviews)

        # Finding product rating based on width style
        rating = listing.find('div', {'class': 'stars2'}).get('style')
        rating = re.findall(r"\d+\.\d+|\d+", rating)[0]
        rating = cleanNumbers(rating)
        rating = rating.strip()
        rating_item.append(rating)

        # Finding Prices
        price = listing.find('div', {'class': 'price'}).text
        price = price.strip()
        USD.append(price)

        # Finding number of times product is sold
        num_sold = listing.select_one('.stats table tr:nth-of-type(3) td:nth-of-type(2)').text
        num_sold = cleanNumbers(num_sold)
        num_sold = num_sold.strip()
        sold.append(num_sold)

        # Finding shipping locations
        shipping = listing.find('div',{'class': 'shipping'}).text
        shippedFrom, shippedTo = cleanString(shipping).split(' > ')
        shipTo.append(shippedTo)
        shipFrom.append(shippedFrom)

        # Adding the url to the list of urls
        link = listing.find('a', recursive=False).get('href')
        href.append(link)

        image_vendor.append("-1")

    return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
                                reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)

 def darkdock_links_parser(soup):
    """Returns a list of description links from a listing page.
    
    It takes a BeautifulSoup object that represents the HTML page of a listing page, and 
    extracts all the description links from the page.

    Args:
        soup: A BeautifulSoup object that represents the HTML page of a listing page.

    Returns:
        A list of description links from a listing page.
    """

    # Returning all links that should be visited by the Crawler

    href = []
    listing = soup.find_all('a', href=lambda href: href and '/product/' in href)

    for a in listing:
        href.append(a['href'])

    return href
--- a/MarketPlaces/Initialization/marketsList.txt
+++ b/MarketPlaces/Initialization/marketsList.txt
@ -1 +1 @@
 DarkMarket
 DarkDock
--- a/MarketPlaces/Initialization/markets_mining.py
+++ b/MarketPlaces/Initialization/markets_mining.py
@ -26,6 +26,7 @@ from MarketPlaces.GoFish.crawler_selenium import crawler as crawlerGoFish
 from MarketPlaces.ZeroDay.crawler_selenium import crawler as crawlerZeroDay
 from MarketPlaces.Torzon.crawler_selenium import crawler as crawlerTorzon
 from MarketPlaces.DarkMarket.crawler_selenium import crawler as crawlerDarkMarket
 from MarketPlaces.DarkDock.crawler_selenium import crawler as crawlerDarkDock

 import configparser
 import os
@ -141,5 +142,7 @@ if __name__ == '__main__':
            crawlerTorzon()
        elif mkt == "DarkMarket":
            crawlerDarkMarket()
        elif mkt == "DarkDock":
            crawlerDarkDock()

    print("\nScraping process completed!")
--- a/MarketPlaces/Initialization/prepare_parser.py
+++ b/MarketPlaces/Initialization/prepare_parser.py
@ -28,6 +28,7 @@ from MarketPlaces.Torzon.parser import *
 from MarketPlaces.GoFish.parser import *
 from MarketPlaces.ZeroDay.parser import *
 from MarketPlaces.DarkMarket.parser import *
 from MarketPlaces.DarkDock.parser import *

 from MarketPlaces.Classifier.classify_product import predict
 from Translator.translate import translate
@ -170,6 +171,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
            rw = torzon_listing_parser(soup)
        elif marketPlace == "DarkMarket":
            rw = darkmarket_listing_parser(soup)
        elif marketPlace == "DarkDock":
            rw = darkdock_listing_parser(soup)
        else:
            print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
            raise Exception
@ -230,6 +233,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
            rmm = torzon_description_parser(soup)
        elif marketPlace == "DarkMarket":
            rmm = darkmarket_description_parser(soup)
        elif marketPlace == "DarkDock":
            rmm = darkdock_description_parser(soup)
        else:
            print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
            raise Exception