khangtran
/
dark_web_forums


								__author__ = 'Helium'


								"""

								DarkDock Marketplace Crawler (Selenium)

								"""


								from selenium import webdriver

								from selenium.common.exceptions import NoSuchElementException

								from selenium.webdriver.firefox.firefox_profile import FirefoxProfile

								from selenium.webdriver.firefox.firefox_binary import FirefoxBinary

								from selenium.webdriver.firefox.service import Service

								from selenium.webdriver.support.ui import WebDriverWait

								from selenium.webdriver.support import expected_conditions as EC

								from selenium.webdriver.common.by import By


								import urllib.parse as urlparse

								import os, time

								from bs4 import BeautifulSoup

								from MarketPlaces.Initialization.prepare_parser import new_parse

								from MarketPlaces.DarkDock.parser import darkdock_links_parser

								from MarketPlaces.Utilities.utilities import cleanHTML


								counter = 1

								baseURL = 'http://oirolrkrppy6sei6x6bvkkdolc4cjqzqfhxisfzu6exqblahwrrvktyd.onion/'


								def startCrawling():

								    """Main method for the crawler.


								    Opens Tor Browser, crawls the website, parses, then closes Tor.

								    """

								    mktName = getMKTName()

								    driver = getAccess()


								    if driver != 'down':

								        try:

								            crawlMarket(driver)

								        except Exception as e:

								            print(driver.current_url, e)

								        closeDriver(driver)


								    new_parse(mktName, baseURL, True)


								def getMKTName():

								    """Returns the name of the website.

								    """

								    name = 'DarkDock'

								    return name


								def getFixedURL():

								    """Returns the base link of site.

								    """

								    url = 'http://oirolrkrppy6sei6x6bvkkdolc4cjqzqfhxisfzu6exqblahwrrvktyd.onion/'

								    return url


								def closeDriver(driver):

								    """Closes Tor Browser.


								    Args:

								        driver: The selected Selenium driver.

								    """

								    # global pid

								    # os.system("taskkill /pid " + str(pro.pid))

								    # os.system("taskkill /t /f /im tor.exe")

								    print('Closing Tor...')

								    driver.close()

								    time.sleep(3)

								    return


								def createFFDriver():

								    """Creates FireFox 'driver' and configure its 'Profile' to use Tor proxy and socket.

								    """

								    from MarketPlaces.Initialization.markets_mining import config


								    ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))


								    ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))

								    ff_prof.set_preference("places.history.enabled", False)

								    ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)

								    ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)

								    ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)

								    ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)

								    ff_prof.set_preference("signon.rememberSignons", False)

								    ff_prof.set_preference("network.cookie.lifetimePolicy", 2)

								    ff_prof.set_preference("network.dns.disablePrefetch", True)

								    ff_prof.set_preference("network.http.sendRefererHeader", 0)

								    ff_prof.set_preference("permissions.default.image", 3)

								    ff_prof.set_preference("browser.download.folderList", 2)

								    ff_prof.set_preference("browser.download.manager.showWhenStarting", False)

								    ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")

								    ff_prof.set_preference('network.proxy.type', 1)

								    ff_prof.set_preference("network.proxy.socks_version", 5)

								    ff_prof.set_preference('network.proxy.socks', '127.0.0.1')

								    ff_prof.set_preference('network.proxy.socks_port', 9150)

								    ff_prof.set_preference('network.proxy.socks_remote_dns', True)

								    ff_prof.set_preference("javascript.enabled", False)

								    ff_prof.update_preferences()


								    service = Service(config.get('TOR', 'geckodriver_path'))


								    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)


								    driver.maximize_window()


								    return driver


								def getAccess():

								    """The driver 'gets' the url and attempts to access the site.


								    Return:

								        A Selenium driver currently on the site or the string 'down' if it can't access the site.

								    """

								    url = getFixedURL()

								    driver = createFFDriver()

								    try:

								        driver.get(url)

								        return driver

								    except:

								        driver.close()

								        return 'down'


								def savePage(driver, page, url):

								    """Saves the crawled html page.


								    Cleans the html of the current page the driver is on. Then saves the current

								    crawled html page with its full path name without special characters into the

								    marketplace's directory. If the directory path doesn't exist it will make it.


								    Args:

								        driver: The Selenium driver accessing the page.

								        page: The html of the saved page.

								        url: The URL of the saved page.

								    """

								    cleanPage = cleanHTML(driver, page)

								    filePath = getFullPathName(url)

								    os.makedirs(os.path.dirname(filePath), exist_ok=True)

								    open(filePath, 'wb').write(cleanPage.encode('utf-8'))

								    return


								def getFullPathName(url):

								    """Gets the full path name.


								    Gets the full path of the page to be saved along with its appropriate file name.

								    Determines which subdirectory to save the page, based on whether it is a description

								    or listing page.


								    Args:

								        url: The URL of the page.

								    """

								    from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE


								    mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")

								    fileName = getNameFromURL(url)

								    if isDescriptionLink(url):

								        fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')

								    else:

								        fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')

								    return fullPath


								def getNameFromURL(url):

								    """Creates the file name from the passed URL.


								    Generates a file name with only its alphanumeric characters.

								    If the name isn't unique, it will be given a unique name.


								    Args:

								        url: The URL of the selected page from the crawler as it crawls through the site.

								    """

								    global counter

								    name = ''.join(e for e in url if e.isalnum())

								    if (name == ''):

								        name = str(counter)

								        counter = counter + 1

								    return name


								def getInterestedLinks():

								    """Returns list of urls the crawlers runs through.


								    Returns a list of the different urls of interest that the crawler runs through.

								    An example of this can be different categories of a market related to hacking,

								    such as Software and Malware, Guides and Tutorials, Digital Products.

								    """

								    links = []

								    categories = [

								        'civil_softwares',

								        'carding',

								        'theft',

								        'mining',

								        'worms',

								        'dump',

								        'viruses',

								        'trojans',

								        'botnets',

								        'security_technology',

								        'computers',

								        'confidential_info',

								        'network_services',

								        'database',

								        'surveillance',

								        'digital_forensics',

								        '0day',

								        'intelligence',

								        'private_security'

								    ]

								    for category in categories:

								        links.append(baseURL + "category/" + category)


								    return links


								def crawlMarket(driver):

								    """Crawls and saves each page of a link of interest.


								    Accesses, saves, and crawls through each link of interest. For DarkDock, each

								    link of interest is a category, so we crawl through all numbered pages of the

								    category. We find the URL of all descriptions/products on the category page, and save each

								    individual description/product page.


								    Args:

								        driver: The Selenium driver accessing the site.

								    """

								    print("Crawling the DarkDock market")


								    linksToCrawl = getInterestedLinks()


								    i = 0

								    while i < len(linksToCrawl):

								        baseCategoryLink = linksToCrawl[i]

								        link = linksToCrawl[i]

								        print('Crawling :', link)


								        try:

								            has_next_page = True

								            count = 2


								            while has_next_page:


								                # Try to access current link and reload if fails

								                try:

								                    driver.get(link)

								                except:

								                    driver.refresh()


								                # Save html page

								                html = driver.page_source

								                savePage(driver, html, linksToCrawl[i] + f"page{count}")


								                # Parse the product/description pages

								                list = descriptionPages(html)

								                for item in list:

								                    # Fetches the item URL by concatenating the base url with the item sub url

								                    itemURL = urlparse.urljoin(baseURL, str(item))

								                    try:

								                        driver.get(itemURL)

								                    except:

								                        driver.refresh()

								                    savePage(driver, driver.page_source, item)

								                    # Go back to the previous category page

								                    driver.back()


								                #     # Add a break for testing if we are checking only the first description/product page

								                #     break

								                #

								                # # Add a break for testing based on how many numbers of pages to test

								                # if count == 3:

								                #     break


								                # Try to find the next page

								                try:

								                    link = f"{baseCategoryLink}/{count}/"

								                    driver.find_element(By.XPATH, f'//a[@href="{urlparse.urlparse(link).path}"]')


								                    if link == "":

								                        raise NoSuchElementException

								                    count += 1


								                except NoSuchElementException:

								                    has_next_page = False


								        except Exception as e:

								            print(link, e)

								        i += 1


								    print("Crawling the DarkDock market done.")


								def isDescriptionLink(url):

								    """Returns whether the url is for a description page.


								    Args:

								        url: The url of a crawled page.


								    Returns:

								        Returns 'True' if the url is for a description page. Returns 'False' if the

								        url is not for a description page.

								    """

								    if 'product' in url:

								        return True

								    return False


								def isListingLink(url):

								    """Returns whether the url is for a listing page.


								    Args:

								        url: The url of a crawled page.


								    Returns:

								        Returns 'True' if the url is for a listing page. Returns 'False' if the

								        url is not for a listing page.

								    """

								    if 'category' in url:

								        return True

								    return False


								def descriptionPages(html):

								    """Returns all product/description links on the current page.


								    Passes the html of the category/listing page and parses it for

								    any description/product links.


								    Args:

								        html: The html of the selected category/listing page.


								    """

								    soup = BeautifulSoup(html, "html.parser")

								    return darkdock_links_parser(soup)


								def crawler():

								    """Starts the crawler.

								    """

								    startCrawling()