khangtran
/
dark_web_forums

__author__ = 'cern'
'''
BlackPyramid Market Crawler (Selenium)'''

from selenium import webdriverfrom selenium.common.exceptions import NoSuchElementExceptionfrom selenium.webdriver.firefox.firefox_profile import FirefoxProfilefrom selenium.webdriver.firefox.firefox_binary import FirefoxBinaryfrom selenium.webdriver.firefox.service import Servicefrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver import ActionChainsimport selenium.webdriver.support.ui as uiClassesfrom selenium.webdriver.common.keys import Keysfrom PIL import Image
import urllib.parse as urlparseimport os, re, timeimport subprocessimport configparserfrom bs4 import BeautifulSoupfrom MarketPlaces.Initialization.prepare_parser import new_parsefrom MarketPlaces.BlackPyramid.parser import BlackPyramid_links_parserfrom MarketPlaces.Utilities.utilities import cleanHTML
import traceback
counter = 1baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/'

# Opens Tor Browser, crawls the websitedef startCrawling():    marketName = getMKTName()    driver = getAccess()
    if driver != 'down':        try:            login(driver)            crawlForum(driver)        except Exception as e:            print(driver.current_url, e)        closetor(driver)
    new_parse(marketName, baseURL, True)

# Logindef login(driver):    # wait for login page    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(        (By.XPATH, "//input[@name='username_login']")))
    # entering username and password into input boxes    usernameBox = driver.find_element(by=By.XPATH, value="//input[@name='username_login']")    # Username here    usernameBox.send_keys('ChipotleSteakBurrito')    passwordBox = driver.find_element(by=By.XPATH, value="//input[@name='password_login']")    # Password here    passwordBox.send_keys('BlackBeans')
    input("Press ENTER when CAPTCHA is completed and you closed the newsletter\n")
    # wait for listing page show up (This Xpath may need to change based on different seed url)    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(        (By.XPATH, '//*[@id="form93b"]')))

# Returns the name of the websitedef getMKTName():    name = 'BlackPyramid'    return name

# Return the link of the websitedef getFixedURL():    url = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/login/?login=1'
    return url

# Closes Tor Browserdef closetor(driver):    # global pid    # os.system("taskkill /pid " + str(pro.pid))    # os.system("taskkill /t /f /im tor.exe")    print('Closing Tor...')    driver.close()    time.sleep(3)    return

# Creates FireFox 'driver' and configure its 'Profile'# to use Tor proxy and socketdef createFFDriver():    from MarketPlaces.Initialization.markets_mining import config
    ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
    ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))    ff_prof.set_preference("places.history.enabled", False)    ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)    ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)    ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)    ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)    ff_prof.set_preference("signon.rememberSignons", False)    ff_prof.set_preference("network.cookie.lifetimePolicy", 2)    # ff_prof.set_preference("network.dns.disablePrefetch", True)    # ff_prof.set_preference("network.http.sendRefererHeader", 0)    ff_prof.set_preference("permissions.default.image", 3)    ff_prof.set_preference("browser.download.folderList", 2)    ff_prof.set_preference("browser.download.manager.showWhenStarting", False)    ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")    ff_prof.set_preference('network.proxy.type', 1)    ff_prof.set_preference("network.proxy.socks_version", 5)    ff_prof.set_preference('network.proxy.socks', '127.0.0.1')    ff_prof.set_preference('network.proxy.socks_port', 9150)    ff_prof.set_preference('network.proxy.socks_remote_dns', True)    ff_prof.set_preference("javascript.enabled", False)    ff_prof.update_preferences()
    service = Service(config.get('TOR', 'geckodriver_path'))
    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
    driver.maximize_window()
    return driver

def getAccess():    url = getFixedURL()    driver = createFFDriver()    try:        driver.get(url)        return driver    except:        driver.close()        return 'down'

# Saves the crawled html pagedef savePage(driver, page, url):    cleanPage = cleanHTML(driver, page)    filePath = getFullPathName(url)    os.makedirs(os.path.dirname(filePath), exist_ok=True)    open(filePath, 'wb').write(cleanPage.encode('utf-8'))    return

# Gets the full path of the page to be saved along with its appropriate file namedef getFullPathName(url):    from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
    mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")    fileName = getNameFromURL(url)    if isDescriptionLink(url):        fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')    else:        fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')    return fullPath

# Creates the file name from passed URLdef getNameFromURL(url):    global counter    name = ''.join(e for e in url if e.isalnum())    if name == '':        name = str(counter)        counter = counter + 1    return name

def page_is_fully_loaded(driver):    return driver.execute_script("return document.readyState") == "complete"

def goToPage(driver, page):    # hover over digital -> hacking tools    a = ActionChains(driver)
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(        (By.XPATH, "//li[@class='dig940']/div/a")))
    # hover    digitalB = driver.find_element(By.XPATH, "//li[@class='dig940']/div/a")    time.sleep(1)    a.move_to_element(digitalB).perform()    # print(digitalB)
    # delay for website to register hover    time.sleep(5)
    # click    xpath = "//input[@name='" + page + "']"    link = driver.find_element(By.XPATH, xpath)    time.sleep(1)    a.move_to_element(link).click().perform()    # print(link)
    # wait for website to load    time.sleep(10)    WebDriverWait(driver, 100).until(page_is_fully_loaded)

def getInterestedLinks():    links = []
    # h11 -> Hacking Tools    links.append('h11')    # g3 -> Guides, Hacking    links.append('g3')    # se3 -> Services    links.append('se11')    # f6 -> Fraud    links.append('f11')
    return links

def crawlForum(driver):
    print("Crawling the BlackPyramid market")
    pages = getInterestedLinks()
    i = 0    for listing in pages:        print('Crawling :', listing)        try:            driver.get(baseURL)            goToPage(driver, listing)
            has_next_page = True            count = 0            currentPage = 1
            while has_next_page:
                html = driver.page_source                savePage(driver, html, listing + "page" + str(currentPage))
                # get a list of urls for each listing                list = productPages(html)
                for item in list:                    itemURL = urlparse.urljoin(baseURL, str(item))                    try:                        driver.get(itemURL)                    except:                        # driver.refresh()                        continue                    savePage(driver, driver.page_source, item)                    # can't use the back button in dark pyramid                    # driver.back()
                #     # comment out                #     break                #                # # comment out                # if count == 1:                #     break
                # go to next page of market                try:                    # Scroll to top of page to see navigation bar                    driver.find_element(by=By.XPATH, value="//body").send_keys(Keys.CONTROL + Keys.HOME)
                    goToPage(driver, listing)                    nav = driver.find_element(by=By.XPATH, value="//input[@name='next_page']")
                    if nav.is_enabled():                        # select next page                        pgnum = uiClasses.Select(driver.find_element(by=By.XPATH, value="//select[@name='pageination']"))                        # print("pg options:", pgnum.options)                        numberOfPages = len(pgnum.options)
                        if currentPage >= numberOfPages:                            raise NoSuchElementException
                        pgnum.select_by_index(currentPage)                        currentPage += 1
                        # click button                        pgbutton = driver.find_element(by=By.XPATH, value="//input[@value='go to page']")                        pgbutton.click()
                        # wait for website to load                        time.sleep(10)                        WebDriverWait(driver, 100).until(page_is_fully_loaded)                    else:                        raise NoSuchElementException                    count += 1
                except NoSuchElementException:                    has_next_page = False
        except Exception as e:            print(listing, e)        i += 1
    print("Crawling the BlackPyramid market done.")

# Returns 'True' if the link is Topic linkdef isDescriptionLink(url):    if 'product' in url:        return True    return False

# Returns True if the link is a listingPage linkdef isListingLink(url):    if 'category=' in url:        return True    return False

# calling the parser to define the linksdef productPages(html):    soup = BeautifulSoup(html, "html.parser")    return BlackPyramid_links_parser(soup)

def crawler():    startCrawling()    # print("Crawling and Parsing BestCardingWorld .... DONE!")