khangtran
/
dark_web_forums

from selenium import webdriverfrom selenium.common.exceptions import NoSuchElementExceptionfrom selenium.webdriver.firefox.firefox_profile import FirefoxProfilefrom selenium.webdriver.firefox.firefox_binary import FirefoxBinaryfrom selenium.webdriver.firefox.service import Serviceimport osfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.firefox.options import Optionsfrom PIL import Image
import codecsimport timefrom datetime import dateimport urllib.parse as urlparseimport osfrom bs4 import BeautifulSoupfrom MarketPlaces.DarkFox.parser import darkfox_links_parser
file = open('../../path.txt', 'r')lines = file.readlines()
# torexe = os.popen(lines[0].strip())  # path for tor.exebinary = FirefoxBinary(lines[0].strip())  # full path for firefox.exe# options = Options()profile = FirefoxProfile(lines[1].strip())  # full path for profile.defaultprofile.set_preference('network.proxy.type', 1)profile.set_preference('network.proxy.socks', '127.0.0.1')profile.set_preference('network.proxy.socks_port', 9150)profile.set_preference("network.proxy.socks_remote_dns", True)profile.update_preferences()service = Service(lines[2].strip())  # full path for geckodriver.exedriver = webdriver.Firefox(firefox_binary=binary, firefox_profile=profile,                           service=service)
# Manual captcha solverdef captcha(driver):    # wait for captcha page show up    WebDriverWait(driver, 100).until(EC.visibility_of_element_located((By.XPATH, "/html/body/div/div/form/button[1]")))
    # save captcha to local    driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div[1]/div[1]").screenshot("captcha.png")
    # open method used to open different extension image file    im = Image.open(r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\captcha.png')
    # This method will show image in any image viewer    im.show()
    # wait until input space show up    inputBox = driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div[1]/div[2]/input")
    # ask user input captha solution in terminal    userIn = input("Enter solution: ")
    # send user solution into the input space    inputBox.send_keys(userIn)
    # click the verify(submit) button    driver.find_element(by=By.XPATH, value="/html/body/div/div/form/button[1]").click()
    # wait for listing page show up (This Xpath may need to change based on different seed url)    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(        (By.XPATH, "/html/body/main/div/div/div[2]/div[1]/div[1]/form/div[1]/h1")))

# Saves the crawled html pagedef savePage(page, url):    filePath = getFullPathName(url)    os.makedirs(os.path.dirname(filePath), exist_ok=True)    open(filePath, 'wb').write(page)    return

# Gets the full path of the page to be saved along with its appropriate file namedef getFullPathName(url):    fileName = getNameFromURL(url)    if isDescriptionLink(url):        fullPath = r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\HTML_Pages\\' + str(            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(            "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'    else:        fullPath = r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\HTML_Pages\\' + str(            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(            "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'    return fullPath

# Creates the name of the file based on URLdef getNameFromURL(url):    global counter    name = ''.join(e for e in url if e.isalnum())    if (name == ''):        name = str(counter)        counter = counter + 1    return name

def getInterestedLinks():    links = []
    # Guides and Tutorials    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/30739153-1fcd-45cd-b919-072b439c6e06')    # Digital Products    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781')    # Software and Malware    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc')    # Services    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280')    # Miscellaneous    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/fd1c989b-1a74-4dc0-92b0-67d8c1c487cb')    # Hosting and Security    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/5233fd6a-72e6-466d-b108-5cc61091cd14')
    return links

def isDescriptionLink(url):    if 'product' in url:        return True    return False

def isListingLink(url):    if 'category' in url:        return True    return False

def productPages(html):    soup = BeautifulSoup(html, "html.parser")    return darkfox_links_parser(soup)

def isSignOut(url):    #absURL = urlparse.urljoin(url.base_url, url.url)    if 'signout' in url.lower() or 'logout' in url.lower():        return True
    return False

# dark fox seed urlbaseurl = 'http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion'driver.get(baseurl)captcha(driver)
# visited = set()# visited.add(br.geturl())linksToCrawl = getInterestedLinks()initialTime = time.time()
i = 0while i < len(linksToCrawl):    link = linksToCrawl[i]    print('Crawling :', link)    try:        driver.get(link)        html = driver.page_source.encode('utf-8', link)        savePage(html, link)        '''
        has_next_page = True        while has_next_page:            j = 0            list = productPages(html)            for item in list:                if j == 1:                    break                itemURL = str(item)                driver.get(itemURL)                savePage(driver.page_source.encode('utf-8'), item)                driver.back()                j += 1
            try:                link = driver.find_element(by=By.XPATH, value=                    '/html/body/main/div/div[2]/div/div[2]/div/div/div/nav/a[2]').get_attribute('href')                driver.get(link)                html = driver.page_source.encode('utf-8', link)                savePage(html, link)            except NoSuchElementException:                has_next_page = False        '''
    except Exception as e:        print(link, e.message)    i += 1
# finalTime = time.time()# print finalTime - initialTime
input("Crawling DarkFox marketplace done sucessfully. Press ENTER to continue\n")