khangtran
/
dark_web_forums


								from selenium import webdriver

								from selenium.common.exceptions import NoSuchElementException

								from selenium.webdriver.firefox.firefox_profile import FirefoxProfile

								from selenium.webdriver.firefox.firefox_binary import FirefoxBinary

								from selenium.webdriver.firefox.service import Service

								import os

								from selenium.webdriver.support.ui import WebDriverWait

								from selenium.webdriver.support import expected_conditions as EC

								from selenium.webdriver.common.by import By

								from selenium.webdriver.firefox.options import Options

								from PIL import Image


								import codecs

								import time

								from datetime import date

								import urllib.parse as urlparse

								import os

								from bs4 import BeautifulSoup

								from MarketPlaces.DarkFox.parser import darkfox_links_parser


								file = open('../../path.txt', 'r')

								lines = file.readlines()


								# torexe = os.popen(lines[0].strip())  # path for tor.exe

								binary = FirefoxBinary(lines[0].strip())  # full path for firefox.exe

								# options = Options()

								profile = FirefoxProfile(lines[1].strip())  # full path for profile.default

								profile.set_preference('network.proxy.type', 1)

								profile.set_preference('network.proxy.socks', '127.0.0.1')

								profile.set_preference('network.proxy.socks_port', 9150)

								profile.set_preference("network.proxy.socks_remote_dns", True)

								profile.update_preferences()

								service = Service(lines[2].strip())  # full path for geckodriver.exe

								driver = webdriver.Firefox(firefox_binary=binary, firefox_profile=profile,

								                           service=service)


								# Manual captcha solver

								def captcha(driver):

								    # wait for captcha page show up

								    WebDriverWait(driver, 100).until(EC.visibility_of_element_located((By.XPATH, "/html/body/div/div/form/button[1]")))


								    # save captcha to local

								    driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div[1]/div[1]").screenshot("captcha.png")


								    # open method used to open different extension image file

								    im = Image.open(r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\captcha.png')


								    # This method will show image in any image viewer

								    im.show()


								    # wait until input space show up

								    inputBox = driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div[1]/div[2]/input")


								    # ask user input captha solution in terminal

								    userIn = input("Enter solution: ")


								    # send user solution into the input space

								    inputBox.send_keys(userIn)


								    # click the verify(submit) button

								    driver.find_element(by=By.XPATH, value="/html/body/div/div/form/button[1]").click()


								    # wait for listing page show up (This Xpath may need to change based on different seed url)

								    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(

								        (By.XPATH, "/html/body/main/div/div/div[2]/div[1]/div[1]/form/div[1]/h1")))


								# Saves the crawled html page

								def savePage(page, url):

								    filePath = getFullPathName(url)

								    os.makedirs(os.path.dirname(filePath), exist_ok=True)

								    open(filePath, 'wb').write(page)

								    return


								# Gets the full path of the page to be saved along with its appropriate file name

								def getFullPathName(url):

								    fileName = getNameFromURL(url)

								    if isDescriptionLink(url):

								        fullPath = r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\HTML_Pages\\' + str(

								            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(

								            "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'

								    else:

								        fullPath = r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\HTML_Pages\\' + str(

								            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(

								            "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'

								    return fullPath


								# Creates the name of the file based on URL

								def getNameFromURL(url):

								    global counter

								    name = ''.join(e for e in url if e.isalnum())

								    if (name == ''):

								        name = str(counter)

								        counter = counter + 1

								    return name


								def getInterestedLinks():

								    links = []


								    # Guides and Tutorials

								    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/30739153-1fcd-45cd-b919-072b439c6e06')

								    # Digital Products

								    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781')

								    # Software and Malware

								    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc')

								    # Services

								    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280')

								    # Miscellaneous

								    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/fd1c989b-1a74-4dc0-92b0-67d8c1c487cb')

								    # Hosting and Security

								    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/5233fd6a-72e6-466d-b108-5cc61091cd14')


								    return links


								def isDescriptionLink(url):

								    if 'product' in url:

								        return True

								    return False


								def isListingLink(url):

								    if 'category' in url:

								        return True

								    return False


								def productPages(html):

								    soup = BeautifulSoup(html, "html.parser")

								    return darkfox_links_parser(soup)


								def isSignOut(url):

								    #absURL = urlparse.urljoin(url.base_url, url.url)

								    if 'signout' in url.lower() or 'logout' in url.lower():

								        return True


								    return False


								# dark fox seed url

								baseurl = 'http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion'

								driver.get(baseurl)

								captcha(driver)


								# visited = set()

								# visited.add(br.geturl())

								linksToCrawl = getInterestedLinks()

								initialTime = time.time()


								i = 0

								while i < len(linksToCrawl):

								    link = linksToCrawl[i]

								    print('Crawling :', link)

								    try:

								        driver.get(link)

								        html = driver.page_source.encode('utf-8', link)

								        savePage(html, link)

								        '''

								        has_next_page = True

								        while has_next_page:

								            j = 0

								            list = productPages(html)

								            for item in list:

								                if j == 1:

								                    break

								                itemURL = str(item)

								                driver.get(itemURL)

								                savePage(driver.page_source.encode('utf-8'), item)

								                driver.back()

								                j += 1


								            try:

								                link = driver.find_element(by=By.XPATH, value=

								                    '/html/body/main/div/div[2]/div/div[2]/div/div/div/nav/a[2]').get_attribute('href')

								                driver.get(link)

								                html = driver.page_source.encode('utf-8', link)

								                savePage(html, link)

								            except NoSuchElementException:

								                has_next_page = False

								        '''

								    except Exception as e:

								        print(link, e.message)

								    i += 1


								# finalTime = time.time()

								# print finalTime - initialTime


								input("Crawling DarkFox marketplace done sucessfully. Press ENTER to continue\n")