from selenium import webdriver from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.service import Service import os from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.firefox.options import Options from PIL import Image import codecs import time from datetime import date import urllib.parse as urlparse import os from bs4 import BeautifulSoup from MarketPlaces.DarkFox.parser import darkfox_links_parser file = open('../../path.txt', 'r') lines = file.readlines() # torexe = os.popen(lines[0].strip()) # path for tor.exe binary = FirefoxBinary(lines[0].strip()) # full path for firefox.exe # options = Options() profile = FirefoxProfile(lines[1].strip()) # full path for profile.default profile.set_preference('network.proxy.type', 1) profile.set_preference('network.proxy.socks', '127.0.0.1') profile.set_preference('network.proxy.socks_port', 9150) profile.set_preference("network.proxy.socks_remote_dns", True) profile.update_preferences() service = Service(lines[2].strip()) # full path for geckodriver.exe driver = webdriver.Firefox(firefox_binary=binary, firefox_profile=profile, service=service) # Manual captcha solver def captcha(driver): # wait for captcha page show up WebDriverWait(driver, 100).until(EC.visibility_of_element_located((By.XPATH, "/html/body/div/div/form/button[1]"))) # save captcha to local driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div[1]/div[1]").screenshot("captcha.png") # open method used to open different extension image file im = Image.open(r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\captcha.png') # This method will show image in any image viewer im.show() # wait until input space show up inputBox = driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div[1]/div[2]/input") # ask user input captha solution in terminal userIn = input("Enter solution: ") # send user solution into the input space inputBox.send_keys(userIn) # click the verify(submit) button driver.find_element(by=By.XPATH, value="/html/body/div/div/form/button[1]").click() # wait for listing page show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 100).until(EC.visibility_of_element_located( (By.XPATH, "/html/body/main/div/div/div[2]/div[1]/div[1]/form/div[1]/h1"))) # Saves the crawled html page def savePage(page, url): filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(page) return # Gets the full path of the page to be saved along with its appropriate file name def getFullPathName(url): fileName = getNameFromURL(url) if isDescriptionLink(url): fullPath = r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\HTML_Pages\\' + str( "%02d" % date.today().month) + str("%02d" % date.today().day) + str( "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' else: fullPath = r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\HTML_Pages\\' + str( "%02d" % date.today().month) + str("%02d" % date.today().day) + str( "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' return fullPath # Creates the name of the file based on URL def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) if (name == ''): name = str(counter) counter = counter + 1 return name def getInterestedLinks(): links = [] # Guides and Tutorials links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/30739153-1fcd-45cd-b919-072b439c6e06') # Digital Products links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781') # Software and Malware links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc') # Services links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280') # Miscellaneous links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/fd1c989b-1a74-4dc0-92b0-67d8c1c487cb') # Hosting and Security links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/5233fd6a-72e6-466d-b108-5cc61091cd14') return links def isDescriptionLink(url): if 'product' in url: return True return False def isListingLink(url): if 'category' in url: return True return False def productPages(html): soup = BeautifulSoup(html, "html.parser") return darkfox_links_parser(soup) def isSignOut(url): #absURL = urlparse.urljoin(url.base_url, url.url) if 'signout' in url.lower() or 'logout' in url.lower(): return True return False # dark fox seed url baseurl = 'http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion' driver.get(baseurl) captcha(driver) # visited = set() # visited.add(br.geturl()) linksToCrawl = getInterestedLinks() initialTime = time.time() i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: driver.get(link) html = driver.page_source.encode('utf-8', link) savePage(html, link) ''' has_next_page = True while has_next_page: j = 0 list = productPages(html) for item in list: if j == 1: break itemURL = str(item) driver.get(itemURL) savePage(driver.page_source.encode('utf-8'), item) driver.back() j += 1 try: link = driver.find_element(by=By.XPATH, value= '/html/body/main/div/div[2]/div/div[2]/div/div/div/nav/a[2]').get_attribute('href') driver.get(link) html = driver.page_source.encode('utf-8', link) savePage(html, link) except NoSuchElementException: has_next_page = False ''' except Exception as e: print(link, e.message) i += 1 # finalTime = time.time() # print finalTime - initialTime input("Crawling DarkFox marketplace done sucessfully. Press ENTER to continue\n")