__author__ = 'DarkWeb' ''' Quest Marketplace Crawler (Selenium) ''' from selenium import webdriver from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.service import Service from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import Select from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from PIL import Image import urllib.parse as urlparse import os, re, time from datetime import date import subprocess import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.Quest.parser import quest_links_parser from MarketPlaces.Utilities.utilities import cleanHTML counter = 1 baseURL = 'http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion' def startCrawling(): mktName = getMKTName() driver = getAccess() if driver != 'down': try: login(driver) crawlForum(driver) except Exception as e: print(driver.current_url, e) closeDriver(driver) new_parse(mktName, baseURL, True) # Returns the name of the website def getMKTName(): name = 'Quest' return name # Return the base link of the website def getFixedURL(): url = 'http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion' return url # Closes Tor Browser def closeDriver(driver): # global pid # os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /t /f /im tor.exe") print('Closing Tor...') driver.close() time.sleep(3) return # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): from MarketPlaces.Initialization.markets_mining import config ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) ff_prof.set_preference("places.history.enabled", False) ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) # ff_prof.set_preference("network.dns.disablePrefetch", True) # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") ff_prof.set_preference('network.proxy.type', 1) ff_prof.set_preference("network.proxy.socks_version", 5) ff_prof.set_preference('network.proxy.socks', '127.0.0.1') ff_prof.set_preference('network.proxy.socks_port', 9150) ff_prof.set_preference('network.proxy.socks_remote_dns', True) ff_prof.set_preference("javascript.enabled", False) ff_prof.update_preferences() service = Service(config.get('TOR', 'geckodriver_path')) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver.maximize_window() return driver #the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' def getAccess(): url = getFixedURL() driver = createFFDriver() try: driver.get(url) return driver except: driver.close() return 'down' def login(driver): WebDriverWait(driver, 100).until(EC.visibility_of_element_located( (By.XPATH, '//*[@id="username"]'))) # entering username and password into input boxes usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') # Username here usernameBox.send_keys('CashCarti') passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') # Password here passwordBox.send_keys('Mahogany') # Clicking the login button # login_button = driver.find_element(By.XPATH, value='/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[4]/div/div/button') # login_button.click() input("Press ENTER when CAPTCHA is completed\n") # wait for listing page show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 100).until(EC.visibility_of_element_located( (By.XPATH, '/html/body/div[1]/nav/div/a/img'))) def savePage(driver, page, url): cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) return def getFullPathName(url): from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath def getMKTName() -> str: name = 'Quest' return name def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) if name == '': name = str(counter) counter = counter + 1 return name def getInterestedLinks(): links = [] ## Services links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/8ae67900-22ed-11ec-a710-31f963ce8d35') ## Software links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/92809300-22ed-11ec-b143-af312e1dab77') ## Tutorial links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/9d1592b0-22ed-11ec-b82d-c3d2878a8716') ## Malware links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/a35bae90-22ed-11ec-ad2e-410f5a5339b5') ## Hacking links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/b4252cf0-22ed-11ec-8032-751549438ed5') ## Exploits links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/c0c3ac60-22ed-11ec-9e97-41cd1912fdee') ## Carding links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/cbe06b00-22ec-11ec-ab3a-816857220dec') return links def crawlForum(driver): print("Crawling the Quest market") linksToCrawl = getInterestedLinks() i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: has_next_page = True count = 0 while has_next_page: try: driver.get(link) except: driver.refresh() html = driver.page_source savePage(driver, html, link) list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: driver.get(itemURL) except: driver.refresh() savePage(driver, driver.page_source, item) driver.back() # # comment out # break # # # comment out # if count == 1: # break try: link_elem = driver.find_element(by=By.CSS_SELECTOR, value='a.page-link[rel="next"]') link = link_elem.get_attribute('href') if link == "": raise NoSuchElementException count += 1 except NoSuchElementException: has_next_page = False except Exception as e: print(link, e) i += 1 print("Crawling the Quest market done.") # Returns 'True' if the link is Topic link, may need to change for every website def isDescriptionLink(url): if 'product' in url: return True return False # Returns True if the link is a listingPage link, may need to change for every website def isListingLink(url): if 'category' in url: return True return False def productPages(html): soup = BeautifulSoup(html, "html.parser") return quest_links_parser(soup) def crawler(): startCrawling()