__author__ = 'cern' ''' BlackPyramid Market Crawler (Selenium) ''' from selenium import webdriver from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver import ActionChains import selenium.webdriver.support.ui as uiClasses from selenium.webdriver.common.keys import Keys from PIL import Image import urllib.parse as urlparse import os, re, time import subprocess import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.BlackPyramid.parser import BlackPyramid_links_parser from MarketPlaces.Utilities.utilities import cleanHTML import traceback counter = 1 baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/' # Opens Tor Browser, crawls the website def startCrawling(): marketName = getMKTName() driver = getAccess() if driver != 'down': try: login(driver) crawlForum(driver) except Exception as e: print(driver.current_url, e) closetor(driver) new_parse(marketName, baseURL, True) # Login def login(driver): # wait for login page WebDriverWait(driver, 100).until(EC.visibility_of_element_located( (By.XPATH, "//input[@name='username_login']"))) # entering username and password into input boxes usernameBox = driver.find_element(by=By.XPATH, value="//input[@name='username_login']") # Username here usernameBox.send_keys('ChipotleSteakBurrito') passwordBox = driver.find_element(by=By.XPATH, value="//input[@name='password_login']") # Password here passwordBox.send_keys('BlackBeans') input("Press ENTER when CAPTCHA is completed and you closed the newsletter\n") # wait for listing page show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 100).until(EC.visibility_of_element_located( (By.XPATH, '//*[@id="form93b"]'))) # Returns the name of the website def getMKTName(): name = 'BlackPyramid' return name # Return the link of the website def getFixedURL(): url = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/login/?login=1' return url # Closes Tor Browser def closetor(driver): # global pid # os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /t /f /im tor.exe") print('Closing Tor...') driver.close() time.sleep(3) return # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): from MarketPlaces.Initialization.markets_mining import config ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) ff_prof.set_preference("places.history.enabled", False) ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) # ff_prof.set_preference("network.dns.disablePrefetch", True) # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") ff_prof.set_preference('network.proxy.type', 1) ff_prof.set_preference("network.proxy.socks_version", 5) ff_prof.set_preference('network.proxy.socks', '127.0.0.1') ff_prof.set_preference('network.proxy.socks_port', 9150) ff_prof.set_preference('network.proxy.socks_remote_dns', True) ff_prof.set_preference("javascript.enabled", False) ff_prof.update_preferences() service = Service(config.get('TOR', 'geckodriver_path')) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver.maximize_window() return driver def getAccess(): url = getFixedURL() driver = createFFDriver() try: driver.get(url) return driver except: driver.close() return 'down' # Saves the crawled html page def savePage(driver, page, url): cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) return # Gets the full path of the page to be saved along with its appropriate file name def getFullPathName(url): from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath # Creates the file name from passed URL def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) if name == '': name = str(counter) counter = counter + 1 return name def page_is_fully_loaded(driver): return driver.execute_script("return document.readyState") == "complete" def goToPage(driver, page): # hover over digital -> hacking tools a = ActionChains(driver) WebDriverWait(driver, 100).until(EC.visibility_of_element_located( (By.XPATH, "//li[@class='dig940']/div/a"))) # hover digitalB = driver.find_element(By.XPATH, "//li[@class='dig940']/div/a") time.sleep(1) a.move_to_element(digitalB).perform() # print(digitalB) # delay for website to register hover time.sleep(5) # click xpath = "//input[@name='" + page + "']" link = driver.find_element(By.XPATH, xpath) time.sleep(1) a.move_to_element(link).click().perform() # print(link) # wait for website to load time.sleep(10) WebDriverWait(driver, 100).until(page_is_fully_loaded) def getInterestedLinks(): links = [] # h11 -> Hacking Tools links.append('h11') # g3 -> Guides, Hacking links.append('g3') # se3 -> Services links.append('se11') # f6 -> Fraud links.append('f11') return links def crawlForum(driver): print("Crawling the BlackPyramid market") pages = getInterestedLinks() i = 0 for listing in pages: print('Crawling :', listing) try: driver.get(baseURL) goToPage(driver, listing) has_next_page = True count = 0 currentPage = 1 while has_next_page: html = driver.page_source savePage(driver, html, listing + "page" + str(currentPage)) # get a list of urls for each listing list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: driver.get(itemURL) except: # driver.refresh() continue savePage(driver, driver.page_source, item) # can't use the back button in dark pyramid # driver.back() # # comment out # break # # # comment out # if count == 1: # break # go to next page of market try: # Scroll to top of page to see navigation bar driver.find_element(by=By.XPATH, value="//body").send_keys(Keys.CONTROL + Keys.HOME) goToPage(driver, listing) nav = driver.find_element(by=By.XPATH, value="//input[@name='next_page']") if nav.is_enabled(): # select next page pgnum = uiClasses.Select(driver.find_element(by=By.XPATH, value="//select[@name='pageination']")) # print("pg options:", pgnum.options) numberOfPages = len(pgnum.options) if currentPage >= numberOfPages: raise NoSuchElementException pgnum.select_by_index(currentPage) currentPage += 1 # click button pgbutton = driver.find_element(by=By.XPATH, value="//input[@value='go to page']") pgbutton.click() # wait for website to load time.sleep(10) WebDriverWait(driver, 100).until(page_is_fully_loaded) else: raise NoSuchElementException count += 1 except NoSuchElementException: has_next_page = False except Exception as e: print(listing, e) i += 1 print("Crawling the BlackPyramid market done.") # Returns 'True' if the link is Topic link def isDescriptionLink(url): if 'product' in url: return True return False # Returns True if the link is a listingPage link def isListingLink(url): if 'category=' in url: return True return False # calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") return BlackPyramid_links_parser(soup) def crawler(): startCrawling() # print("Crawling and Parsing BestCardingWorld .... DONE!")