__author__ = 'DarkWeb' ''' Abacus Marketplace Crawler (Selenium) ''' from selenium import webdriver from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.service import Service from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import Select from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from PIL import Image import urllib.parse as urlparse import os, re, time from datetime import date import subprocess import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.Abacus.parser import abacus_links_parser from MarketPlaces.Utilities.utilities import cleanHTML counter = 1 baseURL = 'http://abacuseeettcn3n2zxo7tqy5vsxhqpha2jtjqs7cgdjzl2jascr4liad.onion' def startCrawling(): mktName = getMKTName() driver = getAccess() if driver != 'down': try: login(driver) crawlForum(driver) except Exception as e: print(driver.current_url, e) closeDriver(driver) # new_parse(mktName, baseURL, True) # Returns the name of the website def getMKTName(): name = 'Abacus' return name # Return the base link of the website def getFixedURL(): url = 'http://abacuseeettcn3n2zxo7tqy5vsxhqpha2jtjqs7cgdjzl2jascr4liad.onion' return url # Closes Tor Browser def closeDriver(driver): # global pid # os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /t /f /im tor.exe") print('Closing Tor...') driver.close() time.sleep(3) return # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): from MarketPlaces.Initialization.markets_mining import config ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) ff_prof.set_preference("places.history.enabled", False) ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) # ff_prof.set_preference("network.dns.disablePrefetch", True) # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") ff_prof.set_preference('network.proxy.type', 1) ff_prof.set_preference("network.proxy.socks_version", 5) ff_prof.set_preference('network.proxy.socks', '127.0.0.1') ff_prof.set_preference('network.proxy.socks_port', 9150) ff_prof.set_preference('network.proxy.socks_remote_dns', True) ff_prof.set_preference("javascript.enabled", False) ff_prof.update_preferences() service = Service(config.get('TOR', 'geckodriver_path')) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver.maximize_window() return driver #the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' def getAccess(): url = getFixedURL() driver = createFFDriver() try: driver.get(url) return driver except: driver.close() return 'down' def login(driver): input("Press ENTER when CAPTCHA is complete and login page has loaded\n") WebDriverWait(driver, 100).until(EC.visibility_of_element_located( (By.XPATH, '/html/body/div/div/div[1]/div/form/div[3]/input[1]'))) # entering username and password into input boxes try: usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div/div[1]/div/form/div[3]/input[1]') # Username here usernameBox.send_keys('ct1234') passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div/div[1]/div/form/div[3]/input[2]') # Password here passwordBox.send_keys('DementedBed123-') except: usernameBox = driver.find_element(by=By.CSS_SELECTOR, value='input.border-solid:nth-child(2)') # Username here usernameBox.send_keys('ct1234') passwordBox = driver.find_element(by=By.CSS_SELECTOR, value='input.border-solid:nth-child(4)') # Password here passwordBox.send_keys('DementedBed123-') input("Press ENTER AFTER phishing is completed (there is a captcha first and then an antiphishing check)\n") # wait for listing page show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 100).until(EC.visibility_of_element_located( (By.XPATH, '/html/body/div/div/div[2]/div/div[2]'))) def savePage(driver, page, url): cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) return def getFullPathName(url): from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath def getMKTName() -> str: name = 'Abacus' return name def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) if name == '': name = str(counter) counter = counter + 1 return name def getInterestedLinks(): links = [] # botnets and malware links.append('http://abacuseeettcn3n2zxo7tqy5vsxhqpha2jtjqs7cgdjzl2jascr4liad.onion/search?fcats[]=475756f633d0cc71f0c868bd&cats=2&s_quick=1') # # social engineering # links.append( # 'http://abacuseeettcn3n2zxo7tqy5vsxhqpha2jtjqs7cgdjzl2jascr4liad.onion/search?fcats[]=1c29a89f7a4022133cab877d&cats=2&s_quick=1') # digital links.append( 'http://abacuseeettcn3n2zxo7tqy5vsxhqpha2jtjqs7cgdjzl2jascr4liad.onion/search?fcats[]=475756f633d0cc71f0c868bd&cats=2&s_quick=1') # # hacking # links.append( # 'http://abacuseeettcn3n2zxo7tqy5vsxhqpha2jtjqs7cgdjzl2jascr4liad.onion/search?fcats[]=a0773b3de70bdaca38acda2f&cats=2&s_quick=1') # # carding # links.append('http://abacuseeettcn3n2zxo7tqy5vsxhqpha2jtjqs7cgdjzl2jascr4liad.onion/search?fcats[]=1b17857dc74c11953df85c55&cats=2&s_quick=1 return links def crawlForum(driver): print("Crawling the Abacus market") linksToCrawl = getInterestedLinks() i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: has_next_page = True count = 0 while has_next_page: try: print('waiting ten seconds to avoid ddos check') time.sleep(10) driver.get(link) except: driver.refresh() html = driver.page_source savePage(driver, html, link) list = productPages(html) for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: print(itemURL) print('waiting 5 sec to avoid ddos check') time.sleep(5) driver.get(itemURL) except: driver.refresh() savePage(driver, driver.page_source, item) print('waiting 20 seconds to avoid ddos check') time.sleep(20) driver.back() # comment out break # # comment out # if count == 3: # break try: chev = driver.find_element(by=By.XPATH, value='/html/body/div/div/div[2]/div/div[3]/div[4]') a_tags = chev.find_elements(by=By.TAG_NAME, value='a') try: for a_tag in a_tags: try: temp = a_tag.find_element(by=By.CLASS_NAME, value='gg-chevron-right') except: temp = '' if temp: link = a_tag.get_attribute('href') print(link) if link == '#': link = '' break else: link = '' except: try: a_tag = a_tags[-2].find_element(by=By.CLASS_NAME, value='gg-chevron-right').get_attribute('href') if a_tag: link = a_tag.get_attribute('href') if link == '#': link = '' break else: link = '' except: link='' if link == "": raise NoSuchElementException count += 1 except NoSuchElementException: has_next_page = False except Exception as e: print(link, e) i += 1 print("Crawling the Abacus market done.") # Returns 'True' if the link is Topic link, may need to change for every website def isDescriptionLink(url): if 'listing' in url: return True return False # Returns True if the link is a listingPage link, may need to change for every website def isListingLink(url): if 'search' in url: return True return False def productPages(html): soup = BeautifulSoup(html, "html.parser") return abacus_links_parser(soup) def crawler(): startCrawling()