diff --git a/MarketPlaces/BlackPyramid/crawler_selenium.py b/MarketPlaces/BlackPyramid/crawler_selenium.py index 6f7e45a..c34a6cb 100644 --- a/MarketPlaces/BlackPyramid/crawler_selenium.py +++ b/MarketPlaces/BlackPyramid/crawler_selenium.py @@ -27,24 +27,15 @@ from MarketPlaces.Utilities.utilities import cleanHTML import traceback -config = configparser.ConfigParser() -config.read('../../setup.ini') counter = 1 -baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/login/?login=1' +baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/' # Opens Tor Browser, crawls the website def startCrawling(): - # Opening tor beforehand gives "Tor exited during startup error" - # opentor() - - marketName = getMarketName() - + marketName = getMKTName() driver = getAccess() - # Wait for website to load - input("Press ENTER when website has loaded") - if driver != 'down': try: login(driver) @@ -56,19 +47,12 @@ def startCrawling(): new_parse(marketName, baseURL, False) -# Opens Tor Browser -def opentor(): - global pid - print("Connecting Tor...") - pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) - pid = pro.pid - time.sleep(7.5) - input('Tor Connected. Press ENTER to continue\n') - return - - # Login def login(driver): + # wait for login page + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "//input[@name='username_login']"))) + # entering username and password into input boxes usernameBox = driver.find_element(by=By.XPATH, value="//input[@name='username_login']") # Username here @@ -77,15 +61,15 @@ def login(driver): # Password here passwordBox.send_keys('BlackBeans') - input("Press ENTER when CAPTCHA is completed\n") + input("Press ENTER when CAPTCHA is completed and you closed the newsletter\n") # wait for listing page show up (This Xpath may need to change based on different seed url) - #WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - # (By.XPATH, '/html/body/div[2]/div[3]/div[3]/div[1]/div[3]/nav/ul/li[10]/a'))) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '//*[@id="form93b"]'))) # Returns the name of the website -def getMarketName(): +def getMKTName(): name = 'BlackPyramid' return name @@ -103,7 +87,7 @@ def closetor(driver): # os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /t /f /im tor.exe") print('Closing Tor...') - driver.quit() + driver.close() time.sleep(3) return @@ -111,6 +95,8 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): + from MarketPlaces.Initialization.markets_mining import config + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) @@ -121,8 +107,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -139,13 +125,14 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() + return driver def getAccess(): url = getFixedURL() driver = createFFDriver() - input('Tor Connected. Press ENTER to continue\n') try: driver.get(url) return driver @@ -155,8 +142,8 @@ def getAccess(): # Saves the crawled html page -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -165,19 +152,14 @@ def savePage(page, url): # Gets the full path of the page to be saved along with its appropriate file name def getFullPathName(url): - global counter - from MarketPlaces.Initialization.markets_mining import CURRENT_DATE + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): - if (os.path.exists(r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html')): - fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + "(" + str(counter) + ")" + '.html' - else: - fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: - if (os.path.exists(r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html')): - fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + "(" + str(counter) + ")" + '.html' - else: - fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath @@ -190,15 +172,23 @@ def getNameFromURL(url): counter = counter + 1 return name + +def page_is_fully_loaded(driver): + return driver.execute_script("return document.readyState") == "complete" + + def goToPage(driver, page): # hover over digital -> hacking tools a = ActionChains(driver) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "//li[@class='dig940']/div/a"))) + # hover digitalB = driver.find_element(By.XPATH, "//li[@class='dig940']/div/a") time.sleep(1) a.move_to_element(digitalB).perform() - print(digitalB) + # print(digitalB) # delay for website to register hover time.sleep(10) @@ -208,11 +198,11 @@ def goToPage(driver, page): link = driver.find_element(By.XPATH, xpath) time.sleep(1) a.move_to_element(link).click().perform() - print(link) + # print(link) # wait for website to load - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/center/div[4]/div[1]/div[3]/article/div[1]/h1/a'))) + time.sleep(10) + WebDriverWait(driver, 100).until(page_is_fully_loaded) def getInterestedLinks(): @@ -226,82 +216,72 @@ def getInterestedLinks(): def crawlForum(driver): + print("Crawling the BlackPyramid market") - #linksToCrawl = getInterestedLinks() - #pages = ["Hacking Tools"] pages = getInterestedLinks() - #visited = set(linksToCrawl) - initialTime = time.time() i = 0 - count = 0 - for listing in pages: - #link = linksToCrawl[i] print('Crawling :', listing) - try: - try: - goToPage(driver, listing) - except: - print("Try block 1") - driver.refresh() - time.sleep(5) - html = driver.page_source - savePage(html, listing) + goToPage(driver, listing) has_next_page = True + count = 0 currentPage = 1 - numberOfPages = 1 + while has_next_page: + + html = driver.page_source + savePage(driver, html, listing + "page" + str(currentPage)) + # get a list of urls for each listing list = productPages(html) - for item in list: + for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: driver.get(itemURL) except: - print("Try block 2") - driver.refresh() - savePage(driver.page_source, item) + # driver.refresh() + continue + savePage(driver, driver.page_source, item) # can't use the back button in dark pyramid # driver.back() # comment out - # break + break # comment out - # if count == 1: - # count = 0 - # break + if count == 1: + break # go to next page of market try: goToPage(driver, listing) nav = driver.find_element(by=By.XPATH, value="//input[@name='next_page']") - if not nav.is_enabled(): - raise NoSuchElementException - try: + if nav.is_enabled(): # select next page pgnum = uiClasses.Select(driver.find_element(by=By.XPATH, value="//select[@name='pageination']")) - print("pg options:", pgnum.options) - pgnum.select_by_index(currentPage) + # print("pg options:", pgnum.options) numberOfPages = len(pgnum.options) + if currentPage >= numberOfPages: + raise NoSuchElementException + + pgnum.select_by_index(currentPage) + currentPage += 1 + # click button pgbutton = driver.find_element(by=By.XPATH, value="//input[@value='go to page']") pgbutton.click() - except Exception as e: - print(e) - raise NoSuchElementException - time.sleep(10) - html = driver.page_source - savePage(html, listing) - currentPage += 1 - if currentPage > numberOfPages: + + # wait for website to load + time.sleep(10) + WebDriverWait(driver, 100).until(page_is_fully_loaded) + else: raise NoSuchElementException count += 1 @@ -309,14 +289,10 @@ def crawlForum(driver): has_next_page = False except Exception as e: - traceback.print_exc() print(listing, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - - input("Crawling Dark Pyramid done successfully. Press ENTER to continue\n") + print("Crawling the BlackPyramid market done.") # Returns 'True' if the link is Topic link @@ -342,6 +318,3 @@ def productPages(html): def crawler(): startCrawling() # print("Crawling and Parsing BestCardingWorld .... DONE!") - -if __name__ == '__main__': - startCrawling() \ No newline at end of file diff --git a/MarketPlaces/BlackPyramid/parser.py b/MarketPlaces/BlackPyramid/parser.py index 4b45ee7..ecc1dcb 100644 --- a/MarketPlaces/BlackPyramid/parser.py +++ b/MarketPlaces/BlackPyramid/parser.py @@ -179,7 +179,6 @@ def BlackPyramid_listing_parser(soup): # Adding the url to the list of urls link = bae[2].get('href') - link = cleanLink(link) href.append(link) # Finding the Product @@ -276,10 +275,7 @@ def BlackPyramid_links_parser(soup): for item in listing: - container = item.find('a', {"class": "ah39063"}) - - if container: - link = item.find('a', {"class": "ah39063"})['href'] - href.append(link) + link = item.find('a', {"class": "ah39063"})['href'] + href.append(link) return href