From 598e3398cbaa3a8dca009b3367d0607b725e1d2d Mon Sep 17 00:00:00 2001 From: Anita Mehr Date: Sat, 2 Dec 2023 22:20:44 +0000 Subject: [PATCH] first try. --- MarketPlaces/WeTheNorth/crawler_selenium.py | 43 +++++++++------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/MarketPlaces/WeTheNorth/crawler_selenium.py b/MarketPlaces/WeTheNorth/crawler_selenium.py index c6d5b70..4b95513 100644 --- a/MarketPlaces/WeTheNorth/crawler_selenium.py +++ b/MarketPlaces/WeTheNorth/crawler_selenium.py @@ -14,6 +14,8 @@ from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait from PIL import Image +import codecs +import socks, socket, time import urllib.parse as urlparse import os, re, time from datetime import date @@ -22,6 +24,7 @@ from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.WeTheNorth.parser import wethenorth_links_parser from MarketPlaces.Utilities.utilities import cleanHTML +import selenium counter = 1 baseURL = 'http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion' @@ -40,25 +43,24 @@ def startCrawling(): print(driver.current_url, e) closeDriver(driver) - new_parse(marketName, False) + new_parse(marketName, baseURL, True) # Login using premade account credentials and do login captcha manually def login(driver): time.sleep(3) #wait for login page - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div/div[2]/div[2]/div/div[3]/form/div[1]/input"))) + input("Press ENTER when CAPTCHA is completed\n") #entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[1]/input') + usernameBox = driver.find_element(by=By.XPATH, value='//input[@name="login"]') #Username here usernameBox.send_keys('blabri') - passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[2]/input') + passwordBox = driver.find_element(by=By.XPATH, value='//input[@name="pass"]') #Password here passwordBox.send_keys('fishowal') - ''' + # wait for captcha page show up WebDriverWait(driver, 100).until(EC.visibility_of_element_located( (By.XPATH, "/html/body/div/div[2]/div[2]/div/div[3]/form/div[3]/div/img"))) @@ -83,27 +85,27 @@ def login(driver): # click the verify(submit) button driver.find_element(by=By.XPATH, value="/html/body/div/div[2]/div[2]/div/div[3]/form/div[5]/input").click() - ''' + input("Press ENTER when CAPTCHA is completed\n") # wait for listing page show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 50).until(EC.visibility_of_element_located( (By.XPATH, '//*[@id="information"]'))) - # Returns the name of the website def getMarketName(): name = 'WeTheNorth' return name +def getMKTName() -> str: + name = 'WeTheNorth' + return name # Return the link of the website def getFixedURL(): url = 'http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion' - return url - # Closes Tor Browser def closeDriver(driver): # global pid @@ -123,13 +125,13 @@ def createFFDriver(): ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) - # ff_prof.set_preference("places.history.enabled", False) - # ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) - # ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) - # ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) - # ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) - # ff_prof.set_preference("signon.rememberSignons", False) - # ff_prof.set_preference("network.cookie.lifetimePolicy", 2) + ff_prof.set_preference("places.history.enabled", False) + ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) + ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) + ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) + ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) + ff_prof.set_preference("signon.rememberSignons", False) + ff_prof.set_preference("network.cookie.lifetimePolicy", 2) # ff_prof.set_preference("network.dns.disablePrefetch", True) # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) @@ -206,7 +208,6 @@ def getInterestedLinks(): # Software and Malware links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=10') - return links @@ -243,12 +244,6 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - break - - # comment out - if count == 1: - break try: nav = driver.find_element(by=By.XPATH, value=