Browse Source

first try.

main
Anita Mehr 1 year ago
parent
commit
598e3398cb
1 changed files with 19 additions and 24 deletions
  1. +19
    -24
      MarketPlaces/WeTheNorth/crawler_selenium.py

+ 19
- 24
MarketPlaces/WeTheNorth/crawler_selenium.py View File

@ -14,6 +14,8 @@ from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import WebDriverWait
from PIL import Image from PIL import Image
import codecs
import socks, socket, time
import urllib.parse as urlparse import urllib.parse as urlparse
import os, re, time import os, re, time
from datetime import date from datetime import date
@ -22,6 +24,7 @@ from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.WeTheNorth.parser import wethenorth_links_parser from MarketPlaces.WeTheNorth.parser import wethenorth_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML from MarketPlaces.Utilities.utilities import cleanHTML
import selenium
counter = 1 counter = 1
baseURL = 'http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion' baseURL = 'http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion'
@ -40,25 +43,24 @@ def startCrawling():
print(driver.current_url, e) print(driver.current_url, e)
closeDriver(driver) closeDriver(driver)
new_parse(marketName, False)
new_parse(marketName, baseURL, True)
# Login using premade account credentials and do login captcha manually # Login using premade account credentials and do login captcha manually
def login(driver): def login(driver):
time.sleep(3) time.sleep(3)
#wait for login page #wait for login page
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div/div[2]/div[2]/div/div[3]/form/div[1]/input")))
input("Press ENTER when CAPTCHA is completed\n")
#entering username and password into input boxes #entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[1]/input')
usernameBox = driver.find_element(by=By.XPATH, value='//input[@name="login"]')
#Username here #Username here
usernameBox.send_keys('blabri') usernameBox.send_keys('blabri')
passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[2]/input')
passwordBox = driver.find_element(by=By.XPATH, value='//input[@name="pass"]')
#Password here #Password here
passwordBox.send_keys('fishowal') passwordBox.send_keys('fishowal')
'''
# wait for captcha page show up # wait for captcha page show up
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div/div[2]/div[2]/div/div[3]/form/div[3]/div/img"))) (By.XPATH, "/html/body/div/div[2]/div[2]/div/div[3]/form/div[3]/div/img")))
@ -83,27 +85,27 @@ def login(driver):
# click the verify(submit) button # click the verify(submit) button
driver.find_element(by=By.XPATH, value="/html/body/div/div[2]/div[2]/div/div[3]/form/div[5]/input").click() driver.find_element(by=By.XPATH, value="/html/body/div/div[2]/div[2]/div/div[3]/form/div[5]/input").click()
'''
input("Press ENTER when CAPTCHA is completed\n") input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url) # wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 50).until(EC.visibility_of_element_located( WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="information"]'))) (By.XPATH, '//*[@id="information"]')))
# Returns the name of the website # Returns the name of the website
def getMarketName(): def getMarketName():
name = 'WeTheNorth' name = 'WeTheNorth'
return name return name
def getMKTName() -> str:
name = 'WeTheNorth'
return name
# Return the link of the website # Return the link of the website
def getFixedURL(): def getFixedURL():
url = 'http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion' url = 'http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion'
return url return url
# Closes Tor Browser # Closes Tor Browser
def closeDriver(driver): def closeDriver(driver):
# global pid # global pid
@ -123,13 +125,13 @@ def createFFDriver():
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
# ff_prof.set_preference("places.history.enabled", False)
# ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
# ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
# ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
# ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
# ff_prof.set_preference("signon.rememberSignons", False)
# ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True) # ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0) # ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("permissions.default.image", 3)
@ -206,7 +208,6 @@ def getInterestedLinks():
# Software and Malware # Software and Malware
links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=10') links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=10')
return links return links
@ -243,12 +244,6 @@ def crawlForum(driver):
savePage(driver, driver.page_source, item) savePage(driver, driver.page_source, item)
driver.back() driver.back()
# comment out
break
# comment out
if count == 1:
break
try: try:
nav = driver.find_element(by=By.XPATH, value= nav = driver.find_element(by=By.XPATH, value=


Loading…
Cancel
Save