Browse Source

first try.

main
Anita Mehr 1 year ago
parent
commit
598e3398cb
1 changed files with 19 additions and 24 deletions
  1. +19
    -24
      MarketPlaces/WeTheNorth/crawler_selenium.py

+ 19
- 24
MarketPlaces/WeTheNorth/crawler_selenium.py View File

@ -14,6 +14,8 @@ from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from PIL import Image
import codecs
import socks, socket, time
import urllib.parse as urlparse
import os, re, time
from datetime import date
@ -22,6 +24,7 @@ from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.WeTheNorth.parser import wethenorth_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
import selenium
counter = 1
baseURL = 'http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion'
@ -40,25 +43,24 @@ def startCrawling():
print(driver.current_url, e)
closeDriver(driver)
new_parse(marketName, False)
new_parse(marketName, baseURL, True)
# Login using premade account credentials and do login captcha manually
def login(driver):
time.sleep(3)
#wait for login page
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div/div[2]/div[2]/div/div[3]/form/div[1]/input")))
input("Press ENTER when CAPTCHA is completed\n")
#entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[1]/input')
usernameBox = driver.find_element(by=By.XPATH, value='//input[@name="login"]')
#Username here
usernameBox.send_keys('blabri')
passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div[2]/div/div[3]/form/div[2]/input')
passwordBox = driver.find_element(by=By.XPATH, value='//input[@name="pass"]')
#Password here
passwordBox.send_keys('fishowal')
'''
# wait for captcha page show up
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div/div[2]/div[2]/div/div[3]/form/div[3]/div/img")))
@ -83,27 +85,27 @@ def login(driver):
# click the verify(submit) button
driver.find_element(by=By.XPATH, value="/html/body/div/div[2]/div[2]/div/div[3]/form/div[5]/input").click()
'''
input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="information"]')))
# Returns the name of the website
def getMarketName():
name = 'WeTheNorth'
return name
def getMKTName() -> str:
name = 'WeTheNorth'
return name
# Return the link of the website
def getFixedURL():
url = 'http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion'
return url
# Closes Tor Browser
def closeDriver(driver):
# global pid
@ -123,13 +125,13 @@ def createFFDriver():
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
# ff_prof.set_preference("places.history.enabled", False)
# ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
# ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
# ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
# ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
# ff_prof.set_preference("signon.rememberSignons", False)
# ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
@ -206,7 +208,6 @@ def getInterestedLinks():
# Software and Malware
links.append('http://hn2paw7zaahbikbejiv6h22zwtijlam65y2c77xj2ypbilm2xs4bnbid.onion/items.php?category=10')
return links
@ -243,12 +244,6 @@ def crawlForum(driver):
savePage(driver, driver.page_source, item)
driver.back()
# comment out
break
# comment out
if count == 1:
break
try:
nav = driver.find_element(by=By.XPATH, value=


Loading…
Cancel
Save