Browse Source

refactored BlackPyramid

main
westernmeadow 1 year ago
parent
commit
bbe020ceda
2 changed files with 67 additions and 98 deletions
  1. +65
    -92
      MarketPlaces/BlackPyramid/crawler_selenium.py
  2. +2
    -6
      MarketPlaces/BlackPyramid/parser.py

+ 65
- 92
MarketPlaces/BlackPyramid/crawler_selenium.py View File

@ -27,24 +27,15 @@ from MarketPlaces.Utilities.utilities import cleanHTML
import traceback import traceback
config = configparser.ConfigParser()
config.read('../../setup.ini')
counter = 1 counter = 1
baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/login/?login=1'
baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/'
# Opens Tor Browser, crawls the website # Opens Tor Browser, crawls the website
def startCrawling(): def startCrawling():
# Opening tor beforehand gives "Tor exited during startup error"
# opentor()
marketName = getMarketName()
marketName = getMKTName()
driver = getAccess() driver = getAccess()
# Wait for website to load
input("Press ENTER when website has loaded")
if driver != 'down': if driver != 'down':
try: try:
login(driver) login(driver)
@ -56,19 +47,12 @@ def startCrawling():
new_parse(marketName, baseURL, False) new_parse(marketName, baseURL, False)
# Opens Tor Browser
def opentor():
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
pid = pro.pid
time.sleep(7.5)
input('Tor Connected. Press ENTER to continue\n')
return
# Login # Login
def login(driver): def login(driver):
# wait for login page
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "//input[@name='username_login']")))
# entering username and password into input boxes # entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value="//input[@name='username_login']") usernameBox = driver.find_element(by=By.XPATH, value="//input[@name='username_login']")
# Username here # Username here
@ -77,15 +61,15 @@ def login(driver):
# Password here # Password here
passwordBox.send_keys('BlackBeans') passwordBox.send_keys('BlackBeans')
input("Press ENTER when CAPTCHA is completed\n")
input("Press ENTER when CAPTCHA is completed and you closed the newsletter\n")
# wait for listing page show up (This Xpath may need to change based on different seed url) # wait for listing page show up (This Xpath may need to change based on different seed url)
#WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
# (By.XPATH, '/html/body/div[2]/div[3]/div[3]/div[1]/div[3]/nav/ul/li[10]/a')))
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="form93b"]')))
# Returns the name of the website # Returns the name of the website
def getMarketName():
def getMKTName():
name = 'BlackPyramid' name = 'BlackPyramid'
return name return name
@ -103,7 +87,7 @@ def closetor(driver):
# os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe") # os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...') print('Closing Tor...')
driver.quit()
driver.close()
time.sleep(3) time.sleep(3)
return return
@ -111,6 +95,8 @@ def closetor(driver):
# Creates FireFox 'driver' and configure its 'Profile' # Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket # to use Tor proxy and socket
def createFFDriver(): def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
@ -121,8 +107,8 @@ def createFFDriver():
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@ -139,13 +125,14 @@ def createFFDriver():
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver return driver
def getAccess(): def getAccess():
url = getFixedURL() url = getFixedURL()
driver = createFFDriver() driver = createFFDriver()
input('Tor Connected. Press ENTER to continue\n')
try: try:
driver.get(url) driver.get(url)
return driver return driver
@ -155,8 +142,8 @@ def getAccess():
# Saves the crawled html page # Saves the crawled html page
def savePage(page, url):
cleanPage = cleanHTML(page)
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url) filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True) os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8')) open(filePath, 'wb').write(cleanPage.encode('utf-8'))
@ -165,19 +152,14 @@ def savePage(page, url):
# Gets the full path of the page to be saved along with its appropriate file name # Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url): def getFullPathName(url):
global counter
from MarketPlaces.Initialization.markets_mining import CURRENT_DATE
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url) fileName = getNameFromURL(url)
if isDescriptionLink(url): if isDescriptionLink(url):
if (os.path.exists(r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html')):
fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + "(" + str(counter) + ")" + '.html'
else:
fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else: else:
if (os.path.exists(r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html')):
fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + "(" + str(counter) + ")" + '.html'
else:
fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath return fullPath
@ -190,15 +172,23 @@ def getNameFromURL(url):
counter = counter + 1 counter = counter + 1
return name return name
def page_is_fully_loaded(driver):
return driver.execute_script("return document.readyState") == "complete"
def goToPage(driver, page): def goToPage(driver, page):
# hover over digital -> hacking tools # hover over digital -> hacking tools
a = ActionChains(driver) a = ActionChains(driver)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "//li[@class='dig940']/div/a")))
# hover # hover
digitalB = driver.find_element(By.XPATH, "//li[@class='dig940']/div/a") digitalB = driver.find_element(By.XPATH, "//li[@class='dig940']/div/a")
time.sleep(1) time.sleep(1)
a.move_to_element(digitalB).perform() a.move_to_element(digitalB).perform()
print(digitalB)
# print(digitalB)
# delay for website to register hover # delay for website to register hover
time.sleep(10) time.sleep(10)
@ -208,11 +198,11 @@ def goToPage(driver, page):
link = driver.find_element(By.XPATH, xpath) link = driver.find_element(By.XPATH, xpath)
time.sleep(1) time.sleep(1)
a.move_to_element(link).click().perform() a.move_to_element(link).click().perform()
print(link)
# print(link)
# wait for website to load # wait for website to load
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/center/div[4]/div[1]/div[3]/article/div[1]/h1/a')))
time.sleep(10)
WebDriverWait(driver, 100).until(page_is_fully_loaded)
def getInterestedLinks(): def getInterestedLinks():
@ -226,82 +216,72 @@ def getInterestedLinks():
def crawlForum(driver): def crawlForum(driver):
print("Crawling the BlackPyramid market") print("Crawling the BlackPyramid market")
#linksToCrawl = getInterestedLinks()
#pages = ["Hacking Tools"]
pages = getInterestedLinks() pages = getInterestedLinks()
#visited = set(linksToCrawl)
initialTime = time.time()
i = 0 i = 0
count = 0
for listing in pages: for listing in pages:
#link = linksToCrawl[i]
print('Crawling :', listing) print('Crawling :', listing)
try: try:
try:
goToPage(driver, listing)
except:
print("Try block 1")
driver.refresh()
time.sleep(5)
html = driver.page_source
savePage(html, listing)
goToPage(driver, listing)
has_next_page = True has_next_page = True
count = 0
currentPage = 1 currentPage = 1
numberOfPages = 1
while has_next_page: while has_next_page:
html = driver.page_source
savePage(driver, html, listing + "page" + str(currentPage))
# get a list of urls for each listing # get a list of urls for each listing
list = productPages(html) list = productPages(html)
for item in list:
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item)) itemURL = urlparse.urljoin(baseURL, str(item))
try: try:
driver.get(itemURL) driver.get(itemURL)
except: except:
print("Try block 2")
driver.refresh()
savePage(driver.page_source, item)
# driver.refresh()
continue
savePage(driver, driver.page_source, item)
# can't use the back button in dark pyramid # can't use the back button in dark pyramid
# driver.back() # driver.back()
# comment out # comment out
# break
break
# comment out # comment out
# if count == 1:
# count = 0
# break
if count == 1:
break
# go to next page of market # go to next page of market
try: try:
goToPage(driver, listing) goToPage(driver, listing)
nav = driver.find_element(by=By.XPATH, value="//input[@name='next_page']") nav = driver.find_element(by=By.XPATH, value="//input[@name='next_page']")
if not nav.is_enabled():
raise NoSuchElementException
try:
if nav.is_enabled():
# select next page # select next page
pgnum = uiClasses.Select(driver.find_element(by=By.XPATH, value="//select[@name='pageination']")) pgnum = uiClasses.Select(driver.find_element(by=By.XPATH, value="//select[@name='pageination']"))
print("pg options:", pgnum.options)
pgnum.select_by_index(currentPage)
# print("pg options:", pgnum.options)
numberOfPages = len(pgnum.options) numberOfPages = len(pgnum.options)
if currentPage >= numberOfPages:
raise NoSuchElementException
pgnum.select_by_index(currentPage)
currentPage += 1
# click button # click button
pgbutton = driver.find_element(by=By.XPATH, value="//input[@value='go to page']") pgbutton = driver.find_element(by=By.XPATH, value="//input[@value='go to page']")
pgbutton.click() pgbutton.click()
except Exception as e:
print(e)
raise NoSuchElementException
time.sleep(10)
html = driver.page_source
savePage(html, listing)
currentPage += 1
if currentPage > numberOfPages:
# wait for website to load
time.sleep(10)
WebDriverWait(driver, 100).until(page_is_fully_loaded)
else:
raise NoSuchElementException raise NoSuchElementException
count += 1 count += 1
@ -309,14 +289,10 @@ def crawlForum(driver):
has_next_page = False has_next_page = False
except Exception as e: except Exception as e:
traceback.print_exc()
print(listing, e) print(listing, e)
i += 1 i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling Dark Pyramid done successfully. Press ENTER to continue\n")
print("Crawling the BlackPyramid market done.")
# Returns 'True' if the link is Topic link # Returns 'True' if the link is Topic link
@ -342,6 +318,3 @@ def productPages(html):
def crawler(): def crawler():
startCrawling() startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!") # print("Crawling and Parsing BestCardingWorld .... DONE!")
if __name__ == '__main__':
startCrawling()

+ 2
- 6
MarketPlaces/BlackPyramid/parser.py View File

@ -179,7 +179,6 @@ def BlackPyramid_listing_parser(soup):
# Adding the url to the list of urls # Adding the url to the list of urls
link = bae[2].get('href') link = bae[2].get('href')
link = cleanLink(link)
href.append(link) href.append(link)
# Finding the Product # Finding the Product
@ -276,10 +275,7 @@ def BlackPyramid_links_parser(soup):
for item in listing: for item in listing:
container = item.find('a', {"class": "ah39063"})
if container:
link = item.find('a', {"class": "ah39063"})['href']
href.append(link)
link = item.find('a', {"class": "ah39063"})['href']
href.append(link)
return href return href

Loading…
Cancel
Save