Browse Source

added incomplete abacus crawler manually

main
LynnTaka 8 months ago
parent
commit
f0f6753ab3
1 changed files with 312 additions and 0 deletions
  1. +312
    -0
      MarketPlaces/Abacus/crawler_selenium.py

+ 312
- 0
MarketPlaces/Abacus/crawler_selenium.py View File

@ -0,0 +1,312 @@
__author__ = 'DarkWeb'
'''
Abacus Marketplace Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.Abacus.parser import abacus_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://abacuseeettcn3n2zxo7tqy5vsxhqpha2jtjqs7cgdjzl2jascr4liad.onion'
def startCrawling():
mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
# new_parse(mktName, baseURL, True)
# Returns the name of the website
def getMKTName():
name = 'Abacus'
return name
# Return the base link of the website
def getFixedURL():
url = 'http://abacuseeettcn3n2zxo7tqy5vsxhqpha2jtjqs7cgdjzl2jascr4liad.onion'
return url
# Closes Tor Browser
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", False)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
def login(driver):
input("Press ENTER when CAPTCHA is complete and login page has loaded\n")
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div/div/div[1]/div/form/div[3]/input[1]')))
# entering username and password into input boxes
try:
usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div/div[1]/div/form/div[3]/input[1]')
# Username here
usernameBox.send_keys('ct1234')
passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div/div[1]/div/form/div[3]/input[2]')
# Password here
passwordBox.send_keys('DementedBed123-')
except:
usernameBox = driver.find_element(by=By.CSS_SELECTOR, value='input.border-solid:nth-child(2)')
# Username here
usernameBox.send_keys('ct1234')
passwordBox = driver.find_element(by=By.CSS_SELECTOR, value='input.border-solid:nth-child(4)')
# Password here
passwordBox.send_keys('DementedBed123-')
input("Press ENTER AFTER phishing is completed (there is a captcha first and then an antiphishing check)\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div/div/div[2]/div/div[2]')))
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
def getMKTName() -> str:
name = 'Abacus'
return name
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if name == '':
name = str(counter)
counter = counter + 1
return name
def getInterestedLinks():
links = []
# botnets and malware
links.append('http://abacuseeettcn3n2zxo7tqy5vsxhqpha2jtjqs7cgdjzl2jascr4liad.onion/search?fcats[]=475756f633d0cc71f0c868bd&cats=2&s_quick=1')
# # social engineering
# links.append(
# 'http://abacuseeettcn3n2zxo7tqy5vsxhqpha2jtjqs7cgdjzl2jascr4liad.onion/search?fcats[]=1c29a89f7a4022133cab877d&cats=2&s_quick=1')
# digital
links.append(
'http://abacuseeettcn3n2zxo7tqy5vsxhqpha2jtjqs7cgdjzl2jascr4liad.onion/search?fcats[]=475756f633d0cc71f0c868bd&cats=2&s_quick=1')
# # hacking
# links.append(
# 'http://abacuseeettcn3n2zxo7tqy5vsxhqpha2jtjqs7cgdjzl2jascr4liad.onion/search?fcats[]=a0773b3de70bdaca38acda2f&cats=2&s_quick=1')
# # carding
# links.append('http://abacuseeettcn3n2zxo7tqy5vsxhqpha2jtjqs7cgdjzl2jascr4liad.onion/search?fcats[]=1b17857dc74c11953df85c55&cats=2&s_quick=1
return links
def crawlForum(driver):
print("Crawling the Abacus market")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
print('waiting ten seconds to avoid ddos check')
time.sleep(10)
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
print(itemURL)
print('waiting 5 sec to avoid ddos check')
time.sleep(5)
driver.get(itemURL)
except:
driver.refresh()
savePage(driver, driver.page_source, item)
print('waiting 20 seconds to avoid ddos check')
time.sleep(20)
driver.back()
# comment out
break
# # comment out
# if count == 3:
# break
try:
chev = driver.find_element(by=By.XPATH, value='/html/body/div/div/div[2]/div/div[3]/div[4]')
a_tags = chev.find_elements(by=By.TAG_NAME, value='a')
try:
for a_tag in a_tags:
try:
temp = a_tag.find_element(by=By.CLASS_NAME, value='gg-chevron-right')
except:
temp = ''
if temp:
link = a_tag.get_attribute('href')
print(link)
if link == '#':
link = ''
break
else:
link = ''
except:
try:
a_tag = a_tags[-2].find_element(by=By.CLASS_NAME, value='gg-chevron-right').get_attribute('href')
if a_tag:
link = a_tag.get_attribute('href')
if link == '#':
link = ''
break
else:
link = ''
except:
link=''
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
print("Crawling the Abacus market done.")
# Returns 'True' if the link is Topic link, may need to change for every website
def isDescriptionLink(url):
if 'listing' in url:
return True
return False
# Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url):
if 'search' in url:
return True
return False
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return abacus_links_parser(soup)
def crawler():
startCrawling()

Loading…
Cancel
Save