Browse Source

completed GoFish

main
Helium 11 months ago
parent
commit
dcc07f5d2b
2 changed files with 118 additions and 207 deletions
  1. +31
    -33
      MarketPlaces/GoFish/crawler_selenium.py
  2. +87
    -174
      MarketPlaces/GoFish/parser.py

+ 31
- 33
MarketPlaces/GoFish/crawler_selenium.py View File

@ -31,15 +31,15 @@ baseURL = 'http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion
def startCrawling():
mktName = getMKTName()
driver = getAccess()
# driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closeDriver(driver)
new_parse(mktName, baseURL, True)
@ -82,8 +82,8 @@ def createFFDriver():
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@ -118,9 +118,9 @@ def getAccess():
def login(driver):
# input("Press ENTER when CAPTCHA is complete and login page has loaded\n")
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="username"]')))
(By.XPATH, '//*[@id="js-off"]')))
# entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
@ -129,15 +129,12 @@ def login(driver):
passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
# Password here
passwordBox.send_keys('DementedBed123-')
# submit
submit = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/form/div[7]/input')
submit.click()
input("Press ENTER when CAPTCHA is complete and home page has loaded\n")
input("Press ENTER when CAPTCHA and login is pressed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div/div[3]/div[2]/div[1]')))
(By.XPATH, '/html/body/div/div[3]/div[1]/div[3]')))
def savePage(driver, page, url):
@ -177,18 +174,20 @@ def getNameFromURL(url):
def getInterestedLinks():
links = []
# Hosting and Security
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=84')
# Exploits and Kits
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=107')
# Botnets and Malware
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=97')
# Other Software
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=108')
# Hacking Guide
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=129')
# Fraud (mostly carding)
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=128')
# # Fraud Software
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=135')
# # hacking guide
# links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=129')
# # malware tutorial
# links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=130')
# # programming tutorial
# links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=131')
# # social engineering tutorial
# links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=86')
# # botnets
# links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=97')
# # exploits
# links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=107')
return links
@ -233,10 +232,9 @@ def crawlForum(driver):
# break
try:
link = driver.find_element(by=By.XPATH, value='/html/body/div/div[3]/div[2]/div[2]/nav/ul/li[3]/a').get_attribute('href')
link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href')
if link == "":
raise NoSuchElementException
link = urlparse.urljoin(baseURL, str(link))
count += 1
except NoSuchElementException:
@ -251,14 +249,14 @@ def crawlForum(driver):
# Returns 'True' if the link is Topic link, may need to change for every website
def isDescriptionLink(url):
if 'a=' in url:
if '.onion/?c' not in url:
return True
return False
# Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url):
if 'c=' in url:
if '.onion/?c' in url:
return True
return False


+ 87
- 174
MarketPlaces/GoFish/parser.py View File

@ -2,6 +2,7 @@ __author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
@ -35,105 +36,50 @@ def gofish_description_parser(soup):
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
# Finding Product Name
divmb = soup.find('div', {'class': "p-3 mb-1 fs-3 fw-bold border border-2 bg-white rounded"})
if divmb is None:
divmb = soup.find('div', {'class': "p-3 mb-1 fs-4 fw-bold border border-2 bg-white rounded"})
name = divmb.text
name = name.replace('\n', ' ')
name = name.replace('\r', ' ')
name = name.replace('\t', ' ')
name = name.replace(",", "")
name = name.strip()
# Finding Vendor
vendor = soup.find('div', {'class': 'my-1'}).find('a').text.strip()
# Finding Vendor Rating
# temp = soup.find('div', {'class': ""}).text
# temp = temp.split('(')
# rating = temp[0].replace("Vendor's Review : ", "")
# rating = rating.replace("%", "")
# rating_vendor = rating.strip()
# Finding the Product Rating and Number of Product Reviews
# reviews = temp[2].replace(" review)", "")
# reviews = reviews.strip()
# temp = temp[1].split(")")
# rating = temp[1].replace("Product Review : ", "")
# rating = rating.replace("%", "")
# rating_item = rating.strip()
# Finding Prices
precios = soup.findAll('td', {'class': "text-end text-nowrap"})
USD = precios[0].text.strip().replace('$', '')
# Finding the Product Category
# pmb = soup.findAll('p', {'class': "mb-1"})
# category = pmb[-1].text
# category = category.replace("Category: ", "").strip()
# Finding the Product Quantity Available
# left = divmb[-1].text
# left = left.split(",", 1)[1]
# left = left.replace("in stock", "")
# left = left.strip()
# Finding Number Sold
# sold = divmb[-1].text
# sold = sold.split(",", 1)[0]
# sold = sold.replace("sold", "")
# sold = sold.strip()
# Finding Shipment Information (Origin)
origin = soup.findAll('div', {'class': "p-3 mt-2 mb-3 border border-2 bg-white rounded"})
remove = origin[0].find('span').text.strip()
origin = origin[0].text.strip()
origin = origin.replace(remove, '')
shipFrom = origin.strip()
# Finding Shipment Information (Destination)
dest = soup.findAll('div', {'class': 'p-3 mb-3 overflow-auto border border-2 bg-white rounded'})
dest = dest[-1].text.strip()
dest = dest.replace('[', '')
dest = dest.replace(']', '')
shipTo = dest[1:].strip()
# Finding the Product description
cardbody = soup.findAll('div', {'class': "p-3 mb-3 overflow-auto border border-2 bg-white rounded"})
describe = cardbody[0].text
describe = describe.replace('\n', ' ')
describe = describe.strip()
temp = soup.find('div', {'class': 'col-lg-5'})
# find vendor name
vendor = temp.find('a', {'class': 'text-decoration-none fw-bold'}).text.strip()
if vendor is None:
print('vendor')
# find product name
temp2 = soup.find('nav', {'aria-label': 'breadcrumb'}).findAll('li', {'class': 'breadcrumb-item'})
name = soup.find('li', {'class': 'breadcrumb-item active text-truncate'}).text.strip()
if name is None:
print('name')
describe = soup.find('div', {'class': 'p-3 mb-3 overflow-auto border border-2 bg-white rounded'}).text
describe = cleanString(describe)
if describe is None:
print('describe')
category = temp2[2].text
if category is None:
print('category')
USD = soup.find('td', {'class': 'text-end text-nowrap'}).text
if USD is None:
print('USD')
shipFrom = soup.find('span', {'class': 'lh-1 me-2 fs-4'}).text
print(shipFrom)
shipTo = soup.find('span', {'class': 'lh-1 me-2 fs-4'}).text
if shipTo.isalnum():
shipTo = shipTo
else:
shipTo = 'Worldwide'
# Finding Product Image
img = soup.findAll('figure', {'class': 'image-feature'})[0]
image = img.find('img', {'class': 'image-block rounded'})
image = image.get('src')
image = image.split('base64,')[-1]
# Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve:
CVE = " "
for idx in cve:
CVE += (idx)
CVE += " "
CVE = CVE.replace(',', ' ')
CVE = CVE.replace('\n', '')
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
if ms:
MS = " "
for im in ms:
MS += (im)
MS += " "
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
image = soup.find('figure', {"class": 'image-feature'}).find('img')
if image is not None:
image = image.get('src')
image = image.split('base64,')[-1]
else:
print('img')
image = "-1"
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
@ -175,20 +121,26 @@ def gofish_listing_parser(soup):
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
listing = soup.find('tbody', {"class": "border border-2 align-middle"}).findAll('tr')
temp = soup.find('div', {"class": "col-9"})
cat = temp.find('nav', {'aria-label': 'breadcrumb'}).find('li', {'class': 'breadcrumb-item active'}).text.strip()
cat = cleanString(cat)
listing = temp.find('tbody', {"class": 'border border-2 align-middle'}).findAll('tr')
# Populating the Number of Products
nm = len(listing)
for a in listing:
bae = a.findAll('a', href=True)
category.append(cat)
# Adding the url to the list of urls
link = bae[0].get('href')
link = a.find('a').get('href')
link = cleanLink(link)
href.append(link)
# Finding the Product
product = bae[1].text
# Finding the Product name
product = a.find('a', {"class": 'text-decoration-none'}).text
product = product.replace('\n', ' ')
product = product.replace(",", "")
product = product.replace("...", "")
@ -196,83 +148,50 @@ def gofish_listing_parser(soup):
name.append(product)
# Finding Product Image
product_image = bae[0].find('img')
product_image = a.find('img')
product_image = product_image.get('src')
product_image = product_image.split('base64,')[-1]
image.append(product_image)
# Finding Prices
price = a.find('span', {"class": "fw-bold text-nowrap"}).text
price = price.replace("$","")
price = price.strip()
USD.append(price)
# Finding the Vendor
vendor_name = bae[-1].text
vendor_name = a.find('a', {"class": 'text-decoration-none fw-bold'}).text
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
# image vendor
image_vendor.append("-1")
# Finding the Category
# cat = lb[-1].find("span").text
# cat = cat.replace("class:", "")
# cat = cat.strip()
# category.append(cat)
# span = lb[0].findAll("span")
# Finding Number of Views
# num = span[0].text
# num = num.replace("views:", "")
# num = num.strip()
# sold.append(num)
# Finding Number Sold
# num = span[2].text
# num = num.replace("Sold:", "")
# num = num.strip()
# sold.append(num)
# Finding Quantity Left
# quant = span[1].text
# quant = quant.replace("stock:", "")
# quant = quant.strip()
# qLeft.append(quant)
# add shipping information
# ship = lb[2].findAll('small')[1].findAll('span')[1].text.split("->")
# shipFrom.append(ship[0].replace("Ship from ", "").strip())
# shipTo.append(ship[1].replace("to ", "").strip())
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue = "-1"
else:
cee = " "
for idx in cve:
cee += (idx)
cee += " "
cee = cee.replace(',', ' ')
cee = cee.replace('\n', '')
cveValue = cee
CVE.append(cveValue)
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue = "-1"
# USD
usd = a.find('div', {'class': 'text-nowrap'}). find('span', {'class': 'fw-bold text-nowrap'}).text.strip()
USD.append(usd)
temp = a.findAll('span', {'class': 'fs-4 lh-1'})
shipF = temp[0].text
shipFrom.append(shipF)
shipT = temp[1].text
if shipT.isalnum():
shipTo.append(shipT)
else:
me = " "
for im in ms:
me += (im)
me += " "
me = me.replace(',', ' ')
me = me.replace('\n', '')
MSValue = me
MS.append(MSValue)
shipTo.append('Worldwide')
rating_vendor.append('-1')
success.append('-1')
CVE.append('-1')
MS.append('-1')
describe.append('-1')
views.append('-1')
reviews.append('-1')
rating_item.append('-1')
addDate.append('-1')
BTC.append('-1')
EURO.append('-1')
sold.append('-1')
qLeft.append('-1')
# Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
@ -286,13 +205,7 @@ def gofish_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.find('tbody', {'class': 'border border-2 align-middle'})
listing = soup.findAll('tr')
listing = listing[1:]
# for a in listing:
# bae = a.find('a', {"class": "text-info"}, href=True)
# link = bae['href']
# href.append(link)
listing = soup.find('div', {"class": "col-9"}).find('tbody', {'class': 'border border-2 align-middle'}).findAll('tr')
for a in listing:
bae = a.findAll('a', href=True)


Loading…
Cancel
Save