Browse Source

completed GoFish

main
Helium 1 year ago
parent
commit
dcc07f5d2b
2 changed files with 118 additions and 207 deletions
  1. +31
    -33
      MarketPlaces/GoFish/crawler_selenium.py
  2. +87
    -174
      MarketPlaces/GoFish/parser.py

+ 31
- 33
MarketPlaces/GoFish/crawler_selenium.py View File

@ -31,15 +31,15 @@ baseURL = 'http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion
def startCrawling(): def startCrawling():
mktName = getMKTName() mktName = getMKTName()
driver = getAccess() # driver = getAccess()
if driver != 'down': # if driver != 'down':
try: # try:
login(driver) # login(driver)
crawlForum(driver) # crawlForum(driver)
except Exception as e: # except Exception as e:
print(driver.current_url, e) # print(driver.current_url, e)
closeDriver(driver) # closeDriver(driver)
new_parse(mktName, baseURL, True) new_parse(mktName, baseURL, True)
@ -82,8 +82,8 @@ def createFFDriver():
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2) ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
# ff_prof.set_preference("network.dns.disablePrefetch", True) ff_prof.set_preference("network.dns.disablePrefetch", True)
# ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@ -118,9 +118,9 @@ def getAccess():
def login(driver): def login(driver):
# input("Press ENTER when CAPTCHA is complete and login page has loaded\n")
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="username"]'))) (By.XPATH, '//*[@id="js-off"]')))
# entering username and password into input boxes # entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
@ -129,15 +129,12 @@ def login(driver):
passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
# Password here # Password here
passwordBox.send_keys('DementedBed123-') passwordBox.send_keys('DementedBed123-')
# submit
submit = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/form/div[7]/input')
submit.click()
input("Press ENTER when CAPTCHA is complete and home page has loaded\n") input("Press ENTER when CAPTCHA and login is pressed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url) # wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located( WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div/div[3]/div[2]/div[1]'))) (By.XPATH, '/html/body/div/div[3]/div[1]/div[3]')))
def savePage(driver, page, url): def savePage(driver, page, url):
@ -177,18 +174,20 @@ def getNameFromURL(url):
def getInterestedLinks(): def getInterestedLinks():
links = [] links = []
# Hosting and Security # # Fraud Software
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=84') links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=135')
# Exploits and Kits # # hacking guide
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=107') # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=129')
# Botnets and Malware # # malware tutorial
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=97') # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=130')
# Other Software # # programming tutorial
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=108') # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=131')
# Hacking Guide # # social engineering tutorial
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=129') # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=86')
# Fraud (mostly carding) # # botnets
links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=128') # links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=97')
# # exploits
# links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=107')
return links return links
@ -233,10 +232,9 @@ def crawlForum(driver):
# break # break
try: try:
link = driver.find_element(by=By.XPATH, value='/html/body/div/div[3]/div[2]/div[2]/nav/ul/li[3]/a').get_attribute('href') link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href')
if link == "": if link == "":
raise NoSuchElementException raise NoSuchElementException
link = urlparse.urljoin(baseURL, str(link))
count += 1 count += 1
except NoSuchElementException: except NoSuchElementException:
@ -251,14 +249,14 @@ def crawlForum(driver):
# Returns 'True' if the link is Topic link, may need to change for every website # Returns 'True' if the link is Topic link, may need to change for every website
def isDescriptionLink(url): def isDescriptionLink(url):
if 'a=' in url: if '.onion/?c' not in url:
return True return True
return False return False
# Returns True if the link is a listingPage link, may need to change for every website # Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url): def isListingLink(url):
if 'c=' in url: if '.onion/?c' in url:
return True return True
return False return False


+ 87
- 174
MarketPlaces/GoFish/parser.py View File

@ -2,6 +2,7 @@ __author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data # Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import * from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree # Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -35,105 +36,50 @@ def gofish_description_parser(soup):
image = "-1" # 19 Product_Image image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image vendor_image = "-1" # 20 Vendor_Image
# Finding Product Name temp = soup.find('div', {'class': 'col-lg-5'})
# find vendor name
divmb = soup.find('div', {'class': "p-3 mb-1 fs-3 fw-bold border border-2 bg-white rounded"}) vendor = temp.find('a', {'class': 'text-decoration-none fw-bold'}).text.strip()
if divmb is None: if vendor is None:
divmb = soup.find('div', {'class': "p-3 mb-1 fs-4 fw-bold border border-2 bg-white rounded"}) print('vendor')
# find product name
name = divmb.text temp2 = soup.find('nav', {'aria-label': 'breadcrumb'}).findAll('li', {'class': 'breadcrumb-item'})
name = name.replace('\n', ' ') name = soup.find('li', {'class': 'breadcrumb-item active text-truncate'}).text.strip()
name = name.replace('\r', ' ') if name is None:
name = name.replace('\t', ' ') print('name')
name = name.replace(",", "") describe = soup.find('div', {'class': 'p-3 mb-3 overflow-auto border border-2 bg-white rounded'}).text
name = name.strip() describe = cleanString(describe)
if describe is None:
# Finding Vendor print('describe')
vendor = soup.find('div', {'class': 'my-1'}).find('a').text.strip() category = temp2[2].text
if category is None:
# Finding Vendor Rating print('category')
# temp = soup.find('div', {'class': ""}).text USD = soup.find('td', {'class': 'text-end text-nowrap'}).text
# temp = temp.split('(') if USD is None:
# rating = temp[0].replace("Vendor's Review : ", "") print('USD')
# rating = rating.replace("%", "") shipFrom = soup.find('span', {'class': 'lh-1 me-2 fs-4'}).text
# rating_vendor = rating.strip() print(shipFrom)
shipTo = soup.find('span', {'class': 'lh-1 me-2 fs-4'}).text
# Finding the Product Rating and Number of Product Reviews if shipTo.isalnum():
# reviews = temp[2].replace(" review)", "") shipTo = shipTo
# reviews = reviews.strip() else:
shipTo = 'Worldwide'
# temp = temp[1].split(")")
# rating = temp[1].replace("Product Review : ", "")
# rating = rating.replace("%", "")
# rating_item = rating.strip()
# Finding Prices
precios = soup.findAll('td', {'class': "text-end text-nowrap"})
USD = precios[0].text.strip().replace('$', '')
# Finding the Product Category
# pmb = soup.findAll('p', {'class': "mb-1"})
# category = pmb[-1].text
# category = category.replace("Category: ", "").strip()
# Finding the Product Quantity Available
# left = divmb[-1].text
# left = left.split(",", 1)[1]
# left = left.replace("in stock", "")
# left = left.strip()
# Finding Number Sold
# sold = divmb[-1].text
# sold = sold.split(",", 1)[0]
# sold = sold.replace("sold", "")
# sold = sold.strip()
# Finding Shipment Information (Origin)
origin = soup.findAll('div', {'class': "p-3 mt-2 mb-3 border border-2 bg-white rounded"})
remove = origin[0].find('span').text.strip()
origin = origin[0].text.strip()
origin = origin.replace(remove, '')
shipFrom = origin.strip()
# Finding Shipment Information (Destination)
dest = soup.findAll('div', {'class': 'p-3 mb-3 overflow-auto border border-2 bg-white rounded'})
dest = dest[-1].text.strip()
dest = dest.replace('[', '')
dest = dest.replace(']', '')
shipTo = dest[1:].strip()
# Finding the Product description
cardbody = soup.findAll('div', {'class': "p-3 mb-3 overflow-auto border border-2 bg-white rounded"})
describe = cardbody[0].text
describe = describe.replace('\n', ' ')
describe = describe.strip()
# Finding Product Image # Finding Product Image
img = soup.findAll('figure', {'class': 'image-feature'})[0] image = soup.find('figure', {"class": 'image-feature'}).find('img')
image = img.find('img', {'class': 'image-block rounded'}) if image is not None:
image = image.get('src') image = image.get('src')
image = image.split('base64,')[-1] image = image.split('base64,')[-1]
else:
# Searching for CVE and MS categories print('img')
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) image = "-1"
if cve:
CVE = " "
for idx in cve:
CVE += (idx)
CVE += " "
CVE = CVE.replace(',', ' ')
CVE = CVE.replace('\n', '')
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
if ms:
MS = " "
for im in ms:
MS += (im)
MS += " "
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
# Populating the final variable (this should be a list with all fields scraped) # Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
@ -175,20 +121,26 @@ def gofish_listing_parser(soup):
image_vendor = [] # 21 Vendor_Image image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links href = [] # 22 Product_Links
listing = soup.find('tbody', {"class": "border border-2 align-middle"}).findAll('tr') temp = soup.find('div', {"class": "col-9"})
cat = temp.find('nav', {'aria-label': 'breadcrumb'}).find('li', {'class': 'breadcrumb-item active'}).text.strip()
cat = cleanString(cat)
listing = temp.find('tbody', {"class": 'border border-2 align-middle'}).findAll('tr')
# Populating the Number of Products # Populating the Number of Products
nm = len(listing) nm = len(listing)
for a in listing: for a in listing:
bae = a.findAll('a', href=True) category.append(cat)
# Adding the url to the list of urls # Adding the url to the list of urls
link = bae[0].get('href') link = a.find('a').get('href')
link = cleanLink(link)
href.append(link) href.append(link)
# Finding the Product # Finding the Product name
product = bae[1].text product = a.find('a', {"class": 'text-decoration-none'}).text
product = product.replace('\n', ' ') product = product.replace('\n', ' ')
product = product.replace(",", "") product = product.replace(",", "")
product = product.replace("...", "") product = product.replace("...", "")
@ -196,83 +148,50 @@ def gofish_listing_parser(soup):
name.append(product) name.append(product)
# Finding Product Image # Finding Product Image
product_image = bae[0].find('img') product_image = a.find('img')
product_image = product_image.get('src') product_image = product_image.get('src')
product_image = product_image.split('base64,')[-1] product_image = product_image.split('base64,')[-1]
image.append(product_image) image.append(product_image)
# Finding Prices
price = a.find('span', {"class": "fw-bold text-nowrap"}).text
price = price.replace("$","")
price = price.strip()
USD.append(price)
# Finding the Vendor # Finding the Vendor
vendor_name = bae[-1].text vendor_name = a.find('a', {"class": 'text-decoration-none fw-bold'}).text
vendor_name = vendor_name.replace(",", "") vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip() vendor_name = vendor_name.strip()
vendor.append(vendor_name) vendor.append(vendor_name)
# image vendor
image_vendor.append("-1") image_vendor.append("-1")
# Finding the Category # USD
# cat = lb[-1].find("span").text usd = a.find('div', {'class': 'text-nowrap'}). find('span', {'class': 'fw-bold text-nowrap'}).text.strip()
# cat = cat.replace("class:", "") USD.append(usd)
# cat = cat.strip() temp = a.findAll('span', {'class': 'fs-4 lh-1'})
# category.append(cat) shipF = temp[0].text
shipFrom.append(shipF)
# span = lb[0].findAll("span") shipT = temp[1].text
if shipT.isalnum():
# Finding Number of Views shipTo.append(shipT)
# num = span[0].text
# num = num.replace("views:", "")
# num = num.strip()
# sold.append(num)
# Finding Number Sold
# num = span[2].text
# num = num.replace("Sold:", "")
# num = num.strip()
# sold.append(num)
# Finding Quantity Left
# quant = span[1].text
# quant = quant.replace("stock:", "")
# quant = quant.strip()
# qLeft.append(quant)
# add shipping information
# ship = lb[2].findAll('small')[1].findAll('span')[1].text.split("->")
# shipFrom.append(ship[0].replace("Ship from ", "").strip())
# shipTo.append(ship[1].replace("to ", "").strip())
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue = "-1"
else:
cee = " "
for idx in cve:
cee += (idx)
cee += " "
cee = cee.replace(',', ' ')
cee = cee.replace('\n', '')
cveValue = cee
CVE.append(cveValue)
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue = "-1"
else: else:
me = " " shipTo.append('Worldwide')
for im in ms: rating_vendor.append('-1')
me += (im) success.append('-1')
me += " " CVE.append('-1')
me = me.replace(',', ' ') MS.append('-1')
me = me.replace('\n', '') describe.append('-1')
MSValue = me views.append('-1')
MS.append(MSValue) reviews.append('-1')
rating_item.append('-1')
addDate.append('-1')
BTC.append('-1')
EURO.append('-1')
sold.append('-1')
qLeft.append('-1')
# Populate the final variable (this should be a list with all fields scraped) # Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
@ -286,13 +205,7 @@ def gofish_links_parser(soup):
# Returning all links that should be visited by the Crawler # Returning all links that should be visited by the Crawler
href = [] href = []
listing = soup.find('tbody', {'class': 'border border-2 align-middle'}) listing = soup.find('div', {"class": "col-9"}).find('tbody', {'class': 'border border-2 align-middle'}).findAll('tr')
listing = soup.findAll('tr')
listing = listing[1:]
# for a in listing:
# bae = a.find('a', {"class": "text-info"}, href=True)
# link = bae['href']
# href.append(link)
for a in listing: for a in listing:
bae = a.findAll('a', href=True) bae = a.findAll('a', href=True)


|||||||
x
 
000:0
Loading…
Cancel
Save