Browse Source

gofish parser done. now testing

main
Joshua 1 year ago
parent
commit
77565cfa22
3 changed files with 82 additions and 71 deletions
  1. +11
    -11
      MarketPlaces/GoFish/crawler_selenium.py
  2. +66
    -60
      MarketPlaces/GoFish/parser.py
  3. +5
    -0
      MarketPlaces/Initialization/prepare_parser.py

+ 11
- 11
MarketPlaces/GoFish/crawler_selenium.py View File

@ -31,17 +31,17 @@ baseURL = 'http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion
def startCrawling(): def startCrawling():
mktName = getMKTName() mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
# new_parse(mktName, baseURL, True)
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closeDriver(driver)
new_parse(mktName, baseURL, True)
# Returns the name of the website # Returns the name of the website


+ 66
- 60
MarketPlaces/GoFish/parser.py View File

@ -2,7 +2,6 @@ __author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data # Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import * from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree # Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -37,67 +36,75 @@ def gofish_description_parser(soup):
vendor_image = "-1" # 20 Vendor_Image vendor_image = "-1" # 20 Vendor_Image
# Finding Product Name # Finding Product Name
divmb = soup.findAll('div', {'class': "mb-1"})
divmb = soup.find('div', {'class': "p-3 mb-1 fs-3 fw-bold border border-2 bg-white rounded"})
name = divmb[0].text
name = divmb.text
name = name.replace('\n', ' ') name = name.replace('\n', ' ')
name = name.replace(",", "") name = name.replace(",", "")
name = name.strip() name = name.strip()
# Finding Vendor # Finding Vendor
vendor = divmb[1].find('a').text.strip()
vendor = soup.find('div', {'class': 'my-1'}).text.strip()
# Finding Vendor Rating # Finding Vendor Rating
temp = soup.find('div', {'class': ""}).text
temp = temp.split('(')
rating = temp[0].replace("Vendor's Review : ", "")
rating = rating.replace("%", "")
rating_vendor = rating.strip()
# temp = soup.find('div', {'class': ""}).text
# temp = temp.split('(')
# rating = temp[0].replace("Vendor's Review : ", "")
# rating = rating.replace("%", "")
# rating_vendor = rating.strip()
# Finding the Product Rating and Number of Product Reviews # Finding the Product Rating and Number of Product Reviews
reviews = temp[2].replace(" review)", "")
reviews = reviews.strip()
# reviews = temp[2].replace(" review)", "")
# reviews = reviews.strip()
temp = temp[1].split(")")
rating = temp[1].replace("Product Review : ", "")
rating = rating.replace("%", "")
rating_item = rating.strip()
# temp = temp[1].split(")")
# rating = temp[1].replace("Product Review : ", "")
# rating = rating.replace("%", "")
# rating_item = rating.strip()
# Finding Prices # Finding Prices
USD = soup.find('div', {'class': "h3 text-primary"}).text.strip()
precios = soup.findall('td', {'class': "text-end text-nowrap"})
USD = precios[0].text.strip().replace('$', '')
# Finding the Product Category # Finding the Product Category
pmb = soup.findAll('p', {'class': "mb-1"})
# pmb = soup.findAll('p', {'class': "mb-1"})
category = pmb[-1].text
category = category.replace("Category: ", "").strip()
# category = pmb[-1].text
# category = category.replace("Category: ", "").strip()
# Finding the Product Quantity Available # Finding the Product Quantity Available
left = divmb[-1].text
left = left.split(",", 1)[1]
left = left.replace("in stock", "")
left = left.strip()
# left = divmb[-1].text
# left = left.split(",", 1)[1]
# left = left.replace("in stock", "")
# left = left.strip()
# Finding Number Sold # Finding Number Sold
sold = divmb[-1].text
sold = sold.split(",", 1)[0]
sold = sold.replace("sold", "")
sold = sold.strip()
# sold = divmb[-1].text
# sold = sold.split(",", 1)[0]
# sold = sold.replace("sold", "")
# sold = sold.strip()
# Finding Shipment Information (Origin) # Finding Shipment Information (Origin)
pmb[0].text
shipFrom = shipFrom.replace("Ships from: ", "").strip()
origin = soup.findall('div', {'class': "p-3 mt-2 mb-3 border border-2 bg-white rounded"})
origin = origin[0].text.strip()
shipFrom = origin[1:].strip()
# Finding Shipment Information (Destination) # Finding Shipment Information (Destination)
pmb[1].text
shipTo = shipTo.replace("Ships to: ", "").strip()
dest = soup.findall('div', {'class': 'p-3 mb-3 overflow-auto border border-2 bg-white rounded'})
dest = dest[-1].text.strip()
shipTo = dest[1:].strip()
# Finding the Product description # Finding the Product description
cardbody = soup.findAll('div', {'class': "card-body"})
describe = cardbody[1].text.strip()
cardbody = soup.findAll('div', {'class': "p-3 mb-3 overflow-auto border border-2 bg-white rounded"})
describe = cardbody[0].text
describe = describe.replace('\n', ' ')
describe = describe.strip()
# Finding Product Image # Finding Product Image
image = soup.find('div', {'class': 'product-primary'}).find('img')
img = soup.findall('figure', {'class': 'image-feature'})[0]
image = img.find('img', {'class': 'image-block rounded'})
image = image.get('src') image = image.get('src')
image = image.split('base64,')[-1] image = image.split('base64,')[-1]
@ -159,21 +166,20 @@ def gofish_listing_parser(soup):
image_vendor = [] # 21 Vendor_Image image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links href = [] # 22 Product_Links
listing = soup.findAll('div', {"id": "itembox"})
listing = soup.find('tbody', {"class": "border border-2 align-middle"}).findall('tr')
# Populating the Number of Products # Populating the Number of Products
nm = len(listing) nm = len(listing)
for a in listing: for a in listing:
bae = a.findAll('a', href=True) bae = a.findAll('a', href=True)
lb = a.findAll('div', {"id": "littlebox"})
# Adding the url to the list of urls # Adding the url to the list of urls
link = bae[0].get('href') link = bae[0].get('href')
href.append(link) href.append(link)
# Finding the Product # Finding the Product
product = lb[1].find('a').text
product = bae[1].text
product = product.replace('\n', ' ') product = product.replace('\n', ' ')
product = product.replace(",", "") product = product.replace(",", "")
product = product.replace("...", "") product = product.replace("...", "")
@ -181,19 +187,19 @@ def gofish_listing_parser(soup):
name.append(product) name.append(product)
# Finding Product Image # Finding Product Image
product_image = a.find('img')
product_image = bae[0].find('img')
product_image = product_image.get('src') product_image = product_image.get('src')
product_image = product_image.split('base64,')[-1] product_image = product_image.split('base64,')[-1]
image.append(product_image) image.append(product_image)
# Finding Prices # Finding Prices
price = lb[-1].find('div', {"class": "mb-1"}).text
price = a.find('span', {"class": "fw-bold text-nowrap"}).text
price = price.replace("$","") price = price.replace("$","")
price = price.strip() price = price.strip()
USD.append(price) USD.append(price)
# Finding the Vendor # Finding the Vendor
vendor_name = lb[-1].find("a").text
vendor_name = bae[-1].text
vendor_name = vendor_name.replace(",", "") vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.strip() vendor_name = vendor_name.strip()
vendor.append(vendor_name) vendor.append(vendor_name)
@ -201,35 +207,35 @@ def gofish_listing_parser(soup):
image_vendor.append("-1") image_vendor.append("-1")
# Finding the Category # Finding the Category
cat = lb[-1].find("span").text
cat = cat.replace("class:", "")
cat = cat.strip()
category.append(cat)
# cat = lb[-1].find("span").text
# cat = cat.replace("class:", "")
# cat = cat.strip()
# category.append(cat)
span = lb[0].findAll("span")
# span = lb[0].findAll("span")
# Finding Number of Views # Finding Number of Views
num = span[0].text
num = num.replace("views:", "")
num = num.strip()
sold.append(num)
# num = span[0].text
# num = num.replace("views:", "")
# num = num.strip()
# sold.append(num)
# Finding Number Sold # Finding Number Sold
num = span[2].text
num = num.replace("Sold:", "")
num = num.strip()
sold.append(num)
# num = span[2].text
# num = num.replace("Sold:", "")
# num = num.strip()
# sold.append(num)
# Finding Quantity Left # Finding Quantity Left
quant = span[1].text
quant = quant.replace("stock:", "")
quant = quant.strip()
qLeft.append(quant)
# quant = span[1].text
# quant = quant.replace("stock:", "")
# quant = quant.strip()
# qLeft.append(quant)
# add shipping information # add shipping information
ship = lb[2].findAll('small')[1].findAll('span')[1].text.split("->")
shipFrom.append(ship[0].replace("Ship from ", "").strip())
shipTo.append(ship[1].replace("to ", "").strip())
# ship = lb[2].findAll('small')[1].findAll('span')[1].text.split("->")
# shipFrom.append(ship[0].replace("Ship from ", "").strip())
# shipTo.append(ship[1].replace("to ", "").strip())
# Searching for CVE and MS categories # Searching for CVE and MS categories


+ 5
- 0
MarketPlaces/Initialization/prepare_parser.py View File

@ -19,6 +19,7 @@ from MarketPlaces.DarkBazar.parser import *
from MarketPlaces.Sonanza.parser import * from MarketPlaces.Sonanza.parser import *
from MarketPlaces.Kingdom.parser import * from MarketPlaces.Kingdom.parser import *
from MarketPlaces.Ares.parser import * from MarketPlaces.Ares.parser import *
from MarketPlaces.GoFish.parser import *
from MarketPlaces.Classifier.classify_product import predict from MarketPlaces.Classifier.classify_product import predict
@ -142,6 +143,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
rw = sonanza_listing_parser(soup) rw = sonanza_listing_parser(soup)
elif marketPlace == "Kingdom": elif marketPlace == "Kingdom":
rw = kingdom_listing_parser(soup) rw = kingdom_listing_parser(soup)
elif marketPlace == "GoFish":
rw = gofish_listing_parser(soup)
else: else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception raise Exception
@ -184,6 +187,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
rmm = sonanza_description_parser(soup) rmm = sonanza_description_parser(soup)
elif marketPlace == "Kingdom": elif marketPlace == "Kingdom":
rmm = kingdom_description_parser(soup) rmm = kingdom_description_parser(soup)
elif marketPlace == "GoFish":
rmm = gofish_description_parser(soup)
else: else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception raise Exception


Loading…
Cancel
Save