Browse Source

Bring parser up to date and improved crawler

main
chris 1 year ago
parent
commit
9e21f6bfa4
2 changed files with 76 additions and 53 deletions
  1. +10
    -1
      MarketPlaces/BlackPyramid/crawler_selenium.py
  2. +66
    -52
      MarketPlaces/BlackPyramid/parser.py

+ 10
- 1
MarketPlaces/BlackPyramid/crawler_selenium.py View File

@ -14,6 +14,7 @@ from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver import ActionChains from selenium.webdriver import ActionChains
import selenium.webdriver.support.ui as uiClasses import selenium.webdriver.support.ui as uiClasses
from selenium.webdriver.common.keys import Keys
from PIL import Image from PIL import Image
import urllib.parse as urlparse import urllib.parse as urlparse
@ -191,7 +192,7 @@ def goToPage(driver, page):
# print(digitalB) # print(digitalB)
# delay for website to register hover # delay for website to register hover
time.sleep(10)
time.sleep(5)
# click # click
xpath = "//input[@name='" + page + "']" xpath = "//input[@name='" + page + "']"
@ -259,6 +260,9 @@ def crawlForum(driver):
# go to next page of market # go to next page of market
try: try:
# Scroll to top of page to see navigation bar
driver.find_element(by=By.XPATH, value="//body").send_keys(Keys.CONTROL + Keys.HOME)
goToPage(driver, listing) goToPage(driver, listing)
nav = driver.find_element(by=By.XPATH, value="//input[@name='next_page']") nav = driver.find_element(by=By.XPATH, value="//input[@name='next_page']")
@ -318,3 +322,8 @@ def productPages(html):
def crawler(): def crawler():
startCrawling() startCrawling()
# print("Crawling and Parsing BestCardingWorld .... DONE!") # print("Crawling and Parsing BestCardingWorld .... DONE!")
if __name__ == "__main__":
#crawler()
new_parse("BlackPyramid", baseURL, False)

+ 66
- 52
MarketPlaces/BlackPyramid/parser.py View File

@ -11,33 +11,31 @@ from bs4 import BeautifulSoup
#stores info it needs in different lists, these lists are returned after being organized #stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of description page #@param: soup object looking at html page of description page
#return: 'row' that contains a variety of lists that each hold info on the description page #return: 'row' that contains a variety of lists that each hold info on the description page
def BlackPyramid_description_parser(soup):
def blackpyramid_description_parser(soup):
# Fields to be parsed # Fields to be parsed
name = "-1" # 0 Product_Name
describe = "-1" # 1 Product_Description
lastSeen = "-1" # 2 Product_LastViewDate
rules = "-1" # 3 NOT USED ...
CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
review = "-1" # 6 Product_Number_Of_Reviews
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
category = "-1" # 7 Product_Category category = "-1" # 7 Product_Category
shipFrom = "-1" # 8 Product_ShippedFrom
shipTo = "-1" # 9 Product_ShippedTo
left = "-1" # 10 Product_QuantityLeft
escrow = "-1" # 11 Vendor_Warranty
terms = "-1" # 12 Vendor_TermsAndConditions
vendor = "-1" # 13 Vendor_Name
sold = "-1" # 14 Product_QuantitySold
addDate = "-1" # 15 Product_AddedDate
available = "-1" # 16 NOT USED ...
endDate = "-1" # 17 NOT USED ...
BTC = "-1" # 18 Product_BTC_SellingPrice
USD = "-1" # 19 Product_USD_SellingPrice
rating = "-1" # 20 Vendor_Rating
success = "-1" # 21 Vendor_Successful_Transactions
EURO = "-1" # 22 Product_EURO_SellingPrice
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
image = "-1" # 19 Product_Image
vendor_image = "-1" # 20 Vendor_Image
# Finding Product Name # Finding Product Name
name = soup.find('div', {'class': 'panel39002'}).find('span').next_sibling name = soup.find('div', {'class': 'panel39002'}).find('span').next_sibling
@ -106,6 +104,15 @@ def BlackPyramid_description_parser(soup):
negative = soup.find('span', {'class': 'ti9400005 ti90088 can39953'}).text negative = soup.find('span', {'class': 'ti9400005 ti90088 can39953'}).text
review = int(positive) + int(neutral) + int(negative) review = int(positive) + int(neutral) + int(negative)
# Finding product image
image = soup.find('img', {'class': 'img0390503'})
image = image.get('src')
image = image.split('base64,')[-1]
vendor_image = soup.find('img', {'class': 'img0390503'})
vendor_image = vendor_image.get('src')
vendor_image = vendor_image.split('base64,')[-1]
# Searching for CVE and MS categories # Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve: if cve:
@ -125,8 +132,8 @@ def BlackPyramid_description_parser(soup):
MS = MS.replace('\n', '') MS = MS.replace('\n', '')
# Populating the final variable (this should be a list with all fields scraped) # Populating the final variable (this should be a list with all fields scraped)
row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,
sold, addDate, available, endDate, BTC, USD, rating, success, EURO)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
# Sending the results # Sending the results
return row return row
@ -136,33 +143,33 @@ def BlackPyramid_description_parser(soup):
#stores info it needs in different lists, these lists are returned after being organized #stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of listing page #@param: soup object looking at html page of listing page
#return: 'row' that contains a variety of lists that each hold info on the listing page #return: 'row' that contains a variety of lists that each hold info on the listing page
def BlackPyramid_listing_parser(soup):
def blackpyramid_listing_parser(soup):
# Fields to be parsed # Fields to be parsed
nm = 0 # Total_Products (Should be Integer)
mktName = "BlackPyramid" # 0 Marketplace_Name
name = [] # 1 Product_Name
CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 3 Product_MS_Classification (Microsoft Security)
category = [] # 4 Product_Category
describe = [] # 5 Product_Description
escrow = [] # 6 Vendor_Warranty
views = [] # 7 Product_Number_Of_Views
reviews = [] # 8 Product_Number_Of_Reviews
addDate = [] # 9 Product_AddDate
lastSeen = [] # 10 Product_LastViewDate
BTC = [] # 11 Product_BTC_SellingPrice
USD = [] # 12 Product_USD_SellingPrice
EURO = [] # 13 Product_EURO_SellingPrice
sold = [] # 14 Product_QuantitySold
qLeft =[] # 15 Product_QuantityLeft
shipFrom = [] # 16 Product_ShippedFrom
shipTo = [] # 17 Product_ShippedTo
rating_item = [] # 18 Product_Rating
vendor = [] # 19 Vendor
rating = [] # 20 Vendor_Rating
success = [] # 21 Vendor_Successful_Transactions
href = [] # 23 Product_Links (Urls)
nm = 0 # *Total_Products (Should be Integer)
mktName = "Black Pyramid" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
rating_vendor = [] # 2 Vendor_Rating
success = [] # 3 Vendor_Successful_Transactions
name = [] # 4 *Product_Name y
CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
category = [] # 7 Product_Category y
describe = [] # 8 Product_Description
views = [] # 9 Product_Number_Of_Views
reviews = [] # 10 Product_Number_Of_Reviews
rating_item = [] # 11 Product_Rating
addDate = [] # 12 Product_AddDate
BTC = [] # 13 Product_BTC_SellingPrice
USD = [] # 14 Product_USD_SellingPrice y
EURO = [] # 15 Product_EURO_SellingPrice
sold = [] # 16 Product_QuantitySold
qLeft = [] # 17 Product_QuantityLeft
shipFrom = [] # 18 Product_ShippedFrom
shipTo = [] # 19 Product_ShippedTo
image = [] # 20 Product_Image
image_vendor = [] # 21 Vendor_Image
href = [] # 22 Product_Links
listing = soup.findAll('article', {"class": "product"}) listing = soup.findAll('article', {"class": "product"})
@ -231,6 +238,12 @@ def BlackPyramid_listing_parser(soup):
qsold = qsold.strip() qsold = qsold.strip()
sold.append(qsold) sold.append(qsold)
# Finding product image
product_image = card.find('img')
product_image = product_image.get('src')
product_image = product_image.split('base64,')[-1]
image.append(product_image)
# Searching for CVE and MS categories # Searching for CVE and MS categories
cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}')) cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve: if not cve:
@ -259,8 +272,9 @@ def BlackPyramid_listing_parser(soup):
MS.append(MSValue) MS.append(MSValue)
# Populate the final variable (this should be a list with all fields scraped) # Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, vendor, rating, success, name, CVE, MS, category, describe, views, reviews, rating,
addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image,
image_vendor)
#called by the crawler to get description links on a listing page #called by the crawler to get description links on a listing page


Loading…
Cancel
Save