From 9e21f6bfa429dec14391d8ce38aa4f2a94aee80e Mon Sep 17 00:00:00 2001 From: chris Date: Sun, 5 Nov 2023 18:35:34 -0800 Subject: [PATCH] Bring parser up to date and improved crawler --- MarketPlaces/BlackPyramid/crawler_selenium.py | 11 +- MarketPlaces/BlackPyramid/parser.py | 118 ++++++++++-------- 2 files changed, 76 insertions(+), 53 deletions(-) diff --git a/MarketPlaces/BlackPyramid/crawler_selenium.py b/MarketPlaces/BlackPyramid/crawler_selenium.py index c34a6cb..a008bf5 100644 --- a/MarketPlaces/BlackPyramid/crawler_selenium.py +++ b/MarketPlaces/BlackPyramid/crawler_selenium.py @@ -14,6 +14,7 @@ from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver import ActionChains import selenium.webdriver.support.ui as uiClasses +from selenium.webdriver.common.keys import Keys from PIL import Image import urllib.parse as urlparse @@ -191,7 +192,7 @@ def goToPage(driver, page): # print(digitalB) # delay for website to register hover - time.sleep(10) + time.sleep(5) # click xpath = "//input[@name='" + page + "']" @@ -259,6 +260,9 @@ def crawlForum(driver): # go to next page of market try: + # Scroll to top of page to see navigation bar + driver.find_element(by=By.XPATH, value="//body").send_keys(Keys.CONTROL + Keys.HOME) + goToPage(driver, listing) nav = driver.find_element(by=By.XPATH, value="//input[@name='next_page']") @@ -318,3 +322,8 @@ def productPages(html): def crawler(): startCrawling() # print("Crawling and Parsing BestCardingWorld .... DONE!") + + +if __name__ == "__main__": + #crawler() + new_parse("BlackPyramid", baseURL, False) diff --git a/MarketPlaces/BlackPyramid/parser.py b/MarketPlaces/BlackPyramid/parser.py index ecc1dcb..c1ea43d 100644 --- a/MarketPlaces/BlackPyramid/parser.py +++ b/MarketPlaces/BlackPyramid/parser.py @@ -11,33 +11,31 @@ from bs4 import BeautifulSoup #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of description page #return: 'row' that contains a variety of lists that each hold info on the description page -def BlackPyramid_description_parser(soup): +def blackpyramid_description_parser(soup): # Fields to be parsed - name = "-1" # 0 Product_Name - describe = "-1" # 1 Product_Description - lastSeen = "-1" # 2 Product_LastViewDate - rules = "-1" # 3 NOT USED ... - CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 5 Product_MS_Classification (Microsoft Security) - review = "-1" # 6 Product_Number_Of_Reviews + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) category = "-1" # 7 Product_Category - shipFrom = "-1" # 8 Product_ShippedFrom - shipTo = "-1" # 9 Product_ShippedTo - left = "-1" # 10 Product_QuantityLeft - escrow = "-1" # 11 Vendor_Warranty - terms = "-1" # 12 Vendor_TermsAndConditions - vendor = "-1" # 13 Vendor_Name - sold = "-1" # 14 Product_QuantitySold - addDate = "-1" # 15 Product_AddedDate - available = "-1" # 16 NOT USED ... - endDate = "-1" # 17 NOT USED ... - BTC = "-1" # 18 Product_BTC_SellingPrice - USD = "-1" # 19 Product_USD_SellingPrice - rating = "-1" # 20 Vendor_Rating - success = "-1" # 21 Vendor_Successful_Transactions - EURO = "-1" # 22 Product_EURO_SellingPrice + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image # Finding Product Name name = soup.find('div', {'class': 'panel39002'}).find('span').next_sibling @@ -106,6 +104,15 @@ def BlackPyramid_description_parser(soup): negative = soup.find('span', {'class': 'ti9400005 ti90088 can39953'}).text review = int(positive) + int(neutral) + int(negative) + # Finding product image + image = soup.find('img', {'class': 'img0390503'}) + image = image.get('src') + image = image.split('base64,')[-1] + + vendor_image = soup.find('img', {'class': 'img0390503'}) + vendor_image = vendor_image.get('src') + vendor_image = vendor_image.split('base64,')[-1] + # Searching for CVE and MS categories cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if cve: @@ -125,8 +132,8 @@ def BlackPyramid_description_parser(soup): MS = MS.replace('\n', '') # Populating the final variable (this should be a list with all fields scraped) - row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor, - sold, addDate, available, endDate, BTC, USD, rating, success, EURO) + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results return row @@ -136,33 +143,33 @@ def BlackPyramid_description_parser(soup): #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of listing page #return: 'row' that contains a variety of lists that each hold info on the listing page -def BlackPyramid_listing_parser(soup): +def blackpyramid_listing_parser(soup): # Fields to be parsed - nm = 0 # Total_Products (Should be Integer) - mktName = "BlackPyramid" # 0 Marketplace_Name - name = [] # 1 Product_Name - CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 3 Product_MS_Classification (Microsoft Security) - category = [] # 4 Product_Category - describe = [] # 5 Product_Description - escrow = [] # 6 Vendor_Warranty - views = [] # 7 Product_Number_Of_Views - reviews = [] # 8 Product_Number_Of_Reviews - addDate = [] # 9 Product_AddDate - lastSeen = [] # 10 Product_LastViewDate - BTC = [] # 11 Product_BTC_SellingPrice - USD = [] # 12 Product_USD_SellingPrice - EURO = [] # 13 Product_EURO_SellingPrice - sold = [] # 14 Product_QuantitySold - qLeft =[] # 15 Product_QuantityLeft - shipFrom = [] # 16 Product_ShippedFrom - shipTo = [] # 17 Product_ShippedTo - rating_item = [] # 18 Product_Rating - vendor = [] # 19 Vendor - rating = [] # 20 Vendor_Rating - success = [] # 21 Vendor_Successful_Transactions - href = [] # 23 Product_Links (Urls) + nm = 0 # *Total_Products (Should be Integer) + mktName = "Black Pyramid" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links listing = soup.findAll('article', {"class": "product"}) @@ -231,6 +238,12 @@ def BlackPyramid_listing_parser(soup): qsold = qsold.strip() sold.append(qsold) + # Finding product image + product_image = card.find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image.append(product_image) + # Searching for CVE and MS categories cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if not cve: @@ -259,8 +272,9 @@ def BlackPyramid_listing_parser(soup): MS.append(MSValue) # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, vendor, rating, success, name, CVE, MS, category, describe, views, reviews, rating, - addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, + image_vendor) #called by the crawler to get description links on a listing page