From e07b3cdcf6e6424e409a0aa4d091633e11afc7f4 Mon Sep 17 00:00:00 2001 From: ericssonmarin-cpp <85146518+ericssonmarin-cpp@users.noreply.github.com> Date: Fri, 1 Sep 2023 13:25:15 -0700 Subject: [PATCH] Image tracking. --- MarketPlaces/Nexus/parser.py | 2 +- MarketPlaces/ThiefWorld/parser.py | 42 +++++++-- MarketPlaces/TorBay/parser.py | 143 ++++++++++++++---------------- 3 files changed, 104 insertions(+), 83 deletions(-) diff --git a/MarketPlaces/Nexus/parser.py b/MarketPlaces/Nexus/parser.py index 093188e..5b9636b 100644 --- a/MarketPlaces/Nexus/parser.py +++ b/MarketPlaces/Nexus/parser.py @@ -138,7 +138,7 @@ def nexus_listing_parser(soup): #everything else appends a -1 rating_vendor.append("-1") USD.append("-1") - vendor.append(mktName) + vendor.append("-1") success.append("-1") CVE.append("-1") MS.append("-1") diff --git a/MarketPlaces/ThiefWorld/parser.py b/MarketPlaces/ThiefWorld/parser.py index dbf7584..bd6c371 100644 --- a/MarketPlaces/ThiefWorld/parser.py +++ b/MarketPlaces/ThiefWorld/parser.py @@ -30,12 +30,19 @@ def thiefWorld_description_parser(soup: BeautifulSoup) -> Tuple: left = "-1" # 16 Product_QuantityLeft shipFrom = "-1" # 17 Product_ShippedFrom shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image name = soup.find("h1", {'class': 'title'}).text name = cleanString(name.strip()) describe = soup.find('div', {'id': 'descriptionContent'}).text describe = cleanString(describe.strip()) + + # Finding Product Image + image = soup.find('div', {'class': 'product_img_big'}).find('img') + image = image.get('src') + image = image.split('base64,')[-1] commentListTag: Tag = soup.find('ul', {'class': 'comment_list scrollbar'}) commentList = commentListTag.find_all('li') @@ -46,7 +53,7 @@ def thiefWorld_description_parser(soup: BeautifulSoup) -> Tuple: vendor = soup.find('h1', {'class': 'title over'}).text vendor = cleanString(vendor.strip()) - + usdTag: Tag = soup.find('div', {'class': 'product_price__big'}).find('span') usdText = usdTag.text.strip('/')[0] # usdText format: " USD " (i.e., "70 000 USD ") @@ -54,10 +61,16 @@ def thiefWorld_description_parser(soup: BeautifulSoup) -> Tuple: ratingDiv = soup.find('div', {'class': 'rating_star'}) rating_vendor = ratingDiv.get('title').split(' ')[1] + + rating_item = soup.find('div', {'class': 'product_rate'}).text + rating_item = rating_item.replace("rating", "") + rating_item = cleanString(rating_item.strip()) + + category = "Hacking, DOSS" # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo) + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results return row @@ -87,7 +100,9 @@ def thiefWorld_listing_parser(soup: BeautifulSoup): qLeft =[] # 17 Product_QuantityLeft shipFrom = [] # 18 Product_ShippedFrom shipTo = [] # 19 Product_ShippedTo - href = [] # 20 Product_Links + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links productList: ResultSet[Tag] = soup.find_all('div', {'class': 'catalog_item'}) @@ -99,13 +114,19 @@ def thiefWorld_listing_parser(soup: BeautifulSoup): productName = cleanString(productTitle.text.strip()) name.append(productName) - + + # Finding Product Image + product_image = product.find('noscript').find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image.append(product_image) + productHref = productTitle.get('href') href.append(productHref) CVE.append('-1') MS.append('-1') - category.append('-1') + category.append('Hacking, DOSS') productDescription = product.find('div', {'class': 'text'}).text productDescription = cleanString(productDescription.strip()) @@ -130,15 +151,22 @@ def thiefWorld_listing_parser(soup: BeautifulSoup): productVendor = product.find('div', {'class': 'market over'}).find('a').text productVendor = cleanString(productVendor.strip()) vendor.append(productVendor) + + image_vendor.append('-1') rating_vendor.append('-1') - rating_item.append('-1') + #rating_item.append('-1') + + rating = product.find('div', {'class': 'rating_star_yellow'}).attrs.get('style') + rating = rating.replace("width: ", "") + rating_item.append(cleanString(rating)) + success.append('-1') # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) diff --git a/MarketPlaces/TorBay/parser.py b/MarketPlaces/TorBay/parser.py index 5c4f164..f20b7c9 100644 --- a/MarketPlaces/TorBay/parser.py +++ b/MarketPlaces/TorBay/parser.py @@ -13,13 +13,14 @@ from bs4 import BeautifulSoup def torbay_description_parser(soup): # Fields to be parsed + vendor = "-1" # 0 *Vendor_Name success = "-1" # 1 Vendor_Successful_Transactions rating_vendor = "-1" # 2 Vendor_Rating name = "-1" # 3 *Product_Name describe = "-1" # 4 Product_Description - CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much - MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) category = "-1" # 7 Product_Category views = "-1" # 8 Product_Number_Of_Views reviews = "-1" # 9 Product_Number_Of_Reviews @@ -32,38 +33,32 @@ def torbay_description_parser(soup): left = "-1" # 16 Product_QuantityLeft shipFrom = "-1" # 17 Product_ShippedFrom shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image # Finding Product Name try: product_name = soup.find('div', {'class': 'product-information'}).find('h1').text name = cleanString(product_name.strip()) except: - try: - product_name = soup.find('div', {'class': 'profile-info'}).find('h2').text - name = cleanString(product_name.strip()) - except: - # print(e) - print("product name") + product_name = soup.find('div', {'class': 'profile-info'}).find('h2').text + name = cleanString(product_name.strip()) # Finding Vendor FIx - try: - vendor_name = soup.find('div', {"class": "profile-info"}).find('h2').text - vendor = cleanString(vendor_name.strip()) - except: - print("description vendor name failed\n") + vendor_name = soup.find('div', {"class": "profile-info"}).find('h2').text + vendor = cleanString(vendor_name.strip()) + + # Finding Vendor Image + vendor_image = soup.find('div', {'class': 'avatar'}).find('img') + vendor_image = vendor_image.get('src') + vendor_image = vendor_image.split('base64,')[-1] # Finding Prices - try: - USD = soup.find('div', {'class': "total-price"}).find('span').text.strip() - except: - print("description price failed\n") + USD = soup.find('div', {'class': "total-price"}).find('span').text.strip() # Finding the Product Category - try: - cat = soup.find('div', {'class': "profile-info"}).find('p').text - category = cleanString(cat.strip()) - except: - print("description product category failed") + cat = soup.find('div', {'class': "profile-info"}).find('p').text + category = cleanString(cat.strip()) # Finding the Product description try: @@ -74,15 +69,17 @@ def torbay_description_parser(soup): describe = cleanString(describe.strip()) except: # print("product desc") - try: - describe = soup.find('div', {'class': 'info'}).text - describe = cleanString(describe.strip()) - except: - print("Product description") + describe = soup.find('div', {'class': 'info'}).text + describe = cleanString(describe.strip()) + + # Finding Product Image + image = soup.find('div', {'class': 'image text-center'}).find('img') + image = image.get('src') + image = image.split('base64,')[-1] # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo) + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results return row @@ -95,28 +92,30 @@ def torbay_description_parser(soup): def torbay_listing_parser(soup): # Fields to be parsed - nm = 0 # *Total_Products (Should be Integer) - mktName = "TorBay" # 0 *Marketplace_Name - vendor = [] # 1 *Vendor y - rating_vendor = [] # 2 Vendor_Rating - success = [] # 3 Vendor_Successful_Transactions - name = [] # 4 *Product_Name y - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this - MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this - category = [] # 7 Product_Category y - describe = [] # 8 Product_Description - views = [] # 9 Product_Number_Of_Views - reviews = [] # 10 Product_Number_Of_Reviews - rating_item = [] # 11 Product_Rating - addDate = [] # 12 Product_AddDate - BTC = [] # 13 Product_BTC_SellingPrice - USD = [] # 14 Product_USD_SellingPrice y - EURO = [] # 15 Product_EURO_SellingPrice - sold = [] # 16 Product_QuantitySold - qLeft = [] # 17 Product_QuantityLeft - shipFrom = [] # 18 Product_ShippedFrom - shipTo = [] # 19 Product_ShippedTo - href = [] # 20 Product_Links + nm = 0 # *Total_Products (Should be Integer) + mktName = "TorBay" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = [] # 6 Product_MS_Classification (Microsoft Security) + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft =[] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links listing = soup.findAll('div', {"class": "product-card"}) @@ -125,30 +124,24 @@ def torbay_listing_parser(soup): for a in listing: - try: - product_name = a.find('p', {'class': 'name'}).text - name.append(cleanString(product_name.strip())) - except: - print("product name") - - try: - prod = a.find('p', {'class': 'price'}).text # price - USD.append(cleanString(prod.strip())) - except: - print("USD") - - try: - ven = a.find('div', {'class': 'pc-footer'}).find('div').find('a').text # pc-footer - vendor.append(cleanString(ven.strip())) - # print(ven) - except: - print("vendor") - - try: - h = a.find('p', {'class': 'name'}).find('a').get('href') - href.append(h) - except: - print("in href") + product_name = a.find('p', {'class': 'name'}).text + name.append(cleanString(product_name.strip())) + + # Finding Product Image + image.append("-1") + + prod = a.find('p', {'class': 'price'}).text # price + USD.append(cleanString(prod.strip())) + + ven = a.find('div', {'class': 'pc-footer'}).find('div').find('a').text # pc-footer + vendor.append(cleanString(ven.strip())) + # print(ven) + + # Finding Vendor Image + image_vendor.append("-1") + + h = a.find('p', {'class': 'name'}).find('a').get('href') + href.append(h) CVE.append("-1") MS.append("-1") @@ -169,7 +162,7 @@ def torbay_listing_parser(soup): # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) #called by the crawler to get description links on a listing page #@param: beautifulsoup object that is using the correct html page (listing page) #return: list of description links from a listing page