From 4856d79540250d102eaf8db651a6a61bd8da4dd5 Mon Sep 17 00:00:00 2001 From: Nathan Pham Date: Tue, 29 Aug 2023 01:09:00 -0700 Subject: [PATCH] finished PabloEscobarMarket parser --- .idea/DW_Pipeline_Test.iml | 2 +- .idea/misc.xml | 2 +- MarketPlaces/PabloEscobarMarket/parser.py | 145 ++++++++-------------- 3 files changed, 51 insertions(+), 98 deletions(-) diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml index 6363711..7f59c2c 100644 --- a/.idea/DW_Pipeline_Test.iml +++ b/.idea/DW_Pipeline_Test.iml @@ -2,7 +2,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 11f1ea0..dc9ea49 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/MarketPlaces/PabloEscobarMarket/parser.py b/MarketPlaces/PabloEscobarMarket/parser.py index be89b24..ecdd086 100644 --- a/MarketPlaces/PabloEscobarMarket/parser.py +++ b/MarketPlaces/PabloEscobarMarket/parser.py @@ -11,13 +11,12 @@ from bs4 import BeautifulSoup # stores info it needs in different lists, these lists are returned after being organized # @param: soup object looking at html page of description page # return: 'row' that contains a variety of lists that each hold info on the description page -def darkfox_description_parser(soup): +def pabloescobarmarket_description_parser(soup): # Fields to be parsed name = "-1" # 0 Product_Name describe = "-1" # 1 Product_Description lastSeen = "-1" # 2 Product_LastViewDate - rules = "-1" # 3 NOT USED ... CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures) MS = "-1" # 5 Product_MS_Classification (Microsoft Security) review = "-1" # 6 Product_Number_Of_Reviews @@ -30,8 +29,6 @@ def darkfox_description_parser(soup): vendor = "-1" # 13 Vendor_Name sold = "-1" # 14 Product_QuantitySold addDate = "-1" # 15 Product_AddedDate - available = "-1" # 16 NOT USED ... - endDate = "-1" # 17 NOT USED ... BTC = "-1" # 18 Product_BTC_SellingPrice USD = "-1" # 19 Product_USD_SellingPrice rating = "-1" # 20 Vendor_Rating @@ -39,67 +36,44 @@ def darkfox_description_parser(soup): EURO = "-1" # 22 Product_EURO_SellingPrice # Finding Product Name - name = soup.find('h1').text - name = name.replace('\n', ' ') - name = name.replace(",", "") - name = name.strip() + # NA + divmd7 = soup.find('div', {'class': "col-md-7"}) + ptag = soup.findAll('p') # Finding Vendor - vendor = soup.find('h3').find('a').text.strip() + vendor = divmd7.find('a').text.strip() # Finding Vendor Rating - rating = soup.find('span', {'class': "tag is-dark"}).text.strip() + # NA # Finding Successful Transactions - success = soup.find('h3').text - success = success.replace("Vendor: ", "") - success = success.replace(vendor, "") - success = success.replace("(", "") - success = success.replace(")", "") - success = success.strip() - - bae = soup.find('div', {'class': "box"}).find_all('ul') + success = soup.find('span', {'class': "badge-primary"}) # Finding Prices - USD = bae[1].find('strong').text.strip() + USD = soup.find('span', {'class': "total"}).text.strip() - li = bae[2].find_all('li') + BTC = soup.find('div', {'class': "text-center"}).text.strip() # Finding Escrow - escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip() + escrow = ptag[-1].text.strip() # Finding the Product Category - category = li[1].find('span', {'class': "tag is-dark"}).text.strip() + category = ptag[-2].text.strip() # Finding the Product Quantity Available - left = li[3].find('span', {'class': "tag is-dark"}).text.strip() + # NA # Finding Number Sold - sold = li[4].find('span', {'class': "tag is-dark"}).text.strip() - - li = bae[3].find_all('li') + # NA # Finding Shipment Information (Origin) - if "Ships from:" in li[-2].text: - shipFrom = li[-2].text - shipFrom = shipFrom.replace("Ships from: ", "") - # shipFrom = shipFrom.replace(",", "") - shipFrom = shipFrom.strip() + # NA # Finding Shipment Information (Destination) - shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text - shipTo = shipTo.replace("Ships to: ", "") - shipTo = shipTo.strip() - if "certain countries" in shipTo: - countries = "" - tags = li[-1].find_all('span', {'class': "tag"}) - for tag in tags: - country = tag.text.strip() - countries += country + ", " - shipTo = countries.strip(", ") + # NA # Finding the Product description - describe = soup.find('div', {'class': "pre-line"}).text + describe = soup.find('div', {'class': "text-white"}).text describe = describe.replace("\n", " ") describe = describe.strip() @@ -135,8 +109,8 @@ def darkfox_description_parser(soup): MS = MS.replace('\n', '') # Populating the final variable (this should be a list with all fields scraped) - row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor, - sold, addDate, available, endDate, BTC, USD, rating, success, EURO) + row = (name, describe, lastSeen, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor, + sold, addDate, BTC, USD, rating, success, EURO) # Sending the results return row @@ -146,10 +120,10 @@ def darkfox_description_parser(soup): # stores info it needs in different lists, these lists are returned after being organized # @param: soup object looking at html page of listing page # return: 'row' that contains a variety of lists that each hold info on the listing page -def darkfox_listing_parser(soup): +def pabloescobarmarket_listing_parser(soup): # Fields to be parsed nm = 0 # Total_Products (Should be Integer) - mktName = "DarkFox" # 0 Marketplace_Name + mktName = "PabloEscobarMarket" # 0 Marketplace_Name name = [] # 1 Product_Name CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) MS = [] # 3 Product_MS_Classification (Microsoft Security) @@ -172,7 +146,7 @@ def darkfox_listing_parser(soup): success = [] # 20 Vendor_Successful_Transactions href = [] # 23 Product_Links (Urls) - listing = soup.findAll('div', {"class": "card"}) + listing = soup.findAll('div', {"class": "p-4"}) # Populating the Number of Products nm = len(listing) @@ -186,58 +160,37 @@ def darkfox_listing_parser(soup): href.append(link) # Finding the Product - product = bae[1].find('p').text + product = a.find('h4').text product = product.replace('\n', ' ') product = product.replace(",", "") product = product.replace("...", "") product = product.strip() name.append(product) - bae = a.find('div', {'class': "media-content"}).find('div').find_all('div') - - if len(bae) >= 5: - # Finding Prices - price = bae[0].text - ud = price.replace(" USD", " ") - # u = ud.replace("$","") - u = ud.replace(",", "") - u = u.strip() - USD.append(u) - # bc = (prc[1]).strip(' BTC') - # BTC.append(bc) - - # Finding the Vendor - vendor_name = bae[1].find('a').text - vendor_name = vendor_name.replace(",", "") - vendor_name = vendor_name.strip() - vendor.append(vendor_name) - - # Finding the Category - cat = bae[2].find('small').text - cat = cat.replace("Category: ", "") - cat = cat.replace(",", "") - cat = cat.strip() - category.append(cat) - - # Finding Number Sold and Quantity Left - num = bae[3].text - num = num.replace("Sold: ", "") - num = num.strip() - sold.append(num) - - quant = bae[4].find('small').text - quant = quant.replace("In stock: ", "") - quant = quant.strip() - qLeft.append(quant) - - # Finding Successful Transactions - freq = bae[1].text - freq = freq.replace(vendor_name, "") - freq = re.sub(r'Vendor Level \d+', "", freq) - freq = freq.replace("(", "") - freq = freq.replace(")", "") - freq = freq.strip() - success.append(freq) + + # Finding Prices + price = a.find('div', {"class": "price"}).text + tempUSD = price.split("~")[0] + tempUSD = tempUSD.replace("$", "") + tempUSD = tempUSD.strip() + USD.append(tempUSD) + + tempBTC = price.split("~")[1] + tempBTC = tempBTC.replace("BTC", "") + tempBTC = tempBTC.strip() + BTC.append(tempBTC) + + # Finding the Vendor + #NA + + # Finding the Category + # NA + + # Finding Number Sold and Quantity Left + # NA + + # Finding Successful Transactions + # NA # Searching for CVE and MS categories cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) @@ -273,12 +226,12 @@ def darkfox_listing_parser(soup): # called by the crawler to get description links on a listing page # @param: beautifulsoup object that is using the correct html page (listing page) -# return: list of description links from a listing page -def metaversemarket_links_parser(soup): +# return: list of description links from a listing page FIX +def pabloescobarmarket_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] - listing = soup.findAll('div', {"class": "col-12 p-0"}) + listing = soup.findAll('div', {"class": "p-4"}) for a in listing: bae = a.find('a', href=True)