__author__ = 'DarkWeb' # Here, we are importing the auxiliary functions to clean or convert data from MarketPlaces.Utilities.utilities import * # Here, we are importing BeautifulSoup to search through the HTML tree from bs4 import BeautifulSoup # This is the method to parse the Description Pages (one page to each Product in the Listing Pages) def kerberos_description_parser(soup): # Fields to be parsed name = "-1" # 0 Product_Name y describe = "-1" # 1 Product_Description y lastSeen = "-1" # 2 Product_LastViewDate rules = "-1" # 3 NOT USED ... CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures) MS = "-1" # 5 Product_MS_Classification (Microsoft Security) review = "-1" # 6 Product_Number_Of_Reviews category = "-1" # 7 Product_Category shipFrom = "-1" # 8 Product_ShippedFrom shipTo = "-1" # 9 Product_ShippedTo left = "-1" # 10 Product_QuantityLeft y escrow = "-1" # 11 Vendor_Warranty y terms = "-1" # 12 Vendor_TermsAndConditions vendor = "-1" # 13 Vendor_Name y sold = "-1" # 14 Product_QuantitySold y addDate = "-1" # 15 Product_AddedDate available = "-1" # 16 NOT USED ... endDate = "-1" # 17 NOT USED ... BTC = "-1" # 18 Product_BTC_SellingPrice y USD = "-1" # 19 Product_USD_SellingPrice y rating = "-1" # 20 Vendor_Rating success = "-1" # 21 Vendor_Successful_Transactions EURO = "-1" # 22 Product_EURO_SellingPrice bae = soup.find('div', {'class': "col-9"}) # Finding Product Name name = bae.find('h2').text name = name.replace('\n', ' ') name = name.replace(",", "") name = name.strip() mb = bae.findAll('div', {"class": "mb-1"}) # Finding Vendor vendor = mb[0].text vendor = vendor.replace(",", "") vendor = vendor.replace("Sold by:", "") vendor = vendor.strip() # # Finding Vendor Rating # full_stars = bae[2].find_all('i', {'class': "fas fa-star"}) # half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"}) # rating = len(full_stars) + (0.5 if half_star is not None else 0) # Finding Warranty escrow = mb[2].text escrow = escrow.replace("Payment:", "") escrow = escrow.strip() # Finding Quantity Sold and Left temp = mb[4].text.split(',') sold = temp[0].replace("sold", "") sold = sold.strip() left = temp[1].replace("in stock", "") left = left.strip() # Finding USD USD = bae.find('div', {"class": "h3 text-secondary"}).text USD = USD.replace("$", "") USD = USD.strip() # Finding BTC temp = bae.find('div', {"class": "small"}).text.split("BTC") BTC = temp[0].strip() # shipping_info = bae[4].text # if "Digital" not in shipping_info: # shipping_info = shipping_info.split(" ") # # # Finding Shipment Information (Origin) # shipFrom = shipping_info[0].strip() # # # Finding Shipment Information (Destination) # shipTo = shipping_info[1].strip() # Finding the Product description describe = bae.find('div', {"class": "card border-top-0"}).text describe = describe.replace("\n", " ") describe = describe.replace("\r", " ") describe = describe.strip() # Searching for CVE and MS categories cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if cve: CVE = " " for idx in cve: CVE += (idx) CVE += " " CVE = CVE.replace(',', ' ') CVE = CVE.replace('\n', '') ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) if ms: MS = " " for im in ms: MS += (im) MS += " " MS = MS.replace(',', ' ') MS = MS.replace('\n', '') # Populating the final variable (this should be a list with all fields scraped) row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor, sold, addDate, available, endDate, BTC, USD, rating, success, EURO) # Sending the results return row # This is the method to parse the Listing Pages def kerberos_listing_parser(soup): # Fields to be parsed nm = 0 # Total_Products (Should be Integer) mktName = "Kerberos" # 0 Marketplace_Name name = [] # 1 Product_Name y CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) MS = [] # 3 Product_MS_Classification (Microsoft Security) category = [] # 4 Product_Category y describe = [] # 5 Product_Description escrow = [] # 6 Vendor_Warranty views = [] # 7 Product_Number_Of_Views reviews = [] # 8 Product_Number_Of_Reviews y addDate = [] # 9 Product_AddDate lastSeen = [] # 10 Product_LastViewDate BTC = [] # 11 Product_BTC_SellingPrice USD = [] # 12 Product_USD_SellingPrice y EURO = [] # 13 Product_EURO_SellingPrice sold = [] # 14 Product_QuantitySold qLeft =[] # 15 Product_QuantityLeft shipFrom = [] # 16 Product_ShippedFrom shipTo = [] # 17 Product_ShippedTo vendor = [] # 18 Vendor y rating = [] # 19 Vendor_Rating success = [] # 20 Vendor_Successful_Transactions href = [] # 24 Product_Links (Urls) listing = soup.findAll('div', {"class": "card product-card mb-3"}) # Populating the Number of Products nm = len(listing) # Finding Category cat = soup.find("div", {"class": "col-9"}) cat = cat.find("h2").text cat = cat.replace("Category: ", "") cat = cat.replace(",", "") cat = cat.strip() for card in listing: category.append(cat) bae = card.findAll('a') # Adding the url to the list of urls link = bae[0].get('href') link = cleanLink(link) href.append(link) # Finding Product Name product = bae[1].text product = product.replace('\n', ' ') product = product.replace(",", "") product = product.strip() name.append(product) # Finding Vendor vendor_name = bae[2].text vendor_name = vendor_name.replace(",", "") vendor_name = vendor_name.strip() vendor.append(vendor_name) # Finding USD usd = card.find('div', {"class": "mb-1"}).text usd = usd.replace("$", "") usd = usd.strip() USD.append(usd) # Finding Reviews num = card.find("span", {"class": "rate-count"}).text num = num.replace("(", "") num = num.replace("review)", "") num = num.replace("reviews)", "") num = num.strip() reviews.append(num) # Searching for CVE and MS categories cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if not cve: cveValue="-1" else: cee = " " for idx in cve: cee += (idx) cee += " " cee = cee.replace(',', ' ') cee = cee.replace('\n', '') cveValue=cee CVE.append(cveValue) ms = card.findAll(text=re.compile('MS\d{2}-\d{3}')) if not ms: MSValue="-1" else: me = " " for im in ms: me += (im) me += " " me = me.replace(',', ' ') me = me.replace('\n', '') MSValue=me MS.append(MSValue) # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen, BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href) def kerberos_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] content = soup.find('div', {"id": "content-pos"}) listing = content.findAll('div', {"class": "item-block"}) for div in listing: ae = div.find('div', {"ae zx300"}) links = ae.findAll('a') href.append(links[1]['href']) return href