__author__ = 'DarkWeb' # Here, we are importing the auxiliary functions to clean or convert data from MarketPlaces.Utilities.utilities import * # Here, we are importing BeautifulSoup to search through the HTML tree from bs4 import BeautifulSoup # This is the method to parse the Description Pages (one page to each Product in the Listing Pages) def wethenorth_description_parser(soup): # Fields to be parsed vendor = "-1" # 0 *Vendor_Name success = "-1" # 1 Vendor_Successful_Transactions rating_vendor = "-1" # 2 Vendor_Rating name = "-1" # 3 *Product_Name describe = "-1" # 4 Product_Description CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) MS = "-1" # 6 Product_MS_Classification (Microsoft Security) category = "-1" # 7 Product_Category views = "-1" # 8 Product_Number_Of_Views reviews = "-1" # 9 Product_Number_Of_Reviews rating_item = "-1" # 10 Product_Rating addDate = "-1" # 11 Product_AddedDate BTC = "-1" # 12 Product_BTC_SellingPrice USD = "-1" # 13 Product_USD_SellingPrice EURO = "-1" # 14 Product_EURO_SellingPrice sold = "-1" # 15 Product_QuantitySold left = "-1" # 16 Product_QuantityLeft shipFrom = "-1" # 17 Product_ShippedFrom shipTo = "-1" # 18 Product_ShippedTo # Finding Product Name listDes = soup.find('div', {'class': "listDes"}) name = listDes.find('h2').text name = name.replace('\n', ' ') name = name.replace(",", "") name = name.strip() # Finding Vendor vendor = listDes.find('b').text vendor = vendor.replace(",", "") vendor = vendor.replace("...", "") vendor = vendor.replace("-", "") vendor = vendor.strip() # Finding Vendor Rating rating = listDes.find('span',{'class':'levelSet'}) rating = rating.text rating = rating.replace('\n', ' ') rating = rating.replace(",", "") rating = rating.strip() # Finding Successful Transactions success = listDes.find_all('p')[1] success = success.find('span').text success = success.split() success = success[0].strip() # Finding Prices - all prices in We The North are in CAD, I left the CAD in the resulting String so that it would show CAD for all prices padp = listDes.find('p',{'class':'padp'}) USD = padp.find('span').text USD = USD.strip() # Finding Escrow - no escrow on WTN market shipping_info = listDes.find('tbody') if "Digital" not in shipping_info: shipping_info = shipping_info.find_all('tr') row1 = shipping_info[0].find_all('td') # Finding Shipment Information (Origin) shipFrom = row1[-1].text shipFrom=shipFrom.strip() if shipFrom=="": shipFrom="-1" row2 = shipping_info[1].find_all('td') # Finding Shipment Information (Destination) shipTo = row2[-1].text shipTo= shipTo.strip() if shipTo == "": shipTo = "-1" # Finding the Product description describe = soup.find("div",{'class':'tabcontent'}) describe = describe.find('p').text describe = describe.replace("\n", " ") describe = describe.replace("\r", " ") describe = describe.strip() # cannot find any tag for these ''' # Finding the Number of Product Reviews tag = soup.findAll(text=re.compile('Reviews')) for index in tag: reviews = index par = reviews.find('(') if par >=0: reviews = reviews.replace("Reviews (","") reviews = reviews.replace(")","") reviews = reviews.split(",") review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) else : review = "-1" ''' # Searching for CVE and MS categories # no CVE or MS for WTN market # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, BTC, USD, EURO, sold, left, shipFrom, shipTo) # Sending the results return row # This is the method to parse the Listing Pages def wethenorth_listing_parser(soup): # Fields to be parsed nm = 0 # *Total_Products (Should be Integer) mktName = "WeTheNorth" # 0 *Marketplace_Name vendor = [] # 1 *Vendor y rating_vendor = [] # 2 Vendor_Rating success = [] # 3 Vendor_Successful_Transactions name = [] # 4 *Product_Name y CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) MS = [] # 6 Product_MS_Classification (Microsoft Security) category = [] # 7 Product_Category y describe = [] # 8 Product_Description views = [] # 9 Product_Number_Of_Views reviews = [] # 10 Product_Number_Of_Reviews rating_item = [] # 11 Product_Rating addDate = [] # 12 Product_AddDate BTC = [] # 13 Product_BTC_SellingPrice USD = [] # 14 Product_USD_SellingPrice y EURO = [] # 15 Product_EURO_SellingPrice sold = [] # 16 Product_QuantitySold qLeft =[] # 17 Product_QuantityLeft shipFrom = [] # 18 Product_ShippedFrom shipTo = [] # 19 Product_ShippedTo href = [] # 20 Product_Links right_content = soup.find('div', {"class": "right-content"}) listing = right_content.findAll('div', {"class": "col-1search"}) listing = listing[3:] # Populating the Number of Products nm = len(listing) for a in listing: bae = a.findAll('a', href=True) # Adding the url to the list of urls link = bae[0].get('href') link = cleanLink(link) href.append(link) # Finding the Vendor vendor_name = a.find('p', {'class': 'padp'}) vendor_name = vendor_name.find('a').text vendor_name = vendor_name.replace(",", "") vendor_name = vendor_name.strip() vendor.append(vendor_name) # Finding the Product product = bae[0].text product = product.replace('\n', ' ') product = product.replace(",", "") product = product.strip() name.append(product) # Finding the Category category_name = a.find('p', {'class': 'padp'}).text first_dash = category_name.find('-') second_dash = category_name[first_dash+1:].find('-') category_name = category_name[first_dash+1:second_dash] category_name=category_name.strip() category.append(category_name) # Finding Views view_count = a.text view_count = view_count[view_count.find('Views:'): view_count.find('Sales:')] view_count = view_count.replace('Views:', ' ') view_count = view_count.replace('/', ' ') view_count = view_count.strip() views.append(view_count) # Finding success sales sold_count = a.text sold_count = sold_count[sold_count.find('Sales:'): sold_count.find('Short')] sold_count = sold_count.replace('Sales:', ' ') sold_count = sold_count.replace('/', ' ') sold_count = sold_count.strip() success.append(sold_count) # Searching for CVE and MS categories # no CVE or MS in WTN market cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if not cve: cveValue="-1" else: cee = " " for idx in cve: cee += (idx) cee += " " cee = cee.replace(',', ' ') cee = cee.replace('\n', '') cveValue=cee CVE.append(cveValue) ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) if not ms: MSValue="-1" else: me = " " for im in ms: me += (im) me += " " me = me.replace(',', ' ') me = me.replace('\n', '') MSValue=me MS.append(MSValue) # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) def wethenorth_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] right_content = soup.find('div',{"class": "right-content"}) listing = right_content.findAll('div', {"class": "col-1search"}) #cut out the irrelevant products that are in blue, the first three products of each page usually unrelated listing = listing[3:] for a in listing: link = a.find('a') link = link['href'] href.append(link) return href