__author__ = 'chris' import re import traceback # Here, we are importing the auxiliary functions to clean or convert data from MarketPlaces.Utilities.utilities import * # Here, we are importing BeautifulSoup to search through the HTML tree from bs4 import BeautifulSoup # Import for test run import glob import os import codecs import shutil # This is the method to parse the Description Pages (one page to each Product in the Listing Pages) def Robinhood_description_parser(soup): # Fields to be parsed vendor = "-1" # 0 *Vendor_Name success = "-1" # 1 Vendor_Successful_Transactions rating_vendor = "-1" # 2 Vendor_Rating name = "-1" # 3 *Product_Name describe = "-1" # 4 Product_Description CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) MS = "-1" # 6 Product_MS_Classification (Microsoft Security) category = "-1" # 7 Product_Category views = "-1" # 8 Product_Number_Of_Views reviews = "-1" # 9 Product_Number_Of_Reviews rating_item = "-1" # 10 Product_Rating addDate = "-1" # 11 Product_AddedDate BTC = "-1" # 12 Product_BTC_SellingPrice USD = "-1" # 13 Product_USD_SellingPrice EURO = "-1" # 14 Product_EURO_SellingPrice sold = "-1" # 15 Product_QuantitySold left = "-1" # 16 Product_QuantityLeft shipFrom = "-1" # 17 Product_ShippedFrom shipTo = "-1" # 18 Product_ShippedTo # Finding Product Name name = soup.find('h1').text name = name.replace('\n', ' ') name = name.replace(",", "") name = name.strip() # Finding description desc = '' primary = soup.find('div', {'id': 'primary'}) product = primary.findAll('div')[1] commerce = product.findAll('div', recursive=False)[2] descDiv = commerce.findAll('div')[0] # descDiv = soup.find('div', {'class': 'woocommerce-Tabs-panel woocommerce-Tabs-panel--description panel entry-content wc-tab'}) descText = descDiv.findAll('p') for para in descText: desc = desc + para.text describe = desc # Finding Vendor vendor = soup.find('a', {'class': 'wcfm_dashboard_item_title'}).text vendor = vendor.replace(",", "") vendor = vendor.replace("Sold by:", "") vendor = vendor.strip() # Finding Category catSpan = soup.find('span', {'class': 'posted_in'}) category = catSpan.find('a').text # Finding USD priceText = soup.find('p', {'class': 'price'}).text USD = str(priceText).strip() # Searching for CVE and MS categories cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if cve: CVE = " " for idx in cve: CVE += (idx) CVE += " " CVE = CVE.replace(',', ' ') CVE = CVE.replace('\n', '') ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) if ms: MS = " " for im in ms: MS += (im) MS += " " MS = MS.replace(',', ' ') MS = MS.replace('\n', '') # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, BTC, USD, EURO, sold, left, shipFrom, shipTo) # Sending the results return row # This is the method to parse the Listing Pages def Robinhood_listing_parser(soup): # Fields to be parsed nm = 0 # *Total_Products (Should be Integer) mktName = "Robinhood Market" # 0 *Marketplace_Name vendor = [] # 1 *Vendor y rating_vendor = [] # 2 Vendor_Rating success = [] # 3 Vendor_Successful_Transactions name = [] # 4 *Product_Name y CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) MS = [] # 6 Product_MS_Classification (Microsoft Security) category = [] # 7 Product_Category y describe = [] # 8 Product_Description views = [] # 9 Product_Number_Of_Views reviews = [] # 10 Product_Number_Of_Reviews rating_item = [] # 11 Product_Rating addDate = [] # 12 Product_AddDate BTC = [] # 13 Product_BTC_SellingPrice USD = [] # 14 Product_USD_SellingPrice y EURO = [] # 15 Product_EURO_SellingPrice sold = [] # 16 Product_QuantitySold qLeft =[] # 17 Product_QuantityLeft shipFrom = [] # 18 Product_ShippedFrom shipTo = [] # 19 Product_ShippedTo href = [] # 20 Product_Links listing = soup.find('ul', {"class": "products columns-4"}) items = listing.findAll('li') # Populating the Number of Products nm = len(items) for card in items: # Finding Category cat = soup.find("h1").text cat = cat.replace('\n', ' ') cat = cat.replace(",", "") cat = cat.strip() category.append(cat) bae = card.findAll('a') # Adding the url to the list of urls link = card.find('a').get('href') href.append(link) # Finding Product Name product = card.find("h2").text product = product.replace('\n', ' ') product = product.replace(",", "") product = product.strip() name.append(product) info = card.find('div', {'class': 'wcfmmp_sold_by_container'}) # Finding Vendor vendor_name = info.find('a', {'class', 'wcfm_dashboard_item_title'}).text vendor_name = vendor_name.replace(",", "") vendor_name = vendor_name.strip() vendor.append(vendor_name) # Finding USD span = card.find('span', {'class': 'price'}) if span is not None: bdi = span.find('bdi') usdText = bdi.find('span').next_sibling usdVal = usdText.text else: usdVal = "0" USD.append(usdVal) # Searching for CVE and MS categories cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if not cve: cveValue="-1" else: cee = " " for idx in cve: cee += (idx) cee += " " cee = cee.replace(',', ' ') cee = cee.replace('\n', '') cveValue=cee CVE.append(cveValue) ms = card.findAll(text=re.compile('MS\d{2}-\d{3}')) if not ms: MSValue="-1" else: me = " " for im in ms: me += (im) me += " " me = me.replace(',', ' ') me = me.replace('\n', '') MSValue=me MS.append(MSValue) #print(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, # reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) def Robinhood_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] #list = soup.findAll('div', {"class": "woocommerce columns-4"}) listing = soup.find('ul', {"class": "products columns-4"}).findAll('li') for item in listing: link = item.find('a')['href'] href.append(link) return href if __name__ == '__main__': nError = 0 marketPlace = 'RobinhoodMarket' lines = [] # listing pages lns = [] # description pages detPage = {} ''' # reading description pages count = 0 for fileDescription in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\08082023\\Description", '*.html')): count += 1 lns.append(fileDescription) # if count > 5: # break for index, line2 in enumerate(lns): print("Reading description folder of '" + marketPlace + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns))) try: html = codecs.open(line2.strip('\n'), encoding='utf8') soup = BeautifulSoup(html, "html.parser") html.close() except: try: html = open(line2.strip('\n')) soup = BeautifulSoup(html, "html.parser") html.close() except: nError += 1 print("There was a problem to read the file " + line2 + " in the Description section!") # if createLog: # logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section.\n") continue try: print(Robinhood_description_parser(soup)) except: traceback.print_exc() print("There was a problem to parse the file " + line2 + " in the Description section!") ''' # reading listing pages count = 0 for fileListing in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\08082023\\Listing", '*.html')): count += 1 lines.append(fileListing) #if count > 1: # break for index, line1 in enumerate(lines): print("Reading listing folder of '" + marketPlace + "', file '" + os.path.basename(line1) + "', index= " + str(index + 1) + " ... " + str(len(lines))) readError = False try: html = codecs.open(line1.strip('\n'), encoding='utf8') soup = BeautifulSoup(html, "html.parser") html.close() except: try: html = open(line1.strip('\n')) soup = BeautifulSoup(html, "html.parser") html.close() except: print("There was a problem to read the file " + line1 + " in the Listing section!") readError = True if not readError: parseError = False try: test = Robinhood_listing_parser(soup) print(Robinhood_listing_parser(soup)) except: traceback.print_exc() print("There was a problem to parse the file " + line1 + " in the listing section!") parseError = True print("DONE")