diff --git a/MarketPlaces/DarkMatter/parser.py b/MarketPlaces/DarkMatter/parser.py index 8f71ded..4c02f5d 100644 --- a/MarketPlaces/DarkMatter/parser.py +++ b/MarketPlaces/DarkMatter/parser.py @@ -15,129 +15,66 @@ def darkfox_description_parser(soup): # Fields to be parsed - name = "-1" # 0 Product_Name - describe = "-1" # 1 Product_Description - lastSeen = "-1" # 2 Product_LastViewDate - rules = "-1" # 3 NOT USED ... - CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 5 Product_MS_Classification (Microsoft Security) - review = "-1" # 6 Product_Number_Of_Reviews - category = "-1" # 7 Product_Category - shipFrom = "-1" # 8 Product_ShippedFrom - shipTo = "-1" # 9 Product_ShippedTo - left = "-1" # 10 Product_QuantityLeft - escrow = "-1" # 11 Vendor_Warranty - terms = "-1" # 12 Vendor_TermsAndConditions - vendor = "-1" # 13 Vendor_Name - sold = "-1" # 14 Product_QuantitySold - addDate = "-1" # 15 Product_AddedDate - available = "-1" # 16 NOT USED ... - endDate = "-1" # 17 NOT USED ... - BTC = "-1" # 18 Product_BTC_SellingPrice - USD = "-1" # 19 Product_USD_SellingPrice - rating = "-1" # 20 Vendor_Rating - success = "-1" # 21 Vendor_Successful_Transactions - EURO = "-1" # 22 Product_EURO_SellingPrice - - # Finding Product Name - name = soup.find('h1').text - name = name.replace('\n', ' ') - name = name.replace(",", "") - name = name.strip() - - # Finding Vendor - vendor = soup.find('h3').find('a').text.strip() - - # Finding Vendor Rating - rating = soup.find('span', {'class': "tag is-dark"}).text.strip() - - # Finding Successful Transactions - success = soup.find('h3').text - success = success.replace("Vendor: ", "") - success = success.replace(vendor, "") - success = success.replace("(", "") - success = success.replace(")", "") - success = success.strip() - - bae = soup.find('div', {'class': "box"}).find_all('ul') - - # Finding Prices - USD = bae[1].find('strong').text.strip() - - li = bae[2].find_all('li') - - # Finding Escrow - escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip() - - # Finding the Product Category - category = li[1].find('span', {'class': "tag is-dark"}).text.strip() - - # Finding the Product Quantity Available - left = li[3].find('span', {'class': "tag is-dark"}).text.strip() - - # Finding Number Sold - sold = li[4].find('span', {'class': "tag is-dark"}).text.strip() - - li = bae[3].find_all('li') - - # Finding Shipment Information (Origin) - if "Ships from:" in li[-2].text: - shipFrom = li[-2].text - shipFrom = shipFrom.replace("Ships from: ", "") - # shipFrom = shipFrom.replace(",", "") - shipFrom = shipFrom.strip() - - # Finding Shipment Information (Destination) - shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text - shipTo = shipTo.replace("Ships to: ", "") - shipTo = shipTo.strip() - if "certain countries" in shipTo: - countries = "" - tags = li[-1].find_all('span', {'class': "tag"}) - for tag in tags: - country = tag.text.strip() - countries += country + ", " - shipTo = countries.strip(", ") - - # Finding the Product description - describe = soup.find('div', {'class': "pre-line"}).text - describe = describe.replace("\n", " ") - describe = describe.strip() - - '''# Finding the Number of Product Reviews - tag = soup.findAll(text=re.compile('Reviews')) - for index in tag: - reviews = index - par = reviews.find('(') - if par >=0: - reviews = reviews.replace("Reviews (","") - reviews = reviews.replace(")","") - reviews = reviews.split(",") - review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) - else : - review = "-1"''' - - # Searching for CVE and MS categories - cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if cve: - CVE = " " - for idx in cve: - CVE += (idx) - CVE += " " - CVE = CVE.replace(',', ' ') - CVE = CVE.replace('\n', '') - ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) - if ms: - MS = " " - for im in ms: - MS += (im) - MS += " " - MS = MS.replace(',', ' ') - MS = MS.replace('\n', '') + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + + + # product name + try: + name = soup.find('head').find('title').text + name = cleanString(name.strip()) + except: + print("name") + + #product description + try: + temp = soup.find('pre', {'class', 'description'}).text + temp = temp.replace('\n', ' ') + describe = cleanString(temp.strip()) + except: + print("description") + + + + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo # Populating the final variable (this should be a list with all fields scraped) - row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor, - sold, addDate, available, endDate, BTC, USD, rating, success, EURO) + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo) # Sending the results return row @@ -281,7 +218,7 @@ def darkmatter_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] - listing = soup.findAll('td', {"class": "lefted", 'colspan': '2'}) + listing = soup.find('div', {"class": "content"}).findAll('td', {"class": "lefted", 'colspan': '3'}) for a in listing: bae = a.find('a', href=True) diff --git a/MarketPlaces/M00nkeyMarket/parser.py b/MarketPlaces/M00nkeyMarket/parser.py index db54c4b..b726310 100644 --- a/MarketPlaces/M00nkeyMarket/parser.py +++ b/MarketPlaces/M00nkeyMarket/parser.py @@ -42,7 +42,8 @@ def m00nkey_description_parser(soup): temp = soup.findAll('div', {'class','text-center text-truncate column-flex ml-1 mr-1'}) #card sidebar-menu mb-4 card sidebar-menu mb-4 temp2 = temp[1].findAll('span', {'class', 'float-right font-weight-bold'}) temp = temp2[1].text - success = (cleanString(temp.strip())) + success = (temp.strip()) + # print(success) #vendor rating 5 @@ -52,9 +53,12 @@ def m00nkey_description_parser(soup): rating_vendor = (cleanString(temp.strip())) # product name - temp = soup.find('h3', {'class', 'h3 rounded card-title'}).find('span').text - name = (cleanString(temp.strip())) - + try: + temp = soup.find('h3', {'class', 'h3 rounded card-title'}).find('span').text + name = (cleanString(temp.strip())) + except: + temp = soup.find('h3', {'class', 'h3 rounded card-title'}).find('span').find("div").text + name = (cleanString(temp.strip())) # product description describe = soup.find('div', {'class': "box rounded flex-fill"}).find('pre').text @@ -180,7 +184,7 @@ def m00nkey_listing_parser(soup): # vendor try: - temp = a.find('col-5 justify-content-between mx-auto').find('a').text + temp = a.find('div', {'class','col-5 justify-content-between mx-auto'}).find('a').text vendor.append(cleanString(temp.strip())) except: print('vendor') @@ -188,16 +192,16 @@ def m00nkey_listing_parser(soup): #vendor rating - #successful transactions + #successful transactions CHECK AGAIN HERE try: - temp = a.find('col-5 justify-content-between mx-auto').find('div').text - success.append(cleanString(temp.strip())) + temp = a.find('div', {'class','col-5 justify-content-between mx-auto'}).find('div').text + success.append(temp.strip()) except: print('successful transactions') # product name try: - temp = a.find('card-title rounded text-truncate').find('a').text + temp = a.find('h5', {'class','card-title rounded text-truncate'}).find('a').text name.append(cleanString(temp.strip())) except: print('product name') @@ -205,27 +209,70 @@ def m00nkey_listing_parser(soup): CVE.append('-1') MS.append('-1') - rating_vendor.append("-1") + rating_vendor.append('-1') + # product category try: - temp = a.findAll('btn btn-block btn-primary') + temp = soup.find('div', {'class', 'card-sidebar-menu box mb-2 flex-column'}).find('h3').find('span').text + if "Search Results for: " in temp: + temp = temp.replace("Search Results for: ", "") + category.append(cleanString(temp.strip())) + except: print("Error in product category") - category = [] # 7 Product_Category y - describe = [] # 8 Product_Description - views = [] # 9 Product_Number_Of_Views - reviews = [] # 10 Product_Number_Of_Reviews - rating_item = [] # 11 Product_Rating - addDate = [] # 12 Product_AddDate - BTC = [] # 13 Product_BTC_SellingPrice - USD = [] # 14 Product_USD_SellingPrice y - EURO = [] # 15 Product_EURO_SellingPrice - sold = [] # 16 Product_QuantitySold - qLeft = [] # 17 Product_QuantityLeft - shipFrom = [] # 18 Product_ShippedFrom - shipTo = [] # 19 Product_ShippedTo - href = [] # 20 Product_Links + describe.append('-1') + + # product views + try: + temp = a.find('h6',{'class', 'card-subtitle mb-1 text-muted text-truncate'}) + temp2 = temp.find('i').text + views.append(cleanString(temp2.strip())) + except: + print("Error in views") + + reviews.append('-1') # 10 Product_Number_Of_Reviews + rating_item.append('-1') # 11 Product_Rating + addDate.append('-1') # 12 Product_AddDate + + # BTC + try: + temp = a.find('div', {'class', 'col-3 justify-content-between mx-auto'}) + temp2 = temp.findAll('p') + temp = temp2[1].text + BTC.append(cleanString(temp.strip())) + except: + print("BTC") + + #USD ERROR get rid of $ + try: + temp = a.find('div', {'class', 'col-12 justify-content-between mx-auto'}).find('i').text + if '$' in temp: + temp = temp.replace("$", "") + USD.append(cleanString(temp.strip())) # 14 Product_USD_SellingPrice + except: + print("USD") + + EURO.append("-1") # 15 Product_EURO_SellingPrice + + #product sold + try: + temp = a.find('div', {'class', 'col-12 mx-auto text-truncate text-center flex-fill'}).findAll('p', {'class', 'card-text mb-0'}) + temp2 = temp[1].find('i').text + sold.append(cleanString(temp2.strip())) + except: + print("product sold") + + qLeft.append('-1') # 17 Product_QuantityLeft + shipFrom.append('-1') # 18 Product_ShippedFrom + shipTo.append('-1') # 19 Product_ShippedTo + + #href + try: + temp = a.find('h5', {'class', 'card-title rounded text-truncate'}).find('a').get('href') + href.append(temp) # 20 Product_Links + except: + print("href") # Populate the final variable (this should be a list with all fields scraped)