diff --git a/MarketPlaces/Apocalypse/parser.py b/MarketPlaces/Apocalypse/parser.py index bc139fe..b7a4f63 100644 --- a/MarketPlaces/Apocalypse/parser.py +++ b/MarketPlaces/Apocalypse/parser.py @@ -43,7 +43,7 @@ def apocalypse_description_parser(soup: Tag): # Finding Product Image image = soup.find('div', {'class': 'col-md-7 text-center'}).find('img') - image = image.get('src') + image = image.get('src').split('base64,')[-1] product_reviews_list: Tag = content.find("table", {"class": "table product_reviews"}) \ .find_all("li") @@ -122,7 +122,7 @@ def apocalypse_listing_parser(soup: Tag): # Finding Product Image product_image = prod.find('img', {'class': 'customHeight'}) - product_image = product_image.get('src') + product_image = product_image.get('src').split('base64,')[-1] image.append(product_image) CVE.append("-1") diff --git a/MarketPlaces/DB_Connection/db_connection.py b/MarketPlaces/DB_Connection/db_connection.py index 81e3a1c..664f6e8 100644 --- a/MarketPlaces/DB_Connection/db_connection.py +++ b/MarketPlaces/DB_Connection/db_connection.py @@ -186,7 +186,7 @@ def create_vendor(cur, row, marketId): recset = cur.fetchall() - #aes_decryption(recset[0][5]) trying to decrypt the image + # decode_decrypt_image_in_base64(recset[0][5]) if (str(recset[0][3]) != str(row[2] if row[2] != '-1' else None) or # there was a change in the vendor information str(recset[0][4]) != str(row[3] if row[3] != '-1' else None) or @@ -266,6 +266,8 @@ def create_items(cur, row, marketId, vendorId): recset = cur.fetchall() + # decode_decrypt_image_in_base64(recset[0][20]) + if (str(recset[0][4]) != str(row[5] if row[5] != '-1' else None) or str(recset[0][5]) != str(row[6] if row[6] != '-1' else None) or str(recset[0][6]) != str(row[7] if row[7] != '-1' else None) or str(recset[0][7]) != str(row[8] if row[8] != '-1' else None) or str(recset[0][8]) != str(row[9] if row[9] != '-1' else None) or str(recset[0][9]) != str(row[10] if row[10] != '-1' else None) or diff --git a/MarketPlaces/DarkBazar/parser.py b/MarketPlaces/DarkBazar/parser.py index ccb7266..9b2d823 100644 --- a/MarketPlaces/DarkBazar/parser.py +++ b/MarketPlaces/DarkBazar/parser.py @@ -210,6 +210,8 @@ def darkbazar_listing_parser(soup): quant = quant.strip() qLeft.append(quant) + # add shipping information + # Searching for CVE and MS categories cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) diff --git a/MarketPlaces/DarkMatter/parser.py b/MarketPlaces/DarkMatter/parser.py index 17d775c..692db80 100644 --- a/MarketPlaces/DarkMatter/parser.py +++ b/MarketPlaces/DarkMatter/parser.py @@ -42,30 +42,21 @@ def darkmatter_description_parser(soup): temp = soup.find('table', {'class', 'vtable'}) temp = temp.findAll('tr') temp2 = temp[3].find('a').text - name = cleanString(temp2.strip()) + vendor = cleanString(temp2.strip()) except: - try: - temp = soup.find('table', {'class', 'vtable'}) - temp = temp.findAll('tr') - temp2 = temp[4].find('a').text - name = cleanString(temp2.strip()) - except: - print("vendor") + temp = soup.find('table', {'class', 'vtable'}) + temp = temp.findAll('tr') + temp2 = temp[4].find('a').text + vendor = cleanString(temp2.strip()) # product name - try: - name = soup.find('div', {'class', 'title-h2'}).text - name = cleanString(name.strip()) - except: - print("name") + name = soup.find('div', {'class', 'title-h2'}).text + name = cleanString(name.strip()) #product description - try: - temp = soup.find('pre', {'class', 'description'}).text - temp = temp.replace('\n', ' ') - describe = cleanString(temp.strip()) - except: - print("description") + temp = soup.find('pre', {'class', 'description'}).text + temp = temp.replace('\n', ' ') + describe = cleanString(temp.strip()) # Finding Product Image #image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img') @@ -81,44 +72,37 @@ def darkmatter_description_parser(soup): temp2 = temp[4].find('a').text category = cleanString(temp2.strip()) except: - try: - temp = soup.find('table', {'class', 'vtable'}) - temp = temp.findAll('tr') - temp2 = temp[5].find('th').text - temp2 = cleanString(temp2.strip) - if (temp2 == "Category"): - temp2 = temp[5].find('a').text - category = cleanString(temp2.strip()) - except: - print('category') - - # usd - try: - temp = soup.find('table', {'class', 'vtable'}) - temp = temp.findAll('tr') - temp2 = temp[1].find('td').text - temp2 = temp2.replace(' USD', '') - USD = cleanString(temp2) - except: - print('USD') - - # 15 Product_QuantitySold - try: temp = soup.find('table', {'class', 'vtable'}) temp = temp.findAll('tr') temp2 = temp[5].find('th').text - temp2 = cleanString(temp2) - temp3 = temp[6].find('th').text - temp3 = cleanString(temp3) - if (temp2 == "Sold"): - temp2 = temp[5].find('td').text - sold = cleanString(temp2.strip()) - elif (temp3 == "Sold"): - temp2 = temp[6].find('td').text - sold = cleanString(temp2.strip()) - except: - print('sold') + temp2 = cleanString(temp2.strip) + if (temp2 == "Category"): + temp2 = temp[5].find('a').text + category = cleanString(temp2.strip()) + # usd + temp = soup.find('table', {'class', 'vtable'}) + temp = temp.findAll('tr') + temp2 = temp[1].find('td').text + temp2 = temp2.replace(' USD', '') + USD = cleanString(temp2) + + # 15 Product_QuantitySold + temp = soup.find('table', {'class', 'vtable'}) + temp = temp.findAll('tr') + temp2 = temp[5].find('th').text + temp2 = cleanString(temp2) + temp3 = temp[6].find('th').text + temp3 = cleanString(temp3) + if (temp2 == "Sold"): + temp2 = temp[5].find('td').text + sold = cleanString(temp2.strip()) + elif (temp3 == "Sold"): + temp2 = temp[6].find('td').text + sold = cleanString(temp2.strip()) + + image = soup.find('td', {"class": "vtop"}).find('img').get('src') + image = image.split('base64,')[-1] # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, @@ -134,35 +118,8 @@ def darkmatter_description_parser(soup): #return: 'row' that contains a variety of lists that each hold info on the listing page def darkmatter_listing_parser(soup): - """ # Fields to be parsed - nm = 0 # Total_Products (Should be Integer) - mktName = "DarkMatter" # 0 Marketplace_Name - name = [] # 1 Product_Name - CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 3 Product_MS_Classification (Microsoft Security) - category = [] # 4 Product_Category - describe = [] # 5 Product_Description - escrow = [] # 6 Vendor_Warranty - views = [] # 7 Product_Number_Of_Views - reviews = [] # 8 Product_Number_Of_Reviews - addDate = [] # 9 Product_AddDate - rating_item = [] # 11 Product_Rating - lastSeen = [] # 10 Product_LastViewDate - BTC = [] # 11 Product_BTC_SellingPrice - USD = [] # 12 Product_USD_SellingPrice - EURO = [] # 13 Product_EURO_SellingPrice - sold = [] # 14 Product_QuantitySold - qLeft =[] # 15 Product_QuantityLeft - shipFrom = [] # 16 Product_ShippedFrom - shipTo = [] # 17 Product_ShippedTo - vendor = [] # 18 Vendor - rating = [] # 19 Vendor_Rating - success = [] # 20 Vendor_Successful_Transactions - href = [] # 23 Product_Links (Urls) - """ - # Fields to be parsed nm = 0 # *Total_Products (Should be Integer) mktName = "DarkMatter" # 0 *Marketplace_Name vendor = [] # 1 *Vendor y @@ -191,6 +148,7 @@ def darkmatter_listing_parser(soup): names = soup.find('div', {"class": "content"}).findAll('td', {"class": "lefted", "colspan": "3"}) left = soup.find('div', {"class": "content"}).findAll('table', {"class": "vtable"}) right = soup.find('div', {"class": "content"}).findAll('td', {"class": "vtop centered"}) + images = soup.find('div', {"class": "content"}).findAll('td', {"class": "vcentered"}) # vtop centered count = 0 @@ -199,18 +157,15 @@ def darkmatter_listing_parser(soup): for a in names: # product name - try: - temp = a.find('a').text - if ("pcs x " in temp): - index = temp.index("pcs x ") - result = temp[index + len("pcs x "):] - name.append(cleanString(result)) - elif("pks x " in temp): - index = temp.index("pks x ") - result = temp[index + len("pks x "):] - name.append(cleanString(temp)) - except Exception as e: - print("product name", e) + temp = a.find('a').text + if ("pcs x " in temp): + index = temp.index("pcs x ") + result = temp[index + len("pcs x "):] + name.append(cleanString(result)) + elif("pks x " in temp): + index = temp.index("pks x ") + result = temp[index + len("pks x "):] + name.append(cleanString(result)) # Finding Product Image #product_image = a.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'}) @@ -225,11 +180,8 @@ def darkmatter_listing_parser(soup): length_2 = len(temp2) - 1 # category - try: - temp = temp2[1].find('td').text - category.append(cleanString(temp.strip())) - except: - print('category') + temp = temp2[1].find('td').text + category.append(cleanString(temp.strip())) describe.append("-1") #escrow.append("-1") @@ -238,63 +190,49 @@ def darkmatter_listing_parser(soup): addDate.append("-1") #lastSeen.append("-1") BTC.append("-1") - image.append("-1") image_vendor.append("-1") # usd - try: - temp3 = right[count*2].find('span').text - temp = temp3.replace(' USD', '') - USD.append(cleanString(temp)) - except: - print('USD') + temp3 = right[count*2].find('span').text + temp = temp3.replace(' USD', '') + USD.append(cleanString(temp)) EURO.append("-1") # 14 Product_QuantitySold - try: - temp3 = temp2[length_2].find('th').text - temp3 = cleanString(temp3) - if (temp3 == "Sold:"): - temp = temp2[length_2].find('td').text - sold.append(cleanString(temp.strip())) - else: - sold.append("-1") - except Exception as e: + temp3 = temp2[length_2].find('th').text + temp3 = cleanString(temp3) + if (temp3 == "Sold:"): + temp = temp2[length_2].find('td').text + sold.append(cleanString(temp.strip())) + else: sold.append("-1") - print('sold', e) qLeft.append("-1") shipFrom.append("-1") # ship to - try: - temp3 = temp2[length_2].find('th').text - temp3 = cleanString(temp3) - if (temp3 == "Ship To:"): - temp = temp2[length_2].find('td').text - shipTo.append(cleanString(temp.strip())) - else: - shipTo.append("-1") - except Exception as e: + temp3 = temp2[length_2].find('th').text + temp3 = cleanString(temp3) + if (temp3 == "Ship To:"): + temp = temp2[length_2].find('td').text + shipTo.append(cleanString(temp.strip())) + else: shipTo.append("-1") - print('shopto') # vendor - try: - temp = temp2[0].find('a').text - vendor.append(cleanString(temp.strip())) - except: - print('vendor') + temp = temp2[0].find('a').text + vendor.append(cleanString(temp.strip())) + # add product rating (stars) rating.append("-1") success.append("-1") - try: - temp = a.find('a').get('href') - href.append(temp) - except: - print('href') + temp = a.find('a').get('href') + href.append(temp) + + image = images[count*2].find('img').get('src') + image = image.split('base64,')[-1] count += 1 diff --git a/MarketPlaces/DigitalThriftShop/parser.py b/MarketPlaces/DigitalThriftShop/parser.py index f55c1e6..ad275e2 100644 --- a/MarketPlaces/DigitalThriftShop/parser.py +++ b/MarketPlaces/DigitalThriftShop/parser.py @@ -46,7 +46,7 @@ def digitalThriftShop_description_parser(soup: Tag): # Finding Product Image image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img') - image = image.get('src') + image = image.get('src').split('base64,')[-1] product_category = soup.find("span", {"class": "posted_in"}).find("a").text category = cleanString(product_category.strip()) @@ -115,7 +115,7 @@ def digitalThriftShop_listing_parser(soup: Tag): for product in products_list: nm += 1 - vendor.append("-1") + vendor.append(mktName) rating_vendor.append("-1") success.append("-1") @@ -124,7 +124,7 @@ def digitalThriftShop_listing_parser(soup: Tag): # Finding Product Image product_image = product.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'}) - product_image = product_image.get('src') + product_image = product_image.get('src').split('base64,')[-1] image.append(product_image) CVE.append("-1") diff --git a/MarketPlaces/HiddenMarket/parser.py b/MarketPlaces/HiddenMarket/parser.py index 0dc4bc9..b1783e0 100644 --- a/MarketPlaces/HiddenMarket/parser.py +++ b/MarketPlaces/HiddenMarket/parser.py @@ -88,7 +88,7 @@ def hiddenmarket_description_parser(soup): # Finding Product Image image = soup.find('div', {"class": "thumbnails"}).find('img', {"class": "bigthumbnail"}) - image = image.get('src') + image = image.get('src').split('base64,')[-1] # Finding the Product Category category = mb[-4].text diff --git a/MarketPlaces/Initialization/markets_mining.py b/MarketPlaces/Initialization/markets_mining.py index b93ef96..7779f10 100644 --- a/MarketPlaces/Initialization/markets_mining.py +++ b/MarketPlaces/Initialization/markets_mining.py @@ -25,6 +25,7 @@ from MarketPlaces.RobinhoodMarket.crawler_selenium import crawler as crawlerRobi from MarketPlaces.Nexus.crawler_selenium import crawler as crawlerNexus from MarketPlaces.CypherMarketplace.crawler_selenium import crawler as crawlerCypher from MarketPlaces.DarkBazar.crawler_selenium import crawler as crawlerDarkBazar +from MarketPlaces.PabloEscobarMarket.crawler_selenium import crawler as crawlerPabloEscobar import configparser import os @@ -140,5 +141,7 @@ if __name__ == '__main__': crawlerCypher() elif mkt == "DarkBazar": crawlerDarkBazar() + elif mkt == "PabloEscobarMarket": + crawlerPabloEscobar() print("\nScraping process completed!") diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index 68ba3b9..c7699bd 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -22,6 +22,7 @@ from MarketPlaces.RobinhoodMarket.parser import * from MarketPlaces.Nexus.parser import * from MarketPlaces.MikesGrandStore.parser import * from MarketPlaces.DarkBazar.parser import * +from MarketPlaces.PabloEscobarMarket.parser import * from MarketPlaces.Classifier.classify_product import predict @@ -73,9 +74,9 @@ def mergePages(rmm, rec): rec[18] = rmm[17] if rec[19] == "-1": # shippedto_item rec[19] = rmm[18] - if rec[20] == "-1": # image + if rmm[19] != "-1": # image rec[20] = rmm[19] - if rec[21] == "-1": # image_vendor + if rmm[20] != "-1": # image_vendor rec[21] = rmm[20] return rec @@ -155,6 +156,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile): rw = mikesGrandStore_listing_parser(soup) elif marketPlace == "DarkBazar": rw = darkbazar_listing_parser(soup) + elif marketPlace == "PabloEscobarMarket": + rw = pabloescobarmarket_listing_parser(soup) else: print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") raise Exception @@ -208,6 +211,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): rmm = mikesGrandStore_description_parser(soup) elif marketPlace == "DarkBazar": rmm = darkbazar_description_parser(soup) + elif marketPlace == "PabloEscobarMarket": + rmm = pabloescobarmarket_description_parser(soup) else: print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") raise Exception diff --git a/MarketPlaces/RobinhoodMarket/crawler_selenium.py b/MarketPlaces/RobinhoodMarket/crawler_selenium.py index 06d8bc9..232fac7 100644 --- a/MarketPlaces/RobinhoodMarket/crawler_selenium.py +++ b/MarketPlaces/RobinhoodMarket/crawler_selenium.py @@ -162,8 +162,8 @@ def getInterestedLinks(): # Hacking links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/hacking/') - # # Other Software - # links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/other-software/') + # Other Software + links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/other-software/') return links @@ -191,7 +191,7 @@ def crawlForum(driver): savePage(driver, html, link) list = productPages(html) - for item in list: + for c, item in enumerate(list): itemURL = urlparse.urljoin(baseURL, str(item)) try: @@ -202,11 +202,12 @@ def crawlForum(driver): driver.back() # comment out - # break + # if c == 4: + # break # comment out - if count == 1: - break + # if count == 1: + # break # go to next page of market try: diff --git a/MarketPlaces/RobinhoodMarket/parser.py b/MarketPlaces/RobinhoodMarket/parser.py index 1a3bdb8..5de7a70 100644 --- a/MarketPlaces/RobinhoodMarket/parser.py +++ b/MarketPlaces/RobinhoodMarket/parser.py @@ -50,20 +50,17 @@ def Robinhood_description_parser(soup): # Finding description desc = '' - primary = soup.find('div', {'id': 'primary'}) - product = primary.findAll('div')[1] - commerce = product.findAll('div', recursive=False)[2] - descDiv = commerce.findAll('div')[0] - # descDiv = soup.find('div', {'class': 'woocommerce-Tabs-panel woocommerce-Tabs-panel--description panel entry-content wc-tab'}) - - descText = descDiv.findAll('p') - for para in descText: - desc = desc + para.text - describe = desc + tab = soup.find('div', {"id": "tab-description"}) + for p in tab.findAll('p'): + desc += p.text + if desc == '': + desc = soup.find('div', {"class": "woocommerce-product-details__short-description"}).text + describe = cleanString(desc.strip()) # Finding Product Image image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img') image = image.get('src') + image = image.split('base64,')[-1] # Finding Vendor vendor = soup.find('a', {'class': 'wcfm_dashboard_item_title'}).text @@ -74,6 +71,7 @@ def Robinhood_description_parser(soup): # Finding Vendor Image vendor_image = soup.find('div', {'class': 'wcfmmp_sold_by_container_left'}).find('img') vendor_image = vendor_image.get('src') + vendor_image = vendor_image.split('base64,')[-1] # Finding Category catSpan = soup.find('span', {'class': 'posted_in'}) @@ -168,6 +166,7 @@ def Robinhood_listing_parser(soup): # Finding Product Image product_image = card.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'}) product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] image.append(product_image) info = card.find('div', {'class': 'wcfmmp_sold_by_container'}) @@ -181,6 +180,7 @@ def Robinhood_listing_parser(soup): # Finding Vendor Image vendor_icon = info.find('img', {'class', 'wcfmmp_sold_by_logo'}) vendor_icon = vendor_icon.get('src') + vendor_icon = vendor_icon.split('base64,')[-1] image_vendor.append(vendor_icon) # Finding USD diff --git a/MarketPlaces/Utilities/utilities.py b/MarketPlaces/Utilities/utilities.py index df74e92..fb9b122 100644 --- a/MarketPlaces/Utilities/utilities.py +++ b/MarketPlaces/Utilities/utilities.py @@ -342,7 +342,6 @@ def aes_encryption(item): def aes_decryption(item): to_bytes = bytes(item) - #to_bytes = bytes(item, 'utf-8') decrypted_bytes = decryptCipher.decrypt(to_bytes) @@ -368,29 +367,24 @@ def encrypt_encode_image_to_base64(driver, xpath): return None -def decode_decrypt_image_in_base64(html_content): +def decode_decrypt_image_in_base64(string_image): - soup = BeautifulSoup(html_content, 'html.parser') - - for img_tag in soup.find_all('img'): - - src_attr = img_tag.get('src') + try: - if src_attr and src_attr.startswith('data:image'): + base64_image = bytes(string_image, encoding='utf-8') + encrypted_image = base64.b64decode(base64_image) + decrypted_image = aes_decryption(encrypted_image) - try: + im = Image.open(io.BytesIO(decrypted_image)) + im.show() - string_image = src_attr.split('base64,')[-1] - base64_image = bytes(string_image, encoding='utf-8') - encrypted_image = base64.b64decode(base64_image) - decrypted_image = aes_decryption(encrypted_image) + return decrypted_image - im = Image.open(io.BytesIO(decrypted_image)) - im.show() + except Exception as e: + print(e) + pass - except Exception as e: - print(e) - pass + return None def replace_image_sources(driver, html_content): @@ -408,7 +402,7 @@ def replace_image_sources(driver, html_content): string_image = encrypt_encode_image_to_base64(driver, img_xpath) if string_image: - img_tag.set('src', f'data:image/png;base64;{string_image}') + img_tag.set('src', f'data:image/png;base64,{string_image}') else: img_tag.getparent().remove(img_tag) @@ -420,7 +414,6 @@ def replace_image_sources(driver, html_content): def cleanHTML(driver, html): clean_html = replace_image_sources(driver, html) - # decode_decrypt_image_in_base64(clean_html) formats = [ "jpg", "jpeg", "jfif", "pjpeg", "pjp",