Browse Source

image debug for marketplaces

main
westernmeadow 1 year ago
parent
commit
aac0b87f14
11 changed files with 123 additions and 179 deletions
  1. +2
    -2
      MarketPlaces/Apocalypse/parser.py
  2. +3
    -1
      MarketPlaces/DB_Connection/db_connection.py
  3. +2
    -0
      MarketPlaces/DarkBazar/parser.py
  4. +72
    -134
      MarketPlaces/DarkMatter/parser.py
  5. +3
    -3
      MarketPlaces/DigitalThriftShop/parser.py
  6. +1
    -1
      MarketPlaces/HiddenMarket/parser.py
  7. +3
    -0
      MarketPlaces/Initialization/markets_mining.py
  8. +7
    -2
      MarketPlaces/Initialization/prepare_parser.py
  9. +7
    -6
      MarketPlaces/RobinhoodMarket/crawler_selenium.py
  10. +10
    -10
      MarketPlaces/RobinhoodMarket/parser.py
  11. +13
    -20
      MarketPlaces/Utilities/utilities.py

+ 2
- 2
MarketPlaces/Apocalypse/parser.py View File

@ -43,7 +43,7 @@ def apocalypse_description_parser(soup: Tag):
# Finding Product Image
image = soup.find('div', {'class': 'col-md-7 text-center'}).find('img')
image = image.get('src')
image = image.get('src').split('base64,')[-1]
product_reviews_list: Tag = content.find("table", {"class": "table product_reviews"}) \
.find_all("li")
@ -122,7 +122,7 @@ def apocalypse_listing_parser(soup: Tag):
# Finding Product Image
product_image = prod.find('img', {'class': 'customHeight'})
product_image = product_image.get('src')
product_image = product_image.get('src').split('base64,')[-1]
image.append(product_image)
CVE.append("-1")


+ 3
- 1
MarketPlaces/DB_Connection/db_connection.py View File

@ -186,7 +186,7 @@ def create_vendor(cur, row, marketId):
recset = cur.fetchall()
#aes_decryption(recset[0][5]) trying to decrypt the image
# decode_decrypt_image_in_base64(recset[0][5])
if (str(recset[0][3]) != str(row[2] if row[2] != '-1' else None) or # there was a change in the vendor information
str(recset[0][4]) != str(row[3] if row[3] != '-1' else None) or
@ -266,6 +266,8 @@ def create_items(cur, row, marketId, vendorId):
recset = cur.fetchall()
# decode_decrypt_image_in_base64(recset[0][20])
if (str(recset[0][4]) != str(row[5] if row[5] != '-1' else None) or str(recset[0][5]) != str(row[6] if row[6] != '-1' else None) or
str(recset[0][6]) != str(row[7] if row[7] != '-1' else None) or str(recset[0][7]) != str(row[8] if row[8] != '-1' else None) or
str(recset[0][8]) != str(row[9] if row[9] != '-1' else None) or str(recset[0][9]) != str(row[10] if row[10] != '-1' else None) or


+ 2
- 0
MarketPlaces/DarkBazar/parser.py View File

@ -210,6 +210,8 @@ def darkbazar_listing_parser(soup):
quant = quant.strip()
qLeft.append(quant)
# add shipping information
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))


+ 72
- 134
MarketPlaces/DarkMatter/parser.py View File

@ -42,30 +42,21 @@ def darkmatter_description_parser(soup):
temp = soup.find('table', {'class', 'vtable'})
temp = temp.findAll('tr')
temp2 = temp[3].find('a').text
name = cleanString(temp2.strip())
vendor = cleanString(temp2.strip())
except:
try:
temp = soup.find('table', {'class', 'vtable'})
temp = temp.findAll('tr')
temp2 = temp[4].find('a').text
name = cleanString(temp2.strip())
except:
print("vendor")
temp = soup.find('table', {'class', 'vtable'})
temp = temp.findAll('tr')
temp2 = temp[4].find('a').text
vendor = cleanString(temp2.strip())
# product name
try:
name = soup.find('div', {'class', 'title-h2'}).text
name = cleanString(name.strip())
except:
print("name")
name = soup.find('div', {'class', 'title-h2'}).text
name = cleanString(name.strip())
#product description
try:
temp = soup.find('pre', {'class', 'description'}).text
temp = temp.replace('\n', ' ')
describe = cleanString(temp.strip())
except:
print("description")
temp = soup.find('pre', {'class', 'description'}).text
temp = temp.replace('\n', ' ')
describe = cleanString(temp.strip())
# Finding Product Image
#image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img')
@ -81,44 +72,37 @@ def darkmatter_description_parser(soup):
temp2 = temp[4].find('a').text
category = cleanString(temp2.strip())
except:
try:
temp = soup.find('table', {'class', 'vtable'})
temp = temp.findAll('tr')
temp2 = temp[5].find('th').text
temp2 = cleanString(temp2.strip)
if (temp2 == "Category"):
temp2 = temp[5].find('a').text
category = cleanString(temp2.strip())
except:
print('category')
# usd
try:
temp = soup.find('table', {'class', 'vtable'})
temp = temp.findAll('tr')
temp2 = temp[1].find('td').text
temp2 = temp2.replace(' USD', '')
USD = cleanString(temp2)
except:
print('USD')
# 15 Product_QuantitySold
try:
temp = soup.find('table', {'class', 'vtable'})
temp = temp.findAll('tr')
temp2 = temp[5].find('th').text
temp2 = cleanString(temp2)
temp3 = temp[6].find('th').text
temp3 = cleanString(temp3)
if (temp2 == "Sold"):
temp2 = temp[5].find('td').text
sold = cleanString(temp2.strip())
elif (temp3 == "Sold"):
temp2 = temp[6].find('td').text
sold = cleanString(temp2.strip())
except:
print('sold')
temp2 = cleanString(temp2.strip)
if (temp2 == "Category"):
temp2 = temp[5].find('a').text
category = cleanString(temp2.strip())
# usd
temp = soup.find('table', {'class', 'vtable'})
temp = temp.findAll('tr')
temp2 = temp[1].find('td').text
temp2 = temp2.replace(' USD', '')
USD = cleanString(temp2)
# 15 Product_QuantitySold
temp = soup.find('table', {'class', 'vtable'})
temp = temp.findAll('tr')
temp2 = temp[5].find('th').text
temp2 = cleanString(temp2)
temp3 = temp[6].find('th').text
temp3 = cleanString(temp3)
if (temp2 == "Sold"):
temp2 = temp[5].find('td').text
sold = cleanString(temp2.strip())
elif (temp3 == "Sold"):
temp2 = temp[6].find('td').text
sold = cleanString(temp2.strip())
image = soup.find('td', {"class": "vtop"}).find('img').get('src')
image = image.split('base64,')[-1]
# Populating the final variable (this should be a list with all fields scraped)
row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
@ -134,35 +118,8 @@ def darkmatter_description_parser(soup):
#return: 'row' that contains a variety of lists that each hold info on the listing page
def darkmatter_listing_parser(soup):
"""
# Fields to be parsed
nm = 0 # Total_Products (Should be Integer)
mktName = "DarkMatter" # 0 Marketplace_Name
name = [] # 1 Product_Name
CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 3 Product_MS_Classification (Microsoft Security)
category = [] # 4 Product_Category
describe = [] # 5 Product_Description
escrow = [] # 6 Vendor_Warranty
views = [] # 7 Product_Number_Of_Views
reviews = [] # 8 Product_Number_Of_Reviews
addDate = [] # 9 Product_AddDate
rating_item = [] # 11 Product_Rating
lastSeen = [] # 10 Product_LastViewDate
BTC = [] # 11 Product_BTC_SellingPrice
USD = [] # 12 Product_USD_SellingPrice
EURO = [] # 13 Product_EURO_SellingPrice
sold = [] # 14 Product_QuantitySold
qLeft =[] # 15 Product_QuantityLeft
shipFrom = [] # 16 Product_ShippedFrom
shipTo = [] # 17 Product_ShippedTo
vendor = [] # 18 Vendor
rating = [] # 19 Vendor_Rating
success = [] # 20 Vendor_Successful_Transactions
href = [] # 23 Product_Links (Urls)
"""
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
mktName = "DarkMatter" # 0 *Marketplace_Name
vendor = [] # 1 *Vendor y
@ -191,6 +148,7 @@ def darkmatter_listing_parser(soup):
names = soup.find('div', {"class": "content"}).findAll('td', {"class": "lefted", "colspan": "3"})
left = soup.find('div', {"class": "content"}).findAll('table', {"class": "vtable"})
right = soup.find('div', {"class": "content"}).findAll('td', {"class": "vtop centered"})
images = soup.find('div', {"class": "content"}).findAll('td', {"class": "vcentered"})
# vtop centered
count = 0
@ -199,18 +157,15 @@ def darkmatter_listing_parser(soup):
for a in names:
# product name
try:
temp = a.find('a').text
if ("pcs x " in temp):
index = temp.index("pcs x ")
result = temp[index + len("pcs x "):]
name.append(cleanString(result))
elif("pks x " in temp):
index = temp.index("pks x ")
result = temp[index + len("pks x "):]
name.append(cleanString(temp))
except Exception as e:
print("product name", e)
temp = a.find('a').text
if ("pcs x " in temp):
index = temp.index("pcs x ")
result = temp[index + len("pcs x "):]
name.append(cleanString(result))
elif("pks x " in temp):
index = temp.index("pks x ")
result = temp[index + len("pks x "):]
name.append(cleanString(result))
# Finding Product Image
#product_image = a.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'})
@ -225,11 +180,8 @@ def darkmatter_listing_parser(soup):
length_2 = len(temp2) - 1
# category
try:
temp = temp2[1].find('td').text
category.append(cleanString(temp.strip()))
except:
print('category')
temp = temp2[1].find('td').text
category.append(cleanString(temp.strip()))
describe.append("-1")
#escrow.append("-1")
@ -238,63 +190,49 @@ def darkmatter_listing_parser(soup):
addDate.append("-1")
#lastSeen.append("-1")
BTC.append("-1")
image.append("-1")
image_vendor.append("-1")
# usd
try:
temp3 = right[count*2].find('span').text
temp = temp3.replace(' USD', '')
USD.append(cleanString(temp))
except:
print('USD')
temp3 = right[count*2].find('span').text
temp = temp3.replace(' USD', '')
USD.append(cleanString(temp))
EURO.append("-1")
# 14 Product_QuantitySold
try:
temp3 = temp2[length_2].find('th').text
temp3 = cleanString(temp3)
if (temp3 == "Sold:"):
temp = temp2[length_2].find('td').text
sold.append(cleanString(temp.strip()))
else:
sold.append("-1")
except Exception as e:
temp3 = temp2[length_2].find('th').text
temp3 = cleanString(temp3)
if (temp3 == "Sold:"):
temp = temp2[length_2].find('td').text
sold.append(cleanString(temp.strip()))
else:
sold.append("-1")
print('sold', e)
qLeft.append("-1")
shipFrom.append("-1")
# ship to
try:
temp3 = temp2[length_2].find('th').text
temp3 = cleanString(temp3)
if (temp3 == "Ship To:"):
temp = temp2[length_2].find('td').text
shipTo.append(cleanString(temp.strip()))
else:
shipTo.append("-1")
except Exception as e:
temp3 = temp2[length_2].find('th').text
temp3 = cleanString(temp3)
if (temp3 == "Ship To:"):
temp = temp2[length_2].find('td').text
shipTo.append(cleanString(temp.strip()))
else:
shipTo.append("-1")
print('shopto')
# vendor
try:
temp = temp2[0].find('a').text
vendor.append(cleanString(temp.strip()))
except:
print('vendor')
temp = temp2[0].find('a').text
vendor.append(cleanString(temp.strip()))
# add product rating (stars)
rating.append("-1")
success.append("-1")
try:
temp = a.find('a').get('href')
href.append(temp)
except:
print('href')
temp = a.find('a').get('href')
href.append(temp)
image = images[count*2].find('img').get('src')
image = image.split('base64,')[-1]
count += 1


+ 3
- 3
MarketPlaces/DigitalThriftShop/parser.py View File

@ -46,7 +46,7 @@ def digitalThriftShop_description_parser(soup: Tag):
# Finding Product Image
image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img')
image = image.get('src')
image = image.get('src').split('base64,')[-1]
product_category = soup.find("span", {"class": "posted_in"}).find("a").text
category = cleanString(product_category.strip())
@ -115,7 +115,7 @@ def digitalThriftShop_listing_parser(soup: Tag):
for product in products_list:
nm += 1
vendor.append("-1")
vendor.append(mktName)
rating_vendor.append("-1")
success.append("-1")
@ -124,7 +124,7 @@ def digitalThriftShop_listing_parser(soup: Tag):
# Finding Product Image
product_image = product.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'})
product_image = product_image.get('src')
product_image = product_image.get('src').split('base64,')[-1]
image.append(product_image)
CVE.append("-1")


+ 1
- 1
MarketPlaces/HiddenMarket/parser.py View File

@ -88,7 +88,7 @@ def hiddenmarket_description_parser(soup):
# Finding Product Image
image = soup.find('div', {"class": "thumbnails"}).find('img', {"class": "bigthumbnail"})
image = image.get('src')
image = image.get('src').split('base64,')[-1]
# Finding the Product Category
category = mb[-4].text


+ 3
- 0
MarketPlaces/Initialization/markets_mining.py View File

@ -25,6 +25,7 @@ from MarketPlaces.RobinhoodMarket.crawler_selenium import crawler as crawlerRobi
from MarketPlaces.Nexus.crawler_selenium import crawler as crawlerNexus
from MarketPlaces.CypherMarketplace.crawler_selenium import crawler as crawlerCypher
from MarketPlaces.DarkBazar.crawler_selenium import crawler as crawlerDarkBazar
from MarketPlaces.PabloEscobarMarket.crawler_selenium import crawler as crawlerPabloEscobar
import configparser
import os
@ -140,5 +141,7 @@ if __name__ == '__main__':
crawlerCypher()
elif mkt == "DarkBazar":
crawlerDarkBazar()
elif mkt == "PabloEscobarMarket":
crawlerPabloEscobar()
print("\nScraping process completed!")

+ 7
- 2
MarketPlaces/Initialization/prepare_parser.py View File

@ -22,6 +22,7 @@ from MarketPlaces.RobinhoodMarket.parser import *
from MarketPlaces.Nexus.parser import *
from MarketPlaces.MikesGrandStore.parser import *
from MarketPlaces.DarkBazar.parser import *
from MarketPlaces.PabloEscobarMarket.parser import *
from MarketPlaces.Classifier.classify_product import predict
@ -73,9 +74,9 @@ def mergePages(rmm, rec):
rec[18] = rmm[17]
if rec[19] == "-1": # shippedto_item
rec[19] = rmm[18]
if rec[20] == "-1": # image
if rmm[19] != "-1": # image
rec[20] = rmm[19]
if rec[21] == "-1": # image_vendor
if rmm[20] != "-1": # image_vendor
rec[21] = rmm[20]
return rec
@ -155,6 +156,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
rw = mikesGrandStore_listing_parser(soup)
elif marketPlace == "DarkBazar":
rw = darkbazar_listing_parser(soup)
elif marketPlace == "PabloEscobarMarket":
rw = pabloescobarmarket_listing_parser(soup)
else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception
@ -208,6 +211,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
rmm = mikesGrandStore_description_parser(soup)
elif marketPlace == "DarkBazar":
rmm = darkbazar_description_parser(soup)
elif marketPlace == "PabloEscobarMarket":
rmm = pabloescobarmarket_description_parser(soup)
else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception


+ 7
- 6
MarketPlaces/RobinhoodMarket/crawler_selenium.py View File

@ -162,8 +162,8 @@ def getInterestedLinks():
# Hacking
links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/hacking/')
# # Other Software
# links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/other-software/')
# Other Software
links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/other-software/')
return links
@ -191,7 +191,7 @@ def crawlForum(driver):
savePage(driver, html, link)
list = productPages(html)
for item in list:
for c, item in enumerate(list):
itemURL = urlparse.urljoin(baseURL, str(item))
try:
@ -202,11 +202,12 @@ def crawlForum(driver):
driver.back()
# comment out
# break
# if c == 4:
# break
# comment out
if count == 1:
break
# if count == 1:
# break
# go to next page of market
try:


+ 10
- 10
MarketPlaces/RobinhoodMarket/parser.py View File

@ -50,20 +50,17 @@ def Robinhood_description_parser(soup):
# Finding description
desc = ''
primary = soup.find('div', {'id': 'primary'})
product = primary.findAll('div')[1]
commerce = product.findAll('div', recursive=False)[2]
descDiv = commerce.findAll('div')[0]
# descDiv = soup.find('div', {'class': 'woocommerce-Tabs-panel woocommerce-Tabs-panel--description panel entry-content wc-tab'})
descText = descDiv.findAll('p')
for para in descText:
desc = desc + para.text
describe = desc
tab = soup.find('div', {"id": "tab-description"})
for p in tab.findAll('p'):
desc += p.text
if desc == '':
desc = soup.find('div', {"class": "woocommerce-product-details__short-description"}).text
describe = cleanString(desc.strip())
# Finding Product Image
image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img')
image = image.get('src')
image = image.split('base64,')[-1]
# Finding Vendor
vendor = soup.find('a', {'class': 'wcfm_dashboard_item_title'}).text
@ -74,6 +71,7 @@ def Robinhood_description_parser(soup):
# Finding Vendor Image
vendor_image = soup.find('div', {'class': 'wcfmmp_sold_by_container_left'}).find('img')
vendor_image = vendor_image.get('src')
vendor_image = vendor_image.split('base64,')[-1]
# Finding Category
catSpan = soup.find('span', {'class': 'posted_in'})
@ -168,6 +166,7 @@ def Robinhood_listing_parser(soup):
# Finding Product Image
product_image = card.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'})
product_image = product_image.get('src')
product_image = product_image.split('base64,')[-1]
image.append(product_image)
info = card.find('div', {'class': 'wcfmmp_sold_by_container'})
@ -181,6 +180,7 @@ def Robinhood_listing_parser(soup):
# Finding Vendor Image
vendor_icon = info.find('img', {'class', 'wcfmmp_sold_by_logo'})
vendor_icon = vendor_icon.get('src')
vendor_icon = vendor_icon.split('base64,')[-1]
image_vendor.append(vendor_icon)
# Finding USD


+ 13
- 20
MarketPlaces/Utilities/utilities.py View File

@ -342,7 +342,6 @@ def aes_encryption(item):
def aes_decryption(item):
to_bytes = bytes(item)
#to_bytes = bytes(item, 'utf-8')
decrypted_bytes = decryptCipher.decrypt(to_bytes)
@ -368,29 +367,24 @@ def encrypt_encode_image_to_base64(driver, xpath):
return None
def decode_decrypt_image_in_base64(html_content):
def decode_decrypt_image_in_base64(string_image):
soup = BeautifulSoup(html_content, 'html.parser')
for img_tag in soup.find_all('img'):
src_attr = img_tag.get('src')
try:
if src_attr and src_attr.startswith('data:image'):
base64_image = bytes(string_image, encoding='utf-8')
encrypted_image = base64.b64decode(base64_image)
decrypted_image = aes_decryption(encrypted_image)
try:
im = Image.open(io.BytesIO(decrypted_image))
im.show()
string_image = src_attr.split('base64,')[-1]
base64_image = bytes(string_image, encoding='utf-8')
encrypted_image = base64.b64decode(base64_image)
decrypted_image = aes_decryption(encrypted_image)
return decrypted_image
im = Image.open(io.BytesIO(decrypted_image))
im.show()
except Exception as e:
print(e)
pass
except Exception as e:
print(e)
pass
return None
def replace_image_sources(driver, html_content):
@ -408,7 +402,7 @@ def replace_image_sources(driver, html_content):
string_image = encrypt_encode_image_to_base64(driver, img_xpath)
if string_image:
img_tag.set('src', f'data:image/png;base64;{string_image}')
img_tag.set('src', f'data:image/png;base64,{string_image}')
else:
img_tag.getparent().remove(img_tag)
@ -420,7 +414,6 @@ def replace_image_sources(driver, html_content):
def cleanHTML(driver, html):
clean_html = replace_image_sources(driver, html)
# decode_decrypt_image_in_base64(clean_html)
formats = [
"jpg", "jpeg", "jfif", "pjpeg", "pjp",


Loading…
Cancel
Save