image debug for marketplaces

1 year ago · aac0b87f14
--- a/MarketPlaces/Apocalypse/parser.py
+++ b/MarketPlaces/Apocalypse/parser.py
@ -43,7 +43,7 @@ def apocalypse_description_parser(soup: Tag):

    # Finding Product Image
    image = soup.find('div', {'class': 'col-md-7 text-center'}).find('img')
    image = image.get('src')
    image = image.get('src').split('base64,')[-1]

    product_reviews_list: Tag = content.find("table", {"class": "table product_reviews"}) \
                               .find_all("li")
@ -122,7 +122,7 @@ def apocalypse_listing_parser(soup: Tag):

        # Finding Product Image
        product_image = prod.find('img', {'class': 'customHeight'})
        product_image = product_image.get('src')
        product_image = product_image.get('src').split('base64,')[-1]
        image.append(product_image)
        
        CVE.append("-1")
--- a/MarketPlaces/DB_Connection/db_connection.py
+++ b/MarketPlaces/DB_Connection/db_connection.py
@ -186,7 +186,7 @@ def create_vendor(cur, row, marketId):

        recset = cur.fetchall()

        #aes_decryption(recset[0][5]) trying to decrypt the image
        # decode_decrypt_image_in_base64(recset[0][5])

        if (str(recset[0][3]) != str(row[2] if row[2] != '-1' else None) or # there was a change in the vendor information
            str(recset[0][4]) != str(row[3] if row[3] != '-1' else None) or
@ -266,6 +266,8 @@ def create_items(cur, row, marketId, vendorId):

            recset = cur.fetchall()

            # decode_decrypt_image_in_base64(recset[0][20])

            if (str(recset[0][4]) != str(row[5] if row[5] != '-1' else None) or str(recset[0][5]) != str(row[6] if row[6] != '-1' else None) or
                str(recset[0][6]) != str(row[7] if row[7] != '-1' else None) or str(recset[0][7]) != str(row[8] if row[8] != '-1' else None) or
                str(recset[0][8]) != str(row[9] if row[9] != '-1' else None) or str(recset[0][9]) != str(row[10] if row[10] != '-1' else None) or
--- a/MarketPlaces/DarkBazar/parser.py
+++ b/MarketPlaces/DarkBazar/parser.py
@ -210,6 +210,8 @@ def darkbazar_listing_parser(soup):
        quant = quant.strip()
        qLeft.append(quant)

        # add shipping information


        # Searching for CVE and MS categories
        cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
--- a/MarketPlaces/DarkMatter/parser.py
+++ b/MarketPlaces/DarkMatter/parser.py
@ -42,30 +42,21 @@ def darkmatter_description_parser(soup):
        temp = soup.find('table', {'class', 'vtable'})
        temp = temp.findAll('tr')
        temp2 = temp[3].find('a').text
        name = cleanString(temp2.strip())
        vendor = cleanString(temp2.strip())
    except:
        try:
            temp = soup.find('table', {'class', 'vtable'})
            temp = temp.findAll('tr')
            temp2 = temp[4].find('a').text
            name = cleanString(temp2.strip())
        except:
            print("vendor")
        temp = soup.find('table', {'class', 'vtable'})
        temp = temp.findAll('tr')
        temp2 = temp[4].find('a').text
        vendor = cleanString(temp2.strip())

    # product name
    try:
        name = soup.find('div', {'class', 'title-h2'}).text
        name = cleanString(name.strip())
    except:
        print("name")
    name = soup.find('div', {'class', 'title-h2'}).text
    name = cleanString(name.strip())

    #product description
    try:
        temp = soup.find('pre', {'class', 'description'}).text
        temp = temp.replace('\n', ' ')
        describe = cleanString(temp.strip())
    except:
        print("description")
    temp = soup.find('pre', {'class', 'description'}).text
    temp = temp.replace('\n', ' ')
    describe = cleanString(temp.strip())

    # Finding Product Image
    #image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img')
@ -81,44 +72,37 @@ def darkmatter_description_parser(soup):
            temp2 = temp[4].find('a').text
            category = cleanString(temp2.strip())
    except:
        try:
            temp = soup.find('table', {'class', 'vtable'})
            temp = temp.findAll('tr')
            temp2 = temp[5].find('th').text
            temp2 = cleanString(temp2.strip)
            if (temp2 == "Category"):
                temp2 = temp[5].find('a').text
                category = cleanString(temp2.strip())
        except:
            print('category')

    # usd
    try:
        temp = soup.find('table', {'class', 'vtable'})
        temp = temp.findAll('tr')
        temp2 = temp[1].find('td').text
        temp2 = temp2.replace(' USD', '')
        USD = cleanString(temp2)
    except:
        print('USD')

    # 15 Product_QuantitySold
    try:
        temp = soup.find('table', {'class', 'vtable'})
        temp = temp.findAll('tr')
        temp2 = temp[5].find('th').text
        temp2 = cleanString(temp2)
        temp3 = temp[6].find('th').text
        temp3 = cleanString(temp3)
        if (temp2 == "Sold"):
            temp2 = temp[5].find('td').text
            sold = cleanString(temp2.strip())
        elif (temp3 == "Sold"):
            temp2 = temp[6].find('td').text
            sold = cleanString(temp2.strip())
    except:
        print('sold')
        temp2 = cleanString(temp2.strip)
        if (temp2 == "Category"):
            temp2 = temp[5].find('a').text
            category = cleanString(temp2.strip())

    # usd
    temp = soup.find('table', {'class', 'vtable'})
    temp = temp.findAll('tr')
    temp2 = temp[1].find('td').text
    temp2 = temp2.replace(' USD', '')
    USD = cleanString(temp2)

    # 15 Product_QuantitySold
    temp = soup.find('table', {'class', 'vtable'})
    temp = temp.findAll('tr')
    temp2 = temp[5].find('th').text
    temp2 = cleanString(temp2)
    temp3 = temp[6].find('th').text
    temp3 = cleanString(temp3)
    if (temp2 == "Sold"):
        temp2 = temp[5].find('td').text
        sold = cleanString(temp2.strip())
    elif (temp3 == "Sold"):
        temp2 = temp[6].find('td').text
        sold = cleanString(temp2.strip())

    image = soup.find('td', {"class": "vtop"}).find('img').get('src')
    image = image.split('base64,')[-1]

    # Populating the final variable (this should be a list with all fields scraped)
    row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
@ -134,35 +118,8 @@ def darkmatter_description_parser(soup):
 #return: 'row' that contains a variety of lists that each hold info on the listing page
 def darkmatter_listing_parser(soup):

    """
    # Fields to be parsed
    nm = 0                                    # Total_Products (Should be Integer)
    mktName = "DarkMatter"                       # 0 Marketplace_Name
    name = []                                 # 1 Product_Name
    CVE = []                                  # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
    MS = []                                   # 3 Product_MS_Classification (Microsoft Security)
    category = []                             # 4 Product_Category
    describe = []                             # 5 Product_Description
    escrow = []                               # 6 Vendor_Warranty
    views = []                                # 7 Product_Number_Of_Views
    reviews = []                              # 8 Product_Number_Of_Reviews
    addDate = []                              # 9 Product_AddDate
    rating_item = []  # 11 Product_Rating
    lastSeen = []                             # 10 Product_LastViewDate
    BTC = []                                  # 11 Product_BTC_SellingPrice
    USD = []                                  # 12 Product_USD_SellingPrice
    EURO = []                                 # 13 Product_EURO_SellingPrice
    sold = []                                 # 14 Product_QuantitySold
    qLeft =[]                                 # 15 Product_QuantityLeft
    shipFrom = []                             # 16 Product_ShippedFrom
    shipTo = []                               # 17 Product_ShippedTo
    vendor = []                               # 18 Vendor
    rating = []                               # 19 Vendor_Rating
    success = []                              # 20 Vendor_Successful_Transactions
    href = []                                 # 23 Product_Links (Urls)
    """

    # Fields to be parsed
    nm = 0                                    # *Total_Products (Should be Integer)
    mktName = "DarkMatter"                    # 0 *Marketplace_Name
    vendor = []                               # 1 *Vendor y
@ -191,6 +148,7 @@ def darkmatter_listing_parser(soup):
    names = soup.find('div', {"class": "content"}).findAll('td', {"class": "lefted", "colspan": "3"})
    left = soup.find('div', {"class": "content"}).findAll('table', {"class": "vtable"})
    right = soup.find('div', {"class": "content"}).findAll('td', {"class": "vtop centered"})
    images = soup.find('div', {"class": "content"}).findAll('td', {"class": "vcentered"})

    # vtop centered
    count = 0
@ -199,18 +157,15 @@ def darkmatter_listing_parser(soup):

    for a in names:
        # product name
        try:
            temp = a.find('a').text
            if ("pcs x " in temp):
                index = temp.index("pcs x ")
                result = temp[index + len("pcs x "):]
                name.append(cleanString(result))
            elif("pks x " in temp):
                index = temp.index("pks x ")
                result = temp[index + len("pks x "):]
                name.append(cleanString(temp))
        except Exception as e:
            print("product name", e)
        temp = a.find('a').text
        if ("pcs x " in temp):
            index = temp.index("pcs x ")
            result = temp[index + len("pcs x "):]
            name.append(cleanString(result))
        elif("pks x " in temp):
            index = temp.index("pks x ")
            result = temp[index + len("pks x "):]
            name.append(cleanString(result))

        # Finding Product Image
        #product_image = a.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'})
@ -225,11 +180,8 @@ def darkmatter_listing_parser(soup):
        length_2 = len(temp2) - 1

        # category
        try:
            temp = temp2[1].find('td').text
            category.append(cleanString(temp.strip()))
        except:
            print('category')
        temp = temp2[1].find('td').text
        category.append(cleanString(temp.strip()))

        describe.append("-1")
        #escrow.append("-1")
@ -238,63 +190,49 @@ def darkmatter_listing_parser(soup):
        addDate.append("-1")
        #lastSeen.append("-1")
        BTC.append("-1")
        image.append("-1")
        image_vendor.append("-1")

        # usd
        try:
            temp3 = right[count*2].find('span').text
            temp = temp3.replace(' USD', '')
            USD.append(cleanString(temp))
        except:
            print('USD')
        temp3 = right[count*2].find('span').text
        temp = temp3.replace(' USD', '')
        USD.append(cleanString(temp))

        EURO.append("-1")

        # 14 Product_QuantitySold
        try:
            temp3 = temp2[length_2].find('th').text
            temp3 = cleanString(temp3)
            if (temp3 == "Sold:"):
                temp = temp2[length_2].find('td').text
                sold.append(cleanString(temp.strip()))
            else:
                sold.append("-1")
        except Exception as e:
        temp3 = temp2[length_2].find('th').text
        temp3 = cleanString(temp3)
        if (temp3 == "Sold:"):
            temp = temp2[length_2].find('td').text
            sold.append(cleanString(temp.strip()))
        else:
            sold.append("-1")
            print('sold', e)

        qLeft.append("-1")
        shipFrom.append("-1")

        # ship to
        try:
            temp3 = temp2[length_2].find('th').text
            temp3 = cleanString(temp3)
            if (temp3 == "Ship To:"):
                temp = temp2[length_2].find('td').text
                shipTo.append(cleanString(temp.strip()))
            else:
                shipTo.append("-1")
        except Exception as e:
        temp3 = temp2[length_2].find('th').text
        temp3 = cleanString(temp3)
        if (temp3 == "Ship To:"):
            temp = temp2[length_2].find('td').text
            shipTo.append(cleanString(temp.strip()))
        else:
            shipTo.append("-1")
            print('shopto')

        # vendor
        try:
            temp = temp2[0].find('a').text
            vendor.append(cleanString(temp.strip()))
        except:
            print('vendor')
        temp = temp2[0].find('a').text
        vendor.append(cleanString(temp.strip()))

        # add product rating (stars)
        rating.append("-1")
        success.append("-1")

        try:
            temp = a.find('a').get('href')
            href.append(temp)
        except:
            print('href')
        temp = a.find('a').get('href')
        href.append(temp)

        image = images[count*2].find('img').get('src')
        image = image.split('base64,')[-1]

        count += 1

--- a/MarketPlaces/DigitalThriftShop/parser.py
+++ b/MarketPlaces/DigitalThriftShop/parser.py
@ -46,7 +46,7 @@ def digitalThriftShop_description_parser(soup: Tag):

    # Finding Product Image
    image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img')
    image = image.get('src')
    image = image.get('src').split('base64,')[-1]

    product_category = soup.find("span", {"class": "posted_in"}).find("a").text
    category = cleanString(product_category.strip())
@ -115,7 +115,7 @@ def digitalThriftShop_listing_parser(soup: Tag):
    
    for product in products_list:
        nm += 1
        vendor.append("-1")
        vendor.append(mktName)
        rating_vendor.append("-1")
        success.append("-1")
        
@ -124,7 +124,7 @@ def digitalThriftShop_listing_parser(soup: Tag):

        # Finding Product Image
        product_image = product.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'})
        product_image = product_image.get('src')
        product_image = product_image.get('src').split('base64,')[-1]
        image.append(product_image)
        
        CVE.append("-1")
--- a/MarketPlaces/HiddenMarket/parser.py
+++ b/MarketPlaces/HiddenMarket/parser.py
@ -88,7 +88,7 @@ def hiddenmarket_description_parser(soup):

    # Finding Product Image
    image = soup.find('div', {"class": "thumbnails"}).find('img', {"class": "bigthumbnail"})
    image = image.get('src')
    image = image.get('src').split('base64,')[-1]

    # Finding the Product Category
    category = mb[-4].text
--- a/MarketPlaces/Initialization/markets_mining.py
+++ b/MarketPlaces/Initialization/markets_mining.py
@ -25,6 +25,7 @@ from MarketPlaces.RobinhoodMarket.crawler_selenium import crawler as crawlerRobi
 from MarketPlaces.Nexus.crawler_selenium import crawler as crawlerNexus
 from MarketPlaces.CypherMarketplace.crawler_selenium import crawler as crawlerCypher
 from MarketPlaces.DarkBazar.crawler_selenium import crawler as crawlerDarkBazar
 from MarketPlaces.PabloEscobarMarket.crawler_selenium import crawler as crawlerPabloEscobar

 import configparser
 import os
@ -140,5 +141,7 @@ if __name__ == '__main__':
            crawlerCypher()
        elif mkt == "DarkBazar":
            crawlerDarkBazar()
        elif mkt == "PabloEscobarMarket":
            crawlerPabloEscobar()

    print("\nScraping process completed!")
--- a/MarketPlaces/Initialization/prepare_parser.py
+++ b/MarketPlaces/Initialization/prepare_parser.py
@ -22,6 +22,7 @@ from MarketPlaces.RobinhoodMarket.parser import *
 from MarketPlaces.Nexus.parser import *
 from MarketPlaces.MikesGrandStore.parser import *
 from MarketPlaces.DarkBazar.parser import *
 from MarketPlaces.PabloEscobarMarket.parser import *

 from MarketPlaces.Classifier.classify_product import predict

@ -73,9 +74,9 @@ def mergePages(rmm, rec):
        rec[18] = rmm[17]
    if rec[19] == "-1":         # shippedto_item
        rec[19] = rmm[18]
    if rec[20] == "-1":         # image
    if rmm[19] != "-1":         # image
        rec[20] = rmm[19]
    if rec[21] == "-1":         # image_vendor
    if rmm[20] != "-1":         # image_vendor
        rec[21] = rmm[20]

    return rec
@ -155,6 +156,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile):
            rw = mikesGrandStore_listing_parser(soup)
        elif marketPlace == "DarkBazar":
            rw = darkbazar_listing_parser(soup)
        elif marketPlace == "PabloEscobarMarket":
            rw = pabloescobarmarket_listing_parser(soup)
        else:
            print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
            raise Exception
@ -208,6 +211,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
            rmm = mikesGrandStore_description_parser(soup)
        elif marketPlace == "DarkBazar":
            rmm = darkbazar_description_parser(soup)
        elif marketPlace == "PabloEscobarMarket":
            rmm = pabloescobarmarket_description_parser(soup)
        else:
            print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
            raise Exception
--- a/MarketPlaces/RobinhoodMarket/crawler_selenium.py
+++ b/MarketPlaces/RobinhoodMarket/crawler_selenium.py
@ -162,8 +162,8 @@ def getInterestedLinks():

    # Hacking
    links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/hacking/')
    # # Other Software
    # links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/other-software/')
    # Other Software
    links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/other-software/')

    return links

@ -191,7 +191,7 @@ def crawlForum(driver):
                savePage(driver, html, link)

                list = productPages(html)
                for item in list:
                for c, item in enumerate(list):

                    itemURL = urlparse.urljoin(baseURL, str(item))
                    try:
@ -202,11 +202,12 @@ def crawlForum(driver):
                    driver.back()

                    # comment out
                    # break
                    # if c == 4:
                    #     break

                # comment out
                if count == 1:
                    break
                # if count == 1:
                #     break

                # go to next page of market
                try:
--- a/MarketPlaces/RobinhoodMarket/parser.py
+++ b/MarketPlaces/RobinhoodMarket/parser.py
@ -50,20 +50,17 @@ def Robinhood_description_parser(soup):

    # Finding description
    desc = ''
    primary = soup.find('div', {'id': 'primary'})
    product = primary.findAll('div')[1]
    commerce = product.findAll('div', recursive=False)[2]
    descDiv = commerce.findAll('div')[0]
    # descDiv = soup.find('div', {'class': 'woocommerce-Tabs-panel woocommerce-Tabs-panel--description panel entry-content wc-tab'})

    descText = descDiv.findAll('p')
    for para in descText:
        desc = desc + para.text
    describe = desc
    tab = soup.find('div', {"id": "tab-description"})
    for p in tab.findAll('p'):
        desc += p.text
    if desc == '':
        desc = soup.find('div', {"class": "woocommerce-product-details__short-description"}).text
    describe = cleanString(desc.strip())

    # Finding Product Image
    image = soup.find('div', {'class': 'woocommerce-product-gallery__image'}).find('img')
    image = image.get('src')
    image = image.split('base64,')[-1]

    # Finding Vendor
    vendor = soup.find('a', {'class': 'wcfm_dashboard_item_title'}).text
@ -74,6 +71,7 @@ def Robinhood_description_parser(soup):
    # Finding Vendor Image
    vendor_image = soup.find('div', {'class': 'wcfmmp_sold_by_container_left'}).find('img')
    vendor_image = vendor_image.get('src')
    vendor_image = vendor_image.split('base64,')[-1]

    # Finding Category
    catSpan = soup.find('span', {'class': 'posted_in'})
@ -168,6 +166,7 @@ def Robinhood_listing_parser(soup):
        # Finding Product Image
        product_image = card.find('img', {'class': 'attachment-woocommerce_thumbnail size-woocommerce_thumbnail'})
        product_image = product_image.get('src')
        product_image = product_image.split('base64,')[-1]
        image.append(product_image)

        info = card.find('div', {'class': 'wcfmmp_sold_by_container'})
@ -181,6 +180,7 @@ def Robinhood_listing_parser(soup):
        # Finding Vendor Image
        vendor_icon = info.find('img', {'class', 'wcfmmp_sold_by_logo'})
        vendor_icon = vendor_icon.get('src')
        vendor_icon = vendor_icon.split('base64,')[-1]
        image_vendor.append(vendor_icon)

        # Finding USD
--- a/MarketPlaces/Utilities/utilities.py
+++ b/MarketPlaces/Utilities/utilities.py
@ -342,7 +342,6 @@ def aes_encryption(item):
 def aes_decryption(item):

    to_bytes = bytes(item)
    #to_bytes = bytes(item, 'utf-8')

    decrypted_bytes = decryptCipher.decrypt(to_bytes)

@ -368,29 +367,24 @@ def encrypt_encode_image_to_base64(driver, xpath):
    return None


 def decode_decrypt_image_in_base64(html_content):
 def decode_decrypt_image_in_base64(string_image):

    soup = BeautifulSoup(html_content, 'html.parser')

    for img_tag in soup.find_all('img'):

        src_attr = img_tag.get('src')
    try:

        if src_attr and src_attr.startswith('data:image'):
        base64_image = bytes(string_image, encoding='utf-8')
        encrypted_image = base64.b64decode(base64_image)
        decrypted_image = aes_decryption(encrypted_image)

            try:
        im = Image.open(io.BytesIO(decrypted_image))
        im.show()

                string_image = src_attr.split('base64,')[-1]
                base64_image = bytes(string_image, encoding='utf-8')
                encrypted_image = base64.b64decode(base64_image)
                decrypted_image = aes_decryption(encrypted_image)
        return decrypted_image

                im = Image.open(io.BytesIO(decrypted_image))
                im.show()
    except Exception as e:
        print(e)
        pass

            except Exception as e:
                print(e)
                pass
    return None


 def replace_image_sources(driver, html_content):
@ -408,7 +402,7 @@ def replace_image_sources(driver, html_content):
        string_image = encrypt_encode_image_to_base64(driver, img_xpath)

        if string_image:
            img_tag.set('src', f'data:image/png;base64;{string_image}')
            img_tag.set('src', f'data:image/png;base64,{string_image}')
        else:
            img_tag.getparent().remove(img_tag)

@ -420,7 +414,6 @@ def replace_image_sources(driver, html_content):
 def cleanHTML(driver, html):

    clean_html = replace_image_sources(driver, html)
    # decode_decrypt_image_in_base64(clean_html)

    formats = [
        "jpg", "jpeg", "jfif", "pjpeg", "pjp",