fixed change tracking bug and image tracking for AnonMarket

1 year ago · d1d53d9b23
--- a/Forums/Utilities/utilities.py
+++ b/Forums/Utilities/utilities.py
@ -195,12 +195,16 @@ def cleanLink(originalLink):

 def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate):

    day = time.strftime("%m/%d/%Y")
    ahora = time.strftime("%I:%M:%S")

    rw = []

    current_time = datetime.now()
    day = current_time.strftime("%m/%d/%Y")

    for n in range(nm):

        current_time += timedelta(seconds=2)
        ahora = current_time.strftime("%I:%M:%S")

        lne = forum                                             # 0
        lne += ","
        lne += board                                            # 1
@ -400,19 +404,19 @@ def cleanHTML(driver, html):
    ]

    # remove images
    clean_html = re.sub(r"<svg.*?>", "", clean_html)
    clean_html = re.sub(r"<svg[\s\S]*?svg>", "", clean_html)
    for fmat in formats:
        clean_html = re.sub(r"<object.*" + fmat + ".*?>", "", clean_html)
    clean_html = re.sub(r"<canvas.*?>", "", clean_html)
        clean_html = re.sub(r"<object.*" + fmat + "[\s\S]*?object>", "", clean_html)
    clean_html = re.sub(r"<canvas[\s\S]*?canvas>", "", clean_html)

    # remove JavaScript
    clean_html = re.sub(r"<script.*?>", "", clean_html)
    clean_html = re.sub(r"<object.*javascript.*?>", "", clean_html)
    clean_html = re.sub(r"<aplet.*mayscript?>", "", clean_html)
    clean_html = re.sub(r"<embed.*scriptable?>", "", clean_html)
    clean_html = re.sub(r"<script[\s\S]*?script>", "", clean_html)
    clean_html = re.sub(r"<object.*javascript[\s\S]*?object>", "", clean_html)
    clean_html = re.sub(r"<aplet.*mayscript[\s\S]*?aplet>", "", clean_html)
    clean_html = re.sub(r"<embed.*scriptable[\s\S]*?embed>", "", clean_html)

    # image and JavaScript
    clean_html = re.sub(r"<div[^>]*style=\"[^\"]*background-image.*?>|background-image:url\(\'(.*?)\'\);", "", clean_html)
    clean_html = re.sub(r"<div[^>]*style=\"[^\"]*background-image[\s\S]*?div>", "", clean_html)

    return clean_html

--- a/MarketPlaces/AnonMarket/crawler_selenium.py
+++ b/MarketPlaces/AnonMarket/crawler_selenium.py
@ -159,9 +159,8 @@ def getNameFromURL(url):
 #as you can see they are categories of products
 def getInterestedLinks():
    links = []
    # # Software
    # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/civil_softwares')
    # # Malware

    # Malware
    links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/malware')
    # # Bootkits
    # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/bootkits')
@ -195,6 +194,8 @@ def getInterestedLinks():
    # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/exploit_kit')
    # # Security
    # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/security')
    # # Ransomware
    # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/ransomware')

    return links

--- a/MarketPlaces/AnonMarket/parser.py
+++ b/MarketPlaces/AnonMarket/parser.py
@ -49,26 +49,29 @@ def AnonMarket_description_parser(soup):
    info_div = soup.find('div', {'class': 'information'})
    table = info_div.find('table') if info_div else None

    if table:
        # Find all table rows
        rows = table.find_all('tr')

        # Parse each row to get relevant data
        data = {}
        for row in rows:
            columns = row.find_all('td')
            if len(columns) == 3:
                key = columns[0].text.strip()
                value = columns[2].text.strip()
                data[key] = value

        # Extract specific data from the dictionary and assign them to individual variables
        vendor = data.get('Vendor', '-1')
        shipFrom = data.get('Location', '-1')
        shipTo = data.get('Ships to', '-1')
        category = data.get('Category', '-1')
        USD = data.get('Price', '-1').split()[0]
        left = data.get('Stock', '-1')
    # Find all table rows
    rows = table.find_all('tr')

    # Parse each row to get relevant data
    data = {}
    for row in rows:
        columns = row.find_all('td')
        if len(columns) == 3:
            key = columns[0].text.strip()
            value = columns[2].text.strip()
            data[key] = value

    # Extract specific data from the dictionary and assign them to individual variables
    vendor = data.get('Vendor', '-1')
    shipFrom = data.get('Location', '-1')
    shipTo = data.get('Ships to', '-1')
    category = data.get('Category', '-1')
    USD = data.get('Price', '-1').split()[0]
    left = data.get('Stock', '-1')

    # image
    image = soup.find('img', {"class": "bigthumbnail"})
    image = image.get('src').split('base64,')[-1]

    # Populating the final variable (this should be a list with all fields scraped)
    row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
@ -111,51 +114,55 @@ def AnonMarket_listing_parser(soup):
    href = []                                 # 22 Product_Links
    base_url = "http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion"

    cat = soup.find("div", {'class': 'heading'}).text

    products_list = soup.find_all('div', {'class': 'item'})
    nm = 0
    for product in products_list:
        try:
            name_of_product = product.find("div", {"class": "title"}).text.strip()
            name.append(name_of_product)

            name_of_vendor = product.find("a", {'class': 'seller'}).text.strip()
            vendor.append(name_of_vendor)

            cat = soup.find("div", {'class': 'heading'}).text
            category.append(cat)

            product_link_element = product.find("div", {"class": "title"}).find_parent('a')
            if product_link_element:
                link = product_link_element['href']
                if "/product/" in link and "/user/" not in link:
                    full_link = base_url + link
                    href.append(full_link)
                else:
                    href.append("-1")
            else:
                href.append("-1")

            # Append '-1' for unavailable data
            rating_vendor.append("-1")
            success.append("-1")
            CVE.append("-1")
            MS.append("-1")
            describe.append("-1")
            views.append("-1")
            reviews.append("-1")
            addDate.append("-1")
            BTC.append("-1")
            EURO.append("-1")
            sold.append("-1")
            qLeft.append("-1")
            shipFrom.append("-1")
            shipTo.append("-1")

            nm += 1

        except AttributeError as e:
            print("I'm somewhere I don't belong. I'm going to leave")
            continue
        name_of_product = product.find("div", {"class": "title"}).text.strip()
        name.append(name_of_product)

        name_of_vendor = product.find("a", {'class': 'seller'}).text.strip()
        vendor.append(name_of_vendor)

        category.append(cat)

        tbody = product.find('div', {"class": "info"}).find('tbody')

        # rating_item
        width = tbody.find('div', {"class": "stars2"}).get('style')
        rating_item.append(cleanNumbers(width.strip()))

        tr = tbody.findAll('tr', recursive=False)
        td = tr[2].findAll('td')

        # sold
        sold.append(td[0].text.strip())

        # reviews
        reviews.append(td[1].text.strip())

        product_link_element = product.find("div", {"class": "title"}).find_parent('a')
        link = product_link_element['href']
        full_link = base_url + link
        href.append(full_link)

        # Append '-1' for unavailable data
        rating_vendor.append("-1")
        success.append("-1")
        CVE.append("-1")
        MS.append("-1")
        describe.append("-1")
        views.append("-1")
        addDate.append("-1")
        BTC.append("-1")
        USD.append("-1")
        EURO.append("-1")
        qLeft.append("-1")
        shipFrom.append("-1")
        shipTo.append("-1")

        nm += 1

    # Populate the final variable (this should be a list with all fields scraped)
    return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
--- a/MarketPlaces/DB_Connection/db_connection.py
+++ b/MarketPlaces/DB_Connection/db_connection.py
@ -266,7 +266,7 @@ def create_items(cur, row, marketId, vendorId):

            recset = cur.fetchall()

            #decode_decrypt_image_in_base64(recset[0][20])
            # decode_decrypt_image_in_base64(recset[0][20])

            if (str(recset[0][4]) != str(row[5] if row[5] != '-1' else None) or str(recset[0][5]) != str(row[6] if row[6] != '-1' else None) or
                str(recset[0][6]) != str(row[7] if row[7] != '-1' else None) or str(recset[0][7]) != str(row[8] if row[8] != '-1' else None) or
@ -332,7 +332,7 @@ def create_items(cur, row, marketId, vendorId):
                                 'shippedto_item': row[19] if row[19] != '-1' else None,
                                 'dateinserted_item': row[23],
                                 'lastseen_item': row[23],
                                 'image_item': row[20],
                                 'image_item': row[20] if row[20] != '-1' else None,
                                 'itemId': itemId})


--- a/MarketPlaces/Utilities/utilities.py
+++ b/MarketPlaces/Utilities/utilities.py
@ -8,8 +8,6 @@ import base64
 import io
 import configparser
 from datetime import datetime, timedelta
 import datetime as fulldatetime
 from bs4 import BeautifulSoup
 from lxml import html as lxml
 from selenium.webdriver.common.by import By
 from Crypto.Cipher import AES
@ -246,11 +244,14 @@ def organizeProducts(marketplace, nm, vendor, rating_vendor, success_vendor, nom

    rw = []

    day = time.strftime("%m/%d/%Y")
    ahora = time.strftime("%I:%M:%S")
    current_time = datetime.now()
    day = current_time.strftime("%m/%d/%Y")

    for n in range(nm):

        current_time += timedelta(seconds=2)
        ahora = current_time.strftime("%I:%M:%S")

        lne = marketplace                                                       # 0
        lne += ","
        lne += vendor[n]                                                        # 1
@ -422,19 +423,19 @@ def cleanHTML(driver, html):
    ]

    # remove images
    clean_html = re.sub(r"<svg.*?>", "", clean_html)
    clean_html = re.sub(r"<svg[\s\S]*?svg>", "", clean_html)
    for fmat in formats:
        clean_html = re.sub(r"<object.*" + fmat + ".*?>", "", clean_html)
    clean_html = re.sub(r"<canvas.*?>", "", clean_html)
        clean_html = re.sub(r"<object.*" + fmat + "[\s\S]*?object>", "", clean_html)
    clean_html = re.sub(r"<canvas[\s\S]*?canvas>", "", clean_html)

    # remove JavaScript
    clean_html = re.sub(r"<script.*?>", "", clean_html)
    clean_html = re.sub(r"<object.*javascript.*?>", "", clean_html)
    clean_html = re.sub(r"<aplet.*mayscript?>", "", clean_html)
    clean_html = re.sub(r"<embed.*scriptable?>", "", clean_html)
    clean_html = re.sub(r"<script[\s\S]*?script>", "", clean_html)
    clean_html = re.sub(r"<object.*javascript[\s\S]*?object>", "", clean_html)
    clean_html = re.sub(r"<aplet.*mayscript[\s\S]*?aplet>", "", clean_html)
    clean_html = re.sub(r"<embed.*scriptable[\s\S]*?embed>", "", clean_html)

    # image and JavaScript
    clean_html = re.sub(r"<div[^>]*style=\"[^\"]*background-image.*?>|background-image:url\(\'(.*?)\'\);", "", clean_html)
    clean_html = re.sub(r"<div[^>]*style=\"[^\"]*background-image[\s\S]*?div>", "", clean_html)

    return clean_html