Completed and tested the parsers for DigitalThriftShop

2 years ago · 115449492e
--- a/MarketPlaces/DigitalThriftShop/crawler_selenium.py
+++ b/MarketPlaces/DigitalThriftShop/crawler_selenium.py
@ -32,19 +32,19 @@ baseURL = 'http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion
 # Opens Tor Browser, crawls the website, then parses, then closes tor
 #acts like the main method for the crawler, another function at the end of this code calls this function later
 def startCrawling():
    opentor()
    # mktName = getMKTName()
    driver = getAccess()

    if driver != 'down':
        try:
            login(driver)
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closetor(driver)

    # new_parse(forumName, baseURL, False)
    # opentor()
    mktName = getMKTName()
    # driver = getAccess()

    # if driver != 'down':
    #     try:
    #         login(driver)
    #         crawlForum(driver)
    #     except Exception as e:
    #         print(driver.current_url, e)
    #     closetor(driver)

    new_parse(mktName, baseURL, False)


 # Opens Tor Browser
--- a/MarketPlaces/DigitalThriftShop/parser.py
+++ b/MarketPlaces/DigitalThriftShop/parser.py
@ -4,7 +4,7 @@ __author__ = 'DarkWeb'
 from MarketPlaces.Utilities.utilities import *

 # Here, we are importing BeautifulSoup to search through the HTML tree
 from bs4 import BeautifulSoup, Tag
 from bs4 import BeautifulSoup, ResultSet, Tag


 #parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
@ -40,7 +40,7 @@ def digitalThriftShop_description_parser(soup: Tag):
    product_name = soup.find("h1", {"class": "product_title entry-title"}).text
    name = cleanString(product_name.strip())

    product_description = soup.find("id", {"tab-description"}).find("p").text
    product_description = soup.find("div", {"id": "tab-description"}).find("p").text
    describe = cleanString(product_description.strip())
    
    product_category = soup.find("span", {"class": "posted_in"}).find("a").text
@ -52,7 +52,7 @@ def digitalThriftShop_description_parser(soup: Tag):
        reviews = product_rating.find("span", {"Class": "rating"}).text
    
    except Exception as e:
        raisen> <span class="n">e
        pass

    product_BTC = soup.find("div", {"id": "price-BTC"}).find("span", {"class": "priceinfo cw-noselect"}).text
    BTC = cleanString(product_BTC.strip())
@ -74,7 +74,7 @@ def digitalThriftShop_description_parser(soup: Tag):
 #stores info it needs in different lists, these lists are returned after being organized
 #@param: soup object looking at html page of listing page
 #return: 'row' that contains a variety of lists that each hold info on the listing page
 def digitalThriftShop_listing_parser(soup):
 def digitalThriftShop_listing_parser(soup: Tag):

   # Fields to be parsed
    nm = 0                                    # *Total_Products (Should be Integer)
@ -100,13 +100,53 @@ def digitalThriftShop_listing_parser(soup):
    shipTo = []                               # 19 Product_ShippedTo
    href = []                                 # 20 Product_Links

    product_category = soup.find("h1", {"class": "woocommerce-products-header__title page-title"}).text

    products_list: ResultSet[Tag] = soup.find("ul", {"class": "products columns-5"}).find_all("li")
    
    
    
    for product in products_list:
        nm += 1
        vendor.append("-1")
        rating_vendor.append("-1")
        success.append("-1")
        
        product_name = product.find("h2", {"class": "woocommerce-loop-product__title"}).text
        name.append(cleanString(product_name.strip()))
        
        CVE.append("-1")
        MS.append("-1")
        category.append(cleanString(product_category.strip()))
        describe.append("-1")
        views.append("-1")
        reviews.append("-1")
        
        try:
            product_rating = product.find("div", {"class": "star-rating"}).find("strong", {"class": "rating"}).text
            rating_item.append(cleanString(product_rating.strip()))
        except:
            rating_item.append("-1")
        
        addDate.append("-1")
        BTC.append("-1")
        
        product_USD = product.find("span", {"class": "price"}).text
        USD.append(product_USD.replace("$", "").strip())
        
        EURO.append("-1")
        sold.append("-1")
        qLeft.append("-1")
        shipFrom.append("-1")
        shipTo.append("-1")

        product_href = product.find("a", {"class": "woocommerce-LoopProduct-link woocommerce-loop-product__link"}).get("href")
        href.append(cleanString(product_href.strip()))
    

    # Populate the final variable (this should be a list with all fields scraped)
    # return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
    #                         reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
    return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
                            reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)


 #called by the crawler to get description links on a listing page
--- a/MarketPlaces/Initialization/marketsList.txt
+++ b/MarketPlaces/Initialization/marketsList.txt
@ -1 +1 @@
 M00nkeyMarket
 DigitalThriftShop
--- a/MarketPlaces/Initialization/prepare_parser.py
+++ b/MarketPlaces/Initialization/prepare_parser.py
@ -14,6 +14,7 @@ from MarketPlaces.ViceCity.parser import *
 from MarketPlaces.TorBay.parser import *
 from MarketPlaces.M00nkeyMarket.parser import *
 from MarketPlaces.DarkMatter.parser import *
 from MarketPlaces.DigitalThriftShop.parser import *

 from MarketPlaces.Classifier.classify_product import predict

@ -160,6 +161,8 @@ def new_parse(marketPlace, url, createLog):
                rmm = m00nkey_description_parser(soup)
            elif marketPlace == "DarkMatter":
                rmm = darkmatter_description_parser(soup)
            elif marketPlace == "DigitalThriftShop":
                rmm = digitalThriftShop_description_parser(soup)

            # key = u"Pr:" + rmm[0].upper()[:desc_lim1] + u" Vendor:" + rmm[13].upper()[:desc_lim2]
            key = u"Url:" + os.path.basename(line2).replace(".html", "")
@ -167,7 +170,8 @@ def new_parse(marketPlace, url, createLog):
            # save file address with description record in memory
            detPage[key] = {'rmm': rmm, 'filename': os.path.basename(line2)}

        except :  
        except Exception as e:
            raise e  

            nError += 1
            print("There was a problem to parse the file " + line2 + " in the Description section!")
@ -221,6 +225,8 @@ def new_parse(marketPlace, url, createLog):
                    rw = m00nkey_listing_parser(soup)
                elif marketPlace == "DarkMatter":
                    rw = darkmatter_listing_parser(soup)
                elif marketPlace == "DigitalThriftShop":
                    rw = digitalThriftShop_listing_parser(soup)
                else:
                    parseError = True