From 115449492e6a8df62cc0539906fffa78f548526e Mon Sep 17 00:00:00 2001 From: Khoi Date: Thu, 20 Jul 2023 13:50:29 -0700 Subject: [PATCH] Completed and tested the parsers for DigitalThriftShop --- .../DigitalThriftShop/crawler_selenium.py | 26 +++++----- MarketPlaces/DigitalThriftShop/parser.py | 52 ++++++++++++++++--- MarketPlaces/Initialization/marketsList.txt | 2 +- MarketPlaces/Initialization/prepare_parser.py | 8 ++- 4 files changed, 67 insertions(+), 21 deletions(-) diff --git a/MarketPlaces/DigitalThriftShop/crawler_selenium.py b/MarketPlaces/DigitalThriftShop/crawler_selenium.py index 984e0f5..58c833a 100644 --- a/MarketPlaces/DigitalThriftShop/crawler_selenium.py +++ b/MarketPlaces/DigitalThriftShop/crawler_selenium.py @@ -32,19 +32,19 @@ baseURL = 'http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): - opentor() - # mktName = getMKTName() - driver = getAccess() - - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closetor(driver) - - # new_parse(forumName, baseURL, False) + # opentor() + mktName = getMKTName() + # driver = getAccess() + + # if driver != 'down': + # try: + # login(driver) + # crawlForum(driver) + # except Exception as e: + # print(driver.current_url, e) + # closetor(driver) + + new_parse(mktName, baseURL, False) # Opens Tor Browser diff --git a/MarketPlaces/DigitalThriftShop/parser.py b/MarketPlaces/DigitalThriftShop/parser.py index 8706076..b45c3b9 100644 --- a/MarketPlaces/DigitalThriftShop/parser.py +++ b/MarketPlaces/DigitalThriftShop/parser.py @@ -4,7 +4,7 @@ __author__ = 'DarkWeb' from MarketPlaces.Utilities.utilities import * # Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup, Tag +from bs4 import BeautifulSoup, ResultSet, Tag #parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs @@ -40,7 +40,7 @@ def digitalThriftShop_description_parser(soup: Tag): product_name = soup.find("h1", {"class": "product_title entry-title"}).text name = cleanString(product_name.strip()) - product_description = soup.find("id", {"tab-description"}).find("p").text + product_description = soup.find("div", {"id": "tab-description"}).find("p").text describe = cleanString(product_description.strip()) product_category = soup.find("span", {"class": "posted_in"}).find("a").text @@ -52,7 +52,7 @@ def digitalThriftShop_description_parser(soup: Tag): reviews = product_rating.find("span", {"Class": "rating"}).text except Exception as e: - raise e + pass product_BTC = soup.find("div", {"id": "price-BTC"}).find("span", {"class": "priceinfo cw-noselect"}).text BTC = cleanString(product_BTC.strip()) @@ -74,7 +74,7 @@ def digitalThriftShop_description_parser(soup: Tag): #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of listing page #return: 'row' that contains a variety of lists that each hold info on the listing page -def digitalThriftShop_listing_parser(soup): +def digitalThriftShop_listing_parser(soup: Tag): # Fields to be parsed nm = 0 # *Total_Products (Should be Integer) @@ -100,13 +100,53 @@ def digitalThriftShop_listing_parser(soup): shipTo = [] # 19 Product_ShippedTo href = [] # 20 Product_Links + product_category = soup.find("h1", {"class": "woocommerce-products-header__title page-title"}).text + + products_list: ResultSet[Tag] = soup.find("ul", {"class": "products columns-5"}).find_all("li") + + for product in products_list: + nm += 1 + vendor.append("-1") + rating_vendor.append("-1") + success.append("-1") + + product_name = product.find("h2", {"class": "woocommerce-loop-product__title"}).text + name.append(cleanString(product_name.strip())) + + CVE.append("-1") + MS.append("-1") + category.append(cleanString(product_category.strip())) + describe.append("-1") + views.append("-1") + reviews.append("-1") + + try: + product_rating = product.find("div", {"class": "star-rating"}).find("strong", {"class": "rating"}).text + rating_item.append(cleanString(product_rating.strip())) + except: + rating_item.append("-1") + + addDate.append("-1") + BTC.append("-1") + + product_USD = product.find("span", {"class": "price"}).text + USD.append(product_USD.replace("$", "").strip()) + + EURO.append("-1") + sold.append("-1") + qLeft.append("-1") + shipFrom.append("-1") + shipTo.append("-1") + + product_href = product.find("a", {"class": "woocommerce-LoopProduct-link woocommerce-loop-product__link"}).get("href") + href.append(cleanString(product_href.strip())) # Populate the final variable (this should be a list with all fields scraped) - # return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - # reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) #called by the crawler to get description links on a listing page diff --git a/MarketPlaces/Initialization/marketsList.txt b/MarketPlaces/Initialization/marketsList.txt index 032ecf3..e7488f3 100644 --- a/MarketPlaces/Initialization/marketsList.txt +++ b/MarketPlaces/Initialization/marketsList.txt @@ -1 +1 @@ -M00nkeyMarket \ No newline at end of file +DigitalThriftShop \ No newline at end of file diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index 9fa557c..f2ff464 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -14,6 +14,7 @@ from MarketPlaces.ViceCity.parser import * from MarketPlaces.TorBay.parser import * from MarketPlaces.M00nkeyMarket.parser import * from MarketPlaces.DarkMatter.parser import * +from MarketPlaces.DigitalThriftShop.parser import * from MarketPlaces.Classifier.classify_product import predict @@ -160,6 +161,8 @@ def new_parse(marketPlace, url, createLog): rmm = m00nkey_description_parser(soup) elif marketPlace == "DarkMatter": rmm = darkmatter_description_parser(soup) + elif marketPlace == "DigitalThriftShop": + rmm = digitalThriftShop_description_parser(soup) # key = u"Pr:" + rmm[0].upper()[:desc_lim1] + u" Vendor:" + rmm[13].upper()[:desc_lim2] key = u"Url:" + os.path.basename(line2).replace(".html", "") @@ -167,7 +170,8 @@ def new_parse(marketPlace, url, createLog): # save file address with description record in memory detPage[key] = {'rmm': rmm, 'filename': os.path.basename(line2)} - except : + except Exception as e: + raise e nError += 1 print("There was a problem to parse the file " + line2 + " in the Description section!") @@ -221,6 +225,8 @@ def new_parse(marketPlace, url, createLog): rw = m00nkey_listing_parser(soup) elif marketPlace == "DarkMatter": rw = darkmatter_listing_parser(soup) + elif marketPlace == "DigitalThriftShop": + rw = digitalThriftShop_listing_parser(soup) else: parseError = True