Browse Source

Completed and tested the parsers for DigitalThriftShop

main
Khoi 1 year ago
parent
commit
115449492e
4 changed files with 67 additions and 21 deletions
  1. +13
    -13
      MarketPlaces/DigitalThriftShop/crawler_selenium.py
  2. +46
    -6
      MarketPlaces/DigitalThriftShop/parser.py
  3. +1
    -1
      MarketPlaces/Initialization/marketsList.txt
  4. +7
    -1
      MarketPlaces/Initialization/prepare_parser.py

+ 13
- 13
MarketPlaces/DigitalThriftShop/crawler_selenium.py View File

@ -32,19 +32,19 @@ baseURL = 'http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion
# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
opentor()
# mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# new_parse(forumName, baseURL, False)
# opentor()
mktName = getMKTName()
# driver = getAccess()
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
new_parse(mktName, baseURL, False)
# Opens Tor Browser


+ 46
- 6
MarketPlaces/DigitalThriftShop/parser.py View File

@ -4,7 +4,7 @@ __author__ = 'DarkWeb'
from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup, Tag
from bs4 import BeautifulSoup, ResultSet, Tag
#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
@ -40,7 +40,7 @@ def digitalThriftShop_description_parser(soup: Tag):
product_name = soup.find("h1", {"class": "product_title entry-title"}).text
name = cleanString(product_name.strip())
product_description = soup.find("id", {"tab-description"}).find("p").text
product_description = soup.find("div", {"id": "tab-description"}).find("p").text
describe = cleanString(product_description.strip())
product_category = soup.find("span", {"class": "posted_in"}).find("a").text
@ -52,7 +52,7 @@ def digitalThriftShop_description_parser(soup: Tag):
reviews = product_rating.find("span", {"Class": "rating"}).text
except Exception as e:
raisen> <span class="n">e
pass
product_BTC = soup.find("div", {"id": "price-BTC"}).find("span", {"class": "priceinfo cw-noselect"}).text
BTC = cleanString(product_BTC.strip())
@ -74,7 +74,7 @@ def digitalThriftShop_description_parser(soup: Tag):
#stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of listing page
#return: 'row' that contains a variety of lists that each hold info on the listing page
def digitalThriftShop_listing_parser(soup):
def digitalThriftShop_listing_parser(soup: Tag):
# Fields to be parsed
nm = 0 # *Total_Products (Should be Integer)
@ -100,13 +100,53 @@ def digitalThriftShop_listing_parser(soup):
shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
product_category = soup.find("h1", {"class": "woocommerce-products-header__title page-title"}).text
products_list: ResultSet[Tag] = soup.find("ul", {"class": "products columns-5"}).find_all("li")
for product in products_list:
nm += 1
vendor.append("-1")
rating_vendor.append("-1")
success.append("-1")
product_name = product.find("h2", {"class": "woocommerce-loop-product__title"}).text
name.append(cleanString(product_name.strip()))
CVE.append("-1")
MS.append("-1")
category.append(cleanString(product_category.strip()))
describe.append("-1")
views.append("-1")
reviews.append("-1")
try:
product_rating = product.find("div", {"class": "star-rating"}).find("strong", {"class": "rating"}).text
rating_item.append(cleanString(product_rating.strip()))
except:
rating_item.append("-1")
addDate.append("-1")
BTC.append("-1")
product_USD = product.find("span", {"class": "price"}).text
USD.append(product_USD.replace("$", "").strip())
EURO.append("-1")
sold.append("-1")
qLeft.append("-1")
shipFrom.append("-1")
shipTo.append("-1")
product_href = product.find("a", {"class": "woocommerce-LoopProduct-link woocommerce-loop-product__link"}).get("href")
href.append(cleanString(product_href.strip()))
# Populate the final variable (this should be a list with all fields scraped)
# return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
# reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
#called by the crawler to get description links on a listing page


+ 1
- 1
MarketPlaces/Initialization/marketsList.txt View File

@ -1 +1 @@
M00nkeyMarket
DigitalThriftShop

+ 7
- 1
MarketPlaces/Initialization/prepare_parser.py View File

@ -14,6 +14,7 @@ from MarketPlaces.ViceCity.parser import *
from MarketPlaces.TorBay.parser import *
from MarketPlaces.M00nkeyMarket.parser import *
from MarketPlaces.DarkMatter.parser import *
from MarketPlaces.DigitalThriftShop.parser import *
from MarketPlaces.Classifier.classify_product import predict
@ -160,6 +161,8 @@ def new_parse(marketPlace, url, createLog):
rmm = m00nkey_description_parser(soup)
elif marketPlace == "DarkMatter":
rmm = darkmatter_description_parser(soup)
elif marketPlace == "DigitalThriftShop":
rmm = digitalThriftShop_description_parser(soup)
# key = u"Pr:" + rmm[0].upper()[:desc_lim1] + u" Vendor:" + rmm[13].upper()[:desc_lim2]
key = u"Url:" + os.path.basename(line2).replace(".html", "")
@ -167,7 +170,8 @@ def new_parse(marketPlace, url, createLog):
# save file address with description record in memory
detPage[key] = {'rmm': rmm, 'filename': os.path.basename(line2)}
except :
except Exception as e:
raise e
nError += 1
print("There was a problem to parse the file " + line2 + " in the Description section!")
@ -221,6 +225,8 @@ def new_parse(marketPlace, url, createLog):
rw = m00nkey_listing_parser(soup)
elif marketPlace == "DarkMatter":
rw = darkmatter_listing_parser(soup)
elif marketPlace == "DigitalThriftShop":
rw = digitalThriftShop_listing_parser(soup)
else:
parseError = True


Loading…
Cancel
Save