From 74ac3c188c130563159aa5847198af519deba452 Mon Sep 17 00:00:00 2001 From: Joshua Date: Mon, 9 Oct 2023 16:48:00 -0700 Subject: [PATCH 1/5] setup configs for ares crawler and added relevant ares links --- MarketPlaces/Ares/crawler_selenium.py | 168 ++++------ MarketPlaces/Ares/parser.py | 288 +++++++++++------- MarketPlaces/Initialization/marketsList.txt | 2 +- MarketPlaces/Initialization/markets_mining.py | 3 + MarketPlaces/Initialization/prepare_parser.py | 5 + 5 files changed, 249 insertions(+), 217 deletions(-) diff --git a/MarketPlaces/Ares/crawler_selenium.py b/MarketPlaces/Ares/crawler_selenium.py index fbed2b1..169c06c 100644 --- a/MarketPlaces/Ares/crawler_selenium.py +++ b/MarketPlaces/Ares/crawler_selenium.py @@ -1,7 +1,7 @@ __author__ = 'DarkWeb' ''' -Ares Market Crawler (Selenium) +Ares Marketplace Crawler (Selenium) ''' from selenium import webdriver @@ -9,27 +9,28 @@ from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.service import Service -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait -from PIL import Image +from selenium.webdriver.support.ui import Select +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By +from PIL import Image import urllib.parse as urlparse -import os, time +import os, re, time from datetime import date import subprocess +import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.Ares.parser import ares_links_parser from MarketPlaces.Utilities.utilities import cleanHTML counter = 1 -baseURL = 'http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion' +baseURL = 'http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/' -# Opens Tor Browser, crawls the website def startCrawling(): - marketName = getMarketName() + mktName = getMKTName() driver = getAccess() if driver != 'down': @@ -40,66 +41,18 @@ def startCrawling(): print(driver.current_url, e) closeDriver(driver) - new_parse(marketName, False) - - -# Login using premade account credentials and do login captcha manually -def login(driver): - #wait for login page - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[3]/div[3]/div[2]/div/div[2]/div/center"))) - - #entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') - #Username here - usernameBox.send_keys('blabri') - passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') - #Password here - passwordBox.send_keys('fishowal') - - ''' - # wait for captcha page show up - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[3]/div/div/img"))) - - # save captcha to local - driver.find_element(by=By.XPATH, value='/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[3]/div/div/img').screenshot( - r'..\Ares\captcha.png') - - # This method will show image in any image viewer - im = Image.open(r'..\Ares\captcha.png') - - im.show() - - # wait until input space show up - inputBox = driver.find_element(by=By.XPATH, value='/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[3]/input') - - # ask user input captcha solution in terminal - userIn = input("Enter solution: ") - - # send user solution into the input space - inputBox.send_keys(userIn) - - # click the verify(submit) button - driver.find_element(by=By.XPATH, value="/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[4]/div/div/button").click() - ''' - input("Press ENTER when CAPTCHA is completed\n") - - # wait for listing page show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div[7]/div[3]/div[2]/div[1]/div[1]'))) + new_parse(mktName, baseURL, True) # Returns the name of the website -def getMarketName(): +def getMKTName(): name = 'Ares' return name -# Return the link of the website +# Return the base link of the website def getFixedURL(): - url = 'http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion' - + url = 'http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/' return url @@ -109,7 +62,7 @@ def closeDriver(driver): # os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /t /f /im tor.exe") print('Closing Tor...') - driver.quit() + driver.close() time.sleep(3) return @@ -129,8 +82,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -146,12 +99,13 @@ def createFFDriver(): service = Service(config.get('TOR', 'geckodriver_path')) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - + driver.maximize_window() return driver +#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' def getAccess(): url = getFixedURL() driver = createFFDriver() @@ -163,7 +117,24 @@ def getAccess(): return 'down' -# Saves the crawled html page +def login(driver): + input("Press ENTER when CAPTCHA is complete and login page has loaded\n") + + # entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') + # Username here + usernameBox.send_keys('blabri') + passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') + # Password here + passwordBox.send_keys('fishowal') + + input("Press ENTER when BROKEN CIRCLE is pressed\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/div[6]/div[3]/div[2]/div[1]/div[1]'))) + + def savePage(driver, page, url): cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) @@ -172,7 +143,6 @@ def savePage(driver, page, url): return -# Gets the full path of the page to be saved along with its appropriate file name def getFullPathName(url): from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE @@ -185,7 +155,11 @@ def getFullPathName(url): return fullPath -# Creates the file name from passed URL +def getMKTName() -> str: + name = 'Ares' + return name + + def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) @@ -198,33 +172,26 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # # Digital - Other - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/91ecd5d0-002c-11ec-9b46-ede2378c5d3c') - # # Digital - VPN - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/9431b830-002b-11ec-86d6-cdaf65cd97f1') - # # Digital - Coding - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/948b7400-a939-11ec-adc5-2f775203130c') # Digital - Malware links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/95c37970-002c-11ec-a5dc-1f4432087ed2') - # # Digital - Guides - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/9a8bea70-002b-11ec-a3db-c90dd329f662') - # # Digital - Hacking - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/a81693f0-002b-11ec-9c39-110550ce4921') - # # Digital - Malware - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/b3258c50-002b-11ec-b658-876d3d651145') - # # Digital - Services - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/bae64840-002b-11ec-bbcc-a93431540099') - # # Digital - Software - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/cff75df0-002b-11ec-8d0a-81fddeb36bf1') - # # Digital - Exploits - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/ef029550-002f-11ec-8711-675a8b116ba6') - # # Digital - Tutorials - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/f6e9c3b0-002b-11ec-85aa-c79a6ac8cfe8') + # Digital - Guides (Mostly carding, some useful hacking guides. probably dont use) + links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/9a8bea70-002b-11ec-a3db-c90dd329f662') + # Digital - Hacking + links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/a81693f0-002b-11ec-9c39-110550ce4921') + # Digital - Malware2 + links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/b3258c50-002b-11ec-b658-876d3d651145') + # Digital - Sofware (50/50 hacking stuff and cracked software) + links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/cff75df0-002b-11ec-8d0a-81fddeb36bf1') + # Digital - Exploits + links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/ef029550-002f-11ec-8711-675a8b116ba6') + # Digital - Tutorials (Mostly random stuff, some useful tutorials, probably dont use) + links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/f6e9c3b0-002b-11ec-85aa-c79a6ac8cfe8') return links def crawlForum(driver): + print("Crawling the Ares market") linksToCrawl = getInterestedLinks() @@ -246,6 +213,7 @@ def crawlForum(driver): savePage(driver, html, link) list = productPages(html) + for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: @@ -255,19 +223,15 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - nav = driver.find_element(by=By.XPATH, value= - '/html/body/div[7]/div[3]/div/div[2]/nav') - a = nav.find_element(by=By.LINK_TEXT, value="Next") - link = a.get_attribute('href') - + link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href') if link == "": raise NoSuchElementException count += 1 @@ -279,24 +243,23 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling Ares market done sucessfully. Press ENTER to continue\n") + print("Crawling the Ares market done.") -# Returns 'True' if the link is Topic link +# Returns 'True' if the link is Topic link, may need to change for every website def isDescriptionLink(url): if 'product' in url: return True return False -# Returns True if the link is a listingPage link +# Returns True if the link is a listingPage link, may need to change for every website def isListingLink(url): if 'category' in url: return True return False -# calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") return ares_links_parser(soup) @@ -304,4 +267,3 @@ def productPages(html): def crawler(): startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/MarketPlaces/Ares/parser.py b/MarketPlaces/Ares/parser.py index 3232b0c..83c986b 100644 --- a/MarketPlaces/Ares/parser.py +++ b/MarketPlaces/Ares/parser.py @@ -7,99 +7,99 @@ from MarketPlaces.Utilities.utilities import * from bs4 import BeautifulSoup -# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) +# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of description page +# return: 'row' that contains a variety of lists that each hold info on the description page def ares_description_parser(soup): - # Fields to be parsed - vendor = "-1" # 0 *Vendor_Name - success = "-1" # 1 Vendor_Successful_Transactions - rating_vendor = "-1" # 2 Vendor_Rating - name = "-1" # 3 *Product_Name - describe = "-1" # 4 Product_Description - CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 6 Product_MS_Classification (Microsoft Security) - category = "-1" # 7 Product_Category - views = "-1" # 8 Product_Number_Of_Views - reviews = "-1" # 9 Product_Number_Of_Reviews - rating_item = "-1" # 10 Product_Rating - addDate = "-1" # 11 Product_AddedDate - BTC = "-1" # 12 Product_BTC_SellingPrice - USD = "-1" # 13 Product_USD_SellingPrice - EURO = "-1" # 14 Product_EURO_SellingPrice - sold = "-1" # 15 Product_QuantitySold - left = "-1" # 16 Product_QuantityLeft - shipFrom = "-1" # 17 Product_ShippedFrom - shipTo = "-1" # 18 Product_ShippedTo + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image # Finding Product Name - name = soup.find('div', {'class': "col-md-12 my-2"}).text + divmb = soup.findAll('div', {'class': "mb-1"}) + + name = divmb[0].text name = name.replace('\n', ' ') name = name.replace(",", "") name = name.strip() - bae = soup.find('div', {'class': "col-md-7"}).find('span').find_all('span') - # Finding Vendor - vendor = bae[0].text - vendor = vendor.replace(",", "") - vendor = vendor.replace("...", "") - vendor = vendor.strip() + vendor = divmb[1].find('a').text.strip() # Finding Vendor Rating - full_stars = bae[2].find_all('i', {'class': "fas fa-star"}) - half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"}) - rating_vendor = len(full_stars) + (0.5 if half_star is not None else 0) + temp = soup.find('div', {'class': ""}).text + temp = temp.split('(') + rating = temp[0].replace("Vendor's Review : ", "") + rating = rating.replace("%", "") + rating_vendor = rating.strip() - # Finding Successful Transactions - success = bae[4].text - success = success.replace("Sales ", "") - success = success.strip() + # Finding the Product Rating and Number of Product Reviews + reviews = temp[2].replace(" review)", "") + reviews = reviews.strip() - bae = soup.find('span', {'class': "text-left"}).find_all('span') + temp = temp[1].split(")") + rating = temp[1].replace("Product Review : ", "") + rating = rating.replace("%", "") + rating_item = rating.strip() # Finding Prices - USD = bae[0].text - USD = USD.replace("\n$", "") - USD = USD.strip() + USD = soup.find('div', {'class': "h3 text-primary"}).text.strip() + + # Finding the Product Category + pmb = soup.findAll('p', {'class': "mb-1"}) - shipping_info = bae[4].text - if "Digital" not in shipping_info: - shipping_info = shipping_info.split(" ") + category = pmb[-1].text + category = category.replace("Category: ", "").strip() - # Finding Shipment Information (Origin) - shipFrom = shipping_info[0].strip() + # Finding the Product Quantity Available + left = divmb[-1].text + left = left.split(",", 1)[1] + left = left.replace("in stock", "") + left = left.strip() - # Finding Shipment Information (Destination) - shipTo = shipping_info[1].strip() + # Finding Number Sold + sold = divmb[-1].text + sold = sold.split(",", 1)[0] + sold = sold.replace("sold", "") + sold = sold.strip() - bae = soup.find_all('textarea') + # Finding Shipment Information (Origin) + pmb[0].text + shipFrom = shipFrom.replace("Ships from: ", "").strip() + + # Finding Shipment Information (Destination) + pmb[1].text + shipTo = shipTo.replace("Ships to: ", "").strip() # Finding the Product description - describe = bae[0].text - describe = describe.replace("\n", " ") - describe = describe.replace("\r", " ") - describe = describe.strip() - - # Finding the Terms and Conditions - terms = bae[1].text - terms = terms.replace("\n", " ") - terms = terms.strip() - - ''' - # Finding the Number of Product Reviews - tag = soup.findAll(text=re.compile('Reviews')) - for index in tag: - reviews = index - par = reviews.find('(') - if par >=0: - reviews = reviews.replace("Reviews (","") - reviews = reviews.replace(")","") - reviews = reviews.split(",") - review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) - else : - review = "-1" - ''' + cardbody = soup.findAll('div', {'class': "card-body"}) + describe = cardbody[1].text.strip() + + # Finding Product Image + image = soup.find('div', {'class': 'product-primary'}).find('img') + image = image.get('src') + image = image.split('base64,')[-1] # Searching for CVE and MS categories cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) @@ -121,69 +121,122 @@ def ares_description_parser(soup): # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo) + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results return row -# This is the method to parse the Listing Pages +# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of listing page +# return: 'row' that contains a variety of lists that each hold info on the listing page def ares_listing_parser(soup): # Fields to be parsed - nm = 0 # *Total_Products (Should be Integer) - mktName = "Ares" # 0 *Marketplace_Name - vendor = [] # 1 *Vendor - rating_vendor = [] # 2 Vendor_Rating - success = [] # 3 Vendor_Successful_Transactions - name = [] # 4 *Product_Name - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 6 Product_MS_Classification (Microsoft Security) - category = [] # 7 Product_Category - describe = [] # 8 Product_Description - views = [] # 9 Product_Number_Of_Views - reviews = [] # 10 Product_Number_Of_Reviews - rating_item = [] # 11 Product_Rating - addDate = [] # 12 Product_AddDate - BTC = [] # 13 Product_BTC_SellingPrice - USD = [] # 14 Product_USD_SellingPrice - EURO = [] # 15 Product_EURO_SellingPrice - sold = [] # 16 Product_QuantitySold - qLeft = [] # 17 Product_QuantityLeft - shipFrom = [] # 18 Product_ShippedFrom - shipTo = [] # 19 Product_ShippedTo - href = [] # 20 Product_Links - - listing = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"}) + nm = 0 # *Total_Products (Should be Integer) + mktName = "Ares" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links + + listing = soup.findAll('div', {"id": "itembox"}) # Populating the Number of Products nm = len(listing) for a in listing: bae = a.findAll('a', href=True) + lb = a.findAll('div', {"id": "littlebox"}) # Adding the url to the list of urls link = bae[0].get('href') link = cleanLink(link) href.append(link) - # Finding the Vendor - vendor_name = bae[1].text - vendor_name = vendor_name.replace(",", "") - vendor_name = vendor_name.strip() - vendor.append(vendor_name) - # Finding the Product - product = bae[2].find('img').get('alt') + product = lb[1].find('a').text product = product.replace('\n', ' ') product = product.replace(",", "") + product = product.replace("...", "") product = product.strip() name.append(product) + # Finding Product Image + product_image = a.find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image.append(product_image) + + # Finding Prices + price = lb[-1].find('div', {"class": "mb-1"}).text + price = price.replace("$","") + price = price.strip() + USD.append(price) + + # Finding the Vendor + vendor_name = lb[-1].find("a").text + vendor_name = vendor_name.replace(",", "") + vendor_name = vendor_name.strip() + vendor.append(vendor_name) + + image_vendor.append("-1") + + # Finding the Category + cat = lb[-1].find("span").text + cat = cat.replace("class:", "") + cat = cat.strip() + category.append(cat) + + span = lb[0].findAll("span") + + # Finding Number of Views + num = span[0].text + num = num.replace("views:", "") + num = num.strip() + sold.append(num) + + # Finding Number Sold + num = span[2].text + num = num.replace("Sold:", "") + num = num.strip() + sold.append(num) + + # Finding Quantity Left + quant = span[1].text + quant = quant.replace("stock:", "") + quant = quant.strip() + qLeft.append(quant) + + # add shipping information + ship = lb[2].findAll('small')[1].findAll('span')[1].text.split("->") + shipFrom.append(ship[0].replace("Ship from ", "").strip()) + shipTo.append(ship[1].replace("to ", "").strip()) + + # Searching for CVE and MS categories cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if not cve: - cveValue="-1" + cveValue = "-1" else: cee = " " for idx in cve: @@ -191,12 +244,12 @@ def ares_listing_parser(soup): cee += " " cee = cee.replace(',', ' ') cee = cee.replace('\n', '') - cveValue=cee + cveValue = cee CVE.append(cveValue) - + ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) if not ms: - MSValue="-1" + MSValue = "-1" else: me = " " for im in ms: @@ -204,24 +257,33 @@ def ares_listing_parser(soup): me += " " me = me.replace(',', ' ') me = me.replace('\n', '') - MSValue=me + MSValue = me MS.append(MSValue) # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) +# called by the crawler to get description links on a listing page +# @param: beautifulsoup object that is using the correct html page (listing page) +# return: list of description links from a listing page def ares_links_parser(soup): - # Returning all links that should be visited by the Crawler + href = [] + listing = soup.findAll('div', {"id": "itembox"}) - listing = soup.findAll('a', {"class": "btn btn-success w-100 my-1"}) + # for a in listing: + # bae = a.find('a', {"class": "text-info"}, href=True) + # link = bae['href'] + # href.append(link) for a in listing: + bae = a.findAll('a', href=True) - link = a['href'] + # Adding the url to the list of urls + link = bae[0].get('href') href.append(link) return href \ No newline at end of file diff --git a/MarketPlaces/Initialization/marketsList.txt b/MarketPlaces/Initialization/marketsList.txt index 87f811c..9d7692b 100644 --- a/MarketPlaces/Initialization/marketsList.txt +++ b/MarketPlaces/Initialization/marketsList.txt @@ -1 +1 @@ -ThiefWorld \ No newline at end of file +Ares \ No newline at end of file diff --git a/MarketPlaces/Initialization/markets_mining.py b/MarketPlaces/Initialization/markets_mining.py index f85b46c..e5fe69a 100644 --- a/MarketPlaces/Initialization/markets_mining.py +++ b/MarketPlaces/Initialization/markets_mining.py @@ -14,6 +14,7 @@ from MarketPlaces.M00nkeyMarket.crawler_selenium import crawler as crawlerM00nke from MarketPlaces.ViceCity.crawler_selenium import crawler as crawlerViceCity from MarketPlaces.CypherMarketplace.crawler_selenium import crawler as crawlerCypher from MarketPlaces.PabloEscobarMarket.crawler_selenium import crawler as crawlerPabloEscobar +from MarketPlaces.Ares.crawler_selenium import crawler as crawlerAres import configparser import os @@ -107,5 +108,7 @@ if __name__ == '__main__': crawlerCypher() elif mkt == "PabloEscobarMarket": crawlerPabloEscobar() + elif mkt == "Ares": + crawlerAres() print("\nScraping process completed!") diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index e075541..786d58b 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -15,6 +15,7 @@ from MarketPlaces.M00nkeyMarket.parser import * from MarketPlaces.MikesGrandStore.parser import * from MarketPlaces.PabloEscobarMarket.parser import * from MarketPlaces.CityMarket.parser import * +from MarketPlaces.Ares.parser import * from MarketPlaces.Classifier.classify_product import predict @@ -130,6 +131,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile): rw = pabloescobarmarket_listing_parser(soup) elif marketPlace == "CityMarket": rw = city_listing_parser(soup) + elif marketPlace == "Ares": + rw = ares_listing_parser(soup) else: print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") raise Exception @@ -164,6 +167,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): rmm = pabloescobarmarket_description_parser(soup) elif marketPlace == "CityMarket": rmm = city_description_parser(soup) + elif marketPlace == "Ares": + rmm = ares_description_parser(soup) else: print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") raise Exception From 88b31a468af0ae337a357b96012ce4befd5465db Mon Sep 17 00:00:00 2001 From: Joshua Date: Wed, 25 Oct 2023 19:30:16 -0700 Subject: [PATCH 2/5] updated market links --- MarketPlaces/Ares/crawler_selenium.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MarketPlaces/Ares/crawler_selenium.py b/MarketPlaces/Ares/crawler_selenium.py index 169c06c..52986ec 100644 --- a/MarketPlaces/Ares/crawler_selenium.py +++ b/MarketPlaces/Ares/crawler_selenium.py @@ -26,7 +26,7 @@ from MarketPlaces.Ares.parser import ares_links_parser from MarketPlaces.Utilities.utilities import cleanHTML counter = 1 -baseURL = 'http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/' +baseURL = 'http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/signin' def startCrawling(): @@ -52,7 +52,7 @@ def getMKTName(): # Return the base link of the website def getFixedURL(): - url = 'http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/' + url = 'http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/signin' return url From 0a9d4c56acf1436d8495c875173c8b983a3ec3af Mon Sep 17 00:00:00 2001 From: Joshua Date: Thu, 26 Oct 2023 08:34:52 -0700 Subject: [PATCH 3/5] finished crawler --- MarketPlaces/Ares/crawler_selenium.py | 25 ++++++++++++------------- MarketPlaces/Ares/parser.py | 2 +- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/MarketPlaces/Ares/crawler_selenium.py b/MarketPlaces/Ares/crawler_selenium.py index 52986ec..5120f06 100644 --- a/MarketPlaces/Ares/crawler_selenium.py +++ b/MarketPlaces/Ares/crawler_selenium.py @@ -26,7 +26,7 @@ from MarketPlaces.Ares.parser import ares_links_parser from MarketPlaces.Utilities.utilities import cleanHTML counter = 1 -baseURL = 'http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/signin' +baseURL = 'http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/' def startCrawling(): @@ -41,7 +41,7 @@ def startCrawling(): print(driver.current_url, e) closeDriver(driver) - new_parse(mktName, baseURL, True) + # new_parse(mktName, baseURL, True) # Returns the name of the website @@ -52,7 +52,7 @@ def getMKTName(): # Return the base link of the website def getFixedURL(): - url = 'http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/signin' + url = 'http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/' return url @@ -118,7 +118,7 @@ def getAccess(): def login(driver): - input("Press ENTER when CAPTCHA is complete and login page has loaded\n") + # input("Press ENTER when CAPTCHA is complete and login page has loaded\n") # entering username and password into input boxes usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') @@ -173,19 +173,19 @@ def getInterestedLinks(): links = [] # Digital - Malware - links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/95c37970-002c-11ec-a5dc-1f4432087ed2') + # links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/95c37970-002c-11ec-a5dc-1f4432087ed2') # Digital - Guides (Mostly carding, some useful hacking guides. probably dont use) - links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/9a8bea70-002b-11ec-a3db-c90dd329f662') + # links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/9a8bea70-002b-11ec-a3db-c90dd329f662') # Digital - Hacking - links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/a81693f0-002b-11ec-9c39-110550ce4921') + # links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/a81693f0-002b-11ec-9c39-110550ce4921') # Digital - Malware2 - links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/b3258c50-002b-11ec-b658-876d3d651145') + links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/b3258c50-002b-11ec-b658-876d3d651145') # Digital - Sofware (50/50 hacking stuff and cracked software) - links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/cff75df0-002b-11ec-8d0a-81fddeb36bf1') + # links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/cff75df0-002b-11ec-8d0a-81fddeb36bf1') # Digital - Exploits - links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/ef029550-002f-11ec-8711-675a8b116ba6') + # links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/ef029550-002f-11ec-8711-675a8b116ba6') # Digital - Tutorials (Mostly random stuff, some useful tutorials, probably dont use) - links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/f6e9c3b0-002b-11ec-85aa-c79a6ac8cfe8') + # links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/f6e9c3b0-002b-11ec-85aa-c79a6ac8cfe8') return links @@ -211,7 +211,6 @@ def crawlForum(driver): driver.refresh() html = driver.page_source savePage(driver, html, link) - list = productPages(html) for item in list: @@ -231,7 +230,7 @@ def crawlForum(driver): # break try: - link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href') + link = driver.find_element(by=By.XPATH, value='/html/body/div[6]/div[3]/div/div[2]/nav/ul/li[4]/a').get_attribute('href') if link == "": raise NoSuchElementException count += 1 diff --git a/MarketPlaces/Ares/parser.py b/MarketPlaces/Ares/parser.py index 83c986b..28e0850 100644 --- a/MarketPlaces/Ares/parser.py +++ b/MarketPlaces/Ares/parser.py @@ -272,7 +272,7 @@ def ares_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] - listing = soup.findAll('div', {"id": "itembox"}) + listing = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"}) # for a in listing: # bae = a.find('a', {"class": "text-info"}, href=True) From 1b2047fe6171cf779af5b64958a393d42c6383d3 Mon Sep 17 00:00:00 2001 From: Joshua Date: Tue, 31 Oct 2023 09:15:26 -0700 Subject: [PATCH 4/5] finished listing parser --- MarketPlaces/Ares/parser.py | 132 +++++++++++++++++++----------------- 1 file changed, 69 insertions(+), 63 deletions(-) diff --git a/MarketPlaces/Ares/parser.py b/MarketPlaces/Ares/parser.py index 28e0850..fe68579 100644 --- a/MarketPlaces/Ares/parser.py +++ b/MarketPlaces/Ares/parser.py @@ -37,67 +37,72 @@ def ares_description_parser(soup): vendor_image = "-1" # 20 Vendor_Image # Finding Product Name - divmb = soup.findAll('div', {'class': "mb-1"}) + divmb = soup.find('div', {'class': "col-md-12 my-2"}) - name = divmb[0].text + name = divmb.find('span', {'class': "btn btn-sm btn-outline-dark w-100 active rounded-0"}).text name = name.replace('\n', ' ') name = name.replace(",", "") name = name.strip() + box = soup.find('div', {'class': "col-md-7"}).find('span', {'class': "text-left text-white"}) + box = box.findAll('span', {'class': "btn btn-mgray btn-sm w-100 active border-danger"}) + # Finding Vendor - vendor = divmb[1].find('a').text.strip() + vendor = box[0].text.strip() # Finding Vendor Rating - temp = soup.find('div', {'class': ""}).text - temp = temp.split('(') - rating = temp[0].replace("Vendor's Review : ", "") - rating = rating.replace("%", "") - rating_vendor = rating.strip() + temp = box[2] + stars = len(temp.findAll('i', {"class": "fas fa-star"})) + half_stars = len(temp.findAll('i', {'class': "fas fa-star-half-alt"})) + rating_vendor = str((stars - half_stars)/5) # Finding the Product Rating and Number of Product Reviews - reviews = temp[2].replace(" review)", "") - reviews = reviews.strip() + # reviews = temp[2].replace(" review)", "") + # reviews = reviews.strip() + # + # temp = temp[1].split(")") + # rating = temp[1].replace("Product Review : ", "") + # rating = rating.replace("%", "") + # rating_item = rating.strip() + - temp = temp[1].split(")") - rating = temp[1].replace("Product Review : ", "") - rating = rating.replace("%", "") - rating_item = rating.strip() + box2 = soup.find('div', {"class": "col-md-4 text-center"}).find('span', {"class": "text-left"}).findAll('span') # Finding Prices - USD = soup.find('div', {'class': "h3 text-primary"}).text.strip() + USD = box2[0].text.strip() # Finding the Product Category - pmb = soup.findAll('p', {'class': "mb-1"}) + # pmb = soup.findAll('p', {'class': "mb-1"}) - category = pmb[-1].text - category = category.replace("Category: ", "").strip() + # category = pmb[-1].text + # category = category.replace("Category: ", "").strip() # Finding the Product Quantity Available - left = divmb[-1].text - left = left.split(",", 1)[1] - left = left.replace("in stock", "") - left = left.strip() + # left = divmb[-1].text + # left = left.split(",", 1)[1] + # left = left.replace("in stock", "") + # left = left.strip() # Finding Number Sold - sold = divmb[-1].text - sold = sold.split(",", 1)[0] - sold = sold.replace("sold", "") - sold = sold.strip() + # sold = divmb[-1].text + # sold = sold.split(",", 1)[0] + # sold = sold.replace("sold", "") + # sold = sold.strip() # Finding Shipment Information (Origin) - pmb[0].text - shipFrom = shipFrom.replace("Ships from: ", "").strip() + # pmb[0].text + # shipFrom = shipFrom.replace("Ships from: ", "").strip() # Finding Shipment Information (Destination) - pmb[1].text - shipTo = shipTo.replace("Ships to: ", "").strip() + # pmb[1].text + # shipTo = shipTo.replace("Ships to: ", "").strip() # Finding the Product description - cardbody = soup.findAll('div', {'class': "card-body"}) - describe = cardbody[1].text.strip() + cardbody = soup.find('div', {"class": 'row-md-12'}).find('div', {"class": 'col-md-4'}).find('textarea', {"class": 'disabled form-control form-control-sm w-100 bg-mgray text-white rounded-0 border-danger'}) + describe = cardbody.text.strip() # Finding Product Image - image = soup.find('div', {'class': 'product-primary'}).find('img') + image = soup.find('div', {"class": 'row-md-12'}).find('div', {"class": 'col-md-4 text-center'}).find('img') image = image.get('src') image = image.split('base64,')[-1] @@ -159,22 +164,20 @@ def ares_listing_parser(soup): image_vendor = [] # 21 Vendor_Image href = [] # 22 Product_Links - listing = soup.findAll('div', {"id": "itembox"}) + listing = soup.find('div', {"class": 'card-body text-black text-left bg-dark'}).findAll('div', {"class": 'card mb-4 border-danger rounded-0'}) # Populating the Number of Products nm = len(listing) for a in listing: - bae = a.findAll('a', href=True) - lb = a.findAll('div', {"id": "littlebox"}) # Adding the url to the list of urls - link = bae[0].get('href') + link = a.find('a', {'class': "badge badge-danger w-100 text-white"}).get('href') link = cleanLink(link) href.append(link) - # Finding the Product - product = lb[1].find('a').text + # Finding the Product name + product = a.find('div', {"class": 'marquee-parent'}).find('div', {"class": "marquee-child"}).text product = product.replace('\n', ' ') product = product.replace(",", "") product = product.replace("...", "") @@ -188,13 +191,18 @@ def ares_listing_parser(soup): image.append(product_image) # Finding Prices - price = lb[-1].find('div', {"class": "mb-1"}).text + price = a.findAll('a', {"class": "text-white"})[-1].text price = price.replace("$","") price = price.strip() - USD.append(price) + currency = a.find('div', {"class": 'card-header bg-mgray rounded-0'}).findAll('i')[1] + if 'bitcoin' in currency.get('class'): + BTC.append(price) + elif 'USD' in currency.get('class'): + USD.append(price) + # Finding the Vendor - vendor_name = lb[-1].find("a").text + vendor_name = a.find('a', {"class": 'badge badge-dark w-100 text-white my-1'}).text vendor_name = vendor_name.replace(",", "") vendor_name = vendor_name.strip() vendor.append(vendor_name) @@ -202,35 +210,33 @@ def ares_listing_parser(soup): image_vendor.append("-1") # Finding the Category - cat = lb[-1].find("span").text - cat = cat.replace("class:", "") - cat = cat.strip() - category.append(cat) - - span = lb[0].findAll("span") + # cat = lb[-1].find("span").text + # cat = cat.replace("class:", "") + # cat = cat.strip() + # category.append(cat) # Finding Number of Views - num = span[0].text - num = num.replace("views:", "") - num = num.strip() - sold.append(num) + # num = span[0].text + # num = num.replace("views:", "") + # num = num.strip() + # sold.append(num) # Finding Number Sold - num = span[2].text - num = num.replace("Sold:", "") - num = num.strip() - sold.append(num) + # num = span[2].text + # num = num.replace("Sold:", "") + # num = num.strip() + # sold.append(num) # Finding Quantity Left - quant = span[1].text - quant = quant.replace("stock:", "") - quant = quant.strip() - qLeft.append(quant) + # quant = span[1].text + # quant = quant.replace("stock:", "") + # quant = quant.strip() + # qLeft.append(quant) # add shipping information - ship = lb[2].findAll('small')[1].findAll('span')[1].text.split("->") - shipFrom.append(ship[0].replace("Ship from ", "").strip()) - shipTo.append(ship[1].replace("to ", "").strip()) + # ship = lb[2].findAll('small')[1].findAll('span')[1].text.split("->") + # shipFrom.append(ship[0].replace("Ship from ", "").strip()) + # shipTo.append(ship[1].replace("to ", "").strip()) # Searching for CVE and MS categories From 72da7f2f053055234781687f56618fc34f4219ab Mon Sep 17 00:00:00 2001 From: Joshua Date: Wed, 1 Nov 2023 15:58:39 -0700 Subject: [PATCH 5/5] finished crawler and parser --- MarketPlaces/Ares/crawler_selenium.py | 2 +- MarketPlaces/Ares/parser.py | 32 ++++++++++++++++++++++----- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/MarketPlaces/Ares/crawler_selenium.py b/MarketPlaces/Ares/crawler_selenium.py index 5120f06..2e0c677 100644 --- a/MarketPlaces/Ares/crawler_selenium.py +++ b/MarketPlaces/Ares/crawler_selenium.py @@ -41,7 +41,7 @@ def startCrawling(): print(driver.current_url, e) closeDriver(driver) - # new_parse(mktName, baseURL, True) + new_parse(mktName, baseURL, True) # Returns the name of the website diff --git a/MarketPlaces/Ares/parser.py b/MarketPlaces/Ares/parser.py index fe68579..1803233 100644 --- a/MarketPlaces/Ares/parser.py +++ b/MarketPlaces/Ares/parser.py @@ -44,17 +44,18 @@ def ares_description_parser(soup): name = name.replace(",", "") name = name.strip() - box = soup.find('div', {'class': "col-md-7"}).find('span', {'class': "text-left text-white"}) + box = soup.find('div', {'class': "col-md-7"}).find('span') box = box.findAll('span', {'class': "btn btn-mgray btn-sm w-100 active border-danger"}) # Finding Vendor - vendor = box[0].text.strip() + vendor = soup.find('a', {'class': "btn btn-sm btn-mgray my-1 w-100 text-white"}).get('href') + vendor = vendor.split('otherParty=')[-1].strip() # Finding Vendor Rating - temp = box[2] + temp = box[1] stars = len(temp.findAll('i', {"class": "fas fa-star"})) half_stars = len(temp.findAll('i', {'class': "fas fa-star-half-alt"})) - rating_vendor = str((stars - half_stars)/5) + rating_vendor = str(((stars - half_stars)/5) * 100) # Finding the Product Rating and Number of Product Reviews # reviews = temp[2].replace(" review)", "") @@ -69,7 +70,26 @@ def ares_description_parser(soup): box2 = soup.find('div', {"class": "col-md-4 text-center"}).find('span', {"class": "text-left"}).findAll('span') # Finding Prices - USD = box2[0].text.strip() + price = box2[0].text + price = price.replace("$", "") + price = price.replace('\n', '') + price = price.strip() + currency = box2[2].find('i').get('class') + if 'bitcoin' in currency: + BTC = price + elif 'USD' in currency: + USD = price + elif 'monero' in currency: + USD = (str(int(price) * 170.97)) + + USD = box2[0].text + USD = USD.replace('\n', '') + USD = USD.replace('$', '') + USD = USD.strip() + + # Finding Vendor Image + vendor_image = soup.find('img', {"class": 'img-fluid'}).get('src') + vendor_image = vendor_image.split('base64,')[-1] # Finding the Product Category # pmb = soup.findAll('p', {'class': "mb-1"}) @@ -199,6 +219,8 @@ def ares_listing_parser(soup): BTC.append(price) elif 'USD' in currency.get('class'): USD.append(price) + elif 'monero' in currency.get('class'): + USD.append(str(int(price) * 170.97)) # Finding the Vendor