diff --git a/MarketPlaces/Ares/crawler_selenium.py b/MarketPlaces/Ares/crawler_selenium.py index fbed2b1..2e0c677 100644 --- a/MarketPlaces/Ares/crawler_selenium.py +++ b/MarketPlaces/Ares/crawler_selenium.py @@ -1,7 +1,7 @@ __author__ = 'DarkWeb' ''' -Ares Market Crawler (Selenium) +Ares Marketplace Crawler (Selenium) ''' from selenium import webdriver @@ -9,27 +9,28 @@ from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.service import Service -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait -from PIL import Image +from selenium.webdriver.support.ui import Select +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By +from PIL import Image import urllib.parse as urlparse -import os, time +import os, re, time from datetime import date import subprocess +import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.Ares.parser import ares_links_parser from MarketPlaces.Utilities.utilities import cleanHTML counter = 1 -baseURL = 'http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion' +baseURL = 'http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/' -# Opens Tor Browser, crawls the website def startCrawling(): - marketName = getMarketName() + mktName = getMKTName() driver = getAccess() if driver != 'down': @@ -40,66 +41,18 @@ def startCrawling(): print(driver.current_url, e) closeDriver(driver) - new_parse(marketName, False) - - -# Login using premade account credentials and do login captcha manually -def login(driver): - #wait for login page - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[3]/div[3]/div[2]/div/div[2]/div/center"))) - - #entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') - #Username here - usernameBox.send_keys('blabri') - passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') - #Password here - passwordBox.send_keys('fishowal') - - ''' - # wait for captcha page show up - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[3]/div/div/img"))) - - # save captcha to local - driver.find_element(by=By.XPATH, value='/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[3]/div/div/img').screenshot( - r'..\Ares\captcha.png') - - # This method will show image in any image viewer - im = Image.open(r'..\Ares\captcha.png') - - im.show() - - # wait until input space show up - inputBox = driver.find_element(by=By.XPATH, value='/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[3]/input') - - # ask user input captcha solution in terminal - userIn = input("Enter solution: ") - - # send user solution into the input space - inputBox.send_keys(userIn) - - # click the verify(submit) button - driver.find_element(by=By.XPATH, value="/html/body/div[3]/div[3]/div[2]/div/div[2]/div/form/div/div[4]/div/div/button").click() - ''' - input("Press ENTER when CAPTCHA is completed\n") - - # wait for listing page show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div[7]/div[3]/div[2]/div[1]/div[1]'))) + new_parse(mktName, baseURL, True) # Returns the name of the website -def getMarketName(): +def getMKTName(): name = 'Ares' return name -# Return the link of the website +# Return the base link of the website def getFixedURL(): - url = 'http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion' - + url = 'http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/' return url @@ -109,7 +62,7 @@ def closeDriver(driver): # os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /t /f /im tor.exe") print('Closing Tor...') - driver.quit() + driver.close() time.sleep(3) return @@ -129,8 +82,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -146,12 +99,13 @@ def createFFDriver(): service = Service(config.get('TOR', 'geckodriver_path')) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - + driver.maximize_window() return driver +#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' def getAccess(): url = getFixedURL() driver = createFFDriver() @@ -163,7 +117,24 @@ def getAccess(): return 'down' -# Saves the crawled html page +def login(driver): + # input("Press ENTER when CAPTCHA is complete and login page has loaded\n") + + # entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') + # Username here + usernameBox.send_keys('blabri') + passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') + # Password here + passwordBox.send_keys('fishowal') + + input("Press ENTER when BROKEN CIRCLE is pressed\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/div[6]/div[3]/div[2]/div[1]/div[1]'))) + + def savePage(driver, page, url): cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) @@ -172,7 +143,6 @@ def savePage(driver, page, url): return -# Gets the full path of the page to be saved along with its appropriate file name def getFullPathName(url): from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE @@ -185,7 +155,11 @@ def getFullPathName(url): return fullPath -# Creates the file name from passed URL +def getMKTName() -> str: + name = 'Ares' + return name + + def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) @@ -198,33 +172,26 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # # Digital - Other - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/91ecd5d0-002c-11ec-9b46-ede2378c5d3c') - # # Digital - VPN - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/9431b830-002b-11ec-86d6-cdaf65cd97f1') - # # Digital - Coding - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/948b7400-a939-11ec-adc5-2f775203130c') # Digital - Malware - links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/95c37970-002c-11ec-a5dc-1f4432087ed2') - # # Digital - Guides - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/9a8bea70-002b-11ec-a3db-c90dd329f662') - # # Digital - Hacking - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/a81693f0-002b-11ec-9c39-110550ce4921') - # # Digital - Malware - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/b3258c50-002b-11ec-b658-876d3d651145') - # # Digital - Services - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/bae64840-002b-11ec-bbcc-a93431540099') - # # Digital - Software - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/cff75df0-002b-11ec-8d0a-81fddeb36bf1') - # # Digital - Exploits - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/ef029550-002f-11ec-8711-675a8b116ba6') - # # Digital - Tutorials - # links.append('http://sn2sfdqay6cxztroslaxa36covrhoowe6a5xug6wlm6ek7nmeiujgvad.onion/category/f6e9c3b0-002b-11ec-85aa-c79a6ac8cfe8') + # links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/95c37970-002c-11ec-a5dc-1f4432087ed2') + # Digital - Guides (Mostly carding, some useful hacking guides. probably dont use) + # links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/9a8bea70-002b-11ec-a3db-c90dd329f662') + # Digital - Hacking + # links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/a81693f0-002b-11ec-9c39-110550ce4921') + # Digital - Malware2 + links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/b3258c50-002b-11ec-b658-876d3d651145') + # Digital - Sofware (50/50 hacking stuff and cracked software) + # links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/cff75df0-002b-11ec-8d0a-81fddeb36bf1') + # Digital - Exploits + # links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/ef029550-002f-11ec-8711-675a8b116ba6') + # Digital - Tutorials (Mostly random stuff, some useful tutorials, probably dont use) + # links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/f6e9c3b0-002b-11ec-85aa-c79a6ac8cfe8') return links def crawlForum(driver): + print("Crawling the Ares market") linksToCrawl = getInterestedLinks() @@ -244,8 +211,8 @@ def crawlForum(driver): driver.refresh() html = driver.page_source savePage(driver, html, link) - list = productPages(html) + for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: @@ -255,19 +222,15 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - nav = driver.find_element(by=By.XPATH, value= - '/html/body/div[7]/div[3]/div/div[2]/nav') - a = nav.find_element(by=By.LINK_TEXT, value="Next") - link = a.get_attribute('href') - + link = driver.find_element(by=By.XPATH, value='/html/body/div[6]/div[3]/div/div[2]/nav/ul/li[4]/a').get_attribute('href') if link == "": raise NoSuchElementException count += 1 @@ -279,24 +242,23 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling Ares market done sucessfully. Press ENTER to continue\n") + print("Crawling the Ares market done.") -# Returns 'True' if the link is Topic link +# Returns 'True' if the link is Topic link, may need to change for every website def isDescriptionLink(url): if 'product' in url: return True return False -# Returns True if the link is a listingPage link +# Returns True if the link is a listingPage link, may need to change for every website def isListingLink(url): if 'category' in url: return True return False -# calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") return ares_links_parser(soup) @@ -304,4 +266,3 @@ def productPages(html): def crawler(): startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/MarketPlaces/Ares/parser.py b/MarketPlaces/Ares/parser.py index 3232b0c..1803233 100644 --- a/MarketPlaces/Ares/parser.py +++ b/MarketPlaces/Ares/parser.py @@ -7,99 +7,124 @@ from MarketPlaces.Utilities.utilities import * from bs4 import BeautifulSoup -# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) +# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of description page +# return: 'row' that contains a variety of lists that each hold info on the description page def ares_description_parser(soup): - # Fields to be parsed - vendor = "-1" # 0 *Vendor_Name - success = "-1" # 1 Vendor_Successful_Transactions - rating_vendor = "-1" # 2 Vendor_Rating - name = "-1" # 3 *Product_Name - describe = "-1" # 4 Product_Description - CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 6 Product_MS_Classification (Microsoft Security) - category = "-1" # 7 Product_Category - views = "-1" # 8 Product_Number_Of_Views - reviews = "-1" # 9 Product_Number_Of_Reviews - rating_item = "-1" # 10 Product_Rating - addDate = "-1" # 11 Product_AddedDate - BTC = "-1" # 12 Product_BTC_SellingPrice - USD = "-1" # 13 Product_USD_SellingPrice - EURO = "-1" # 14 Product_EURO_SellingPrice - sold = "-1" # 15 Product_QuantitySold - left = "-1" # 16 Product_QuantityLeft - shipFrom = "-1" # 17 Product_ShippedFrom - shipTo = "-1" # 18 Product_ShippedTo + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image # Finding Product Name - name = soup.find('div', {'class': "col-md-12 my-2"}).text + divmb = soup.find('div', {'class': "col-md-12 my-2"}) + + name = divmb.find('span', {'class': "btn btn-sm btn-outline-dark w-100 active rounded-0"}).text name = name.replace('\n', ' ') name = name.replace(",", "") name = name.strip() - bae = soup.find('div', {'class': "col-md-7"}).find('span').find_all('span') + box = soup.find('div', {'class': "col-md-7"}).find('span') + box = box.findAll('span', {'class': "btn btn-mgray btn-sm w-100 active border-danger"}) # Finding Vendor - vendor = bae[0].text - vendor = vendor.replace(",", "") - vendor = vendor.replace("...", "") - vendor = vendor.strip() + vendor = soup.find('a', {'class': "btn btn-sm btn-mgray my-1 w-100 text-white"}).get('href') + vendor = vendor.split('otherParty=')[-1].strip() # Finding Vendor Rating - full_stars = bae[2].find_all('i', {'class': "fas fa-star"}) - half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"}) - rating_vendor = len(full_stars) + (0.5 if half_star is not None else 0) + temp = box[1] + stars = len(temp.findAll('i', {"class": "fas fa-star"})) + half_stars = len(temp.findAll('i', {'class': "fas fa-star-half-alt"})) + rating_vendor = str(((stars - half_stars)/5) * 100) + + # Finding the Product Rating and Number of Product Reviews + # reviews = temp[2].replace(" review)", "") + # reviews = reviews.strip() + # + # temp = temp[1].split(")") + # rating = temp[1].replace("Product Review : ", "") + # rating = rating.replace("%", "") + # rating_item = rating.strip() - # Finding Successful Transactions - success = bae[4].text - success = success.replace("Sales ", "") - success = success.strip() - bae = soup.find('span', {'class': "text-left"}).find_all('span') + box2 = soup.find('div', {"class": "col-md-4 text-center"}).find('span', {"class": "text-left"}).findAll('span') # Finding Prices - USD = bae[0].text - USD = USD.replace("\n$", "") + price = box2[0].text + price = price.replace("$", "") + price = price.replace('\n', '') + price = price.strip() + currency = box2[2].find('i').get('class') + if 'bitcoin' in currency: + BTC = price + elif 'USD' in currency: + USD = price + elif 'monero' in currency: + USD = (str(int(price) * 170.97)) + + USD = box2[0].text + USD = USD.replace('\n', '') + USD = USD.replace('$', '') USD = USD.strip() - shipping_info = bae[4].text - if "Digital" not in shipping_info: - shipping_info = shipping_info.split(" ") + # Finding Vendor Image + vendor_image = soup.find('img', {"class": 'img-fluid'}).get('src') + vendor_image = vendor_image.split('base64,')[-1] + + # Finding the Product Category + # pmb = soup.findAll('p', {'class': "mb-1"}) + + # category = pmb[-1].text + # category = category.replace("Category: ", "").strip() - # Finding Shipment Information (Origin) - shipFrom = shipping_info[0].strip() + # Finding the Product Quantity Available + # left = divmb[-1].text + # left = left.split(",", 1)[1] + # left = left.replace("in stock", "") + # left = left.strip() - # Finding Shipment Information (Destination) - shipTo = shipping_info[1].strip() + # Finding Number Sold + # sold = divmb[-1].text + # sold = sold.split(",", 1)[0] + # sold = sold.replace("sold", "") + # sold = sold.strip() - bae = soup.find_all('textarea') + # Finding Shipment Information (Origin) + # pmb[0].text + # shipFrom = shipFrom.replace("Ships from: ", "").strip() + + # Finding Shipment Information (Destination) + # pmb[1].text + # shipTo = shipTo.replace("Ships to: ", "").strip() # Finding the Product description - describe = bae[0].text - describe = describe.replace("\n", " ") - describe = describe.replace("\r", " ") - describe = describe.strip() - - # Finding the Terms and Conditions - terms = bae[1].text - terms = terms.replace("\n", " ") - terms = terms.strip() - - ''' - # Finding the Number of Product Reviews - tag = soup.findAll(text=re.compile('Reviews')) - for index in tag: - reviews = index - par = reviews.find('(') - if par >=0: - reviews = reviews.replace("Reviews (","") - reviews = reviews.replace(")","") - reviews = reviews.split(",") - review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) - else : - review = "-1" - ''' + cardbody = soup.find('div', {"class": 'row-md-12'}).find('div', {"class": 'col-md-4'}).find('textarea', {"class": 'disabled form-control form-control-sm w-100 bg-mgray text-white rounded-0 border-danger'}) + describe = cardbody.text.strip() + + # Finding Product Image + image = soup.find('div', {"class": 'row-md-12'}).find('div', {"class": 'col-md-4 text-center'}).find('img') + image = image.get('src') + image = image.split('base64,')[-1] # Searching for CVE and MS categories cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) @@ -121,69 +146,125 @@ def ares_description_parser(soup): # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo) + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results return row -# This is the method to parse the Listing Pages +# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of listing page +# return: 'row' that contains a variety of lists that each hold info on the listing page def ares_listing_parser(soup): # Fields to be parsed - nm = 0 # *Total_Products (Should be Integer) - mktName = "Ares" # 0 *Marketplace_Name - vendor = [] # 1 *Vendor - rating_vendor = [] # 2 Vendor_Rating - success = [] # 3 Vendor_Successful_Transactions - name = [] # 4 *Product_Name - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 6 Product_MS_Classification (Microsoft Security) - category = [] # 7 Product_Category - describe = [] # 8 Product_Description - views = [] # 9 Product_Number_Of_Views - reviews = [] # 10 Product_Number_Of_Reviews - rating_item = [] # 11 Product_Rating - addDate = [] # 12 Product_AddDate - BTC = [] # 13 Product_BTC_SellingPrice - USD = [] # 14 Product_USD_SellingPrice - EURO = [] # 15 Product_EURO_SellingPrice - sold = [] # 16 Product_QuantitySold - qLeft = [] # 17 Product_QuantityLeft - shipFrom = [] # 18 Product_ShippedFrom - shipTo = [] # 19 Product_ShippedTo - href = [] # 20 Product_Links - - listing = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"}) + nm = 0 # *Total_Products (Should be Integer) + mktName = "Ares" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links + + listing = soup.find('div', {"class": 'card-body text-black text-left bg-dark'}).findAll('div', {"class": 'card mb-4 border-danger rounded-0'}) # Populating the Number of Products nm = len(listing) for a in listing: - bae = a.findAll('a', href=True) # Adding the url to the list of urls - link = bae[0].get('href') + link = a.find('a', {'class': "badge badge-danger w-100 text-white"}).get('href') link = cleanLink(link) href.append(link) + # Finding the Product name + product = a.find('div', {"class": 'marquee-parent'}).find('div', {"class": "marquee-child"}).text + product = product.replace('\n', ' ') + product = product.replace(",", "") + product = product.replace("...", "") + product = product.strip() + name.append(product) + + # Finding Product Image + product_image = a.find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image.append(product_image) + + # Finding Prices + price = a.findAll('a', {"class": "text-white"})[-1].text + price = price.replace("$","") + price = price.strip() + currency = a.find('div', {"class": 'card-header bg-mgray rounded-0'}).findAll('i')[1] + if 'bitcoin' in currency.get('class'): + BTC.append(price) + elif 'USD' in currency.get('class'): + USD.append(price) + elif 'monero' in currency.get('class'): + USD.append(str(int(price) * 170.97)) + + # Finding the Vendor - vendor_name = bae[1].text + vendor_name = a.find('a', {"class": 'badge badge-dark w-100 text-white my-1'}).text vendor_name = vendor_name.replace(",", "") vendor_name = vendor_name.strip() vendor.append(vendor_name) - # Finding the Product - product = bae[2].find('img').get('alt') - product = product.replace('\n', ' ') - product = product.replace(",", "") - product = product.strip() - name.append(product) + image_vendor.append("-1") + + # Finding the Category + # cat = lb[-1].find("span").text + # cat = cat.replace("class:", "") + # cat = cat.strip() + # category.append(cat) + + # Finding Number of Views + # num = span[0].text + # num = num.replace("views:", "") + # num = num.strip() + # sold.append(num) + + # Finding Number Sold + # num = span[2].text + # num = num.replace("Sold:", "") + # num = num.strip() + # sold.append(num) + + # Finding Quantity Left + # quant = span[1].text + # quant = quant.replace("stock:", "") + # quant = quant.strip() + # qLeft.append(quant) + + # add shipping information + # ship = lb[2].findAll('small')[1].findAll('span')[1].text.split("->") + # shipFrom.append(ship[0].replace("Ship from ", "").strip()) + # shipTo.append(ship[1].replace("to ", "").strip()) + # Searching for CVE and MS categories cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if not cve: - cveValue="-1" + cveValue = "-1" else: cee = " " for idx in cve: @@ -191,12 +272,12 @@ def ares_listing_parser(soup): cee += " " cee = cee.replace(',', ' ') cee = cee.replace('\n', '') - cveValue=cee + cveValue = cee CVE.append(cveValue) - + ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) if not ms: - MSValue="-1" + MSValue = "-1" else: me = " " for im in ms: @@ -204,24 +285,33 @@ def ares_listing_parser(soup): me += " " me = me.replace(',', ' ') me = me.replace('\n', '') - MSValue=me + MSValue = me MS.append(MSValue) # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) +# called by the crawler to get description links on a listing page +# @param: beautifulsoup object that is using the correct html page (listing page) +# return: list of description links from a listing page def ares_links_parser(soup): - # Returning all links that should be visited by the Crawler + href = [] + listing = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"}) - listing = soup.findAll('a', {"class": "btn btn-success w-100 my-1"}) + # for a in listing: + # bae = a.find('a', {"class": "text-info"}, href=True) + # link = bae['href'] + # href.append(link) for a in listing: + bae = a.findAll('a', href=True) - link = a['href'] + # Adding the url to the list of urls + link = bae[0].get('href') href.append(link) return href \ No newline at end of file diff --git a/MarketPlaces/Initialization/marketsList.txt b/MarketPlaces/Initialization/marketsList.txt index 87f811c..9d7692b 100644 --- a/MarketPlaces/Initialization/marketsList.txt +++ b/MarketPlaces/Initialization/marketsList.txt @@ -1 +1 @@ -ThiefWorld \ No newline at end of file +Ares \ No newline at end of file diff --git a/MarketPlaces/Initialization/markets_mining.py b/MarketPlaces/Initialization/markets_mining.py index f85b46c..e5fe69a 100644 --- a/MarketPlaces/Initialization/markets_mining.py +++ b/MarketPlaces/Initialization/markets_mining.py @@ -14,6 +14,7 @@ from MarketPlaces.M00nkeyMarket.crawler_selenium import crawler as crawlerM00nke from MarketPlaces.ViceCity.crawler_selenium import crawler as crawlerViceCity from MarketPlaces.CypherMarketplace.crawler_selenium import crawler as crawlerCypher from MarketPlaces.PabloEscobarMarket.crawler_selenium import crawler as crawlerPabloEscobar +from MarketPlaces.Ares.crawler_selenium import crawler as crawlerAres import configparser import os @@ -107,5 +108,7 @@ if __name__ == '__main__': crawlerCypher() elif mkt == "PabloEscobarMarket": crawlerPabloEscobar() + elif mkt == "Ares": + crawlerAres() print("\nScraping process completed!") diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index de6cc79..79a2bdc 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -18,6 +18,7 @@ from MarketPlaces.CityMarket.parser import * from MarketPlaces.DarkBazar.parser import * from MarketPlaces.Sonanza.parser import * from MarketPlaces.Kingdom.parser import * +from MarketPlaces.Ares.parser import * from MarketPlaces.Classifier.classify_product import predict @@ -133,6 +134,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile): rw = pabloescobarmarket_listing_parser(soup) elif marketPlace == "CityMarket": rw = city_listing_parser(soup) + elif marketPlace == "Ares": + rw = ares_listing_parser(soup) elif marketPlace == "DarkBazar": rw = darkbazar_listing_parser(soup) elif marketPlace == "Sonanza": @@ -173,6 +176,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): rmm = pabloescobarmarket_description_parser(soup) elif marketPlace == "CityMarket": rmm = city_description_parser(soup) + elif marketPlace == "Ares": + rmm = ares_description_parser(soup) elif marketPlace == "DarkBazar": rmm = darkbazar_description_parser(soup) elif marketPlace == "Sonanza":