From 3c42d28810d71dbc5e2a5df30310dd332a76e066 Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 30 Oct 2023 17:04:48 -0700 Subject: [PATCH] Finished crawler and parser for Quest maketplace --- MarketPlaces/Quest/crawler_selenium.py | 153 +++++++--------- MarketPlaces/Quest/parser.py | 238 +++++++++---------------- 2 files changed, 148 insertions(+), 243 deletions(-) diff --git a/MarketPlaces/Quest/crawler_selenium.py b/MarketPlaces/Quest/crawler_selenium.py index 69287a9..213ab24 100644 --- a/MarketPlaces/Quest/crawler_selenium.py +++ b/MarketPlaces/Quest/crawler_selenium.py @@ -1,7 +1,7 @@ __author__ = 'DarkWeb' ''' -Quest Market Crawler (Selenium) +Quest Marketplace Crawler (Selenium) ''' from selenium import webdriver @@ -9,15 +9,17 @@ from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.service import Service -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait -from PIL import Image +from selenium.webdriver.support.ui import Select +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By +from PIL import Image import urllib.parse as urlparse import os, re, time from datetime import date import subprocess +import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.Quest.parser import quest_links_parser @@ -27,9 +29,8 @@ counter = 1 baseURL = 'http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion' -# Opens Tor Browser, crawls the website def startCrawling(): - marketName = getMarketName() + mktName = getMKTName() driver = getAccess() if driver != 'down': @@ -40,66 +41,18 @@ def startCrawling(): print(driver.current_url, e) closeDriver(driver) - new_parse(marketName, False) - - -# Login using premade account credentials and do login captcha manually -def login(driver): - #wait for login page - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[4]/div/div/button"))) - - #entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') - #Username here - usernameBox.send_keys('blabri') - passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') - #Password here - passwordBox.send_keys('fishowal') - - ''' - # wait for captcha page show up - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[3]/div/img"))) - - # save captcha to local - driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[3]/div/img').screenshot( - r'..\Quest\captcha.png') - - # This method will show image in any image viewer - im = Image.open(r'..\Quest\captcha.png') - - im.show() - - # wait until input space show up - inputBox = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[3]/input') - - # ask user input captcha solution in terminal - userIn = input("Enter solution: ") - - # send user solution into the input space - inputBox.send_keys(userIn) - - # click the verify(submit) button - driver.find_element(by=By.XPATH, value="/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[4]/div/div/button").click() - ''' - input("Press ENTER when CAPTCHA is completed\n") - - # wait for listing page show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div[5]/div/div/div/span'))) + new_parse(mktName, baseURL, True) # Returns the name of the website -def getMarketName(): +def getMKTName(): name = 'Quest' return name -# Return the link of the website +# Return the base link of the website def getFixedURL(): url = 'http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion' - return url @@ -129,8 +82,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -146,12 +99,13 @@ def createFFDriver(): service = Service(config.get('TOR', 'geckodriver_path')) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - + driver.maximize_window() return driver +#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' def getAccess(): url = getFixedURL() driver = createFFDriver() @@ -163,7 +117,27 @@ def getAccess(): return 'down' -# Saves the crawled html page +def login(driver): + input("Press ENTER when CAPTCHA is complete and login page has loaded\n") + + # entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') + # Username here + usernameBox.send_keys('CashCarti') + passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') + # Password here + passwordBox.send_keys('Mahogany') + # Clicking the login button + login_button = driver.find_element(By.XPATH, value='/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[4]/div/div/button') + login_button.click() + + input("Press ENTER when CAPTCHA is completed and you exit the newsletter\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/div[1]/nav/div/a/img'))) + + def savePage(driver, page, url): cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) @@ -172,7 +146,6 @@ def savePage(driver, page, url): return -# Gets the full path of the page to be saved along with its appropriate file name def getFullPathName(url): from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE @@ -185,7 +158,11 @@ def getFullPathName(url): return fullPath -# Creates the file name from passed URL +def getMKTName() -> str: + name = 'Quest' + return name + + def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) @@ -198,23 +175,24 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # # Digital - Services - # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/8ae67900-22ed-11ec-a710-31f963ce8d35') - # # Digital - Software - # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/92809300-22ed-11ec-b143-af312e1dab77') - # # Digital - Tutorials - # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/9d1592b0-22ed-11ec-b82d-c3d2878a8716') - # # Digital - Malware - # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/a35bae90-22ed-11ec-ad2e-410f5a5339b5') - # # Digital - Hacking - # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/b4252cf0-22ed-11ec-8032-751549438ed5') - # Digital - Exploits + ## Software + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/92809300-22ed-11ec-b143-af312e1dab77') + ## Tutorial + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/9d1592b0-22ed-11ec-b82d-c3d2878a8716') + ## Malware + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/a35bae90-22ed-11ec-ad2e-410f5a5339b5') + ## Hacking + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/b4252cf0-22ed-11ec-8032-751549438ed5') + ## Exploits links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/c0c3ac60-22ed-11ec-9e97-41cd1912fdee') + ## Carding + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/cbe06b00-22ec-11ec-ab3a-816857220dec') return links def crawlForum(driver): + print("Crawling the Quest market") linksToCrawl = getInterestedLinks() @@ -236,6 +214,7 @@ def crawlForum(driver): savePage(driver, html, link) list = productPages(html) + for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: @@ -245,18 +224,16 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - nav = driver.find_element(by=By.XPATH, value='/html/body/div[6]/nav') - li = nav.find_elements(By.TAG_NAME, 'li') - a = li[-1].find_element(By.TAG_NAME, 'a') - link = a.get_attribute('href') + link_elem = driver.find_element(by=By.CSS_SELECTOR, value='a.page-link[rel="next"]') + link = link_elem.get_attribute('href') if link == "": raise NoSuchElementException count += 1 @@ -268,24 +245,23 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling Quest market done sucessfully. Press ENTER to continue\n") + print("Crawling the Quest market done.") -# Returns 'True' if the link is Topic link +# Returns 'True' if the link is Topic link, may need to change for every website def isDescriptionLink(url): if 'product' in url: return True return False -# Returns True if the link is a listingPage link +# Returns True if the link is a listingPage link, may need to change for every website def isListingLink(url): if 'category' in url: return True return False -# calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") return quest_links_parser(soup) @@ -293,4 +269,3 @@ def productPages(html): def crawler(): startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/MarketPlaces/Quest/parser.py b/MarketPlaces/Quest/parser.py index 6761ed9..ed73d76 100644 --- a/MarketPlaces/Quest/parser.py +++ b/MarketPlaces/Quest/parser.py @@ -7,9 +7,11 @@ from MarketPlaces.Utilities.utilities import * from bs4 import BeautifulSoup -# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) +# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of description page +# return: 'row' that contains a variety of lists that each hold info on the description page def quest_description_parser(soup): - # Fields to be parsed vendor = "-1" # 0 *Vendor_Name @@ -31,111 +33,50 @@ def quest_description_parser(soup): left = "-1" # 16 Product_QuantityLeft shipFrom = "-1" # 17 Product_ShippedFrom shipTo = "-1" # 18 Product_ShippedTo - - row = soup.find_all('div', {'class': "row"}) + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image # Finding Product Name - name = row[1].text - name = name.replace('\n', ' ') - name = name.replace(",", "") - name = name.strip() - - small = row[3].find_all('small') - - # Finding Vendor - vendor = small[0].text - vendor = vendor.replace("Vendor:", "") - vendor = vendor.replace(",", "") - vendor = vendor.strip() - - # Finding Vendor Rating - full_stars = small[2].find_all('i', {'class': "fas fa-star"}) - half_star = small[2].find('i', {'class': "fas fa-star-half-alt"}) - rating_vendor = len(full_stars) + (0.5 if half_star is not None else 0) - - # Finding Successful Transactions - success = small[4].text - success = success.replace("Total Sales:", "") - success = success.strip() - - small = row[2].find('p', {'class': "text-left"}).find_all('small') - - # Finding Prices - USD = small[1].text - USD = USD.replace("$", "") - USD = USD.strip() - - shipping_info = row[2].find('p', {'class': "text-left"}).find('span').text.strip() - if "Digital" not in shipping_info: - shipping_info = shipping_info.split(" ") - - # Finding Shipment Information (Origin) - shipFrom = shipping_info[0].strip() - - # Finding Shipment Information (Destination) - shipTo = shipping_info[1].strip() - - textarea = row[2].find_all('textarea') - - # Finding the Product description - describe = textarea[0].text - describe = describe.replace("\n", " ") - describe = describe.replace("\r", " ") - describe = describe.strip() - - ''' - # Finding the Number of Product Reviews - tag = soup.findAll(text=re.compile('Reviews')) - for index in tag: - reviews = index - par = reviews.find('(') - if par >=0: - reviews = reviews.replace("Reviews (","") - reviews = reviews.replace(")","") - reviews = reviews.split(",") - review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) - else : - review = "-1" - ''' - - # Searching for CVE and MS categories - cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if cve: - CVE = " " - for idx in cve: - CVE += (idx) - CVE += " " - CVE = CVE.replace(',', ' ') - CVE = CVE.replace('\n', '') - ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) - if ms: - MS = " " - for im in ms: - MS += (im) - MS += " " - MS = MS.replace(',', ' ') - MS = MS.replace('\n', '') + name = soup.find('div', class_='card-header bg-dark text-white rounded-0 text-center').text.strip() + + # USD Price + USD = soup.find('small', text='Product Price:').find_next('small').text.strip().replace('$', '') + + # Product Description + describe = soup.find('textarea').text.strip() + + # Finding Product Image + image = soup.find('img', {'class': 'img-fluid'}) + image = image.get('src').split('base64,')[-1] + + # Finding Vendor Image + vendor_image = soup.select_one('.card-body.bg-mgray.css-selector.shadow img') + vendor_image = vendor_image.get('src').split('base64,')[-1] + # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo) + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results return row -# This is the method to parse the Listing Pages +# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of listing page +# return: 'row' that contains a variety of lists that each hold info on the listing page def quest_listing_parser(soup): # Fields to be parsed - nm = 0 # *Total_Products (Should be Integer) - mktName = "Quest" # 0 *Marketplace_Name + nm = 0 # *Total_Products (Should be Integer) + mktName = "quest" # 0 *Marketplace_Name vendor = [] # 1 *Vendor y rating_vendor = [] # 2 Vendor_Rating success = [] # 3 Vendor_Successful_Transactions name = [] # 4 *Product_Name y - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 6 Product_MS_Classification (Microsoft Security) + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this category = [] # 7 Product_Category y describe = [] # 8 Product_Description views = [] # 9 Product_Number_Of_Views @@ -146,87 +87,76 @@ def quest_listing_parser(soup): USD = [] # 14 Product_USD_SellingPrice y EURO = [] # 15 Product_EURO_SellingPrice sold = [] # 16 Product_QuantitySold - qLeft =[] # 17 Product_QuantityLeft + qLeft = [] # 17 Product_QuantityLeft shipFrom = [] # 18 Product_ShippedFrom shipTo = [] # 19 Product_ShippedTo - href = [] # 20 Product_Links - - # Finding category of listing page - cat = soup.find('span', {'class': "btn btn-sm btn-outline-mgray active border-info"}).text - cat = cat.replace("Digital -", "") - cat = cat.strip() - - listing = soup.find_all('div', {"class": "col-md-2 my-md-0 col-12"}) + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links + # Extract all product listings + listing = soup.findAll('div', class_='col-md-2 my-md-0 col-12') # Populating the Number of Products nm = len(listing) for a in listing: - bae = a.find_all('a', href=True) - - # Adding the category - category.append(cat) - - # Adding the url to the list of urls - link = bae[0].get('href') - link = cleanLink(link) - href.append(link) - - # Finding the Vendor - vendor_name = bae[2].text - vendor_name = vendor_name.replace(",", "") - vendor_name = vendor_name.strip() - vendor.append(vendor_name) - - # Finding the Product - product = bae[1].find('img').get('alt') - product = product.replace('\n', ' ') - product = product.replace(",", "") - product = product.strip() - name.append(product) - - # Searching for CVE and MS categories - cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if not cve: - cveValue="-1" - else: - cee = " " - for idx in cve: - cee += (idx) - cee += " " - cee = cee.replace(',', ' ') - cee = cee.replace('\n', '') - cveValue=cee - CVE.append(cveValue) - - ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) - if not ms: - MSValue="-1" - else: - me = " " - for im in ms: - me += (im) - me += " " - me = me.replace(',', ' ') - me = me.replace('\n', '') - MSValue=me - MS.append(MSValue) + + # Extracting Product URL & Name + product_link_tags = a.find_all('a', class_='badge-info') + if product_link_tags: + # Using the first tag as default + product_link_tag = product_link_tags[0] + href.append(product_link_tag['href']) + name.append(product_link_tag.text.strip()) + + # Extracting Product Image + img_tag = a.find('img') + if img_tag: + image_data = img_tag['src'].split('base64,')[-1] + image.append(image_data) + + # Extracting Vendor Name + vendor_tag = a.find('a', class_='badge-dark') + if vendor_tag: + vendor.append(vendor_tag.text.replace('👤', '').strip()) + + # Extracting Product Price in USD + price_tag = a.find('a', class_='text') + if price_tag: + USD.append(price_tag.text.replace("$", "").strip()) + + category_tag = soup.find('span', class_= 'btn btn-sm btn-outline-mgray active border-info') + if category_tag: + category.append(category_tag.text.strip()) + # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) +# called by the crawler to get description links on a listing page +# @param: beautifulsoup object that is using the correct html page (listing page) +# return: list of description links from a listing page def quest_links_parser(soup): + # Returning all product links - # Returning all links that should be visited by the Crawler href = [] - listing = soup.findAll('div', {"class": "col-md-2 my-md-0 col-12"}) + # Locate all divs with class 'row' + row_divs = soup.findAll('div', class_='row') + + for row_div in row_divs: + # Locate all product divs within the current 'row' div + product_divs = row_div.findAll('div', class_='col-md-2 my-md-0 col-12') + + for product_div in product_divs: + # Locate the anchor tag containing the product link within each product div + product_link_tag = product_div.find('a', class_='badge-info') - for div in listing: + if product_link_tag and product_link_tag.has_attr('href'): + href.append(product_link_tag['href']) - link = div.find('a')["href"] - href.append(link) + print(len(href)) return href \ No newline at end of file