diff --git a/Forums/DB_Connection/db_connection.py b/Forums/DB_Connection/db_connection.py index e4f6c5d..dfdec49 100644 --- a/Forums/DB_Connection/db_connection.py +++ b/Forums/DB_Connection/db_connection.py @@ -3,7 +3,7 @@ __author__ = 'DarkWeb' import psycopg2 import traceback from Forums.Utilities.utilities import * - +from dateutil.relativedelta import relativedelta, FR def connectDataBase(): @@ -484,6 +484,28 @@ def create_posts(cur, row, forumId, topicId): 'dateinserted_post': row[8], 'postId': postId}) +def create_status(cur, forumId, date, listings, descriptions, status): + + date = datetime.strptime(date, "%m%d%Y") + + # getting last Fridays a reference date + date_reference = date + relativedelta(weekday=FR(-1)) + + # checking if status already exists + sql = "select * from forums_status where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s" + cur.execute(sql, {'forum_id': forumId, 'date_inserted': date}) + + recset = cur.fetchall() + if recset: + sql = "Update forums_status set listings = %(listings)s, descriptions = %(descriptions)s, status = %(status)s, date_reference = %(date_reference)s " \ + "where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s" + recset = {'listings': listings, 'descriptions': descriptions, 'status': status, 'date_reference': date_reference, 'forum_id': forumId, 'date_inserted': date} + else: + sql = "Insert into forums_status (forum_id, date_inserted, listings, descriptions, status, date_reference) Values (%s, %s, %s, %s, %s, %s)" + recset = [forumId, date, listings, descriptions, status, date_reference] + + cur.execute(sql, recset) + def create_database(cur, con): try: @@ -496,6 +518,12 @@ def create_database(cur, con): sql = "create unique index unique_forum ON forums USING btree (name_forum ASC NULLS LAST)" cur.execute(sql) + sql = "Create table forums_status (forum_id integer NOT NULL, date_inserted date NOT NULL, " \ + "listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL " \ + "CONSTRAINT forums_log_pkey PRIMARY KEY (forum_id, date_inserted), " \ + "CONSTRAINT forums_fk FOREIGN KEY (forum_id) REFERENCES forums (forum_id))" + cur.execute(sql) + sql = "create table users (user_id integer NOT NULL, forum_id integer NOT NULL, name_user character varying(" \ "255) NOT NULL, status_user character varying(255) null, reputation_user character varying(255) null, " \ "interest_user character varying(5000) null, signature_user character varying(1000) null, " \ diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index 91b662f..b86b5c6 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -341,6 +341,16 @@ def new_parse(forum, url, createLog): # move listing files of completed folder move_file(listingFile, createLog, logFile) + # registering the current forum status (up/down) and the number of scraped pages in the database + forumId = verifyForum(cur, forum) + if (forumId > 0): + + readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.html')) + readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.html')) + + create_status(cur, forumId, CURRENT_DATE, len(readListings), len(readDescriptions), '1' if len(listings) > 0 else '0') + con.commit() + if createLog: logFile.close() diff --git a/MarketPlaces/BlackPyramid/crawler_selenium.py b/MarketPlaces/BlackPyramid/crawler_selenium.py index b257c40..c34a6cb 100644 --- a/MarketPlaces/BlackPyramid/crawler_selenium.py +++ b/MarketPlaces/BlackPyramid/crawler_selenium.py @@ -1,9 +1,7 @@ -__author__ = 'Helium' +__author__ = 'cern' ''' -BlackPyramid Forum Crawler (Selenium) -cannot use bc no links are used -kept in case issues are solved +BlackPyramid Market Crawler (Selenium) ''' from selenium import webdriver @@ -11,29 +9,31 @@ from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.service import Service -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By - +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver import ActionChains +import selenium.webdriver.support.ui as uiClasses from PIL import Image + import urllib.parse as urlparse import os, re, time -from datetime import date import subprocess import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse -from MarketPlaces.BlackPyramid.parser import blackpyramid_links_parser +from MarketPlaces.BlackPyramid.parser import BlackPyramid_links_parser from MarketPlaces.Utilities.utilities import cleanHTML +import traceback + counter = 1 -baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/login/' +baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/' -# Opens Tor Browser, crawls the website, then parses, then closes tor -#acts like the main method for the crawler, another function at the end of this code calls this function later +# Opens Tor Browser, crawls the website def startCrawling(): - mktName = getMKTName() + marketName = getMKTName() driver = getAccess() if driver != 'down': @@ -42,28 +42,47 @@ def startCrawling(): crawlForum(driver) except Exception as e: print(driver.current_url, e) - closeDriver(driver) + closetor(driver) + + new_parse(marketName, baseURL, False) + + +# Login +def login(driver): + # wait for login page + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "//input[@name='username_login']"))) + + # entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value="//input[@name='username_login']") + # Username here + usernameBox.send_keys('ChipotleSteakBurrito') + passwordBox = driver.find_element(by=By.XPATH, value="//input[@name='password_login']") + # Password here + passwordBox.send_keys('BlackBeans') - new_parse(mktName, baseURL, True) + input("Press ENTER when CAPTCHA is completed and you closed the newsletter\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '//*[@id="form93b"]'))) # Returns the name of the website -#return: name of site in string type def getMKTName(): name = 'BlackPyramid' return name -# Return the base link of the website -#return: url of base site in string type +# Return the link of the website def getFixedURL(): - url = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/' + url = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/login/?login=1' + return url # Closes Tor Browser -#@param: current selenium driver -def closeDriver(driver): +def closetor(driver): # global pid # os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /t /f /im tor.exe") @@ -88,8 +107,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -111,8 +130,6 @@ def createFFDriver(): return driver -#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' -#return: return the selenium driver or string 'down' def getAccess(): url = getFixedURL() driver = createFFDriver() @@ -124,31 +141,7 @@ def getAccess(): return 'down' -# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha -# then allows for manual solving of captcha in the terminal -#@param: current selenium web driver -def login(driver): - # wait for login page - login_link = driver.find_element(by=By.XPATH, value='/html/body/div/div/div[3]/div/main/div/div/div/div[2]/div/div/div/section[1]/input[1]') - login_link.click() # open tab with url - - # entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') - # Username here - usernameBox.send_keys('ChipotleSteakBurrito') - passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') - # Password here - passwordBox.send_keys('BlackBeans') - - input("Press ENTER when CAPTCHA is completed\n") - - # wait for listing page show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div[2]/form/nav/nav/ul/li[2]/div/a/span[1]'))) - - - -# Saves the crawled html page, makes the directory path for html pages if not made +# Saves the crawled html page def savePage(driver, page, url): cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) @@ -158,7 +151,6 @@ def savePage(driver, page, url): # Gets the full path of the page to be saved along with its appropriate file name -#@param: raw url as crawler crawls through every site def getFullPathName(url): from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE @@ -171,75 +163,92 @@ def getFullPathName(url): return fullPath -# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned -#@param: raw url as crawler crawls through every site +# Creates the file name from passed URL def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) - if (name == ''): + if name == '': name = str(counter) counter = counter + 1 return name -# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list -#in this example, there are a couple of categories some threads fall under such as -# Guides and Tutorials, Digital Products, and Software and Malware -#as you can see they are categories of products +def page_is_fully_loaded(driver): + return driver.execute_script("return document.readyState") == "complete" + + +def goToPage(driver, page): + # hover over digital -> hacking tools + a = ActionChains(driver) + + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "//li[@class='dig940']/div/a"))) + + # hover + digitalB = driver.find_element(By.XPATH, "//li[@class='dig940']/div/a") + time.sleep(1) + a.move_to_element(digitalB).perform() + # print(digitalB) + + # delay for website to register hover + time.sleep(10) + + # click + xpath = "//input[@name='" + page + "']" + link = driver.find_element(By.XPATH, xpath) + time.sleep(1) + a.move_to_element(link).click().perform() + # print(link) + + # wait for website to load + time.sleep(10) + WebDriverWait(driver, 100).until(page_is_fully_loaded) + + def getInterestedLinks(): - links = [] - - # Hacking Guides - links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # Exploits - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # botnets/malware - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # fraud software - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # Other Tools - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # Services - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') + # h11 -> Hacking Tools + # g3 -> Guides, Hacking + # se3 -> Services, Hacking + # f6 -> Fraud software + links = ['h11','g3','se3','f6'] return links -# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through -#topic and description pages are crawled through here, where both types of pages are saved -#@param: selenium driver def crawlForum(driver): + print("Crawling the BlackPyramid market") - linksToCrawl = getInterestedLinks() + pages = getInterestedLinks() i = 0 - while i < len(linksToCrawl): - link = linksToCrawl[i] - print('Crawling :', link) + for listing in pages: + print('Crawling :', listing) try: + goToPage(driver, listing) + has_next_page = True count = 0 + currentPage = 1 while has_next_page: - try: - clicker = driver.find_element(by=By.XPATH, value='/html/body/div[2]/form/nav/nav/ul/li[2]/div/a') - clicker.click() # open tab with url - driver.get(link) - except: - driver.refresh() + html = driver.page_source - savePage(driver, html, link) + savePage(driver, html, listing + "page" + str(currentPage)) + # get a list of urls for each listing list = productPages(html) + for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: driver.get(itemURL) except: - driver.refresh() + # driver.refresh() + continue savePage(driver, driver.page_source, item) - driver.back() + # can't use the back button in dark pyramid + # driver.back() # comment out break @@ -248,10 +257,31 @@ def crawlForum(driver): if count == 1: break + # go to next page of market try: - clicker = driver.find_element(by=By.XPATH, value= - '/html/body/center/div[4]/div/div[3]/div[23]/div[2]/input[1]') - if clicker == "": + goToPage(driver, listing) + nav = driver.find_element(by=By.XPATH, value="//input[@name='next_page']") + + if nav.is_enabled(): + # select next page + pgnum = uiClasses.Select(driver.find_element(by=By.XPATH, value="//select[@name='pageination']")) + # print("pg options:", pgnum.options) + numberOfPages = len(pgnum.options) + + if currentPage >= numberOfPages: + raise NoSuchElementException + + pgnum.select_by_index(currentPage) + currentPage += 1 + + # click button + pgbutton = driver.find_element(by=By.XPATH, value="//input[@value='go to page']") + pgbutton.click() + + # wait for website to load + time.sleep(10) + WebDriverWait(driver, 100).until(page_is_fully_loaded) + else: raise NoSuchElementException count += 1 @@ -259,39 +289,32 @@ def crawlForum(driver): has_next_page = False except Exception as e: - print(link, e) + print(listing, e) i += 1 print("Crawling the BlackPyramid market done.") -# Returns 'True' if the link is a description link -#@param: url of any url crawled -#return: true if is a description page, false if not +# Returns 'True' if the link is Topic link def isDescriptionLink(url): - if 'products' in url: + if 'product' in url: return True return False # Returns True if the link is a listingPage link -#@param: url of any url crawled -#return: true if is a Listing page, false if not def isListingLink(url): - if 'search' in url: + if 'category=' in url: return True return False -# calling the parser to define the links, the html is the url of a link from the list of interested link list -#@param: link from interested link list ie. getInterestingLinks() -#return: list of description links that should be crawled through +# calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") - return blackpyramid_links_parser(soup) - + return BlackPyramid_links_parser(soup) def crawler(): startCrawling() - # print("Crawling and Parsing BlackPyramid .... DONE!") + # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/MarketPlaces/BlackPyramid/parser.py b/MarketPlaces/BlackPyramid/parser.py index 743466a..ecc1dcb 100644 --- a/MarketPlaces/BlackPyramid/parser.py +++ b/MarketPlaces/BlackPyramid/parser.py @@ -1,4 +1,4 @@ -__author__ = 'Helium' +__author__ = 'cern' # Here, we are importing the auxiliary functions to clean or convert data from MarketPlaces.Utilities.utilities import * @@ -11,7 +11,7 @@ from bs4 import BeautifulSoup #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of description page #return: 'row' that contains a variety of lists that each hold info on the description page -def darkfox_description_parser(soup): +def BlackPyramid_description_parser(soup): # Fields to be parsed @@ -40,82 +40,71 @@ def darkfox_description_parser(soup): EURO = "-1" # 22 Product_EURO_SellingPrice # Finding Product Name - name = soup.find('h1').text + name = soup.find('div', {'class': 'panel39002'}).find('span').next_sibling name = name.replace('\n', ' ') name = name.replace(",", "") name = name.strip() + # product description + describe = soup.findAll('div', {'class': 'fer048953'})[1].text + describe = describe.replace('\n', ' ') + describe = describe.replace(",", "") + describe = describe.strip() + # Finding Vendor - vendor = soup.find('h3').find('a').text.strip() + vendor = soup.find('div', {'class': 'bold03905 vstat364'}).text + vendor = vendor.split(" ") + vendor = vendor[2][:-1] + vendor = vendor.replace('\n', ' ') + vendor = vendor.replace(",", "") + vendor = vendor.strip() # Finding Vendor Rating - rating = soup.find('span', {'class': "tag is-dark"}).text.strip() + rating_span = soup.find('span', {'class': 'to3098503t'}).find_next_sibling('span') + rating_num = rating_span.find('b').text + if rating_num != 'N/A': + rating = rating_num[0:3] # Finding Successful Transactions - success = soup.find('h3').text - success = success.replace("Vendor: ", "") - success = success.replace(vendor, "") - success = success.replace("(", "") - success = success.replace(")", "") + success_container = soup.find('ul', {'class': 'ul3o00953'}).findAll('li')[1] + success = success_container.find('div').text + success = success.replace('"', '') + success = success.replace("\n", " ") + success = success.replace(",", "") success = success.strip() - bae = soup.find('div', {'class': "box"}).find_all('ul') - # Finding Prices - USD = bae[1].find('strong').text.strip() - - li = bae[2].find_all('li') - - # Finding Escrow - escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip() - - # Finding the Product Category - category = li[1].find('span', {'class': "tag is-dark"}).text.strip() + USD_text = soup.find('li', {'class': 'vul2994 vghul995'}).find('div').text + USD = USD_text.split(',')[1] + USD = USD.replace('\n', ' ') + USD = USD.replace(",", "") + USD = USD.strip() - # Finding the Product Quantity Available - left = li[3].find('span', {'class': "tag is-dark"}).text.strip() + container = soup.find('ul', {'class': 'bic03095'}) # Finding Number Sold - sold = li[4].find('span', {'class': "tag is-dark"}).text.strip() - - li = bae[3].find_all('li') - - # Finding Shipment Information (Origin) - if "Ships from:" in li[-2].text: - shipFrom = li[-2].text - shipFrom = shipFrom.replace("Ships from: ", "") - # shipFrom = shipFrom.replace(",", "") - shipFrom = shipFrom.strip() - - # Finding Shipment Information (Destination) - shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text - shipTo = shipTo.replace("Ships to: ", "") - shipTo = shipTo.strip() - if "certain countries" in shipTo: - countries = "" - tags = li[-1].find_all('span', {'class': "tag"}) - for tag in tags: - country = tag.text.strip() - countries += country + ", " - shipTo = countries.strip(", ") - - # Finding the Product description - describe = soup.find('div', {'class': "pre-line"}).text - describe = describe.replace("\n", " ") - describe = describe.strip() + sold_container = container.find('li') + sold_div = sold_container.findAll('div')[2] + sold = sold_div.find('b').next_sibling + sold = sold.replace('"', '') + sold = sold.replace("\n", " ") + sold = sold.replace(",", "") + sold = sold.strip() - '''# Finding the Number of Product Reviews - tag = soup.findAll(text=re.compile('Reviews')) - for index in tag: - reviews = index - par = reviews.find('(') - if par >=0: - reviews = reviews.replace("Reviews (","") - reviews = reviews.replace(")","") - reviews = reviews.split(",") - review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) - else : - review = "-1"''' + # Finding the Product Quantity Available + left_container = container.find('li') + left_div = left_container.findAll('div')[3] + left = left_div.find('b').next_sibling + left = left.replace('"', '') + left = left.replace("\n", " ") + left = left.replace(",", "") + left = left.strip() + + # Finding number of reviews + positive = soup.find('span', {'class': 'ar04999324'}).text + neutral = soup.find('span', {'class': 'ti9400005 can39953'}).text + negative = soup.find('span', {'class': 'ti9400005 ti90088 can39953'}).text + review = int(positive) + int(neutral) + int(negative) # Searching for CVE and MS categories cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) @@ -147,11 +136,11 @@ def darkfox_description_parser(soup): #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of listing page #return: 'row' that contains a variety of lists that each hold info on the listing page -def darkfox_listing_parser(soup): +def BlackPyramid_listing_parser(soup): # Fields to be parsed nm = 0 # Total_Products (Should be Integer) - mktName = "DarkFox" # 0 Marketplace_Name + mktName = "BlackPyramid" # 0 Marketplace_Name name = [] # 1 Product_Name CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) MS = [] # 3 Product_MS_Classification (Microsoft Security) @@ -169,80 +158,81 @@ def darkfox_listing_parser(soup): qLeft =[] # 15 Product_QuantityLeft shipFrom = [] # 16 Product_ShippedFrom shipTo = [] # 17 Product_ShippedTo - vendor = [] # 18 Vendor - rating = [] # 19 Vendor_Rating - success = [] # 20 Vendor_Successful_Transactions + rating_item = [] # 18 Product_Rating + vendor = [] # 19 Vendor + rating = [] # 20 Vendor_Rating + success = [] # 21 Vendor_Successful_Transactions href = [] # 23 Product_Links (Urls) - listing = soup.findAll('div', {"class": "card"}) + listing = soup.findAll('article', {"class": "product"}) + + # Some listing pages have an additional article section which is blank + if not listing[-1].findAll('a', href=True): + listing = listing[:-1] + # Populating the Number of Products nm = len(listing) - for a in listing: - bae = a.findAll('a', href=True) + for card in listing: + bae = card.findAll('a', href=True) # Adding the url to the list of urls - link = bae[0].get('href') - link = cleanLink(link) + link = bae[2].get('href') href.append(link) # Finding the Product - product = bae[1].find('p').text + product = bae[3].text product = product.replace('\n', ' ') product = product.replace(",", "") product = product.replace("...", "") product = product.strip() name.append(product) - bae = a.find('div', {'class': "media-content"}).find('div').find_all('div') - - if len(bae) >= 5: - # Finding Prices - price = bae[0].text - ud = price.replace(" USD", " ") - # u = ud.replace("$","") - u = ud.replace(",", "") - u = u.strip() - USD.append(u) - # bc = (prc[1]).strip(' BTC') - # BTC.append(bc) - - # Finding the Vendor - vendor_name = bae[1].find('a').text - vendor_name = vendor_name.replace(",", "") - vendor_name = vendor_name.strip() - vendor.append(vendor_name) - - # Finding the Category - cat = bae[2].find('small').text - cat = cat.replace("Category: ", "") - cat = cat.replace(",", "") - cat = cat.strip() - category.append(cat) - - # Finding Number Sold and Quantity Left - num = bae[3].text - num = num.replace("Sold: ", "") - num = num.strip() - sold.append(num) - - quant = bae[4].find('small').text - quant = quant.replace("In stock: ", "") - quant = quant.strip() - qLeft.append(quant) - - # Finding Successful Transactions - freq = bae[1].text - freq = freq.replace(vendor_name, "") - freq = re.sub(r'Vendor Level \d+', "", freq) - freq = freq.replace("(", "") - freq = freq.replace(")", "") - freq = freq.strip() - success.append(freq) + # Finding description + # 'recurisve = False' only searches direct children + desc = card.findChildren('div', recursive=False)[0] + desc = desc.findAll('div', recursive=False)[3].text + desc = desc.replace('\n', ' ') + desc = desc.replace(",", "") + desc = desc.strip() + describe.append(desc) + + # Finding Vendor Name + vendor_name = bae[4].find('span').text + vendor_name = vendor_name.split(' ')[1] + vendor_name = vendor_name.replace('\n', ' ') + vendor_name = vendor_name.replace(",", "") + vendor_name = vendor_name.strip() + vendor.append(vendor_name) + + # Finding the Category + cat = card.findAll('div', recursive=False)[0].findAll('div', recursive=False)[1].find('span').text + cat = cat.replace("\n", "") + cat = cat.replace(",", "") + cat = cat.strip() + category.append(cat) + + bae = card.findAll('div', recursive=False)[1].findAll('div', recursive=False)[1] + + # Finding amount left + left = bae.findAll('div', recursive=False)[1].text + left = left.replace("x", "") + left = left.replace('\n', ' ') + left = left.replace(",", "") + left = left.strip() + qLeft.append(left) + + # Finding amount sold + qsold = bae.findAll('div', recursive=False)[2].text + qsold = qsold.replace('\n', ' ') + qsold = qsold.replace("x", "") + qsold = qsold.replace(",", "") + qsold = qsold.strip() + sold.append(qsold) # Searching for CVE and MS categories - cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) + cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if not cve: cveValue="-1" else: @@ -255,7 +245,7 @@ def darkfox_listing_parser(soup): cveValue=cee CVE.append(cveValue) - ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) + ms = card.findAll(text=re.compile('MS\d{2}-\d{3}')) if not ms: MSValue="-1" else: @@ -269,23 +259,23 @@ def darkfox_listing_parser(soup): MS.append(MSValue) # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen, - BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href) + return organizeProducts(mktName, nm, vendor, rating, success, name, CVE, MS, category, describe, views, reviews, rating, + addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) #called by the crawler to get description links on a listing page #@param: beautifulsoup object that is using the correct html page (listing page) #return: list of description links from a listing page -def blackpyramid_links_parser(soup): +def BlackPyramid_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] listing = soup.findAll('article', {"class": "product"}) - for div in listing: + for item in listing: - link = div.find('a', {"class": "ah39063"})['href'] + link = item.find('a', {"class": "ah39063"})['href'] href.append(link) - return href \ No newline at end of file + return href diff --git a/MarketPlaces/DB_Connection/db_connection.py b/MarketPlaces/DB_Connection/db_connection.py index 8769869..2f3341a 100644 --- a/MarketPlaces/DB_Connection/db_connection.py +++ b/MarketPlaces/DB_Connection/db_connection.py @@ -4,7 +4,7 @@ import psycopg2 import traceback import configparser from MarketPlaces.Utilities.utilities import * - +from dateutil.relativedelta import relativedelta, FR def connectDataBase(): @@ -273,6 +273,8 @@ def create_items(cur, row, marketId, vendorId): if newItem: + # decode_decrypt_image_in_base64(row[20]) + sql = "Insert into items (item_id, market_id, vendor_id, name_item, description_item, cve_item, ms_item, category_item, " \ "views_item, reviews_item, rating_item, dateadded_item, btc_item, usd_item, euro_item, quantitysold_item, " \ "quantityleft_item, shippedfrom_item, shippedto_item, lastseen_item, image_item, href_item, dateinserted_item, " \ @@ -312,7 +314,7 @@ def create_items(cur, row, marketId, vendorId): recset = cur.fetchall() - # decode_decrypt_image_in_base64(recset[0][20]) + # decode_decrypt_image_in_base64(recset[0]['image_item']) if (str(recset[0]['description_item']) != str(row[5] if row[5] != '-1' else None) or str(recset[0]['cve_item']) != str(row[6] if row[6] != '-1' else None) or @@ -401,6 +403,27 @@ def create_items(cur, row, marketId, vendorId): return itemId +def create_status(cur, marketId, date, listings, descriptions, status): + + date = datetime.strptime(date, "%m%d%Y") + + # getting last Fridays a reference date + date_reference = date + relativedelta(weekday=FR(-1)) + + # checking if status already exists + sql = "select * from marketplaces_status where market_id = %(market_id)s and date_inserted = %(date_inserted)s" + cur.execute(sql, {'market_id': marketId, 'date_inserted': date}) + + recset = cur.fetchall() + if recset: + sql = "Update marketplaces_status set listings = %(listings)s, descriptions = %(descriptions)s, status = %(status)s, date_reference = %(date_reference)s " \ + "where market_id = %(market_id)s and date_inserted = %(date_inserted)s" + recset = {'listings': listings, 'descriptions': descriptions, 'status': status, 'date_reference': date_reference, 'market_id': marketId, 'date_inserted': date} + else: + sql = "Insert into marketplaces_status (market_id, date_inserted, listings, descriptions, status, date_reference) Values (%s, %s, %s, %s, %s, %s)" + recset = [marketId, date, listings, descriptions, status, date_reference] + + cur.execute(sql, recset) def create_database(cur, con): try: @@ -413,6 +436,12 @@ def create_database(cur, con): sql = "create unique index unique_market ON marketplaces USING btree (name_market ASC NULLS LAST)" cur.execute(sql) + sql = "Create table marketplaces_status (market_id integer NOT NULL, date_inserted date NOT NULL, " \ + "listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL " \ + "CONSTRAINT marketplaces_log_pkey PRIMARY KEY (market_id, date_inserted), " \ + "CONSTRAINT marketplaces_fk FOREIGN KEY (market_id) REFERENCES marketplaces (market_id))" + cur.execute(sql) + sql = "create table vendors(vendor_id integer not null, market_id integer not null, name_vendor character " \ "varying(255) not null, rating_vendor character varying(255), successfultransactions_vendor integer " \ "null, image_vendor character varying(10000000) null, dateinserted_vendor timestamp(6) with time zone not null, " \ diff --git a/MarketPlaces/DarkBazar/crawler_selenium.py b/MarketPlaces/DarkBazar/crawler_selenium.py index fdfb640..dac91b0 100644 --- a/MarketPlaces/DarkBazar/crawler_selenium.py +++ b/MarketPlaces/DarkBazar/crawler_selenium.py @@ -216,12 +216,12 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # # comment out - # break - # - # # comment out - # if count == 1: - # break + # comment out + break + + # comment out + if count == 1: + break try: link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href') @@ -236,7 +236,7 @@ def crawlForum(driver): print(link, e) i += 1 - print("Crawling the DarkBazar market done.") + print("Crawling the DarkBazar market done.") # Returns 'True' if the link is Topic link, may need to change for every website diff --git a/MarketPlaces/DarkBazar/parser.py b/MarketPlaces/DarkBazar/parser.py index 9386d18..3d56e92 100644 --- a/MarketPlaces/DarkBazar/parser.py +++ b/MarketPlaces/DarkBazar/parser.py @@ -170,7 +170,6 @@ def darkbazar_listing_parser(soup): # Adding the url to the list of urls link = bae[0].get('href') - link = cleanLink(link) href.append(link) # Finding the Product diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index 786d58b..79a2bdc 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -15,6 +15,9 @@ from MarketPlaces.M00nkeyMarket.parser import * from MarketPlaces.MikesGrandStore.parser import * from MarketPlaces.PabloEscobarMarket.parser import * from MarketPlaces.CityMarket.parser import * +from MarketPlaces.DarkBazar.parser import * +from MarketPlaces.Sonanza.parser import * +from MarketPlaces.Kingdom.parser import * from MarketPlaces.Ares.parser import * from MarketPlaces.Classifier.classify_product import predict @@ -133,6 +136,12 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile): rw = city_listing_parser(soup) elif marketPlace == "Ares": rw = ares_listing_parser(soup) + elif marketPlace == "DarkBazar": + rw = darkbazar_listing_parser(soup) + elif marketPlace == "Sonanza": + rw = sonanza_listing_parser(soup) + elif marketPlace == "Kingdom": + rw = kingdom_listing_parser(soup) else: print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") raise Exception @@ -169,6 +178,12 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): rmm = city_description_parser(soup) elif marketPlace == "Ares": rmm = ares_description_parser(soup) + elif marketPlace == "DarkBazar": + rmm = darkbazar_description_parser(soup) + elif marketPlace == "Sonanza": + rmm = sonanza_description_parser(soup) + elif marketPlace == "Kingdom": + rmm = kingdom_description_parser(soup) else: print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") raise Exception @@ -368,6 +383,16 @@ def new_parse(marketPlace, url, createLog): # move listing files of completed folder move_file(listingFile, createLog, logFile) + # registering the current forum status (up/down) and the number of scraped pages in the database + marketId = verifyMarketPlace(cur, marketPlace) + if (marketId > 0): + + readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.html')) + readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.html')) + + create_status(cur, marketId, CURRENT_DATE, len(readListings), len(readDescriptions), '1' if len(listings) > 0 else '0') + con.commit() + if createLog: logFile.close() diff --git a/MarketPlaces/Kingdom/crawler_selenium.py b/MarketPlaces/Kingdom/crawler_selenium.py index e6b489f..5385150 100644 --- a/MarketPlaces/Kingdom/crawler_selenium.py +++ b/MarketPlaces/Kingdom/crawler_selenium.py @@ -1,4 +1,4 @@ -__author__ = 'DarkWeb' +__author__ = 'Helium' ''' Kingdom Market Crawler (Selenium) @@ -35,55 +35,27 @@ baseURL = 'http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion # Opens Tor Browser, crawls the website def startCrawling(): - # marketName = getMarketName() + mktName = getMKTName() driver = getAccess() if driver != 'down': try: - captcha(driver) login(driver) crawlForum(driver) except Exception as e: print(driver.current_url, e) closeDriver(driver) - # new_parse(marketName, False) + new_parse(mktName, baseURL, True) +# Login using premade account credentials and do login captcha manually +def login(driver): -def captcha(driver): - ''' - # wait for captcha page - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div/div[1]"))) - - # save captcha to local - driver.find_element(by=By.XPATH, value='/html/body/div/div[2]').screenshot( - r'..\Kingdom\captcha1.png') - - # This method will show image in any image viewer - im = Image.open(r'..\Kingdom\captcha1.png') - im.show() - - iframes = driver.find_elements(by=By.TAG_NAME, value='iframe') - - # ask user input captcha solution in terminal - print("Enter squares from smallest to largest (squares are numbered 1-9 left to right)") - for order in ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']: - id = input(f"{order}: ") - iframes[int(id)-1].click() - ''' input("Press ENTER when CAPTCHA is completed\n") # wait for login page WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button"))) - - -# Login using premade account credentials and do login captcha manually -def login(driver): - # wait for login page - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button"))) + (By.XPATH, '//*[@id="login-form"]'))) # entering username and password into input boxes usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-name"]') @@ -96,39 +68,17 @@ def login(driver): select = Select(driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-sessiontime"]')) select.select_by_visible_text('24 hours') - ''' - # wait for captcha page show up - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, '//*[@id="captcha"]'))) - - # save captcha to local - driver.find_element(by=By.XPATH, value='//*[@id="captcha"]').screenshot(r'..\Kingdom\captcha2.png') - - # This method will show image in any image viewer - im = Image.open(r'..\Kingdom\captcha2.png') - im.show() - - # wait until input space show up - inputBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-captcha"]') - - # ask user input captcha solution in terminal - userIn = input("Enter solution: ") - - # send user solution into the input space - inputBox.send_keys(userIn) - - # click the verify(submit) button - driver.find_element(by=By.XPATH, value="/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button").click() - ''' - input("Press ENTER when CAPTCHA is completed\n") + input("Press ENTER when CAPTCHA and DDOS is completed\n") # wait for listing page show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div/div/div[3]/div[2]'))) + (By.XPATH, '/html/body/div/div/div[3]/div[1]/div/div[3]'))) + + # Returns the name of the website -def getMarketName(): +def getMKTName(): name = 'Kingdom' return name @@ -236,30 +186,17 @@ def getInterestedLinks(): links = [] # Software and Malware - links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=127&t=c298a77d9e93ad32') + links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=127&t=597a56b9a0b3e0d0') # # Services - # links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=45&t=c298a77d9e93ad32') - # # Exploits - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=45') - # # Tools - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=46') - # # Malware - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=47') - # # Cryptography - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=48') - # # Others - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=49') - # # Hacking Tutorials - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=50') - # # Hacked Accounts and Database Dumps - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=30') - # # Android Moded pak - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=53') + links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=45&t=597a56b9a0b3e0d0') + # # guides and tutorials + links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=107&t=597a56b9a0b3e0d0') return links def crawlForum(driver): + print("Crawling the Kingdom market") linksToCrawl = getInterestedLinks() @@ -281,6 +218,7 @@ def crawlForum(driver): savePage(driver, html, link) list = productPages(html) + for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: @@ -290,18 +228,15 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - temp = driver.find_element(by=By.XPATH, value= - '/html/body/div/div/div[3]/div[2]/div[2]/div/div/ul') - next = temp.find_element_by_class_name("next") - link = link.find_element_by_tag_name('a').get_attribute('href') + link = driver.find_element(by=By.XPATH, value='/html/body/div/div/div[3]/div[2]/div[2]/div[3]/div/ul/li[13]/a').get_attribute('href') if link == "": raise NoSuchElementException count += 1 @@ -313,7 +248,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling Kingdom Market done sucessfully. Press ENTER to continue\n") + print("Crawling the Kingdom market done.") # Returns 'True' if the link is Topic link @@ -325,7 +260,7 @@ def isDescriptionLink(url): # Returns True if the link is a listingPage link def isListingLink(url): - if 'category' in url: + if 'filter_category' in url: return True return False @@ -333,10 +268,8 @@ def isListingLink(url): # calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") - #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text) return kingdom_links_parser(soup) def crawler(): - startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") + startCrawling() \ No newline at end of file diff --git a/MarketPlaces/Kingdom/parser.py b/MarketPlaces/Kingdom/parser.py index b1e05d5..abade27 100644 --- a/MarketPlaces/Kingdom/parser.py +++ b/MarketPlaces/Kingdom/parser.py @@ -1,4 +1,4 @@ -__author__ = 'DarkWeb' +__author__ = 'Helium' # Here, we are importing the auxiliary functions to clean or convert data from MarketPlaces.Utilities.utilities import * @@ -31,6 +31,8 @@ def kingdom_description_parser(soup): left = "-1" # 16 Product_QuantityLeft shipFrom = "-1" # 17 Product_ShippedFrom shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image # Finding Product Name @@ -95,7 +97,7 @@ def kingdom_description_parser(soup): # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo) + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results @@ -126,7 +128,9 @@ def kingdom_listing_parser(soup): qLeft =[] # 17 Product_QuantityLeft shipFrom = [] # 18 Product_ShippedFrom shipTo = [] # 19 Product_ShippedTo - href = [] # 20 Product_Links + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links listing = soup.find('div', {"id": "p0"}).find('div').find_all('div', {"class": "row"}, recursive=False) @@ -153,12 +157,20 @@ def kingdom_listing_parser(soup): product = product.strip() name.append(product) + # Finding Product Image + product_image = a.find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image.append(product_image) + # Finding the Vendor vendor_name = a.select_one('a[href^="/user"]').text vendor_name = vendor_name.replace(",", " ").replace('/', '') vendor_name = vendor_name.strip() vendor.append(vendor_name) + image_vendor.append("-1") + # Adding the url to the list of urls link = a.find('div', {"class": "col-md-7"}).select_one('a[href^="/offer/view?"]')['href'] link = cleanLink(link) @@ -169,7 +181,8 @@ def kingdom_listing_parser(soup): # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, + image, image_vendor) def kingdom_links_parser(soup): diff --git a/MarketPlaces/Quest/crawler_selenium.py b/MarketPlaces/Quest/crawler_selenium.py index 69287a9..213ab24 100644 --- a/MarketPlaces/Quest/crawler_selenium.py +++ b/MarketPlaces/Quest/crawler_selenium.py @@ -1,7 +1,7 @@ __author__ = 'DarkWeb' ''' -Quest Market Crawler (Selenium) +Quest Marketplace Crawler (Selenium) ''' from selenium import webdriver @@ -9,15 +9,17 @@ from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.service import Service -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait -from PIL import Image +from selenium.webdriver.support.ui import Select +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By +from PIL import Image import urllib.parse as urlparse import os, re, time from datetime import date import subprocess +import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.Quest.parser import quest_links_parser @@ -27,9 +29,8 @@ counter = 1 baseURL = 'http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion' -# Opens Tor Browser, crawls the website def startCrawling(): - marketName = getMarketName() + mktName = getMKTName() driver = getAccess() if driver != 'down': @@ -40,66 +41,18 @@ def startCrawling(): print(driver.current_url, e) closeDriver(driver) - new_parse(marketName, False) - - -# Login using premade account credentials and do login captcha manually -def login(driver): - #wait for login page - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[4]/div/div/button"))) - - #entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') - #Username here - usernameBox.send_keys('blabri') - passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') - #Password here - passwordBox.send_keys('fishowal') - - ''' - # wait for captcha page show up - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[3]/div/img"))) - - # save captcha to local - driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[3]/div/img').screenshot( - r'..\Quest\captcha.png') - - # This method will show image in any image viewer - im = Image.open(r'..\Quest\captcha.png') - - im.show() - - # wait until input space show up - inputBox = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[3]/input') - - # ask user input captcha solution in terminal - userIn = input("Enter solution: ") - - # send user solution into the input space - inputBox.send_keys(userIn) - - # click the verify(submit) button - driver.find_element(by=By.XPATH, value="/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[4]/div/div/button").click() - ''' - input("Press ENTER when CAPTCHA is completed\n") - - # wait for listing page show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div[5]/div/div/div/span'))) + new_parse(mktName, baseURL, True) # Returns the name of the website -def getMarketName(): +def getMKTName(): name = 'Quest' return name -# Return the link of the website +# Return the base link of the website def getFixedURL(): url = 'http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion' - return url @@ -129,8 +82,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -146,12 +99,13 @@ def createFFDriver(): service = Service(config.get('TOR', 'geckodriver_path')) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - + driver.maximize_window() return driver +#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' def getAccess(): url = getFixedURL() driver = createFFDriver() @@ -163,7 +117,27 @@ def getAccess(): return 'down' -# Saves the crawled html page +def login(driver): + input("Press ENTER when CAPTCHA is complete and login page has loaded\n") + + # entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') + # Username here + usernameBox.send_keys('CashCarti') + passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') + # Password here + passwordBox.send_keys('Mahogany') + # Clicking the login button + login_button = driver.find_element(By.XPATH, value='/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[4]/div/div/button') + login_button.click() + + input("Press ENTER when CAPTCHA is completed and you exit the newsletter\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/div[1]/nav/div/a/img'))) + + def savePage(driver, page, url): cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) @@ -172,7 +146,6 @@ def savePage(driver, page, url): return -# Gets the full path of the page to be saved along with its appropriate file name def getFullPathName(url): from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE @@ -185,7 +158,11 @@ def getFullPathName(url): return fullPath -# Creates the file name from passed URL +def getMKTName() -> str: + name = 'Quest' + return name + + def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) @@ -198,23 +175,24 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # # Digital - Services - # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/8ae67900-22ed-11ec-a710-31f963ce8d35') - # # Digital - Software - # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/92809300-22ed-11ec-b143-af312e1dab77') - # # Digital - Tutorials - # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/9d1592b0-22ed-11ec-b82d-c3d2878a8716') - # # Digital - Malware - # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/a35bae90-22ed-11ec-ad2e-410f5a5339b5') - # # Digital - Hacking - # links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/b4252cf0-22ed-11ec-8032-751549438ed5') - # Digital - Exploits + ## Software + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/92809300-22ed-11ec-b143-af312e1dab77') + ## Tutorial + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/9d1592b0-22ed-11ec-b82d-c3d2878a8716') + ## Malware + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/a35bae90-22ed-11ec-ad2e-410f5a5339b5') + ## Hacking + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/b4252cf0-22ed-11ec-8032-751549438ed5') + ## Exploits links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/c0c3ac60-22ed-11ec-9e97-41cd1912fdee') + ## Carding + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/cbe06b00-22ec-11ec-ab3a-816857220dec') return links def crawlForum(driver): + print("Crawling the Quest market") linksToCrawl = getInterestedLinks() @@ -236,6 +214,7 @@ def crawlForum(driver): savePage(driver, html, link) list = productPages(html) + for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: @@ -245,18 +224,16 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - nav = driver.find_element(by=By.XPATH, value='/html/body/div[6]/nav') - li = nav.find_elements(By.TAG_NAME, 'li') - a = li[-1].find_element(By.TAG_NAME, 'a') - link = a.get_attribute('href') + link_elem = driver.find_element(by=By.CSS_SELECTOR, value='a.page-link[rel="next"]') + link = link_elem.get_attribute('href') if link == "": raise NoSuchElementException count += 1 @@ -268,24 +245,23 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling Quest market done sucessfully. Press ENTER to continue\n") + print("Crawling the Quest market done.") -# Returns 'True' if the link is Topic link +# Returns 'True' if the link is Topic link, may need to change for every website def isDescriptionLink(url): if 'product' in url: return True return False -# Returns True if the link is a listingPage link +# Returns True if the link is a listingPage link, may need to change for every website def isListingLink(url): if 'category' in url: return True return False -# calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") return quest_links_parser(soup) @@ -293,4 +269,3 @@ def productPages(html): def crawler(): startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/MarketPlaces/Quest/parser.py b/MarketPlaces/Quest/parser.py index 6761ed9..d9c96e3 100644 --- a/MarketPlaces/Quest/parser.py +++ b/MarketPlaces/Quest/parser.py @@ -7,9 +7,11 @@ from MarketPlaces.Utilities.utilities import * from bs4 import BeautifulSoup -# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) +# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of description page +# return: 'row' that contains a variety of lists that each hold info on the description page def quest_description_parser(soup): - # Fields to be parsed vendor = "-1" # 0 *Vendor_Name @@ -31,111 +33,50 @@ def quest_description_parser(soup): left = "-1" # 16 Product_QuantityLeft shipFrom = "-1" # 17 Product_ShippedFrom shipTo = "-1" # 18 Product_ShippedTo - - row = soup.find_all('div', {'class': "row"}) + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image # Finding Product Name - name = row[1].text - name = name.replace('\n', ' ') - name = name.replace(",", "") - name = name.strip() - - small = row[3].find_all('small') - - # Finding Vendor - vendor = small[0].text - vendor = vendor.replace("Vendor:", "") - vendor = vendor.replace(",", "") - vendor = vendor.strip() - - # Finding Vendor Rating - full_stars = small[2].find_all('i', {'class': "fas fa-star"}) - half_star = small[2].find('i', {'class': "fas fa-star-half-alt"}) - rating_vendor = len(full_stars) + (0.5 if half_star is not None else 0) - - # Finding Successful Transactions - success = small[4].text - success = success.replace("Total Sales:", "") - success = success.strip() - - small = row[2].find('p', {'class': "text-left"}).find_all('small') - - # Finding Prices - USD = small[1].text - USD = USD.replace("$", "") - USD = USD.strip() - - shipping_info = row[2].find('p', {'class': "text-left"}).find('span').text.strip() - if "Digital" not in shipping_info: - shipping_info = shipping_info.split(" ") - - # Finding Shipment Information (Origin) - shipFrom = shipping_info[0].strip() - - # Finding Shipment Information (Destination) - shipTo = shipping_info[1].strip() - - textarea = row[2].find_all('textarea') - - # Finding the Product description - describe = textarea[0].text - describe = describe.replace("\n", " ") - describe = describe.replace("\r", " ") - describe = describe.strip() - - ''' - # Finding the Number of Product Reviews - tag = soup.findAll(text=re.compile('Reviews')) - for index in tag: - reviews = index - par = reviews.find('(') - if par >=0: - reviews = reviews.replace("Reviews (","") - reviews = reviews.replace(")","") - reviews = reviews.split(",") - review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) - else : - review = "-1" - ''' - - # Searching for CVE and MS categories - cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if cve: - CVE = " " - for idx in cve: - CVE += (idx) - CVE += " " - CVE = CVE.replace(',', ' ') - CVE = CVE.replace('\n', '') - ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) - if ms: - MS = " " - for im in ms: - MS += (im) - MS += " " - MS = MS.replace(',', ' ') - MS = MS.replace('\n', '') + name = soup.find('div', class_='card-header bg-dark text-white rounded-0 text-center').text.strip() + + # USD Price + USD = soup.find('small', text='Product Price:').find_next('small').text.strip().replace('$', '') + + # Product Description + describe = soup.find('textarea').text.strip() + + # Finding Product Image + image = soup.find('img', {'class': 'img-fluid'}) + image = image.get('src').split('base64,')[-1] + + # Finding Vendor Image + vendor_image = soup.select_one('.card-body.bg-mgray.css-selector.shadow img') + vendor_image = vendor_image.get('src').split('base64,')[-1] + # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo) + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results return row -# This is the method to parse the Listing Pages +# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of listing page +# return: 'row' that contains a variety of lists that each hold info on the listing page def quest_listing_parser(soup): # Fields to be parsed - nm = 0 # *Total_Products (Should be Integer) - mktName = "Quest" # 0 *Marketplace_Name + nm = 0 # *Total_Products (Should be Integer) + mktName = "quest" # 0 *Marketplace_Name vendor = [] # 1 *Vendor y rating_vendor = [] # 2 Vendor_Rating success = [] # 3 Vendor_Successful_Transactions name = [] # 4 *Product_Name y - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 6 Product_MS_Classification (Microsoft Security) + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this category = [] # 7 Product_Category y describe = [] # 8 Product_Description views = [] # 9 Product_Number_Of_Views @@ -146,87 +87,74 @@ def quest_listing_parser(soup): USD = [] # 14 Product_USD_SellingPrice y EURO = [] # 15 Product_EURO_SellingPrice sold = [] # 16 Product_QuantitySold - qLeft =[] # 17 Product_QuantityLeft + qLeft = [] # 17 Product_QuantityLeft shipFrom = [] # 18 Product_ShippedFrom shipTo = [] # 19 Product_ShippedTo - href = [] # 20 Product_Links - - # Finding category of listing page - cat = soup.find('span', {'class': "btn btn-sm btn-outline-mgray active border-info"}).text - cat = cat.replace("Digital -", "") - cat = cat.strip() - - listing = soup.find_all('div', {"class": "col-md-2 my-md-0 col-12"}) + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links + # Extract all product listings + listing = soup.findAll('div', class_='col-md-2 my-md-0 col-12') # Populating the Number of Products nm = len(listing) for a in listing: - bae = a.find_all('a', href=True) - - # Adding the category - category.append(cat) - - # Adding the url to the list of urls - link = bae[0].get('href') - link = cleanLink(link) - href.append(link) - - # Finding the Vendor - vendor_name = bae[2].text - vendor_name = vendor_name.replace(",", "") - vendor_name = vendor_name.strip() - vendor.append(vendor_name) - - # Finding the Product - product = bae[1].find('img').get('alt') - product = product.replace('\n', ' ') - product = product.replace(",", "") - product = product.strip() - name.append(product) - - # Searching for CVE and MS categories - cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) - if not cve: - cveValue="-1" - else: - cee = " " - for idx in cve: - cee += (idx) - cee += " " - cee = cee.replace(',', ' ') - cee = cee.replace('\n', '') - cveValue=cee - CVE.append(cveValue) - - ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) - if not ms: - MSValue="-1" - else: - me = " " - for im in ms: - me += (im) - me += " " - me = me.replace(',', ' ') - me = me.replace('\n', '') - MSValue=me - MS.append(MSValue) + + # Extracting Product URL & Name + product_link_tags = a.find_all('a', class_='badge-info') + if product_link_tags: + # Using the first tag as default + product_link_tag = product_link_tags[0] + href.append(product_link_tag['href']) + name.append(product_link_tag.text.strip()) + + # Extracting Product Image + img_tag = a.find('img') + if img_tag: + image_data = img_tag['src'].split('base64,')[-1] + image.append(image_data) + + # Extracting Vendor Name + vendor_tag = a.find('a', class_='badge-dark') + if vendor_tag: + vendor.append(vendor_tag.text.replace('👤', '').strip()) + + # Extracting Product Price in USD + price_tag = a.find('a', class_='text') + if price_tag: + USD.append(price_tag.text.replace("$", "").strip()) + + category_tag = soup.find('span', class_= 'btn btn-sm btn-outline-mgray active border-info') + if category_tag: + category.append(category_tag.text.strip()) + # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) +# called by the crawler to get description links on a listing page +# @param: beautifulsoup object that is using the correct html page (listing page) +# return: list of description links from a listing page def quest_links_parser(soup): + # Returning all product links - # Returning all links that should be visited by the Crawler href = [] - listing = soup.findAll('div', {"class": "col-md-2 my-md-0 col-12"}) + # Locate all divs with class 'row' + row_divs = soup.findAll('div', class_='row') + + for row_div in row_divs: + # Locate all product divs within the current 'row' div + product_divs = row_div.findAll('div', class_='col-md-2 my-md-0 col-12') - for div in listing: + for product_div in product_divs: + # Locate the anchor tag containing the product link within each product div + product_link_tag = product_div.find('a', class_='badge-info') - link = div.find('a')["href"] - href.append(link) + if product_link_tag and product_link_tag.has_attr('href'): + href.append(product_link_tag['href']) return href \ No newline at end of file