diff --git a/Forums/DB_Connection/db_connection.py b/Forums/DB_Connection/db_connection.py index e4f6c5d..dfdec49 100644 --- a/Forums/DB_Connection/db_connection.py +++ b/Forums/DB_Connection/db_connection.py @@ -3,7 +3,7 @@ __author__ = 'DarkWeb' import psycopg2 import traceback from Forums.Utilities.utilities import * - +from dateutil.relativedelta import relativedelta, FR def connectDataBase(): @@ -484,6 +484,28 @@ def create_posts(cur, row, forumId, topicId): 'dateinserted_post': row[8], 'postId': postId}) +def create_status(cur, forumId, date, listings, descriptions, status): + + date = datetime.strptime(date, "%m%d%Y") + + # getting last Fridays a reference date + date_reference = date + relativedelta(weekday=FR(-1)) + + # checking if status already exists + sql = "select * from forums_status where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s" + cur.execute(sql, {'forum_id': forumId, 'date_inserted': date}) + + recset = cur.fetchall() + if recset: + sql = "Update forums_status set listings = %(listings)s, descriptions = %(descriptions)s, status = %(status)s, date_reference = %(date_reference)s " \ + "where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s" + recset = {'listings': listings, 'descriptions': descriptions, 'status': status, 'date_reference': date_reference, 'forum_id': forumId, 'date_inserted': date} + else: + sql = "Insert into forums_status (forum_id, date_inserted, listings, descriptions, status, date_reference) Values (%s, %s, %s, %s, %s, %s)" + recset = [forumId, date, listings, descriptions, status, date_reference] + + cur.execute(sql, recset) + def create_database(cur, con): try: @@ -496,6 +518,12 @@ def create_database(cur, con): sql = "create unique index unique_forum ON forums USING btree (name_forum ASC NULLS LAST)" cur.execute(sql) + sql = "Create table forums_status (forum_id integer NOT NULL, date_inserted date NOT NULL, " \ + "listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL " \ + "CONSTRAINT forums_log_pkey PRIMARY KEY (forum_id, date_inserted), " \ + "CONSTRAINT forums_fk FOREIGN KEY (forum_id) REFERENCES forums (forum_id))" + cur.execute(sql) + sql = "create table users (user_id integer NOT NULL, forum_id integer NOT NULL, name_user character varying(" \ "255) NOT NULL, status_user character varying(255) null, reputation_user character varying(255) null, " \ "interest_user character varying(5000) null, signature_user character varying(1000) null, " \ diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index 91b662f..b86b5c6 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -341,6 +341,16 @@ def new_parse(forum, url, createLog): # move listing files of completed folder move_file(listingFile, createLog, logFile) + # registering the current forum status (up/down) and the number of scraped pages in the database + forumId = verifyForum(cur, forum) + if (forumId > 0): + + readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.html')) + readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.html')) + + create_status(cur, forumId, CURRENT_DATE, len(readListings), len(readDescriptions), '1' if len(listings) > 0 else '0') + con.commit() + if createLog: logFile.close() diff --git a/MarketPlaces/BlackPyramid/crawler_selenium.py b/MarketPlaces/BlackPyramid/crawler_selenium.py index b257c40..6f7e45a 100644 --- a/MarketPlaces/BlackPyramid/crawler_selenium.py +++ b/MarketPlaces/BlackPyramid/crawler_selenium.py @@ -1,9 +1,7 @@ -__author__ = 'Helium' +__author__ = 'cern' ''' -BlackPyramid Forum Crawler (Selenium) -cannot use bc no links are used -kept in case issues are solved +BlackPyramid Market Crawler (Selenium) ''' from selenium import webdriver @@ -11,64 +9,101 @@ from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.service import Service -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By - +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver import ActionChains +import selenium.webdriver.support.ui as uiClasses from PIL import Image + import urllib.parse as urlparse import os, re, time -from datetime import date import subprocess import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse -from MarketPlaces.BlackPyramid.parser import blackpyramid_links_parser +from MarketPlaces.BlackPyramid.parser import BlackPyramid_links_parser from MarketPlaces.Utilities.utilities import cleanHTML +import traceback + +config = configparser.ConfigParser() +config.read('../../setup.ini') counter = 1 -baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/login/' +baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/login/?login=1' -# Opens Tor Browser, crawls the website, then parses, then closes tor -#acts like the main method for the crawler, another function at the end of this code calls this function later +# Opens Tor Browser, crawls the website def startCrawling(): - mktName = getMKTName() + # Opening tor beforehand gives "Tor exited during startup error" + # opentor() + + marketName = getMarketName() + driver = getAccess() + # Wait for website to load + input("Press ENTER when website has loaded") + if driver != 'down': try: login(driver) crawlForum(driver) except Exception as e: print(driver.current_url, e) - closeDriver(driver) + closetor(driver) + + new_parse(marketName, baseURL, False) - new_parse(mktName, baseURL, True) + +# Opens Tor Browser +def opentor(): + global pid + print("Connecting Tor...") + pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) + pid = pro.pid + time.sleep(7.5) + input('Tor Connected. Press ENTER to continue\n') + return + + +# Login +def login(driver): + # entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value="//input[@name='username_login']") + # Username here + usernameBox.send_keys('ChipotleSteakBurrito') + passwordBox = driver.find_element(by=By.XPATH, value="//input[@name='password_login']") + # Password here + passwordBox.send_keys('BlackBeans') + + input("Press ENTER when CAPTCHA is completed\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) + #WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + # (By.XPATH, '/html/body/div[2]/div[3]/div[3]/div[1]/div[3]/nav/ul/li[10]/a'))) # Returns the name of the website -#return: name of site in string type -def getMKTName(): +def getMarketName(): name = 'BlackPyramid' return name -# Return the base link of the website -#return: url of base site in string type +# Return the link of the website def getFixedURL(): - url = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/' + url = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/login/?login=1' + return url # Closes Tor Browser -#@param: current selenium driver -def closeDriver(driver): +def closetor(driver): # global pid # os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /t /f /im tor.exe") print('Closing Tor...') - driver.close() + driver.quit() time.sleep(3) return @@ -76,8 +111,6 @@ def closeDriver(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): - from MarketPlaces.Initialization.markets_mining import config - ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) @@ -106,16 +139,13 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - driver.maximize_window() - return driver -#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' -#return: return the selenium driver or string 'down' def getAccess(): url = getFixedURL() driver = createFFDriver() + input('Tor Connected. Press ENTER to continue\n') try: driver.get(url) return driver @@ -124,33 +154,9 @@ def getAccess(): return 'down' -# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha -# then allows for manual solving of captcha in the terminal -#@param: current selenium web driver -def login(driver): - # wait for login page - login_link = driver.find_element(by=By.XPATH, value='/html/body/div/div/div[3]/div/main/div/div/div/div[2]/div/div/div/section[1]/input[1]') - login_link.click() # open tab with url - - # entering username and password into input boxes - usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') - # Username here - usernameBox.send_keys('ChipotleSteakBurrito') - passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') - # Password here - passwordBox.send_keys('BlackBeans') - - input("Press ENTER when CAPTCHA is completed\n") - - # wait for listing page show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div[2]/form/nav/nav/ul/li[2]/div/a/span[1]'))) - - - -# Saves the crawled html page, makes the directory path for html pages if not made -def savePage(driver, page, url): - cleanPage = cleanHTML(driver, page) +# Saves the crawled html page +def savePage(page, url): + cleanPage = cleanHTML(page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -158,100 +164,144 @@ def savePage(driver, page, url): # Gets the full path of the page to be saved along with its appropriate file name -#@param: raw url as crawler crawls through every site def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE - - mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") + global counter + from MarketPlaces.Initialization.markets_mining import CURRENT_DATE fileName = getNameFromURL(url) if isDescriptionLink(url): - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') + if (os.path.exists(r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html')): + fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + "(" + str(counter) + ")" + '.html' + else: + fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html' else: - fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') + if (os.path.exists(r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html')): + fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + "(" + str(counter) + ")" + '.html' + else: + fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html' return fullPath -# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned -#@param: raw url as crawler crawls through every site +# Creates the file name from passed URL def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) - if (name == ''): + if name == '': name = str(counter) counter = counter + 1 return name +def goToPage(driver, page): + # hover over digital -> hacking tools + a = ActionChains(driver) + + # hover + digitalB = driver.find_element(By.XPATH, "//li[@class='dig940']/div/a") + time.sleep(1) + a.move_to_element(digitalB).perform() + print(digitalB) + + # delay for website to register hover + time.sleep(10) + + # click + xpath = "//input[@name='" + page + "']" + link = driver.find_element(By.XPATH, xpath) + time.sleep(1) + a.move_to_element(link).click().perform() + print(link) + + # wait for website to load + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/center/div[4]/div[1]/div[3]/article/div[1]/h1/a'))) + -# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list -#in this example, there are a couple of categories some threads fall under such as -# Guides and Tutorials, Digital Products, and Software and Malware -#as you can see they are categories of products def getInterestedLinks(): - links = [] - - # Hacking Guides - links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # Exploits - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # botnets/malware - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # fraud software - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # Other Tools - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') - # # Services - # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') + # h11 -> Hacking Tools + # g3 -> Guides, Hacking + # se3 -> Services, Hacking + # f6 -> Fraud software + links = ['h11','g3','se3','f6'] return links -# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through -#topic and description pages are crawled through here, where both types of pages are saved -#@param: selenium driver def crawlForum(driver): print("Crawling the BlackPyramid market") - linksToCrawl = getInterestedLinks() + #linksToCrawl = getInterestedLinks() + #pages = ["Hacking Tools"] + pages = getInterestedLinks() + #visited = set(linksToCrawl) + initialTime = time.time() i = 0 - while i < len(linksToCrawl): - link = linksToCrawl[i] - print('Crawling :', link) + count = 0 + + for listing in pages: + #link = linksToCrawl[i] + print('Crawling :', listing) + try: - has_next_page = True - count = 0 + try: + goToPage(driver, listing) + except: + print("Try block 1") + driver.refresh() + time.sleep(5) + html = driver.page_source + savePage(html, listing) + has_next_page = True + currentPage = 1 + numberOfPages = 1 while has_next_page: - try: - clicker = driver.find_element(by=By.XPATH, value='/html/body/div[2]/form/nav/nav/ul/li[2]/div/a') - clicker.click() # open tab with url - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(driver, html, link) - + # get a list of urls for each listing list = productPages(html) for item in list: + itemURL = urlparse.urljoin(baseURL, str(item)) try: driver.get(itemURL) except: + print("Try block 2") driver.refresh() - savePage(driver, driver.page_source, item) - driver.back() + savePage(driver.page_source, item) + # can't use the back button in dark pyramid + # driver.back() # comment out - break + # break # comment out - if count == 1: - break + # if count == 1: + # count = 0 + # break + # go to next page of market try: - clicker = driver.find_element(by=By.XPATH, value= - '/html/body/center/div[4]/div/div[3]/div[23]/div[2]/input[1]') - if clicker == "": + goToPage(driver, listing) + nav = driver.find_element(by=By.XPATH, value="//input[@name='next_page']") + + if not nav.is_enabled(): + raise NoSuchElementException + try: + # select next page + pgnum = uiClasses.Select(driver.find_element(by=By.XPATH, value="//select[@name='pageination']")) + print("pg options:", pgnum.options) + pgnum.select_by_index(currentPage) + numberOfPages = len(pgnum.options) + + # click button + pgbutton = driver.find_element(by=By.XPATH, value="//input[@value='go to page']") + pgbutton.click() + except Exception as e: + print(e) + raise NoSuchElementException + time.sleep(10) + html = driver.page_source + savePage(html, listing) + currentPage += 1 + if currentPage > numberOfPages: raise NoSuchElementException count += 1 @@ -259,39 +309,39 @@ def crawlForum(driver): has_next_page = False except Exception as e: - print(link, e) + traceback.print_exc() + print(listing, e) i += 1 - print("Crawling the BlackPyramid market done.") + # finalTime = time.time() + # print finalTime - initialTime + + input("Crawling Dark Pyramid done successfully. Press ENTER to continue\n") -# Returns 'True' if the link is a description link -#@param: url of any url crawled -#return: true if is a description page, false if not +# Returns 'True' if the link is Topic link def isDescriptionLink(url): - if 'products' in url: + if 'product' in url: return True return False # Returns True if the link is a listingPage link -#@param: url of any url crawled -#return: true if is a Listing page, false if not def isListingLink(url): - if 'search' in url: + if 'category=' in url: return True return False -# calling the parser to define the links, the html is the url of a link from the list of interested link list -#@param: link from interested link list ie. getInterestingLinks() -#return: list of description links that should be crawled through +# calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") - return blackpyramid_links_parser(soup) - + return BlackPyramid_links_parser(soup) def crawler(): startCrawling() - # print("Crawling and Parsing BlackPyramid .... DONE!") + # print("Crawling and Parsing BestCardingWorld .... DONE!") + +if __name__ == '__main__': + startCrawling() \ No newline at end of file diff --git a/MarketPlaces/BlackPyramid/parser.py b/MarketPlaces/BlackPyramid/parser.py index 743466a..4b45ee7 100644 --- a/MarketPlaces/BlackPyramid/parser.py +++ b/MarketPlaces/BlackPyramid/parser.py @@ -1,4 +1,4 @@ -__author__ = 'Helium' +__author__ = 'cern' # Here, we are importing the auxiliary functions to clean or convert data from MarketPlaces.Utilities.utilities import * @@ -11,7 +11,7 @@ from bs4 import BeautifulSoup #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of description page #return: 'row' that contains a variety of lists that each hold info on the description page -def darkfox_description_parser(soup): +def BlackPyramid_description_parser(soup): # Fields to be parsed @@ -40,82 +40,71 @@ def darkfox_description_parser(soup): EURO = "-1" # 22 Product_EURO_SellingPrice # Finding Product Name - name = soup.find('h1').text + name = soup.find('div', {'class': 'panel39002'}).find('span').next_sibling name = name.replace('\n', ' ') name = name.replace(",", "") name = name.strip() + # product description + describe = soup.findAll('div', {'class': 'fer048953'})[1].text + describe = describe.replace('\n', ' ') + describe = describe.replace(",", "") + describe = describe.strip() + # Finding Vendor - vendor = soup.find('h3').find('a').text.strip() + vendor = soup.find('div', {'class': 'bold03905 vstat364'}).text + vendor = vendor.split(" ") + vendor = vendor[2][:-1] + vendor = vendor.replace('\n', ' ') + vendor = vendor.replace(",", "") + vendor = vendor.strip() # Finding Vendor Rating - rating = soup.find('span', {'class': "tag is-dark"}).text.strip() + rating_span = soup.find('span', {'class': 'to3098503t'}).find_next_sibling('span') + rating_num = rating_span.find('b').text + if rating_num != 'N/A': + rating = rating_num[0:3] # Finding Successful Transactions - success = soup.find('h3').text - success = success.replace("Vendor: ", "") - success = success.replace(vendor, "") - success = success.replace("(", "") - success = success.replace(")", "") + success_container = soup.find('ul', {'class': 'ul3o00953'}).findAll('li')[1] + success = success_container.find('div').text + success = success.replace('"', '') + success = success.replace("\n", " ") + success = success.replace(",", "") success = success.strip() - bae = soup.find('div', {'class': "box"}).find_all('ul') - # Finding Prices - USD = bae[1].find('strong').text.strip() - - li = bae[2].find_all('li') + USD_text = soup.find('li', {'class': 'vul2994 vghul995'}).find('div').text + USD = USD_text.split(',')[1] + USD = USD.replace('\n', ' ') + USD = USD.replace(",", "") + USD = USD.strip() - # Finding Escrow - escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip() - - # Finding the Product Category - category = li[1].find('span', {'class': "tag is-dark"}).text.strip() - - # Finding the Product Quantity Available - left = li[3].find('span', {'class': "tag is-dark"}).text.strip() + container = soup.find('ul', {'class': 'bic03095'}) # Finding Number Sold - sold = li[4].find('span', {'class': "tag is-dark"}).text.strip() - - li = bae[3].find_all('li') - - # Finding Shipment Information (Origin) - if "Ships from:" in li[-2].text: - shipFrom = li[-2].text - shipFrom = shipFrom.replace("Ships from: ", "") - # shipFrom = shipFrom.replace(",", "") - shipFrom = shipFrom.strip() - - # Finding Shipment Information (Destination) - shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text - shipTo = shipTo.replace("Ships to: ", "") - shipTo = shipTo.strip() - if "certain countries" in shipTo: - countries = "" - tags = li[-1].find_all('span', {'class': "tag"}) - for tag in tags: - country = tag.text.strip() - countries += country + ", " - shipTo = countries.strip(", ") - - # Finding the Product description - describe = soup.find('div', {'class': "pre-line"}).text - describe = describe.replace("\n", " ") - describe = describe.strip() + sold_container = container.find('li') + sold_div = sold_container.findAll('div')[2] + sold = sold_div.find('b').next_sibling + sold = sold.replace('"', '') + sold = sold.replace("\n", " ") + sold = sold.replace(",", "") + sold = sold.strip() - '''# Finding the Number of Product Reviews - tag = soup.findAll(text=re.compile('Reviews')) - for index in tag: - reviews = index - par = reviews.find('(') - if par >=0: - reviews = reviews.replace("Reviews (","") - reviews = reviews.replace(")","") - reviews = reviews.split(",") - review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) - else : - review = "-1"''' + # Finding the Product Quantity Available + left_container = container.find('li') + left_div = left_container.findAll('div')[3] + left = left_div.find('b').next_sibling + left = left.replace('"', '') + left = left.replace("\n", " ") + left = left.replace(",", "") + left = left.strip() + + # Finding number of reviews + positive = soup.find('span', {'class': 'ar04999324'}).text + neutral = soup.find('span', {'class': 'ti9400005 can39953'}).text + negative = soup.find('span', {'class': 'ti9400005 ti90088 can39953'}).text + review = int(positive) + int(neutral) + int(negative) # Searching for CVE and MS categories cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) @@ -147,11 +136,11 @@ def darkfox_description_parser(soup): #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of listing page #return: 'row' that contains a variety of lists that each hold info on the listing page -def darkfox_listing_parser(soup): +def BlackPyramid_listing_parser(soup): # Fields to be parsed nm = 0 # Total_Products (Should be Integer) - mktName = "DarkFox" # 0 Marketplace_Name + mktName = "BlackPyramid" # 0 Marketplace_Name name = [] # 1 Product_Name CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) MS = [] # 3 Product_MS_Classification (Microsoft Security) @@ -169,80 +158,82 @@ def darkfox_listing_parser(soup): qLeft =[] # 15 Product_QuantityLeft shipFrom = [] # 16 Product_ShippedFrom shipTo = [] # 17 Product_ShippedTo - vendor = [] # 18 Vendor - rating = [] # 19 Vendor_Rating - success = [] # 20 Vendor_Successful_Transactions + rating_item = [] # 18 Product_Rating + vendor = [] # 19 Vendor + rating = [] # 20 Vendor_Rating + success = [] # 21 Vendor_Successful_Transactions href = [] # 23 Product_Links (Urls) - listing = soup.findAll('div', {"class": "card"}) + listing = soup.findAll('article', {"class": "product"}) + + # Some listing pages have an additional article section which is blank + if not listing[-1].findAll('a', href=True): + listing = listing[:-1] + # Populating the Number of Products nm = len(listing) - for a in listing: - bae = a.findAll('a', href=True) + for card in listing: + bae = card.findAll('a', href=True) # Adding the url to the list of urls - link = bae[0].get('href') + link = bae[2].get('href') link = cleanLink(link) href.append(link) # Finding the Product - product = bae[1].find('p').text + product = bae[3].text product = product.replace('\n', ' ') product = product.replace(",", "") product = product.replace("...", "") product = product.strip() name.append(product) - bae = a.find('div', {'class': "media-content"}).find('div').find_all('div') - - if len(bae) >= 5: - # Finding Prices - price = bae[0].text - ud = price.replace(" USD", " ") - # u = ud.replace("$","") - u = ud.replace(",", "") - u = u.strip() - USD.append(u) - # bc = (prc[1]).strip(' BTC') - # BTC.append(bc) - - # Finding the Vendor - vendor_name = bae[1].find('a').text - vendor_name = vendor_name.replace(",", "") - vendor_name = vendor_name.strip() - vendor.append(vendor_name) - - # Finding the Category - cat = bae[2].find('small').text - cat = cat.replace("Category: ", "") - cat = cat.replace(",", "") - cat = cat.strip() - category.append(cat) - - # Finding Number Sold and Quantity Left - num = bae[3].text - num = num.replace("Sold: ", "") - num = num.strip() - sold.append(num) - - quant = bae[4].find('small').text - quant = quant.replace("In stock: ", "") - quant = quant.strip() - qLeft.append(quant) - - # Finding Successful Transactions - freq = bae[1].text - freq = freq.replace(vendor_name, "") - freq = re.sub(r'Vendor Level \d+', "", freq) - freq = freq.replace("(", "") - freq = freq.replace(")", "") - freq = freq.strip() - success.append(freq) + # Finding description + # 'recurisve = False' only searches direct children + desc = card.findChildren('div', recursive=False)[0] + desc = desc.findAll('div', recursive=False)[3].text + desc = desc.replace('\n', ' ') + desc = desc.replace(",", "") + desc = desc.strip() + describe.append(desc) + + # Finding Vendor Name + vendor_name = bae[4].find('span').text + vendor_name = vendor_name.split(' ')[1] + vendor_name = vendor_name.replace('\n', ' ') + vendor_name = vendor_name.replace(",", "") + vendor_name = vendor_name.strip() + vendor.append(vendor_name) + + # Finding the Category + cat = card.findAll('div', recursive=False)[0].findAll('div', recursive=False)[1].find('span').text + cat = cat.replace("\n", "") + cat = cat.replace(",", "") + cat = cat.strip() + category.append(cat) + + bae = card.findAll('div', recursive=False)[1].findAll('div', recursive=False)[1] + + # Finding amount left + left = bae.findAll('div', recursive=False)[1].text + left = left.replace("x", "") + left = left.replace('\n', ' ') + left = left.replace(",", "") + left = left.strip() + qLeft.append(left) + + # Finding amount sold + qsold = bae.findAll('div', recursive=False)[2].text + qsold = qsold.replace('\n', ' ') + qsold = qsold.replace("x", "") + qsold = qsold.replace(",", "") + qsold = qsold.strip() + sold.append(qsold) # Searching for CVE and MS categories - cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) + cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if not cve: cveValue="-1" else: @@ -255,7 +246,7 @@ def darkfox_listing_parser(soup): cveValue=cee CVE.append(cveValue) - ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) + ms = card.findAll(text=re.compile('MS\d{2}-\d{3}')) if not ms: MSValue="-1" else: @@ -269,23 +260,26 @@ def darkfox_listing_parser(soup): MS.append(MSValue) # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen, - BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href) + return organizeProducts(mktName, nm, vendor, rating, success, name, CVE, MS, category, describe, views, reviews, rating, + addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) #called by the crawler to get description links on a listing page #@param: beautifulsoup object that is using the correct html page (listing page) #return: list of description links from a listing page -def blackpyramid_links_parser(soup): +def BlackPyramid_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] listing = soup.findAll('article', {"class": "product"}) - for div in listing: + for item in listing: - link = div.find('a', {"class": "ah39063"})['href'] - href.append(link) + container = item.find('a', {"class": "ah39063"}) + + if container: + link = item.find('a', {"class": "ah39063"})['href'] + href.append(link) - return href \ No newline at end of file + return href diff --git a/MarketPlaces/DB_Connection/db_connection.py b/MarketPlaces/DB_Connection/db_connection.py index 8769869..2f3341a 100644 --- a/MarketPlaces/DB_Connection/db_connection.py +++ b/MarketPlaces/DB_Connection/db_connection.py @@ -4,7 +4,7 @@ import psycopg2 import traceback import configparser from MarketPlaces.Utilities.utilities import * - +from dateutil.relativedelta import relativedelta, FR def connectDataBase(): @@ -273,6 +273,8 @@ def create_items(cur, row, marketId, vendorId): if newItem: + # decode_decrypt_image_in_base64(row[20]) + sql = "Insert into items (item_id, market_id, vendor_id, name_item, description_item, cve_item, ms_item, category_item, " \ "views_item, reviews_item, rating_item, dateadded_item, btc_item, usd_item, euro_item, quantitysold_item, " \ "quantityleft_item, shippedfrom_item, shippedto_item, lastseen_item, image_item, href_item, dateinserted_item, " \ @@ -312,7 +314,7 @@ def create_items(cur, row, marketId, vendorId): recset = cur.fetchall() - # decode_decrypt_image_in_base64(recset[0][20]) + # decode_decrypt_image_in_base64(recset[0]['image_item']) if (str(recset[0]['description_item']) != str(row[5] if row[5] != '-1' else None) or str(recset[0]['cve_item']) != str(row[6] if row[6] != '-1' else None) or @@ -401,6 +403,27 @@ def create_items(cur, row, marketId, vendorId): return itemId +def create_status(cur, marketId, date, listings, descriptions, status): + + date = datetime.strptime(date, "%m%d%Y") + + # getting last Fridays a reference date + date_reference = date + relativedelta(weekday=FR(-1)) + + # checking if status already exists + sql = "select * from marketplaces_status where market_id = %(market_id)s and date_inserted = %(date_inserted)s" + cur.execute(sql, {'market_id': marketId, 'date_inserted': date}) + + recset = cur.fetchall() + if recset: + sql = "Update marketplaces_status set listings = %(listings)s, descriptions = %(descriptions)s, status = %(status)s, date_reference = %(date_reference)s " \ + "where market_id = %(market_id)s and date_inserted = %(date_inserted)s" + recset = {'listings': listings, 'descriptions': descriptions, 'status': status, 'date_reference': date_reference, 'market_id': marketId, 'date_inserted': date} + else: + sql = "Insert into marketplaces_status (market_id, date_inserted, listings, descriptions, status, date_reference) Values (%s, %s, %s, %s, %s, %s)" + recset = [marketId, date, listings, descriptions, status, date_reference] + + cur.execute(sql, recset) def create_database(cur, con): try: @@ -413,6 +436,12 @@ def create_database(cur, con): sql = "create unique index unique_market ON marketplaces USING btree (name_market ASC NULLS LAST)" cur.execute(sql) + sql = "Create table marketplaces_status (market_id integer NOT NULL, date_inserted date NOT NULL, " \ + "listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL " \ + "CONSTRAINT marketplaces_log_pkey PRIMARY KEY (market_id, date_inserted), " \ + "CONSTRAINT marketplaces_fk FOREIGN KEY (market_id) REFERENCES marketplaces (market_id))" + cur.execute(sql) + sql = "create table vendors(vendor_id integer not null, market_id integer not null, name_vendor character " \ "varying(255) not null, rating_vendor character varying(255), successfultransactions_vendor integer " \ "null, image_vendor character varying(10000000) null, dateinserted_vendor timestamp(6) with time zone not null, " \ diff --git a/MarketPlaces/DarkBazar/crawler_selenium.py b/MarketPlaces/DarkBazar/crawler_selenium.py index fdfb640..dac91b0 100644 --- a/MarketPlaces/DarkBazar/crawler_selenium.py +++ b/MarketPlaces/DarkBazar/crawler_selenium.py @@ -216,12 +216,12 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # # comment out - # break - # - # # comment out - # if count == 1: - # break + # comment out + break + + # comment out + if count == 1: + break try: link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href') @@ -236,7 +236,7 @@ def crawlForum(driver): print(link, e) i += 1 - print("Crawling the DarkBazar market done.") + print("Crawling the DarkBazar market done.") # Returns 'True' if the link is Topic link, may need to change for every website diff --git a/MarketPlaces/DarkBazar/parser.py b/MarketPlaces/DarkBazar/parser.py index 9386d18..3d56e92 100644 --- a/MarketPlaces/DarkBazar/parser.py +++ b/MarketPlaces/DarkBazar/parser.py @@ -170,7 +170,6 @@ def darkbazar_listing_parser(soup): # Adding the url to the list of urls link = bae[0].get('href') - link = cleanLink(link) href.append(link) # Finding the Product diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index e075541..de6cc79 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -15,6 +15,9 @@ from MarketPlaces.M00nkeyMarket.parser import * from MarketPlaces.MikesGrandStore.parser import * from MarketPlaces.PabloEscobarMarket.parser import * from MarketPlaces.CityMarket.parser import * +from MarketPlaces.DarkBazar.parser import * +from MarketPlaces.Sonanza.parser import * +from MarketPlaces.Kingdom.parser import * from MarketPlaces.Classifier.classify_product import predict @@ -130,6 +133,12 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile): rw = pabloescobarmarket_listing_parser(soup) elif marketPlace == "CityMarket": rw = city_listing_parser(soup) + elif marketPlace == "DarkBazar": + rw = darkbazar_listing_parser(soup) + elif marketPlace == "Sonanza": + rw = sonanza_listing_parser(soup) + elif marketPlace == "Kingdom": + rw = kingdom_listing_parser(soup) else: print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") raise Exception @@ -164,6 +173,12 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): rmm = pabloescobarmarket_description_parser(soup) elif marketPlace == "CityMarket": rmm = city_description_parser(soup) + elif marketPlace == "DarkBazar": + rmm = darkbazar_description_parser(soup) + elif marketPlace == "Sonanza": + rmm = sonanza_description_parser(soup) + elif marketPlace == "Kingdom": + rmm = kingdom_description_parser(soup) else: print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") raise Exception @@ -363,6 +378,16 @@ def new_parse(marketPlace, url, createLog): # move listing files of completed folder move_file(listingFile, createLog, logFile) + # registering the current forum status (up/down) and the number of scraped pages in the database + marketId = verifyMarketPlace(cur, marketPlace) + if (marketId > 0): + + readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.html')) + readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.html')) + + create_status(cur, marketId, CURRENT_DATE, len(readListings), len(readDescriptions), '1' if len(listings) > 0 else '0') + con.commit() + if createLog: logFile.close() diff --git a/MarketPlaces/Kingdom/crawler_selenium.py b/MarketPlaces/Kingdom/crawler_selenium.py index e6b489f..5385150 100644 --- a/MarketPlaces/Kingdom/crawler_selenium.py +++ b/MarketPlaces/Kingdom/crawler_selenium.py @@ -1,4 +1,4 @@ -__author__ = 'DarkWeb' +__author__ = 'Helium' ''' Kingdom Market Crawler (Selenium) @@ -35,55 +35,27 @@ baseURL = 'http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion # Opens Tor Browser, crawls the website def startCrawling(): - # marketName = getMarketName() + mktName = getMKTName() driver = getAccess() if driver != 'down': try: - captcha(driver) login(driver) crawlForum(driver) except Exception as e: print(driver.current_url, e) closeDriver(driver) - # new_parse(marketName, False) + new_parse(mktName, baseURL, True) +# Login using premade account credentials and do login captcha manually +def login(driver): -def captcha(driver): - ''' - # wait for captcha page - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div/div[1]"))) - - # save captcha to local - driver.find_element(by=By.XPATH, value='/html/body/div/div[2]').screenshot( - r'..\Kingdom\captcha1.png') - - # This method will show image in any image viewer - im = Image.open(r'..\Kingdom\captcha1.png') - im.show() - - iframes = driver.find_elements(by=By.TAG_NAME, value='iframe') - - # ask user input captcha solution in terminal - print("Enter squares from smallest to largest (squares are numbered 1-9 left to right)") - for order in ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']: - id = input(f"{order}: ") - iframes[int(id)-1].click() - ''' input("Press ENTER when CAPTCHA is completed\n") # wait for login page WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button"))) - - -# Login using premade account credentials and do login captcha manually -def login(driver): - # wait for login page - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button"))) + (By.XPATH, '//*[@id="login-form"]'))) # entering username and password into input boxes usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-name"]') @@ -96,39 +68,17 @@ def login(driver): select = Select(driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-sessiontime"]')) select.select_by_visible_text('24 hours') - ''' - # wait for captcha page show up - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, '//*[@id="captcha"]'))) - - # save captcha to local - driver.find_element(by=By.XPATH, value='//*[@id="captcha"]').screenshot(r'..\Kingdom\captcha2.png') - - # This method will show image in any image viewer - im = Image.open(r'..\Kingdom\captcha2.png') - im.show() - - # wait until input space show up - inputBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-captcha"]') - - # ask user input captcha solution in terminal - userIn = input("Enter solution: ") - - # send user solution into the input space - inputBox.send_keys(userIn) - - # click the verify(submit) button - driver.find_element(by=By.XPATH, value="/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button").click() - ''' - input("Press ENTER when CAPTCHA is completed\n") + input("Press ENTER when CAPTCHA and DDOS is completed\n") # wait for listing page show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - (By.XPATH, '/html/body/div/div/div[3]/div[2]'))) + (By.XPATH, '/html/body/div/div/div[3]/div[1]/div/div[3]'))) + + # Returns the name of the website -def getMarketName(): +def getMKTName(): name = 'Kingdom' return name @@ -236,30 +186,17 @@ def getInterestedLinks(): links = [] # Software and Malware - links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=127&t=c298a77d9e93ad32') + links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=127&t=597a56b9a0b3e0d0') # # Services - # links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=45&t=c298a77d9e93ad32') - # # Exploits - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=45') - # # Tools - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=46') - # # Malware - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=47') - # # Cryptography - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=48') - # # Others - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=49') - # # Hacking Tutorials - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=50') - # # Hacked Accounts and Database Dumps - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=30') - # # Android Moded pak - # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=53') + links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=45&t=597a56b9a0b3e0d0') + # # guides and tutorials + links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=107&t=597a56b9a0b3e0d0') return links def crawlForum(driver): + print("Crawling the Kingdom market") linksToCrawl = getInterestedLinks() @@ -281,6 +218,7 @@ def crawlForum(driver): savePage(driver, html, link) list = productPages(html) + for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: @@ -290,18 +228,15 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - temp = driver.find_element(by=By.XPATH, value= - '/html/body/div/div/div[3]/div[2]/div[2]/div/div/ul') - next = temp.find_element_by_class_name("next") - link = link.find_element_by_tag_name('a').get_attribute('href') + link = driver.find_element(by=By.XPATH, value='/html/body/div/div/div[3]/div[2]/div[2]/div[3]/div/ul/li[13]/a').get_attribute('href') if link == "": raise NoSuchElementException count += 1 @@ -313,7 +248,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling Kingdom Market done sucessfully. Press ENTER to continue\n") + print("Crawling the Kingdom market done.") # Returns 'True' if the link is Topic link @@ -325,7 +260,7 @@ def isDescriptionLink(url): # Returns True if the link is a listingPage link def isListingLink(url): - if 'category' in url: + if 'filter_category' in url: return True return False @@ -333,10 +268,8 @@ def isListingLink(url): # calling the parser to define the links def productPages(html): soup = BeautifulSoup(html, "html.parser") - #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text) return kingdom_links_parser(soup) def crawler(): - startCrawling() - # print("Crawling and Parsing BestCardingWorld .... DONE!") + startCrawling() \ No newline at end of file diff --git a/MarketPlaces/Kingdom/parser.py b/MarketPlaces/Kingdom/parser.py index b1e05d5..abade27 100644 --- a/MarketPlaces/Kingdom/parser.py +++ b/MarketPlaces/Kingdom/parser.py @@ -1,4 +1,4 @@ -__author__ = 'DarkWeb' +__author__ = 'Helium' # Here, we are importing the auxiliary functions to clean or convert data from MarketPlaces.Utilities.utilities import * @@ -31,6 +31,8 @@ def kingdom_description_parser(soup): left = "-1" # 16 Product_QuantityLeft shipFrom = "-1" # 17 Product_ShippedFrom shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image # Finding Product Name @@ -95,7 +97,7 @@ def kingdom_description_parser(soup): # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo) + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results @@ -126,7 +128,9 @@ def kingdom_listing_parser(soup): qLeft =[] # 17 Product_QuantityLeft shipFrom = [] # 18 Product_ShippedFrom shipTo = [] # 19 Product_ShippedTo - href = [] # 20 Product_Links + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links listing = soup.find('div', {"id": "p0"}).find('div').find_all('div', {"class": "row"}, recursive=False) @@ -153,12 +157,20 @@ def kingdom_listing_parser(soup): product = product.strip() name.append(product) + # Finding Product Image + product_image = a.find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image.append(product_image) + # Finding the Vendor vendor_name = a.select_one('a[href^="/user"]').text vendor_name = vendor_name.replace(",", " ").replace('/', '') vendor_name = vendor_name.strip() vendor.append(vendor_name) + image_vendor.append("-1") + # Adding the url to the list of urls link = a.find('div', {"class": "col-md-7"}).select_one('a[href^="/offer/view?"]')['href'] link = cleanLink(link) @@ -169,7 +181,8 @@ def kingdom_listing_parser(soup): # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, - reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, + image, image_vendor) def kingdom_links_parser(soup):