From 163ec7a36cf888d532eced7ba5e98268b732c568 Mon Sep 17 00:00:00 2001 From: Van H Date: Thu, 31 Aug 2023 22:40:36 -0700 Subject: [PATCH 1/6] comment before pr --- MarketPlaces/GoFish/crawler_selenium.py | 300 ++++++++++++++++++++++ MarketPlaces/GoFish/parser.py | 327 +++++++++++++++++++++++ MarketPlaces/Torzon/crawler_selenium.py | 308 ++++++++++++++++++++++ MarketPlaces/Torzon/parser.py | 328 ++++++++++++++++++++++++ 4 files changed, 1263 insertions(+) create mode 100644 MarketPlaces/GoFish/crawler_selenium.py create mode 100644 MarketPlaces/GoFish/parser.py create mode 100644 MarketPlaces/Torzon/crawler_selenium.py create mode 100644 MarketPlaces/Torzon/parser.py diff --git a/MarketPlaces/GoFish/crawler_selenium.py b/MarketPlaces/GoFish/crawler_selenium.py new file mode 100644 index 0000000..0f87696 --- /dev/null +++ b/MarketPlaces/GoFish/crawler_selenium.py @@ -0,0 +1,300 @@ +__author__ = 'DarkWeb' + +''' +Go Fish market Crawler (Selenium) +- this is a new marketplace and was up for only a few days, crawler has not been finished +''' + +from selenium import webdriver +from selenium.common.exceptions import NoSuchElementException +from selenium.webdriver.firefox.firefox_profile import FirefoxProfile +from selenium.webdriver.firefox.firefox_binary import FirefoxBinary +from selenium.webdriver.firefox.service import Service +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By + +from PIL import Image +import urllib.parse as urlparse +import os, re, time +from datetime import date +import subprocess +import configparser +from bs4 import BeautifulSoup +from MarketPlaces.Initialization.prepare_parser import new_parse +from MarketPlaces.GoFish.parser import gofish_links_parser +from MarketPlaces.Utilities.utilities import cleanHTML + +counter = 1 +baseURL = 'http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/' + +# Opens Tor Browser, crawls the website, then parses, then closes tor +#acts like the main method for the crawler, another function at the end of this code calls this function later +def startCrawling(): + # opentor() + mktName = getMKTName() + driver = getAccess() + + if driver != 'down': + try: + login(driver) + crawlForum(driver) + except Exception as e: + print(driver.current_url, e) + closetor(driver) + + new_parse(mktName, baseURL, True) + + +# Opens Tor Browser +#prompts for ENTER input to continue +def opentor(): + from MarketPlaces.Initialization.markets_mining import config + + global pid + print("Connecting Tor...") + pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) + pid = pro.pid + time.sleep(7.5) + input('Tor Connected. Press ENTER to continue\n') + return + +# Returns the name of the website +#return: name of site in string type +def getMKTName(): + name = 'GoFish' + return name + + +# Return the base link of the website +#return: url of base site in string type +def getFixedURL(): + url = 'http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/' + return url + + +# Closes Tor Browser +#@param: current selenium driver +def closetor(driver): + # global pid + # os.system("taskkill /pid " + str(pro.pid)) + # os.system("taskkill /t /f /im tor.exe") + print('Closing Tor...') + driver.close() + time.sleep(3) + return + + +# Creates FireFox 'driver' and configure its 'Profile' +# to use Tor proxy and socket +def createFFDriver(): + from MarketPlaces.Initialization.markets_mining import config + + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) + + ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) + ff_prof.set_preference("places.history.enabled", False) + ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) + ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) + ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) + ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) + ff_prof.set_preference("signon.rememberSignons", False) + ff_prof.set_preference("network.cookie.lifetimePolicy", 2) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) + ff_prof.set_preference("permissions.default.image", 1) + ff_prof.set_preference("browser.download.folderList", 2) + ff_prof.set_preference("browser.download.manager.showWhenStarting", False) + ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") + ff_prof.set_preference('network.proxy.type', 1) + ff_prof.set_preference("network.proxy.socks_version", 5) + ff_prof.set_preference('network.proxy.socks', '127.0.0.1') + ff_prof.set_preference('network.proxy.socks_port', 9150) + ff_prof.set_preference('network.proxy.socks_remote_dns', True) + ff_prof.set_preference("javascript.enabled", False) + ff_prof.update_preferences() + + service = Service(config.get('TOR', 'geckodriver_path')) + + driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + + # driver.maximize_window() + + return driver + + +#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' +#return: return the selenium driver or string 'down' +def getAccess(): + url = getFixedURL() + driver = createFFDriver() + try: + driver.get(url) + return driver + except: + driver.close() + return 'down' + + +# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha +# then allows for manual solving of captcha in the terminal +#@param: current selenium web driver +def login(driver): + input("Press ENTER when CAPTCHA is completed\n") + + # wait for page to show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '//*[@id="username"]'))) + + # entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') + # Username here + usernameBox.send_keys('itsmedio') + passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') + # Password here + passwordBox.send_keys('DementedBed123-') + + input("Press ENTER when CAPTCHA and exit pressed is completed\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div/div[2]/div/div/div/div/div/div[1]/a/img"))) + +# Saves the crawled html page, makes the directory path for html pages if not made +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) + filePath = getFullPathName(url) + os.makedirs(os.path.dirname(filePath), exist_ok=True) + open(filePath, 'wb').write(cleanPage.encode('utf-8')) + return + + +# Gets the full path of the page to be saved along with its appropriate file name +#@param: raw url as crawler crawls through every site +def getFullPathName(url): + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") + fileName = getNameFromURL(url) + if isDescriptionLink(url): + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') + else: + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') + return fullPath + + +# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned +#@param: raw url as crawler crawls through every site +def getNameFromURL(url): + global counter + name = ''.join(e for e in url if e.isalnum()) + if (name == ''): + name = str(counter) + counter = counter + 1 + return name + + +# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list +#in this example, there are a couple of categories some threads fall under such as +# Guides and Tutorials, Digital Products, and Software and Malware +#as you can see they are categories of products +def getInterestedLinks(): + links = [] + + # Hacking + links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=129') + # Malware + links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=97') + # Exploits + links.append('http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion/?c=107') + + return links + + +# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through +#topic and description pages are crawled through here, where both types of pages are saved +#@param: selenium driver +def crawlForum(driver): + print("Crawling the GoFish market") + + linksToCrawl = getInterestedLinks() + + i = 0 + while i < len(linksToCrawl): + link = linksToCrawl[i] + print('Crawling :', link) + try: + has_next_page = True + count = 0 + + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(driver, html, link) + + list = productPages(html) + for item in list: + itemURL = urlparse.urljoin(baseURL, str(item)) + try: + driver.get(itemURL) + except: + driver.refresh() + savePage(driver, driver.page_source, item) + time.sleep(3) + driver.back() + + # comment out + break + + # comment out + if count == 1: + break + + try: + link = driver.find_element(by=By.LINK_TEXT, value='Next ›').get_attribute('href') + if link == "": + raise NoSuchElementException + count += 1 + + except NoSuchElementException: + has_next_page = False + + except Exception as e: + print(link, e) + i += 1 + + print("Crawling the GoFishMarket done.") + + +# Returns 'True' if the link is a description link +#@param: url of any url crawled +#return: true if is a description page, false if not +def isDescriptionLink(url): + if 'c' in url: + return True + return False + + +# Returns True if the link is a listingPage link +#@param: url of any url crawled +#return: true if is a Listing page, false if not +def isListingLink(url): + if 'a' in url: + return True + return False + + +# calling the parser to define the links, the html is the url of a link from the list of interested link list +#@param: link from interested link list ie. getInterestingLinks() +#return: list of description links that should be crawled through +def productPages(html): + soup = BeautifulSoup(html, "html.parser") + return gofish_links_parser(soup) + + +def crawler(): + startCrawling() + # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/MarketPlaces/GoFish/parser.py b/MarketPlaces/GoFish/parser.py new file mode 100644 index 0000000..28e1a6b --- /dev/null +++ b/MarketPlaces/GoFish/parser.py @@ -0,0 +1,327 @@ +__author__ = 'DarkWeb' + +# Here, we are importing the auxiliary functions to clean or convert data +from MarketPlaces.Utilities.utilities import * + +# Here, we are importing BeautifulSoup to search through the HTML tree +from bs4 import BeautifulSoup + + +#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs +#stores info it needs in different lists, these lists are returned after being organized +#@param: soup object looking at html page of description page +#return: 'row' that contains a variety of lists that each hold info on the description page +def gofish_description_parser(soup): + + # Fields to be parsed + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + + #vendor name + try: + temp = soup.find('div', {'class': 'box rounded mb-0'}).find('a').text + vendor = (cleanString(temp.strip())) + except: + vendor = "-1" + + #successful transaction + try: + temp = soup.findAll('div', {'class','text-center text-truncate column-flex ml-1 mr-1'}) #card sidebar-menu mb-4 card sidebar-menu mb-4 + temp2 = temp[1].findAll('span', {'class', 'float-right font-weight-bold'}) + temp = temp2[1].text + success = (temp.strip()) + except: + print("success") + + + #vendor rating 5 + try: + temp = soup.findAll('div', {'class', 'text-center text-truncate column-flex ml-1 mr-1'}) # card sidebar-menu mb-4 card sidebar-menu mb-4 + temp2 = temp[1].findAll('span', {'class', 'float-right font-weight-bold'}) + temp = temp2[5].text + rating_vendor = (cleanString(temp.strip())) + except: + print("product") + + # product name + try: + temp = soup.find('h3', {'class', 'h3 rounded card-title'}).find('span').text + name = (cleanString(temp.strip())) + except: + temp = soup.find('h3', {'class', 'h3 rounded card-title'}).find('span').find("div").text + name = (cleanString(temp.strip())) + + # product description + describe = soup.find('div', {'class': "box rounded flex-fill"}).find('pre').text + if "\n" in describe: + describe = describe.replace("\n", " ") + describe = describe.replace("\r", " ") + describe = cleanString(describe.strip()) + + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much + + # product category + try: + temp = soup.findAll('table', {'class', 'table table-hover'}) + temp2 = temp[1].find('tr').findAll('td') + temp = temp2[1].text + category = cleanString(temp.strip()) + except: + temp = soup.find('table', {'class', 'table table-hover'}) + temp2 = temp.find('tbody').find('tr').findAll('td') + temp = temp2[1].text + category = cleanString(temp.strip()) + + # product number of view + try: + temp = soup.find('div', {'class', 'box rounded mb-0'}) + temp2 = temp.findAll('i') + temp = temp2[2].text + views = cleanString((temp.strip())) + except: + print('Product number of view') + + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + + #BTC selling price box box-rounded mt-2 + try: + temp = soup.find('div', {'class', 'box box-rounded mt-2'}) + temp2 = temp.findAll('i', {'class', 'float-right color-prices'}) + temp = temp2[1].text + BTC = cleanString((temp.strip())) + except: + try: + temp = soup.find('div', {'class', 'box box-rounded'}) + temp2 = temp.findAll('span', {'class', 'float-right color-prices'}) + temp = temp2[1].text + BTC = cleanString((temp.strip())) + except: + print("BTC") + + + # USD selling price + try: + temp = soup.find('div', {'class', 'box box-rounded mt-2'}) + temp2 = temp.findAll('center') + temp = temp2[1].find('i').text + if "$" in temp: + temp = temp.replace("$", "") + USD = cleanString((temp.strip())) + except: + try: + temp = soup.find('div', {'class', 'box box-rounded'}) + temp2 = temp.findAll('center') + temp = temp2[1].find('span').text + if "$" in temp: + temp = temp.replace("$", "") + USD = cleanString((temp.strip())) + except: + print("USD") + + EURO = "-1" # 14 Product_EURO_SellingPrice + + + # product sold + try: + temp = soup.find('div', {'class', 'box rounded mb-0'}) # card sidebar-menu mb-4 card sidebar-menu mb-4 + temp2 = temp.find('i') + temp = temp2.text + sold = (cleanString(temp.strip())) + # sold = "-1" + except: + print("product sold") + + # product quantatiy left ###ERRROR + try: + temp = soup.findAll('table', {'class', 'table table-hover'}) + temp2 = temp[1].findAll('tr') + temp3 = temp2[1].findAll('td') + temp = temp3[1].text + left = cleanString(temp.strip()) + except: + temp = soup.find('table', {'class', 'table table-hover'}) + temp2 = temp.findAll('tr') + temp3 = temp2[1].findAll('td') + temp = temp3[1].text + left = cleanString(temp.strip()) + + + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + + # Populating the final variable (this should be a list with all fields scraped) + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo) + + # Sending the results + return row + + +#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs +#stores info it needs in different lists, these lists are returned after being organized +#@param: soup object looking at html page of listing page +#return: 'row' that contains a variety of lists that each hold info on the listing page +def gofish_listing_parser(soup): + # Fields to be parsed + nm = 0 # *Total_Products (Should be Integer) + mktName = "GoFish" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + href = [] # 20 Product_Links + + listing = soup.findAll('div', {"class": "card mt-1"}) + + # Populating the Number of Products + nm = len(listing) + + for a in listing: + + # vendor + try: + temp = a.find('div', {'class','col-5 justify-content-between mx-auto'}).find('a').text + vendor.append(cleanString(temp.strip())) + except: + print('vendor') + + #vendor rating + + + #successful transactions CHECK AGAIN HERE + try: + success.append("-1") + except: + print('successful transactions') + + # product name + try: + temp = a.find('h5', {'class','card-title rounded text-truncate'}).find('a').text + name.append(cleanString(temp.strip())) + except: + print('product name') + + + CVE.append('-1') + MS.append('-1') + rating_vendor.append('-1') + + # product category + try: + temp = soup.find('div', {'class', 'card-sidebar-menu box mb-2 flex-column'}).find('h3').find('span').text + if "Search Results for: " in temp: + temp = temp.replace("Search Results for: ", "") + category.append(cleanString(temp.strip())) + + except: + print("Error in product category") + + describe.append('-1') + + # product views + try: + temp = a.find('h6',{'class', 'card-subtitle mb-1 text-muted text-truncate'}) + temp2 = temp.find('i').text + views.append(cleanString(temp2.strip())) + except: + print("Error in views") + + reviews.append('-1') # 10 Product_Number_Of_Reviews + rating_item.append('-1') # 11 Product_Rating + addDate.append('-1') # 12 Product_AddDate + + # BTC + try: + temp = a.find('div', {'class', 'col-3 justify-content-between mx-auto'}) + temp2 = temp.findAll('p') + temp = temp2[1].text + BTC.append(cleanString(temp.strip())) + except: + print("BTC") + + #USD ERROR get rid of $ + try: + temp = a.find('div', {'class', 'col-12 justify-content-between mx-auto'}).find('i').text + if '$' in temp: + temp = temp.replace("$", "") + USD.append(cleanString(temp.strip())) # 14 Product_USD_SellingPrice + except: + print("USD") + + EURO.append("-1") # 15 Product_EURO_SellingPrice + + #product sold + try: + temp = a.find('div', {'class', 'col-12 mx-auto text-truncate text-center flex-fill'}).findAll('p', {'class', 'card-text mb-0'}) + temp2 = temp[1].find('i').text + sold.append(cleanString(temp2.strip())) + except: + print("product sold") + + qLeft.append('-1') # 17 Product_QuantityLeft + shipFrom.append('-1') # 18 Product_ShippedFrom + shipTo.append('-1') # 19 Product_ShippedTo + + #href + try: + temp = a.find('h5', {'class', 'card-title rounded text-truncate'}).find('a').get('href') + href.append(temp) # 20 Product_Links + except: + print("href") + + + # Populate the final variable (this should be a list with all fields scraped) + return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + +#called by the crawler to get description links on a listing page +#@param: beautifulsoup object that is using the correct html page (listing page) +#return: list of description links from a listing page +def gofish_links_parser(soup): + + # Returning all links that should be visited by the Crawler + + href = [] + listing = soup.findAll('div', {"class": "card mt-1"}) + + for a in listing: + bae = a.find('a', href=True)#card-title rounded text-truncate + link = bae['href'] + href.append(link) + + return href \ No newline at end of file diff --git a/MarketPlaces/Torzon/crawler_selenium.py b/MarketPlaces/Torzon/crawler_selenium.py new file mode 100644 index 0000000..6636e80 --- /dev/null +++ b/MarketPlaces/Torzon/crawler_selenium.py @@ -0,0 +1,308 @@ +__author__ = 'DarkWeb' + +''' +Torzon Market Crawler (Selenium) +- problem extracting description links from HTML structure, continue work with this crawler +by fixing torzon_links_parser() +''' + +from selenium import webdriver +from selenium.common.exceptions import NoSuchElementException +from selenium.webdriver.firefox.firefox_profile import FirefoxProfile +from selenium.webdriver.firefox.firefox_binary import FirefoxBinary +from selenium.webdriver.firefox.service import Service +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By + +from PIL import Image +import urllib.parse as urlparse +import os, re, time +from datetime import date +import subprocess +import configparser +from bs4 import BeautifulSoup +from MarketPlaces.Initialization.prepare_parser import new_parse +from MarketPlaces.Torzon.parser import torzon_links_parser +from MarketPlaces.Utilities.utilities import cleanHTML + + +counter = 1 +BASE_URL = 'http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion' + + +# Opens Tor Browser, crawls the website, then parses, then closes tor +#acts like the main method for the crawler, another function at the end of this code calls this function later +def startCrawling(): + opentor() + mktName = getMKTName() + driver = getAccess() + + if driver != 'down': + try: + login(driver) + crawlForum(driver) + except Exception as e: + print(driver.current_url, e) + closetor(driver) + + new_parse(mktName, BASE_URL, False) + + +# Opens Tor Browser +#prompts for ENTER input to continue +def opentor(): + from MarketPlaces.Initialization.markets_mining import config + + global pid + print("Connecting Tor...") + pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) + pid = pro.pid + time.sleep(7.5) + input('Tor Connected. Press ENTER to continue\n') + return + + +# Returns the name of the website +#return: name of site in string type +def getMKTName(): + name = 'Torzon' + return name + + +# Return the base link of the website +#return: url of base site in string type +def getFixedURL(): + url = 'http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion' + return url + + +# Closes Tor Browser +#@param: current selenium driver +def closetor(driver): + # global pid + # os.system("taskkill /pid " + str(pro.pid)) + # os.system("taskkill /t /f /im tor.exe") + print('Closing Tor...') + driver.close() + time.sleep(3) + return + + +# Creates FireFox 'driver' and configure its 'Profile' +# to use Tor proxy and socket +def createFFDriver(): + from MarketPlaces.Initialization.markets_mining import config + + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) + + + ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) + ff_prof.set_preference("places.history.enabled", False) + ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) + ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) + ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) + ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) + ff_prof.set_preference("signon.rememberSignons", False) + ff_prof.set_preference("network.cookie.lifetimePolicy", 2) + # ff_prof.set_preference("network.dns.disablePrefetch", True)#connection issue + # ff_prof.set_preference("network.http.sendRefererHeader", 0)#connection issue + ff_prof.set_preference("permissions.default.image", 1) + ff_prof.set_preference("browser.download.folderList", 2) + ff_prof.set_preference("browser.download.manager.showWhenStarting", False) + ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") + ff_prof.set_preference('network.proxy.type', 1) + ff_prof.set_preference("network.proxy.socks_version", 5) + ff_prof.set_preference('network.proxy.socks', '127.0.0.1') + ff_prof.set_preference('network.proxy.socks_port', 9150) + ff_prof.set_preference('network.proxy.socks_remote_dns', True) + ff_prof.set_preference("javascript.enabled", False) + ff_prof.update_preferences() + + service = Service(config.get('TOR', 'geckodriver_path')) + + driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + + return driver + + +#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' +#return: return the selenium driver or string 'down' +def getAccess(): + # url = getFixedURL() + driver = createFFDriver() ###### may want to use BASE_URL instead ####### + try: + driver.get(BASE_URL) + return driver + except: + driver.close() + return 'down' + + +# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha +# then allows for manual solving of captcha in the terminal +#@param: current selenium web driver +def login(driver): + input("Press ENTER when CAPTCHA is completed and page is loaded\n") + # wait for page to show up (This Xpath may need to change based on different seed url) + +# Saves the crawled html page, makes the directory path for html pages if not made +def savePage(page, url): + cleanPage = cleanHTML(page) + filePath = getFullPathName(url) + # filePath = getFullPathName("Hello") + os.makedirs(os.path.dirname(filePath), exist_ok=True) + with open(filePath, 'wb') as file: + file.write(cleanPage.encode('utf-8')) + # open(filePath, 'wb').write(cleanPage.encode('utf-8')) + return + + +# Gets the full path of the page to be saved along with its appropriate file name +#@param: raw url as crawler crawls through every site +def getFullPathName(url): + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") + fileName = getNameFromURL(url) + if isDescriptionLink(url): + fullPath = os.path.join(mainDir, CURRENT_DATE + r'/Description/' + fileName + '.html') + else: + fullPath = os.path.join(mainDir, CURRENT_DATE + r'/Listing/' + fileName + '.html') + return fullPath + + +# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned +#@param: raw url as crawler crawls through every site +def getNameFromURL(url): + global counter + name = ''.join(e for e in url if e.isalnum()) + if (name == ''): + name = str(counter) + counter = counter + 1 + return name + + +# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list +#in this example, there are a couple of categories some threads fall under such as +# Guides and Tutorials, Digital Products, and Software and Malware +#as you can see they are categories of products +def getInterestedLinks(): + links = [] + + # # services + links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870') + + # # software & malware + links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870') + + # # fraud + links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870') + + # # guides + links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Guides and Tutorials&small=0&big=5000000&id=75026212163304997524932260388151806190538071909089') + + return links + + +# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through +#topic and description pages are crawled through here, where both types of pages are saved +#@param: selenium driver +def crawlForum(driver): + print("Crawling the Torzon market") + + linksToCrawl = getInterestedLinks() + + i = 0 + while i < len(linksToCrawl): + link = linksToCrawl[i] + print('Crawling :', link) + try: + has_next_page = True + count = 0 + + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + + list = productPages(html) + for item in list: + itemURL = urlparse.urljoin(BASE_URL, str(item)) + try: + time.sleep(1.5) # to keep from detecting click speed + driver.get(itemURL) + except: + driver.refresh() + savePage(driver.page_source, item) + time.sleep(1.5) + driver.back() + # to keep from detecting click speed + + # # comment out + # break + # + # # comment out + # if count == 1: + # break + + try: + # nav = driver.find_element(by=By.XPATH, value='/html/body/table[1]/tbody/tr/td/form/div/div[2]/table[2]') + # a = nav.find_element(by=By.LINK_TEXT, value=">") + link = driver.find_element(by=By.LINK_TEXT, value=">").get_attribute('href') + if link == "": + raise NoSuchElementException + count += 1 + + except NoSuchElementException: + has_next_page = False + + except Exception as e: + print(link, e) + # raise e + i += 1 + + input("Crawling Torzon market done sucessfully. Press ENTER to continue\n") + + +# Returns 'True' if the link is a description link +#@param: url of any url crawled +#return: true if is a description page, false if not +def isDescriptionLink(url): + if 'products/' in url and '/products/?category' not in url: + return True + return False + + +# Returns True if the link is a listingPage link +#@param: url of any url crawled +#return: true if is a Listing page, false if not +def isListingLink(url): + if '?category' in url: + return True + return False + + +# calling the parser to define the links, the html is the url of a link from the list of interested link list +#@param: link from interested link list ie. getInterestingLinks() +#return: list of description links that should be crawled through +def productPages(html): + soup = BeautifulSoup(html, "html.parser") + return torzon_links_parser(soup) + + +# Drop links that "signout" +# def isSignOut(url): +# #absURL = urlparse.urljoin(url.base_url, url.url) +# if 'signout' in url.lower() or 'logout' in url.lower(): +# return True +# +# return False + + +def crawler(): + startCrawling() + # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/MarketPlaces/Torzon/parser.py b/MarketPlaces/Torzon/parser.py new file mode 100644 index 0000000..edb8cc4 --- /dev/null +++ b/MarketPlaces/Torzon/parser.py @@ -0,0 +1,328 @@ +__author__ = 'Helium' + +# Here, we are importing the auxiliary functions to clean or convert data +from MarketPlaces.Utilities.utilities import * + +# Here, we are importing BeautifulSoup to search through the HTML tree +from bs4 import BeautifulSoup + + +#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs +#stores info it needs in different lists, these lists are returned after being organized +#@param: soup object looking at html page of description page +#return: 'row' that contains a variety of lists that each hold info on the description page +def torzon_description_parser(soup): + + # Fields to be parsed + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + + #vendor name + try: + temp = soup.find('div', {'class': 'box rounded mb-0'}).find('a').text + vendor = (cleanString(temp.strip())) + except: + vendor = "-1" + + #successful transaction + try: + temp = soup.findAll('div', {'class','text-center text-truncate column-flex ml-1 mr-1'}) #card sidebar-menu mb-4 card sidebar-menu mb-4 + temp2 = temp[1].findAll('span', {'class', 'float-right font-weight-bold'}) + temp = temp2[1].text + success = (temp.strip()) + except: + print("success") + + + #vendor rating 5 + try: + temp = soup.findAll('div', {'class', 'text-center text-truncate column-flex ml-1 mr-1'}) # card sidebar-menu mb-4 card sidebar-menu mb-4 + temp2 = temp[1].findAll('span', {'class', 'float-right font-weight-bold'}) + temp = temp2[5].text + rating_vendor = (cleanString(temp.strip())) + except: + print("product") + + # product name + try: + temp = soup.find('h3', {'class', 'h3 rounded card-title'}).find('span').text + name = (cleanString(temp.strip())) + except: + temp = soup.find('h3', {'class', 'h3 rounded card-title'}).find('span').find("div").text + name = (cleanString(temp.strip())) + + # product description + describe = soup.find('div', {'class': "box rounded flex-fill"}).find('pre').text + if "\n" in describe: + describe = describe.replace("\n", " ") + describe = describe.replace("\r", " ") + describe = cleanString(describe.strip()) + + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much + + # product category + try: + temp = soup.findAll('table', {'class', 'table table-hover'}) + temp2 = temp[1].find('tr').findAll('td') + temp = temp2[1].text + category = cleanString(temp.strip()) + except: + temp = soup.find('table', {'class', 'table table-hover'}) + temp2 = temp.find('tbody').find('tr').findAll('td') + temp = temp2[1].text + category = cleanString(temp.strip()) + + # product number of view + try: + temp = soup.find('div', {'class', 'box rounded mb-0'}) + temp2 = temp.findAll('i') + temp = temp2[2].text + views = cleanString((temp.strip())) + except: + print('Product number of view') + + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + + #BTC selling price box box-rounded mt-2 + try: + temp = soup.find('div', {'class', 'box box-rounded mt-2'}) + temp2 = temp.findAll('i', {'class', 'float-right color-prices'}) + temp = temp2[1].text + BTC = cleanString((temp.strip())) + except: + try: + temp = soup.find('div', {'class', 'box box-rounded'}) + temp2 = temp.findAll('span', {'class', 'float-right color-prices'}) + temp = temp2[1].text + BTC = cleanString((temp.strip())) + except: + print("BTC") + + + # USD selling price + try: + temp = soup.find('div', {'class', 'box box-rounded mt-2'}) + temp2 = temp.findAll('center') + temp = temp2[1].find('i').text + if "$" in temp: + temp = temp.replace("$", "") + USD = cleanString((temp.strip())) + except: + try: + temp = soup.find('div', {'class', 'box box-rounded'}) + temp2 = temp.findAll('center') + temp = temp2[1].find('span').text + if "$" in temp: + temp = temp.replace("$", "") + USD = cleanString((temp.strip())) + except: + print("USD") + + EURO = "-1" # 14 Product_EURO_SellingPrice + + + # product sold + try: + temp = soup.find('div', {'class', 'box rounded mb-0'}) # card sidebar-menu mb-4 card sidebar-menu mb-4 + temp2 = temp.find('i') + temp = temp2.text + sold = (cleanString(temp.strip())) + # sold = "-1" + except: + print("product sold") + + # product quantatiy left ###ERRROR + try: + temp = soup.findAll('table', {'class', 'table table-hover'}) + temp2 = temp[1].findAll('tr') + temp3 = temp2[1].findAll('td') + temp = temp3[1].text + left = cleanString(temp.strip()) + except: + temp = soup.find('table', {'class', 'table table-hover'}) + temp2 = temp.findAll('tr') + temp3 = temp2[1].findAll('td') + temp = temp3[1].text + left = cleanString(temp.strip()) + + + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + + # Populating the final variable (this should be a list with all fields scraped) + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo) + + # Sending the results + return row + + +#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs +#stores info it needs in different lists, these lists are returned after being organized +#@param: soup object looking at html page of listing page +#return: 'row' that contains a variety of lists that each hold info on the listing page +def torzon_listing_parser(soup): + # Fields to be parsed + nm = 0 # *Total_Products (Should be Integer) + mktName = "Torzon" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + href = [] # 20 Product_Links + + listing = soup.findAll('div', {"class": "card mt-1"}) + + # Populating the Number of Products + nm = len(listing) + + for a in listing: + + # vendor + try: + temp = a.find('div', {'class','col-5 justify-content-between mx-auto'}).find('a').text + vendor.append(cleanString(temp.strip())) + except: + print('vendor') + + #vendor rating + + + #successful transactions CHECK AGAIN HERE + try: + success.append("-1") + except: + print('successful transactions') + + # product name + try: + temp = a.find('h5', {'class','card-title rounded text-truncate'}).find('a').text + name.append(cleanString(temp.strip())) + except: + print('product name') + + + CVE.append('-1') + MS.append('-1') + rating_vendor.append('-1') + + # product category + try: + temp = soup.find('div', {'class', 'card-sidebar-menu box mb-2 flex-column'}).find('h3').find('span').text + if "Search Results for: " in temp: + temp = temp.replace("Search Results for: ", "") + category.append(cleanString(temp.strip())) + + except: + print("Error in product category") + + describe.append('-1') + + # product views + try: + temp = a.find('h6',{'class', 'card-subtitle mb-1 text-muted text-truncate'}) + temp2 = temp.find('i').text + views.append(cleanString(temp2.strip())) + except: + print("Error in views") + + reviews.append('-1') # 10 Product_Number_Of_Reviews + rating_item.append('-1') # 11 Product_Rating + addDate.append('-1') # 12 Product_AddDate + + # BTC + try: + temp = a.find('div', {'class', 'col-3 justify-content-between mx-auto'}) + temp2 = temp.findAll('p') + temp = temp2[1].text + BTC.append(cleanString(temp.strip())) + except: + print("BTC") + + #USD ERROR get rid of $ + try: + temp = a.find('div', {'class', 'col-12 justify-content-between mx-auto'}).find('i').text + if '$' in temp: + temp = temp.replace("$", "") + USD.append(cleanString(temp.strip())) # 14 Product_USD_SellingPrice + except: + print("USD") + + EURO.append("-1") # 15 Product_EURO_SellingPrice + + #product sold + try: + temp = a.find('div', {'class', 'col-12 mx-auto text-truncate text-center flex-fill'}).findAll('p', {'class', 'card-text mb-0'}) + temp2 = temp[1].find('i').text + sold.append(cleanString(temp2.strip())) + except: + print("product sold") + + qLeft.append('-1') # 17 Product_QuantityLeft + shipFrom.append('-1') # 18 Product_ShippedFrom + shipTo.append('-1') # 19 Product_ShippedTo + + #href + try: + temp = a.find('h5', {'class', 'card-title rounded text-truncate'}).find('a').get('href') + href.append(temp) # 20 Product_Links + except: + print("href") + + + # Populate the final variable (this should be a list with all fields scraped) + return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) + +#called by the crawler to get description links on a listing page +#@param: beautifulsoup object that is using the correct html page (listing page) +#return: list of description links from a listing page +def torzon_links_parser(soup): + + # Returning all links that should be visited by the Crawler + + href = [] + # listing = soup.findAll('div', {"class": "card mt-1"}) + listing = soup.find('td', {"valign": "top"}).find("table", {"border": "0"}).findAll('td', {'width': '50%'}) + + for a in listing: + bae = a.find('a', href=True)#card-title rounded text-truncate + link = bae['href'] + href.append(link) + + return href \ No newline at end of file From 89684724fd7f78900547707b2c1394c0401c5903 Mon Sep 17 00:00:00 2001 From: westernmeadow Date: Fri, 1 Sep 2023 14:25:26 -0700 Subject: [PATCH 2/6] debugged ThiefWorld, TorMarket, and AnonMarket --- .idea/DW_Pipeline_Test.iml | 3 +- .idea/misc.xml | 2 +- MarketPlaces/AnonMarket/crawler_selenium.py | 91 +++++------ MarketPlaces/AnonMarket/parser.py | 113 ++++++------- MarketPlaces/CityMarket/parser.py | 100 ++++++------ MarketPlaces/Initialization/marketsList.txt | 13 +- MarketPlaces/Initialization/markets_mining.py | 3 + MarketPlaces/Initialization/prepare_parser.py | 29 ++-- .../MetaVerseMarket/crawler_selenium.py | 10 +- MarketPlaces/MetaVerseMarket/parser.py | 148 ++++++++---------- .../PabloEscobarMarket/crawler_selenium.py | 2 +- MarketPlaces/PabloEscobarMarket/parser.py | 96 ++++++------ MarketPlaces/ThiefWorld/crawler_selenium.py | 12 +- MarketPlaces/ThiefWorld/parser.py | 6 +- MarketPlaces/TorMarket/crawler_selenium.py | 4 +- MarketPlaces/TorMarket/parser.py | 120 ++++++-------- 16 files changed, 351 insertions(+), 401 deletions(-) diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml index 08a5719..f27dbb9 100644 --- a/.idea/DW_Pipeline_Test.iml +++ b/.idea/DW_Pipeline_Test.iml @@ -2,7 +2,7 @@ - + @@ -27,6 +27,7 @@ diff --git a/.idea/misc.xml b/.idea/misc.xml index dc9ea49..11f1ea0 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/MarketPlaces/AnonMarket/crawler_selenium.py b/MarketPlaces/AnonMarket/crawler_selenium.py index 2171d84..42d8e49 100644 --- a/MarketPlaces/AnonMarket/crawler_selenium.py +++ b/MarketPlaces/AnonMarket/crawler_selenium.py @@ -31,7 +31,6 @@ baseURL = 'http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): - opentor() mktName = getMKTName() driver = getAccess() @@ -40,22 +39,10 @@ def startCrawling(): crawlForum(driver) except Exception as e: print(driver.current_url, e) - closetor(driver) + closeDriver(driver) new_parse(mktName, baseURL, True) -# Opens Tor Browser -#prompts for ENTER input to continue -def opentor(): - from MarketPlaces.Initialization.markets_mining import config - - global pid - print("Connecting Tor...") - pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) - pid = pro.pid - time.sleep(7.5) - input('Tor Connected. Press ENTER to continue\n') - return # Returns the name of the website #return: name of site in string type @@ -73,7 +60,7 @@ def getFixedURL(): # Closes Tor Browser #@param: current selenium driver -def closetor(driver): +def closeDriver(driver): # global pid # os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /t /f /im tor.exe") @@ -100,7 +87,7 @@ def createFFDriver(): ff_prof.set_preference("network.cookie.lifetimePolicy", 2) ff_prof.set_preference("network.dns.disablePrefetch", True) ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 2) + ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") @@ -216,47 +203,63 @@ def getInterestedLinks(): #topic and description pages are crawled through here, where both types of pages are saved #@param: selenium driver def crawlForum(driver): - print("Crawling Anon Market") + print("Crawling the Anon Market") linksToCrawl = getInterestedLinks() for link in linksToCrawl: print('Crawling :', link) - has_next_page = True - - while has_next_page: - try: - driver.get(link) - except: - driver.refresh() - - html = driver.page_source - savePage(driver, html, link) + try: + has_next_page = True + count = 0 - # Get all product links on the current page - products_list = productPages(html) - for item in products_list: - itemURL = urlparse.urljoin(baseURL, str(item)) + while has_next_page: try: - driver.get(itemURL) + driver.get(link) except: driver.refresh() - savePage(driver, driver.page_source, item) - driver.back() # Go back to listing after visiting each product - # Find the active page number - active_page_element = driver.find_element(By.XPATH, '//div[@class="page activepage"]') - current_page = int(active_page_element.text) + html = driver.page_source + savePage(driver, html, link) + + # Get all product links on the current page + products_list = productPages(html) + for item in products_list: + itemURL = urlparse.urljoin(baseURL, str(item)) + try: + driver.get(itemURL) + except: + driver.refresh() + savePage(driver, driver.page_source, item) + driver.back() # Go back to listing after visiting each product - # Locate the next page link - try: - next_page_element = active_page_element.find_element(By.XPATH, 'following-sibling::a[1]') - link = next_page_element.get_attribute('href') - except NoSuchElementException: - has_next_page = False + # comment out + # break + + # comment out + if count == 1: + break + + # Locate the next page link + try: + # Find the active page number + active_page_element = driver.find_element(By.XPATH, '//div[@class="page activepage"]') + # current_page = int(active_page_element.text) + + next_page_element = active_page_element.find_element(By.XPATH, 'following-sibling::a[1]') + link = next_page_element.get_attribute('href') + if link == "": + raise NoSuchElementException + count += 1 + + except NoSuchElementException: + has_next_page = False + + except Exception as e: + print(link, e) - print("Crawling Anon Market done.") + print("Crawling the Anon Market done.") # Returns 'True' if the link is a description link #@param: url of any url crawled diff --git a/MarketPlaces/AnonMarket/parser.py b/MarketPlaces/AnonMarket/parser.py index c53283c..c5c7f6d 100644 --- a/MarketPlaces/AnonMarket/parser.py +++ b/MarketPlaces/AnonMarket/parser.py @@ -15,25 +15,27 @@ import re def AnonMarket_description_parser(soup): # Fields to be parsed - vendor = "-1" # 0 *Vendor_Name - success = "-1" # 1 Vendor_Successful_Transactions - rating_vendor = "-1" # 2 Vendor_Rating - name = "-1" # 3 *Product_Name - describe = "-1" # 4 Product_Description - CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much - MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much - category = "-1" # 7 Product_Category - views = "-1" # 8 Product_Number_Of_Views - reviews = "-1" # 9 Product_Number_Of_Reviews - rating_item = "-1" # 10 Product_Rating - addDate = "-1" # 11 Product_AddedDate - BTC = "-1" # 12 Product_BTC_SellingPrice - USD = "-1" # 13 Product_USD_SellingPrice - EURO = "-1" # 14 Product_EURO_SellingPrice - sold = "-1" # 15 Product_QuantitySold - left = "-1" # 16 Product_QuantityLeft - shipFrom = "-1" # 17 Product_ShippedFrom - shipTo = "-1" # 18 Product_ShippedTo + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image name_of_product = soup.find("div", {"class": "heading"}).text name = cleanString(name_of_product.strip()) @@ -70,8 +72,7 @@ def AnonMarket_description_parser(soup): # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, - BTC, USD, EURO, sold, left, shipFrom, shipTo) - + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results return row @@ -85,27 +86,29 @@ def AnonMarket_listing_parser(soup): # Fields to be parsed nm = 0 # *Total_Products (Should be Integer) - mktName = "AnonMarket" # 0 *Marketplace_Name - vendor = [] # 1 *Vendor y - rating_vendor = [] # 2 Vendor_Rating - success = [] # 3 Vendor_Successful_Transactions - name = [] # 4 *Product_Name y - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 6 Product_MS_Classification (Microsoft Security) - category = [] # 7 Product_Category y - describe = [] # 8 Product_Description - views = [] # 9 Product_Number_Of_Views - reviews = [] # 10 Product_Number_Of_Reviews - rating_item = [] # 11 Product_Rating - addDate = [] # 12 Product_AddDate - BTC = [] # 13 Product_BTC_SellingPrice - USD = [] # 14 Product_USD_SellingPrice y - EURO = [] # 15 Product_EURO_SellingPrice - sold = [] # 16 Product_QuantitySold - qLeft = [] # 17 Product_QuantityLeft - shipFrom = [] # 18 Product_ShippedFrom - shipTo = [] # 19 Product_ShippedTo - href = [] # 20 Product_Links + mktName = "AnonMarket" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links base_url = "http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion" products_list = soup.find_all('div', {'class': 'item'}) @@ -155,30 +158,8 @@ def AnonMarket_listing_parser(soup): continue # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts( - marketplace = "AnonMarket", - nm = nm, - vendor = vendor, - rating_vendor = rating_vendor, - success_vendor = success, - nombre = name, - CVE = CVE, - MS = MS, - category = category, - describe = describe, - views = views, - reviews = reviews, - rating_item = rating_item, - addDate = addDate, - BTC = BTC, - USD = USD, - EURO = EURO, - sold = sold, - qLeft = qLeft, - shipFrom = shipFrom, - shipTo = shipTo, - href = href - ) + return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) #called by the crawler to get description links on a listing page diff --git a/MarketPlaces/CityMarket/parser.py b/MarketPlaces/CityMarket/parser.py index e5f3575..75ca4fa 100644 --- a/MarketPlaces/CityMarket/parser.py +++ b/MarketPlaces/CityMarket/parser.py @@ -15,29 +15,27 @@ def city_description_parser(soup): # Fields to be parsed - name = "-1" # 0 Product_Name - describe = "-1" # 1 Product_Description - lastSeen = "-1" # 2 Product_LastViewDate - rules = "-1" # 3 NOT USED ... - CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 5 Product_MS_Classification (Microsoft Security) - review = "-1" # 6 Product_Number_Of_Reviews + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) category = "-1" # 7 Product_Category - shipFrom = "-1" # 8 Product_ShippedFrom - shipTo = "-1" # 9 Product_ShippedTo - left = "-1" # 10 Product_QuantityLeft - escrow = "-1" # 11 Vendor_Warranty - terms = "-1" # 12 Vendor_TermsAndConditions - vendor = "-1" # 13 Vendor_Name - sold = "-1" # 14 Product_QuantitySold - addDate = "-1" # 15 Product_AddedDate - available = "-1" # 16 NOT USED ... - endDate = "-1" # 17 NOT USED ... - BTC = "-1" # 18 Product_BTC_SellingPrice - USD = "-1" # 19 Product_USD_SellingPrice - rating = "-1" # 20 Vendor_Rating - success = "-1" # 21 Vendor_Successful_Transactions - EURO = "-1" # 22 Product_EURO_SellingPrice + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image divmd7 = soup.find('div', {'class': "col-md-7"}) ptag = soup.findAll('p') @@ -79,8 +77,7 @@ def city_description_parser(soup): # Finding the Product description describe = soup.find('div', {'class': "text-white"}).text - describe = describe.replace("\n", " ") - describe = describe.strip() + describe = cleanString(describe.strip()) '''# Finding the Number of Product Reviews tag = soup.findAll(text=re.compile('Reviews')) @@ -114,8 +111,8 @@ def city_description_parser(soup): MS = MS.replace('\n', '') # Populating the final variable (this should be a list with all fields scraped) - row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor, - sold, addDate, available, endDate, BTC, USD, rating, success, EURO) + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results return row @@ -128,29 +125,30 @@ def city_description_parser(soup): def city_listing_parser(soup): # Fields to be parsed - nm = 0 # Total_Products (Should be Integer) - mktName = "CityMarket" # 0 Marketplace_Name - name = [] # 1 Product_Name - CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 3 Product_MS_Classification (Microsoft Security) - category = [] # 4 Product_Category - describe = [] # 5 Product_Description - escrow = [] # 6 Vendor_Warranty - views = [] # 7 Product_Number_Of_Views - reviews = [] # 8 Product_Number_Of_Reviews - addDate = [] # 9 Product_AddDate - lastSeen = [] # 10 Product_LastViewDate - BTC = [] # 11 Product_BTC_SellingPrice - USD = [] # 12 Product_USD_SellingPrice - EURO = [] # 13 Product_EURO_SellingPrice - sold = [] # 14 Product_QuantitySold - qLeft =[] # 15 Product_QuantityLeft - shipFrom = [] # 16 Product_ShippedFrom - shipTo = [] # 17 Product_ShippedTo - vendor = [] # 18 Vendor - rating = [] # 19 Vendor_Rating - success = [] # 20 Vendor_Successful_Transactions - href = [] # 23 Product_Links (Urls) + nm = 0 # *Total_Products (Should be Integer) + mktName = "CityMarket" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links listing = soup.findAll('div', {"class": "card"}) @@ -227,8 +225,8 @@ def city_listing_parser(soup): MS.append(MSValue) # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen, - BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href) + return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) #called by the crawler to get description links on a listing page diff --git a/MarketPlaces/Initialization/marketsList.txt b/MarketPlaces/Initialization/marketsList.txt index 540b444..87f811c 100644 --- a/MarketPlaces/Initialization/marketsList.txt +++ b/MarketPlaces/Initialization/marketsList.txt @@ -1,12 +1 @@ -Apocalypse -DarkBazar -DarkMatter -DigitalThriftShop -HiddenMarket -LionMarketplace -Nexus -Robinhood -ThiefWorld -TorBay -TorMarket -ViceCity \ No newline at end of file +ThiefWorld \ No newline at end of file diff --git a/MarketPlaces/Initialization/markets_mining.py b/MarketPlaces/Initialization/markets_mining.py index 62e97f8..21888fc 100644 --- a/MarketPlaces/Initialization/markets_mining.py +++ b/MarketPlaces/Initialization/markets_mining.py @@ -27,6 +27,7 @@ from MarketPlaces.CypherMarketplace.crawler_selenium import crawler as crawlerCy from MarketPlaces.DarkBazar.crawler_selenium import crawler as crawlerDarkBazar from MarketPlaces.PabloEscobarMarket.crawler_selenium import crawler as crawlerPabloEscobar from MarketPlaces.AnonMarket.crawler_selenium import crawler as crawlerAnonMarket +from MarketPlaces.MetaVerseMarket.crawler_selenium import crawler as crawlerMetaVerse import configparser import os @@ -146,5 +147,7 @@ if __name__ == '__main__': crawlerPabloEscobar() elif mkt == "AnonMarket": crawlerAnonMarket() + elif mkt == "MetaVerseMarket": + crawlerMetaVerse() print("\nScraping process completed!") diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index 60abf80..1cc5af5 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -24,6 +24,8 @@ from MarketPlaces.MikesGrandStore.parser import * from MarketPlaces.DarkBazar.parser import * from MarketPlaces.PabloEscobarMarket.parser import * from MarketPlaces.AnonMarket.parser import * +from MarketPlaces.CityMarket.parser import * +from MarketPlaces.MetaVerseMarket.parser import * from MarketPlaces.Classifier.classify_product import predict @@ -161,6 +163,10 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile): rw = pabloescobarmarket_listing_parser(soup) elif marketPlace == "AnonMarket": rw = AnonMarket_listing_parser(soup) + elif marketPlace == "CityMarket": + rw = city_listing_parser(soup) + elif marketPlace == "MetaVerseMarket": + rw = metaversemarket_listing_parser(soup) else: print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") raise Exception @@ -218,6 +224,10 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): rmm = pabloescobarmarket_description_parser(soup) elif marketPlace == "AnonMarket": rmm = AnonMarket_description_parser(soup) + elif marketPlace == "CityMarket": + rmm = city_description_parser(soup) + elif marketPlace == "MetaVerseMarket": + rmm = metaversemarket_description_parser(soup) else: print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") raise Exception @@ -243,18 +253,13 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript con.rollback() - trace = traceback.format_exc() - - if trace.find("already exists") == -1: - incrementError() - print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!") - traceback.print_exc() - if createLog: - logFile.write( - str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n") - return False - else: - return True + incrementError() + print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!") + traceback.print_exc() + if createLog: + logFile.write( + str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n") + return False def move_file(filePath, createLog, logFile): diff --git a/MarketPlaces/MetaVerseMarket/crawler_selenium.py b/MarketPlaces/MetaVerseMarket/crawler_selenium.py index 49760fc..d5783a4 100644 --- a/MarketPlaces/MetaVerseMarket/crawler_selenium.py +++ b/MarketPlaces/MetaVerseMarket/crawler_selenium.py @@ -186,10 +186,10 @@ def getInterestedLinks(): # hacking links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/hacking') - # hosting - links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/hosting') - # hacking guides and tutorials - links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/hacking-guides-and-tutorials') + # # hosting + # links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/hosting') + # # hacking guides and tutorials + # links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/hacking-guides-and-tutorials') return links @@ -236,7 +236,7 @@ def crawlForum(driver): break try: - link = driver.find_element(by=By.XPATH, value='//a[@class="page-link-next"]').get_attribute('href') + link = driver.find_element(by=By.PARTIAL_LINK_TEXT, value='Next').get_attribute('href') if link == "": raise NoSuchElementException count += 1 diff --git a/MarketPlaces/MetaVerseMarket/parser.py b/MarketPlaces/MetaVerseMarket/parser.py index 047db35..c43b823 100644 --- a/MarketPlaces/MetaVerseMarket/parser.py +++ b/MarketPlaces/MetaVerseMarket/parser.py @@ -14,55 +14,52 @@ from bs4 import BeautifulSoup def metaversemarket_description_parser(soup): # Fields to be parsed - name = "-1" # 0 Product_Name - describe = "-1" # 1 Product_Description - lastSeen = "-1" # 2 Product_LastViewDate - CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 5 Product_MS_Classification (Microsoft Security) - review = "-1" # 6 Product_Number_Of_Reviews - category = "-1" # 7 Product_Category - shipFrom = "-1" # 8 Product_ShippedFrom - shipTo = "-1" # 9 Product_ShippedTo - left = "-1" # 10 Product_QuantityLeft - escrow = "-1" # 11 Vendor_Warranty - terms = "-1" # 12 Vendor_TermsAndConditions - vendor = "-1" # 13 Vendor_Name - sold = "-1" # 14 Product_QuantitySold - addDate = "-1" # 15 Product_AddedDate - BTC = "-1" # 18 Product_BTC_SellingPrice - USD = "-1" # 19 Product_USD_SellingPrice - rating = "-1" # 20 Vendor_Rating - success = "-1" # 21 Vendor_Successful_Transactions - EURO = "-1" # 22 Product_EURO_SellingPrice + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image # Finding Product Name - name = soup.find('div', {'class': "panel-heading"}).text.strip + name = soup.find('div', {'class': "panel-heading"}).text + name = cleanString(name.strip()) # Finding Vendor temp = soup.findAll('div', {'class': "col-xs-12 col-sm-6 mt-5"}) temp = temp[1].findAll('span') - temp = temp[1].find('b').text - name = temp.replace("@", "") + vendor = temp[1].find('b').text + vendor = cleanString(vendor.strip()) # Finding Product Reviews - review = soup.find('span', {'class': "badge bg-success fs-12px"}).text.strip() + reviews = soup.find('span', {'class': "badge bg-success fs-12px"}).text.strip() # Finding Successful Transactions # NA - # Finding Prices - USD = soup.find('h3', {'class': "mb-2"}).text() + USD = soup.find('h3', {'class': "mb-2"}).text USD = USD.replace("Price: $", "").strip() - # Finding Escrow - escrow = soup.find('div', {'class': "alert alert-info text-center fw-bold"}).text - escrow = escrow.replace('You are protected by ', "").strip() - # Finding the Product Category - temp = soup.select('div[class="mt-2"]') - temp = temp[0].findAll('span') - category = temp[1].text.strip() + temp = soup.select('div[class="mt-2"]')[1].text + temp = temp.replace("Category:", "") + category = temp.strip() # Finding the Product Quantity Available # temp = soup.find('em', {'class': "icon ni ni-layers-fill"}).parent.parent.parent @@ -78,8 +75,8 @@ def metaversemarket_description_parser(soup): # Finding Shipment Information (Origin) - temp = soup.findAll('div', {'class': "alert alert-info"}) - temp = temp[1].text.split("to") + temp = soup.find('div', {'class': "alert alert-info"}).text + temp = temp.split("to") shipFrom = temp[0].replace("Shipping from ", "").strip() # Finding Shipment Information (Destination) @@ -123,8 +120,8 @@ def metaversemarket_description_parser(soup): MS = MS.replace('\n', '') # Populating the final variable (this should be a list with all fields scraped) - row = (name, describe, lastSeen, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor, - sold, addDate, BTC, USD, rating, success, EURO) + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results return row @@ -136,29 +133,30 @@ def metaversemarket_description_parser(soup): # return: 'row' that contains a variety of lists that each hold info on the listing page def metaversemarket_listing_parser(soup): # Fields to be parsed - nm = 0 # Total_Products (Should be Integer) - mktName = "DarkFox" # 0 Marketplace_Name - name = [] # 1 Product_Name - CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 3 Product_MS_Classification (Microsoft Security) - category = [] # 4 Product_Category - describe = [] # 5 Product_Description - escrow = [] # 6 Vendor_Warranty - views = [] # 7 Product_Number_Of_Views - reviews = [] # 8 Product_Number_Of_Reviews - addDate = [] # 9 Product_AddDate - lastSeen = [] # 10 Product_LastViewDate - BTC = [] # 11 Product_BTC_SellingPrice - USD = [] # 12 Product_USD_SellingPrice - EURO = [] # 13 Product_EURO_SellingPrice - sold = [] # 14 Product_QuantitySold - qLeft = [] # 15 Product_QuantityLeft - shipFrom = [] # 16 Product_ShippedFrom - shipTo = [] # 17 Product_ShippedTo - vendor = [] # 18 Vendor - rating = [] # 19 Vendor_Rating - success = [] # 20 Vendor_Successful_Transactions - href = [] # 23 Product_Links (Urls) + nm = 0 # *Total_Products (Should be Integer) + mktName = "MetaVerseMarket" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links listing = soup.findAll('div', {"class": "col-12 col-sm-4 col-xl-3 product_item_col p-1"}) @@ -175,11 +173,7 @@ def metaversemarket_listing_parser(soup): # Finding the Product product = bae[1].find('span', {"class": "text-primary"}).text - product = product.replace('\n', ' ') - product = product.replace(",", "") - product = product.replace("...", "") - product = product.strip() - name.append(product) + name.append(cleanString(product.strip())) # Finding Prices price = a.find('strong').text @@ -191,20 +185,18 @@ def metaversemarket_listing_parser(soup): # Finding the Vendor temp = a.find('div', {'class': "mt-1 fs-12px"}) temp = temp.findAll('span') - temp = temp[1].find('b').text - vendor_name = temp.replace("@", "").strip() - vendor.append(vendor_name) + vendor_name = temp[1].find('b').text + vendor.append(cleanString(vendor_name.strip())) # Finding the Category cat = a.select_one('div[class="fs-12px"]') cat = cat.findAll('span')[1].text - cat = cat.text cat = cat.strip() category.append(cat) - badge = a.findAll('span', {'class': "badge bg-success"}) + badge = a.find('span', {'class': "badge bg-success"}) # Finding Number Sold and Quantity Left - temp = badge[1].text + temp = badge.text temp = temp.split("/") num = temp[0] num = num.strip() @@ -226,11 +218,7 @@ def metaversemarket_listing_parser(soup): description = a.find('p', {'class': "alert alert-light text-ssbold p-1"}).text description = description.replace("\n", " ") description = description.strip() - describe.append(description) - - # Finding Escrow - es = a.find('span', {'class': "fw-bold"}).text.strip() - escrow.append(es) + describe.append(cleanString(description)) # Finding Number of Views view = a.find('span', {'class': "badge bg-primary"}).text.strip() @@ -239,11 +227,11 @@ def metaversemarket_listing_parser(soup): # Find where ships from ships = a.find('div', {'class': "alert alert-info item_alert fs-12px p-1"}) ships = ships.findAll('b') - sFrom = ships[0].text.strips() + sFrom = ships[0].text.strip() shipFrom.append(sFrom) # Find where it ships to - sTo = ships[1].text.strips() + sTo = ships[1].text.strip() shipTo.append(sTo) # Searching for CVE and MS categories @@ -274,8 +262,8 @@ def metaversemarket_listing_parser(soup): MS.append(MSValue) # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen, - BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href) + return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) # called by the crawler to get description links on a listing page diff --git a/MarketPlaces/PabloEscobarMarket/crawler_selenium.py b/MarketPlaces/PabloEscobarMarket/crawler_selenium.py index 8dc783c..7f516ff 100644 --- a/MarketPlaces/PabloEscobarMarket/crawler_selenium.py +++ b/MarketPlaces/PabloEscobarMarket/crawler_selenium.py @@ -209,7 +209,7 @@ def crawlForum(driver): driver.back() # comment out - # break + break # comment out if count == 1: diff --git a/MarketPlaces/PabloEscobarMarket/parser.py b/MarketPlaces/PabloEscobarMarket/parser.py index ecdd086..a716581 100644 --- a/MarketPlaces/PabloEscobarMarket/parser.py +++ b/MarketPlaces/PabloEscobarMarket/parser.py @@ -14,26 +14,27 @@ from bs4 import BeautifulSoup def pabloescobarmarket_description_parser(soup): # Fields to be parsed - name = "-1" # 0 Product_Name - describe = "-1" # 1 Product_Description - lastSeen = "-1" # 2 Product_LastViewDate - CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 5 Product_MS_Classification (Microsoft Security) - review = "-1" # 6 Product_Number_Of_Reviews - category = "-1" # 7 Product_Category - shipFrom = "-1" # 8 Product_ShippedFrom - shipTo = "-1" # 9 Product_ShippedTo - left = "-1" # 10 Product_QuantityLeft - escrow = "-1" # 11 Vendor_Warranty - terms = "-1" # 12 Vendor_TermsAndConditions - vendor = "-1" # 13 Vendor_Name - sold = "-1" # 14 Product_QuantitySold - addDate = "-1" # 15 Product_AddedDate - BTC = "-1" # 18 Product_BTC_SellingPrice - USD = "-1" # 19 Product_USD_SellingPrice - rating = "-1" # 20 Vendor_Rating - success = "-1" # 21 Vendor_Successful_Transactions - EURO = "-1" # 22 Product_EURO_SellingPrice + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image # Finding Product Name # NA @@ -109,8 +110,8 @@ def pabloescobarmarket_description_parser(soup): MS = MS.replace('\n', '') # Populating the final variable (this should be a list with all fields scraped) - row = (name, describe, lastSeen, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor, - sold, addDate, BTC, USD, rating, success, EURO) + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results return row @@ -122,29 +123,30 @@ def pabloescobarmarket_description_parser(soup): # return: 'row' that contains a variety of lists that each hold info on the listing page def pabloescobarmarket_listing_parser(soup): # Fields to be parsed - nm = 0 # Total_Products (Should be Integer) - mktName = "PabloEscobarMarket" # 0 Marketplace_Name - name = [] # 1 Product_Name - CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 3 Product_MS_Classification (Microsoft Security) - category = [] # 4 Product_Category - describe = [] # 5 Product_Description - escrow = [] # 6 Vendor_Warranty - views = [] # 7 Product_Number_Of_Views - reviews = [] # 8 Product_Number_Of_Reviews - addDate = [] # 9 Product_AddDate - lastSeen = [] # 10 Product_LastViewDate - BTC = [] # 11 Product_BTC_SellingPrice - USD = [] # 12 Product_USD_SellingPrice - EURO = [] # 13 Product_EURO_SellingPrice - sold = [] # 14 Product_QuantitySold - qLeft = [] # 15 Product_QuantityLeft - shipFrom = [] # 16 Product_ShippedFrom - shipTo = [] # 17 Product_ShippedTo - vendor = [] # 18 Vendor - rating = [] # 19 Vendor_Rating - success = [] # 20 Vendor_Successful_Transactions - href = [] # 23 Product_Links (Urls) + nm = 0 # *Total_Products (Should be Integer) + mktName = "PabloEscobarMarket" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links listing = soup.findAll('div', {"class": "p-4"}) @@ -220,8 +222,8 @@ def pabloescobarmarket_listing_parser(soup): MS.append(MSValue) # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen, - BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href) + return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) # called by the crawler to get description links on a listing page diff --git a/MarketPlaces/ThiefWorld/crawler_selenium.py b/MarketPlaces/ThiefWorld/crawler_selenium.py index 16f60b0..af5a456 100644 --- a/MarketPlaces/ThiefWorld/crawler_selenium.py +++ b/MarketPlaces/ThiefWorld/crawler_selenium.py @@ -87,8 +87,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -180,8 +180,8 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # Hacking and DDOS - links.append('http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/35') + # Hacking and DOSS + links.append(['Hacking and DOSS', 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/35']) # # Carding Manuals # links.append('http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/20') # # Software @@ -202,7 +202,8 @@ def crawlForum(driver): i = 0 while i < len(linksToCrawl): - link = linksToCrawl[i] + cat = linksToCrawl[i][0] + link = linksToCrawl[i][1] print('Crawling :', link) try: has_next_page = True @@ -214,6 +215,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source + html += f"{cat}" savePage(driver, html, link) list = productPages(html) diff --git a/MarketPlaces/ThiefWorld/parser.py b/MarketPlaces/ThiefWorld/parser.py index bd6c371..ba0f51c 100644 --- a/MarketPlaces/ThiefWorld/parser.py +++ b/MarketPlaces/ThiefWorld/parser.py @@ -66,8 +66,6 @@ def thiefWorld_description_parser(soup: BeautifulSoup) -> Tuple: rating_item = rating_item.replace("rating", "") rating_item = cleanString(rating_item.strip()) - category = "Hacking, DOSS" - # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) @@ -126,7 +124,9 @@ def thiefWorld_listing_parser(soup: BeautifulSoup): CVE.append('-1') MS.append('-1') - category.append('Hacking, DOSS') + + cat = soup.find('calsys-cat').text + category.append(cat.strip()) productDescription = product.find('div', {'class': 'text'}).text productDescription = cleanString(productDescription.strip()) diff --git a/MarketPlaces/TorMarket/crawler_selenium.py b/MarketPlaces/TorMarket/crawler_selenium.py index de75f89..86fde52 100644 --- a/MarketPlaces/TorMarket/crawler_selenium.py +++ b/MarketPlaces/TorMarket/crawler_selenium.py @@ -215,14 +215,14 @@ def crawlForum(driver): driver.back() # comment out - break + # break # comment out if count == 1: break try: - link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') + link = driver.find_element(by=By.LINK_TEXT, value='NEXT').get_attribute('href') if link == "": raise NoSuchElementException count += 1 diff --git a/MarketPlaces/TorMarket/parser.py b/MarketPlaces/TorMarket/parser.py index e6e14b9..417f8ac 100644 --- a/MarketPlaces/TorMarket/parser.py +++ b/MarketPlaces/TorMarket/parser.py @@ -16,29 +16,27 @@ def tormarket_description_parser(soup): # Fields to be parsed - name = "-1" # 0 Product_Name - describe = "-1" # 1 Product_Description - lastSeen = "-1" # 2 Product_LastViewDate - rules = "-1" # 3 NOT USED ... - CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = "-1" # 5 Product_MS_Classification (Microsoft Security) - review = "-1" # 6 Product_Number_Of_Reviews + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) category = "-1" # 7 Product_Category - shipFrom = "-1" # 8 Product_ShippedFrom - shipTo = "-1" # 9 Product_ShippedTo - left = "-1" # 10 Product_QuantityLeft - escrow = "-1" # 11 Vendor_Warranty - terms = "-1" # 12 Vendor_TermsAndConditions - vendor = "-1" # 13 Vendor_Name - sold = "-1" # 14 Product_QuantitySold - addDate = "-1" # 15 Product_AddedDate - available = "-1" # 16 NOT USED ... - endDate = "-1" # 17 NOT USED ... - BTC = "-1" # 18 Product_BTC_SellingPrice - USD = "-1" # 19 Product_USD_SellingPrice - rating = "-1" # 20 Vendor_Rating - success = "-1" # 21 Vendor_Successful_Transactions - EURO = "-1" # 22 Product_EURO_SellingPrice + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image #finding the name of the product name_of_product = soup.find("h1", {"class": "product_title entry-title"}).find("a").text @@ -51,7 +49,7 @@ def tormarket_description_parser(soup): if inquires_about_product == "There are no inquiries yet.": review = 0 else: - review = -1 #fix later pls + review = "-1" #fix later pls #finding the terms and conditions terms_and_conditions = soup.find("div", {"class": "woocommerce-Tabs-panel woocommerce-Tabs-panel--wcfm_enquiry_tab panel entry-content wc-tab"}).find("p").text @@ -68,8 +66,8 @@ def tormarket_description_parser(soup): #everything else gets a -1 because they are not found # Populating the final variable (this should be a list with all fields scraped) - row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor, - sold, addDate, available, endDate, BTC, USD, rating, success, EURO) + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) # Sending the results return row @@ -82,28 +80,30 @@ def tormarket_description_parser(soup): def tormarket_listing_parser(soup): # Fields to be parsed - nm = 0 # *Total_Products (Should be Integer) - mktName = "TorMarket" # 0 *Marketplace_Name - vendor = [] # 1 *Vendor y - rating_vendor = [] # 2 Vendor_Rating - success = [] # 3 Vendor_Successful_Transactions - name = [] # 4 *Product_Name y - CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 6 Product_MS_Classification (Microsoft Security) - category = [] # 7 Product_Category y - describe = [] # 8 Product_Description - views = [] # 9 Product_Number_Of_Views - reviews = [] # 10 Product_Number_Of_Reviews - rating_item = [] # 11 Product_Rating - addDate = [] # 12 Product_AddDate - BTC = [] # 13 Product_BTC_SellingPrice - USD = [] # 14 Product_USD_SellingPrice y - EURO = [] # 15 Product_EURO_SellingPrice - sold = [] # 16 Product_QuantitySold - qLeft = [] # 17 Product_QuantityLeft - shipFrom = [] # 18 Product_ShippedFrom - shipTo = [] # 19 Product_ShippedTo - href = [] # 20 Product_Links + nm = 0 # *Total_Products (Should be Integer) + mktName = "TorMarket" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links products_list = soup.find('ul', {"class": "products columns-3 tablet-columns-2 mobile-columns-1"}).find_all('li') nm = len(products_list) @@ -159,30 +159,8 @@ def tormarket_listing_parser(soup): # Populate the final variable (this should be a list with all fields scraped) - return organizeProducts( - marketplace = "TorMarket", - nm = nm, - vendor = vendor, - rating_vendor = rating_vendor, - success_vendor = success, - nombre = name, - CVE = CVE, - MS = MS, - category = category, - describe = describe, - views = views, - reviews = reviews, - rating_item = rating_item, - addDate = addDate, - BTC = BTC, - USD = USD, - EURO = EURO, - sold = sold, - qLeft = qLeft, - shipFrom = shipFrom, - shipTo = shipTo, - href = href - ) + return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) #called by the crawler to get description links on a listing page From d1d53d9b239dc775b14ad8fc12be322ac784763c Mon Sep 17 00:00:00 2001 From: westernmeadow Date: Mon, 4 Sep 2023 22:39:11 -0700 Subject: [PATCH 3/6] fixed change tracking bug and image tracking for AnonMarket --- Forums/Utilities/utilities.py | 26 ++-- MarketPlaces/AnonMarket/crawler_selenium.py | 7 +- MarketPlaces/AnonMarket/parser.py | 131 +++++++++++--------- MarketPlaces/DB_Connection/db_connection.py | 4 +- MarketPlaces/Utilities/utilities.py | 25 ++-- 5 files changed, 103 insertions(+), 90 deletions(-) diff --git a/Forums/Utilities/utilities.py b/Forums/Utilities/utilities.py index 659e456..2c2d89f 100644 --- a/Forums/Utilities/utilities.py +++ b/Forums/Utilities/utilities.py @@ -195,12 +195,16 @@ def cleanLink(originalLink): def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate): - day = time.strftime("%m/%d/%Y") - ahora = time.strftime("%I:%M:%S") - rw = [] + current_time = datetime.now() + day = current_time.strftime("%m/%d/%Y") + for n in range(nm): + + current_time += timedelta(seconds=2) + ahora = current_time.strftime("%I:%M:%S") + lne = forum # 0 lne += "," lne += board # 1 @@ -400,19 +404,19 @@ def cleanHTML(driver, html): ] # remove images - clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) for fmat in formats: - clean_html = re.sub(r"", "", clean_html) - clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) # remove JavaScript - clean_html = re.sub(r"", "", clean_html) - clean_html = re.sub(r"", "", clean_html) - clean_html = re.sub(r"", "", clean_html) - clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) # image and JavaScript - clean_html = re.sub(r"]*style=\"[^\"]*background-image.*?>|background-image:url\(\'(.*?)\'\);", "", clean_html) + clean_html = re.sub(r"]*style=\"[^\"]*background-image[\s\S]*?div>", "", clean_html) return clean_html diff --git a/MarketPlaces/AnonMarket/crawler_selenium.py b/MarketPlaces/AnonMarket/crawler_selenium.py index 42d8e49..e5f5a3d 100644 --- a/MarketPlaces/AnonMarket/crawler_selenium.py +++ b/MarketPlaces/AnonMarket/crawler_selenium.py @@ -159,9 +159,8 @@ def getNameFromURL(url): #as you can see they are categories of products def getInterestedLinks(): links = [] - # # Software - # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/civil_softwares') - # # Malware + + # Malware links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/malware') # # Bootkits # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/bootkits') @@ -195,6 +194,8 @@ def getInterestedLinks(): # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/exploit_kit') # # Security # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/security') + # # Ransomware + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/ransomware') return links diff --git a/MarketPlaces/AnonMarket/parser.py b/MarketPlaces/AnonMarket/parser.py index c5c7f6d..997d43e 100644 --- a/MarketPlaces/AnonMarket/parser.py +++ b/MarketPlaces/AnonMarket/parser.py @@ -49,26 +49,29 @@ def AnonMarket_description_parser(soup): info_div = soup.find('div', {'class': 'information'}) table = info_div.find('table') if info_div else None - if table: - # Find all table rows - rows = table.find_all('tr') - - # Parse each row to get relevant data - data = {} - for row in rows: - columns = row.find_all('td') - if len(columns) == 3: - key = columns[0].text.strip() - value = columns[2].text.strip() - data[key] = value - - # Extract specific data from the dictionary and assign them to individual variables - vendor = data.get('Vendor', '-1') - shipFrom = data.get('Location', '-1') - shipTo = data.get('Ships to', '-1') - category = data.get('Category', '-1') - USD = data.get('Price', '-1').split()[0] - left = data.get('Stock', '-1') + # Find all table rows + rows = table.find_all('tr') + + # Parse each row to get relevant data + data = {} + for row in rows: + columns = row.find_all('td') + if len(columns) == 3: + key = columns[0].text.strip() + value = columns[2].text.strip() + data[key] = value + + # Extract specific data from the dictionary and assign them to individual variables + vendor = data.get('Vendor', '-1') + shipFrom = data.get('Location', '-1') + shipTo = data.get('Ships to', '-1') + category = data.get('Category', '-1') + USD = data.get('Price', '-1').split()[0] + left = data.get('Stock', '-1') + + # image + image = soup.find('img', {"class": "bigthumbnail"}) + image = image.get('src').split('base64,')[-1] # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, @@ -111,51 +114,55 @@ def AnonMarket_listing_parser(soup): href = [] # 22 Product_Links base_url = "http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion" + cat = soup.find("div", {'class': 'heading'}).text + products_list = soup.find_all('div', {'class': 'item'}) nm = 0 for product in products_list: - try: - name_of_product = product.find("div", {"class": "title"}).text.strip() - name.append(name_of_product) - - name_of_vendor = product.find("a", {'class': 'seller'}).text.strip() - vendor.append(name_of_vendor) - - cat = soup.find("div", {'class': 'heading'}).text - category.append(cat) - - product_link_element = product.find("div", {"class": "title"}).find_parent('a') - if product_link_element: - link = product_link_element['href'] - if "/product/" in link and "/user/" not in link: - full_link = base_url + link - href.append(full_link) - else: - href.append("-1") - else: - href.append("-1") - - # Append '-1' for unavailable data - rating_vendor.append("-1") - success.append("-1") - CVE.append("-1") - MS.append("-1") - describe.append("-1") - views.append("-1") - reviews.append("-1") - addDate.append("-1") - BTC.append("-1") - EURO.append("-1") - sold.append("-1") - qLeft.append("-1") - shipFrom.append("-1") - shipTo.append("-1") - - nm += 1 - - except AttributeError as e: - print("I'm somewhere I don't belong. I'm going to leave") - continue + name_of_product = product.find("div", {"class": "title"}).text.strip() + name.append(name_of_product) + + name_of_vendor = product.find("a", {'class': 'seller'}).text.strip() + vendor.append(name_of_vendor) + + category.append(cat) + + tbody = product.find('div', {"class": "info"}).find('tbody') + + # rating_item + width = tbody.find('div', {"class": "stars2"}).get('style') + rating_item.append(cleanNumbers(width.strip())) + + tr = tbody.findAll('tr', recursive=False) + td = tr[2].findAll('td') + + # sold + sold.append(td[0].text.strip()) + + # reviews + reviews.append(td[1].text.strip()) + + product_link_element = product.find("div", {"class": "title"}).find_parent('a') + link = product_link_element['href'] + full_link = base_url + link + href.append(full_link) + + # Append '-1' for unavailable data + rating_vendor.append("-1") + success.append("-1") + CVE.append("-1") + MS.append("-1") + describe.append("-1") + views.append("-1") + addDate.append("-1") + BTC.append("-1") + USD.append("-1") + EURO.append("-1") + qLeft.append("-1") + shipFrom.append("-1") + shipTo.append("-1") + + nm += 1 # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, diff --git a/MarketPlaces/DB_Connection/db_connection.py b/MarketPlaces/DB_Connection/db_connection.py index eb4d996..74b1be5 100644 --- a/MarketPlaces/DB_Connection/db_connection.py +++ b/MarketPlaces/DB_Connection/db_connection.py @@ -266,7 +266,7 @@ def create_items(cur, row, marketId, vendorId): recset = cur.fetchall() - #decode_decrypt_image_in_base64(recset[0][20]) + # decode_decrypt_image_in_base64(recset[0][20]) if (str(recset[0][4]) != str(row[5] if row[5] != '-1' else None) or str(recset[0][5]) != str(row[6] if row[6] != '-1' else None) or str(recset[0][6]) != str(row[7] if row[7] != '-1' else None) or str(recset[0][7]) != str(row[8] if row[8] != '-1' else None) or @@ -332,7 +332,7 @@ def create_items(cur, row, marketId, vendorId): 'shippedto_item': row[19] if row[19] != '-1' else None, 'dateinserted_item': row[23], 'lastseen_item': row[23], - 'image_item': row[20], + 'image_item': row[20] if row[20] != '-1' else None, 'itemId': itemId}) diff --git a/MarketPlaces/Utilities/utilities.py b/MarketPlaces/Utilities/utilities.py index fb9b122..77312f6 100644 --- a/MarketPlaces/Utilities/utilities.py +++ b/MarketPlaces/Utilities/utilities.py @@ -8,8 +8,6 @@ import base64 import io import configparser from datetime import datetime, timedelta -import datetime as fulldatetime -from bs4 import BeautifulSoup from lxml import html as lxml from selenium.webdriver.common.by import By from Crypto.Cipher import AES @@ -246,11 +244,14 @@ def organizeProducts(marketplace, nm, vendor, rating_vendor, success_vendor, nom rw = [] - day = time.strftime("%m/%d/%Y") - ahora = time.strftime("%I:%M:%S") + current_time = datetime.now() + day = current_time.strftime("%m/%d/%Y") for n in range(nm): + current_time += timedelta(seconds=2) + ahora = current_time.strftime("%I:%M:%S") + lne = marketplace # 0 lne += "," lne += vendor[n] # 1 @@ -422,19 +423,19 @@ def cleanHTML(driver, html): ] # remove images - clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) for fmat in formats: - clean_html = re.sub(r"", "", clean_html) - clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) # remove JavaScript - clean_html = re.sub(r"", "", clean_html) - clean_html = re.sub(r"", "", clean_html) - clean_html = re.sub(r"", "", clean_html) - clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) # image and JavaScript - clean_html = re.sub(r"]*style=\"[^\"]*background-image.*?>|background-image:url\(\'(.*?)\'\);", "", clean_html) + clean_html = re.sub(r"]*style=\"[^\"]*background-image[\s\S]*?div>", "", clean_html) return clean_html From 0345836e20edacb80377d28781bb29a55e7dcb82 Mon Sep 17 00:00:00 2001 From: westernmeadow Date: Tue, 5 Sep 2023 17:59:33 -0700 Subject: [PATCH 4/6] user image tracking ONLY (missing post image) for some forums --- .idea/DW_Pipeline_Test.iml | 1 + Forums/Altenens/parser.py | 8 ++++ Forums/BestCardingWorld/parser.py | 5 ++ Forums/Cardingleaks/parser.py | 6 ++- Forums/CryptBB/parser.py | 5 ++ Forums/HiddenAnswers/parser.py | 17 ++++++- Forums/OnniForums/parser.py | 7 ++- Forums/Utilities/utilities.py | 8 ++++ MarketPlaces/Apocalypse/parser.py | 7 +-- MarketPlaces/GoFish/crawler_selenium.py | 28 +++-------- MarketPlaces/Torzon/crawler_selenium.py | 62 +++++++++---------------- 11 files changed, 82 insertions(+), 72 deletions(-) diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml index f27dbb9..9ee2f4c 100644 --- a/.idea/DW_Pipeline_Test.iml +++ b/.idea/DW_Pipeline_Test.iml @@ -28,6 +28,7 @@ diff --git a/Forums/Altenens/parser.py b/Forums/Altenens/parser.py index 19155d5..bdad19d 100644 --- a/Forums/Altenens/parser.py +++ b/Forums/Altenens/parser.py @@ -22,6 +22,7 @@ def altenens_description_parser(soup): post = [] # 6 all messages of each post feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) addDate = [] # 8 all dates of each post + image_user = [] topic = soup.find("h1", {"class": "p-title-value"}).text topic = cleanString(topic.strip()) @@ -66,6 +67,13 @@ def altenens_description_parser(soup): date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z') addDate.append(date_time_obj) + img = ipost.find('div', {"class": "message-avatar-wrapper"}).find('img') + if img is not None: + img = img.get('src').split('base64,')[-1] + else: + img = "-1" + image_user.append(img) + # Populate the final variable (this should be a list with all fields scraped) row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) diff --git a/Forums/BestCardingWorld/parser.py b/Forums/BestCardingWorld/parser.py index c4ca6e0..5a294c6 100644 --- a/Forums/BestCardingWorld/parser.py +++ b/Forums/BestCardingWorld/parser.py @@ -25,6 +25,7 @@ def bestcardingworld_description_parser(soup): sign = [] # 6 all user's signature in each post (usually a standard message after the content of the post) post = [] # 7 all messages of each post interest = [] # 8 all user's interest in each post + image_user = [] # Finding the topic (should be just one coming from the Listing Page) @@ -150,6 +151,10 @@ def bestcardingworld_description_parser(soup): feedback.append("-1") + img = ipost.find('div', {"class": "avatar-container"}).find('img', {"class": "avatar"}) + img = img.get('src').split('base64,')[-1] + image_user.append(img) + # Populate the final variable (this should be a list with all fields scraped) row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) diff --git a/Forums/Cardingleaks/parser.py b/Forums/Cardingleaks/parser.py index 98ddf3a..7ab139d 100644 --- a/Forums/Cardingleaks/parser.py +++ b/Forums/Cardingleaks/parser.py @@ -25,6 +25,7 @@ def cardingleaks_description_parser(soup: Tag): post = [] # 6 all messages of each post feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) addDate = [] # 8 all dates of each post + image_user = [] li = soup.find("h1", {"class": "p-title-value"}) topic = cleanString(li.text.strip()) @@ -62,7 +63,10 @@ def cardingleaks_description_parser(soup: Tag): datetime_text = ipost.find("ul", {"class": "message-attribution-main listInline"}).find("time").get("datetime") datetime_obj = datetime.strptime(datetime_text, "%Y-%m-%dT%H:%M:%S%z") addDate.append(datetime_obj) - + + img = ipost.find('div', {"class": "message-avatar"}).find('img') + img = img.get('src').split('base64,')[-1] + image_user.append(img) # Populate the final variable (this should be a list with all fields scraped) diff --git a/Forums/CryptBB/parser.py b/Forums/CryptBB/parser.py index bcef5f8..bfe4403 100644 --- a/Forums/CryptBB/parser.py +++ b/Forums/CryptBB/parser.py @@ -25,6 +25,7 @@ def cryptBB_description_parser(soup): post = [] # 6 all messages of each post feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) addDate = [] # 8 all dates of each post + image_user = [] # Finding the topic (should be just one coming from the Listing Page) @@ -155,6 +156,10 @@ def cryptBB_description_parser(soup): feedback.append("-1") + img = ipost.find('div', {"class": "author_avatar"}).find('img') + img = img.get('src').split('base64,')[-1] + image_user.append(img) + # Populate the final variable (this should be a list with all fields scraped) row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) diff --git a/Forums/HiddenAnswers/parser.py b/Forums/HiddenAnswers/parser.py index 16b56cb..e42ace8 100644 --- a/Forums/HiddenAnswers/parser.py +++ b/Forums/HiddenAnswers/parser.py @@ -22,7 +22,7 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup): sign: List[str] = [] # all user's signature in each post (usually a standard message after the content of the post) post: List[str] = [] # all messages of each post interest: List[str] = [] # all user's interest in each post - + image_user = [] # Finding the topic (should be just one coming from the Listing Page) li = soup.find("h1").find("span", {"itemprop": "name"}) @@ -53,7 +53,13 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup): feedback.append("-1") sign.append("-1") interest.append("-1") - + + img = question.find('span', {"class": "qa-q-view-avatar-meta"}).find('img') + if img is not None: + img = img.get('src').split('base64,')[-1] + else: + img = "-1" + image_user.append(img) answer_list: ResultSet[Tag] = soup.find("div", {"class": "qa-a-list"}).find_all("div", {"class": "qa-a-list-item"}) @@ -84,6 +90,13 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup): sign.append("-1") interest.append("-1") + img = replies.find('span', {"class": "qa-a-item-avatar-meta"}).find('img') + if img is not None: + img = img.get('src').split('base64,')[-1] + else: + img = "-1" + image_user.append(img) + # Populate the final variable (this should be a list with all fields scraped) row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) diff --git a/Forums/OnniForums/parser.py b/Forums/OnniForums/parser.py index 3854141..e0c780a 100644 --- a/Forums/OnniForums/parser.py +++ b/Forums/OnniForums/parser.py @@ -143,8 +143,7 @@ def onniForums_listing_parser(soup: BeautifulSoup): body = thread.find("span",{"class": "subject_new"}) try: post_subject: str = body.text #getting the topic - - except AttributeError: + except: body = thread.find("span",{"class": "subject_old"}) post_subject: str = body.text @@ -153,10 +152,10 @@ def onniForums_listing_parser(soup: BeautifulSoup): reply_count = thread.find_all("td", {"align": "center"})[2].text - post.append(reply_count) + post.append(cleanNumbers(reply_count)) views = thread.find_all("td", {"align": "center"})[3].text - view.append(views) + view.append(cleanNumbers(views)) # dates_added: str = thread.find("span",{"class" : "thread_start_datetime smalltext"}).text # dates_added_cleaned = dates_added.split(',')[0] diff --git a/Forums/Utilities/utilities.py b/Forums/Utilities/utilities.py index 2c2d89f..e7afcb8 100644 --- a/Forums/Utilities/utilities.py +++ b/Forums/Utilities/utilities.py @@ -306,6 +306,14 @@ def convertFromLongDate(longDate, crawlerdate): return correct_date +def cleanNumbers(inputString): + + reg_ex = re.compile(r'[^\d.]+') + updated_string = reg_ex.sub('', inputString) + + return updated_string + + def aes_encryption(item): to_bytes = bytes(item) diff --git a/MarketPlaces/Apocalypse/parser.py b/MarketPlaces/Apocalypse/parser.py index b7a4f63..8cd3a5b 100644 --- a/MarketPlaces/Apocalypse/parser.py +++ b/MarketPlaces/Apocalypse/parser.py @@ -141,8 +141,6 @@ def apocalypse_listing_parser(soup: Tag): product_price = prod.find("span", {"class": "priceP"}).text USD.append(cleanString(product_price.strip())) - - product_sold = prod.find("span", {"class": "badge badge-success"}).text sold.append(cleanString(product_sold.strip())) @@ -168,7 +166,6 @@ def apocalypse_listing_parser(soup: Tag): # When split by the star (★), it should return a 2-value array product_vendor, product_vendor_rating = product_vendor_tag.text.split("★") - try: vendor.append(cleanString(product_vendor.strip())) rating.append(cleanString(product_vendor_rating.strip())) @@ -179,8 +176,7 @@ def apocalypse_listing_parser(soup: Tag): href.append(product_href) nm += 1 - - + return organizeProducts( marketplace=mktName, nm=nm, @@ -208,6 +204,7 @@ def apocalypse_listing_parser(soup: Tag): image_vendor=image_vendor ) + #called by the crawler to get description links on a listing page #@param: beautifulsoup object that is using the correct html page (listing page) #return: list of description links from a listing page diff --git a/MarketPlaces/GoFish/crawler_selenium.py b/MarketPlaces/GoFish/crawler_selenium.py index 0f87696..e5af35b 100644 --- a/MarketPlaces/GoFish/crawler_selenium.py +++ b/MarketPlaces/GoFish/crawler_selenium.py @@ -31,7 +31,6 @@ baseURL = 'http://gofishbybookb4a2kvviuygmwjqfxx7nqsovweogs2cxvqvexhe7edyd.onion # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): - # opentor() mktName = getMKTName() driver = getAccess() @@ -41,24 +40,11 @@ def startCrawling(): crawlForum(driver) except Exception as e: print(driver.current_url, e) - closetor(driver) + closeDriver(driver) new_parse(mktName, baseURL, True) -# Opens Tor Browser -#prompts for ENTER input to continue -def opentor(): - from MarketPlaces.Initialization.markets_mining import config - - global pid - print("Connecting Tor...") - pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) - pid = pro.pid - time.sleep(7.5) - input('Tor Connected. Press ENTER to continue\n') - return - # Returns the name of the website #return: name of site in string type def getMKTName(): @@ -75,7 +61,7 @@ def getFixedURL(): # Closes Tor Browser #@param: current selenium driver -def closetor(driver): +def closeDriver(driver): # global pid # os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /t /f /im tor.exe") @@ -102,7 +88,7 @@ def createFFDriver(): ff_prof.set_preference("network.cookie.lifetimePolicy", 2) # ff_prof.set_preference("network.dns.disablePrefetch", True) # ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 1) + ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") @@ -118,7 +104,7 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) - # driver.maximize_window() + driver.maximize_window() return driver @@ -140,7 +126,6 @@ def getAccess(): # then allows for manual solving of captcha in the terminal #@param: current selenium web driver def login(driver): - input("Press ENTER when CAPTCHA is completed\n") # wait for page to show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 100).until(EC.visibility_of_element_located( @@ -154,11 +139,12 @@ def login(driver): # Password here passwordBox.send_keys('DementedBed123-') - input("Press ENTER when CAPTCHA and exit pressed is completed\n") + input("Press ENTER when CAPTCHA is completed and logged in\n") # wait for listing page show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div/div[2]/div/div/div/div/div/div[1]/a/img"))) + (By.XPATH, "/html/body/div[1]/div[3]/div[1]/div[3]/ul/div"))) + # Saves the crawled html page, makes the directory path for html pages if not made def savePage(driver, page, url): diff --git a/MarketPlaces/Torzon/crawler_selenium.py b/MarketPlaces/Torzon/crawler_selenium.py index 6636e80..8560c57 100644 --- a/MarketPlaces/Torzon/crawler_selenium.py +++ b/MarketPlaces/Torzon/crawler_selenium.py @@ -34,7 +34,6 @@ BASE_URL = 'http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onio # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): - opentor() mktName = getMKTName() driver = getAccess() @@ -44,25 +43,11 @@ def startCrawling(): crawlForum(driver) except Exception as e: print(driver.current_url, e) - closetor(driver) + closeDriver(driver) new_parse(mktName, BASE_URL, False) -# Opens Tor Browser -#prompts for ENTER input to continue -def opentor(): - from MarketPlaces.Initialization.markets_mining import config - - global pid - print("Connecting Tor...") - pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) - pid = pro.pid - time.sleep(7.5) - input('Tor Connected. Press ENTER to continue\n') - return - - # Returns the name of the website #return: name of site in string type def getMKTName(): @@ -79,7 +64,7 @@ def getFixedURL(): # Closes Tor Browser #@param: current selenium driver -def closetor(driver): +def closeDriver(driver): # global pid # os.system("taskkill /pid " + str(pro.pid)) # os.system("taskkill /t /f /im tor.exe") @@ -96,7 +81,6 @@ def createFFDriver(): ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) - ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) ff_prof.set_preference("places.history.enabled", False) ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) @@ -107,7 +91,7 @@ def createFFDriver(): ff_prof.set_preference("network.cookie.lifetimePolicy", 2) # ff_prof.set_preference("network.dns.disablePrefetch", True)#connection issue # ff_prof.set_preference("network.http.sendRefererHeader", 0)#connection issue - ff_prof.set_preference("permissions.default.image", 1) + ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") @@ -123,6 +107,8 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() + return driver @@ -146,15 +132,13 @@ def login(driver): input("Press ENTER when CAPTCHA is completed and page is loaded\n") # wait for page to show up (This Xpath may need to change based on different seed url) + # Saves the crawled html page, makes the directory path for html pages if not made -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) - # filePath = getFullPathName("Hello") os.makedirs(os.path.dirname(filePath), exist_ok=True) - with open(filePath, 'wb') as file: - file.write(cleanPage.encode('utf-8')) - # open(filePath, 'wb').write(cleanPage.encode('utf-8')) + open(filePath, 'wb').write(cleanPage.encode('utf-8')) return @@ -191,16 +175,16 @@ def getInterestedLinks(): links = [] # # services - links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870') + # links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870') - # # software & malware + # software & malware links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870') # # fraud - links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870') + # links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Services&small=0&big=5000000&id=1995441210213618738586452129269668912607120977870') # # guides - links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Guides and Tutorials&small=0&big=5000000&id=75026212163304997524932260388151806190538071909089') + # links.append('http://torzon4kv5swfazrziqvel2imhxcckc4otcvopiv5lnxzpqu4v4m5iyd.onion/products.php?sta=1&shipsto=All&shipsfrom=All&category=Guides and Tutorials&small=0&big=5000000&id=75026212163304997524932260388151806190538071909089') return links @@ -227,27 +211,27 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) list = productPages(html) for item in list: itemURL = urlparse.urljoin(BASE_URL, str(item)) try: - time.sleep(1.5) # to keep from detecting click speed + # time.sleep(1.5) # to keep from detecting click speed driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) - time.sleep(1.5) + savePage(driver, driver.page_source, item) + # time.sleep(1.5) driver.back() # to keep from detecting click speed - # # comment out - # break - # - # # comment out - # if count == 1: - # break + # comment out + break + + # comment out + if count == 1: + break try: # nav = driver.find_element(by=By.XPATH, value='/html/body/table[1]/tbody/tr/td/form/div/div[2]/table[2]') From eb85eeb1021c74f65f910352faa34edcfb160069 Mon Sep 17 00:00:00 2001 From: westernmeadow Date: Thu, 7 Sep 2023 16:30:41 -0700 Subject: [PATCH 5/6] added Nulled forum and Kerberos market (both unfinished) --- Forums/Nulled/crawler_selenium.py | 281 +++++++++++++++++ Forums/Nulled/parser.py | 355 ++++++++++++++++++++++ MarketPlaces/Kerberos/crawler_selenium.py | 337 ++++++++++++++++++++ MarketPlaces/Kerberos/parser.py | 249 +++++++++++++++ 4 files changed, 1222 insertions(+) create mode 100644 Forums/Nulled/crawler_selenium.py create mode 100644 Forums/Nulled/parser.py create mode 100644 MarketPlaces/Kerberos/crawler_selenium.py create mode 100644 MarketPlaces/Kerberos/parser.py diff --git a/Forums/Nulled/crawler_selenium.py b/Forums/Nulled/crawler_selenium.py new file mode 100644 index 0000000..01ded13 --- /dev/null +++ b/Forums/Nulled/crawler_selenium.py @@ -0,0 +1,281 @@ +__author__ = 'DarkWeb' + +''' +nulled Forum Crawler (Selenium) +''' + +from selenium import webdriver +from selenium.common.exceptions import NoSuchElementException +from selenium.webdriver.firefox.firefox_profile import FirefoxProfile +from selenium.webdriver.firefox.firefox_binary import FirefoxBinary +from selenium.webdriver.firefox.service import Service +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait +from PIL import Image +import base64 +from io import BytesIO + + +import urllib.parse as urlparse +import os, re, time +from datetime import date +import subprocess +from bs4 import BeautifulSoup +from Forums.Initialization.prepare_parser import new_parse +from Forums.Nulled.parser import nulled_links_parser +from Forums.Utilities.utilities import cleanHTML + +counter = 1 +baseURL = 'https://www.nulled.to' + + +# Opens Tor Browser, crawls the website +def startCrawling(): + # opentor() + forumName = getForumName() + # driver = getAccess() + # + # if driver != 'down': + # login(driver) + # crawlForum(driver) + # closetor(driver) + + new_parse(forumName, False) + + +# Opens Tor Browser +def opentor(): + global pid + print("Connecting Tor...") + path = open('../../path.txt').readline().strip() + pro = subprocess.Popen(path) + pid = pro.pid + time.sleep(7.5) + input('Tor Connected. Press ENTER to continue\n') + return + + +# Login using premade account credentials and do login captcha manually +def login(driver): + time.sleep(3) + + +# Returns the name of the website +def getForumName(): + name = 'Nulled' + return name + + +# Return the link of the website +def getFixedURL(): + url = 'https://www.nulled.to' + return url + + +# Closes Tor Browser +def closetor(driver): + global pid + # os.system("taskkill /pid " + str(pro.pid)) + os.system("taskkill /t /f /im tor.exe") + print('Closing Tor...') + driver.close() + time.sleep(3) + return + + +# Creates FireFox 'driver' and configure its 'Profile' +# to use Tor proxy and socket +def createFFDriver(): + file = open('../../path.txt', 'r') + lines = file.readlines() + + ff_binary = FirefoxBinary(lines[0].strip()) + + ff_prof = FirefoxProfile(lines[1].strip()) + # ff_prof.set_preference("places.history.enabled", False) + # ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) + # ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) + # ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) + # ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) + # ff_prof.set_preference("signon.rememberSignons", False) + # ff_prof.set_preference("network.cookie.lifetimePolicy", 2) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("permissions.default.image", 3) + # ff_prof.set_preference("browser.download.folderList", 2) + # ff_prof.set_preference("browser.download.manager.showWhenStarting", False) + # ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") + ff_prof.set_preference('network.proxy.type', 1) + ff_prof.set_preference("network.proxy.socks_version", 5) + ff_prof.set_preference('network.proxy.socks', '127.0.0.1') + ff_prof.set_preference('network.proxy.socks_port', 9150) + ff_prof.set_preference('network.proxy.socks_remote_dns', True) + ff_prof.set_preference("javascript.enabled", True) + ff_prof.update_preferences() + + service = Service(lines[2].strip()) + + driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + + return driver + + +def getAccess(): + url = getFixedURL() + driver = createFFDriver() + + try: + driver.get(url) + # time.sleep(3) + return driver + + except: + + return 'down' + + +# Saves the crawled html page +def savePage(page, url): + cleanPage = cleanHTML(page) + filePath = getFullPathName(url) + os.makedirs(os.path.dirname(filePath), exist_ok=True) + open(filePath, 'wb').write(cleanPage.encode('utf-8')) + return + + +# Gets the full path of the page to be saved along with its appropriate file name +def getFullPathName(url): + fileName = getNameFromURL(url) + if isDescriptionLink(url): + fullPath = r'..\Nulled\HTML_Pages\\' + str( + "%02d" % date.today().month) + str("%02d" % date.today().day) + str( + "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' + else: + fullPath = r'..\Nulled\HTML_Pages\\' + str( + "%02d" % date.today().month) + str("%02d" % date.today().day) + str( + "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' + return fullPath + + +# Creates the file name from passed URL +def getNameFromURL(url): + global counter + name = ''.join(e for e in url if e.isalnum()) + if (name == ''): + name = str(counter) + counter = counter + 1 + return name + + +def getInterestedLinks(): + links = [] + + # Cracking Tools + links.append('https://www.nulled.to/forum/90-cracking-tools/') + # # Cracking Tutorials + # links.append('https://www.nulled.to/forum/98-cracking-tutorials/') + # # Releases + # links.append('https://www.nulled.to/forum/209-releases/') + # # Newbi.Net Frameworkse + # links.append('https://www.nulled.to/forum/51-net-framework/') + # # html css js php + # links.append('https://www.nulled.to/forum/54-html-css-js-php/') + # # C C++ + # links.append('https://www.nulled.to/forum/52-cc/') + # # other languages + # links.append('https://www.nulled.to/forum/135-other-languages/') + + return links + + +def crawlForum(driver): + print("Crawling the Nulled forum") + + linksToCrawl = getInterestedLinks() + visited = set(linksToCrawl) + initialTime = time.time() + + i = 0 + count = 0 + while i < len(linksToCrawl): + link = linksToCrawl[i] + print('Crawling :', link) + try: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + + has_next_page = True + while has_next_page: + list = topicPages(html) + for item in list: + itemURL = urlparse.urljoin(baseURL, str(item)) + try: + driver.get(itemURL) + except: + driver.refresh() + savePage(driver.page_source, item) + driver.back() + break + + if count == 1: + count = 0 + break + + try: + temp = driver.find_element(by=By.XPATH, value='/html/body/div[4]/div[3]/div/div[3]/div[4]') + temp = temp.find_element(by=By.CLASS_NAME, value='pagination') + link = temp.find_element(by=By.CLASS_NAME, value='next') + link = link.find_element(by=By.TAG_NAME, value='a').get_attribute('href') + + if link == "": + raise NoSuchElementException + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + count += 1 + + except NoSuchElementException: + has_next_page = False + + except Exception as e: + print(link, e.message) + i += 1 + + # finalTime = time.time() + # print finalTime - initialTime + + input("Crawling Nulled forum done sucessfully. Press ENTER to continue\n") + + +# Returns 'True' if the link is Topic link +def isDescriptionLink(url): + if 'topic/' in url: + return True + return False + + +# Returns True if the link is a listingPage link +def isListingLink(url): + if 'forum/' in url: + return True + return False + + +# calling the parser to define the links +def topicPages(html): + soup = BeautifulSoup(html, "html.parser") + #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text) + return nulled_links_parser(soup) + + +def crawler(): + startCrawling() + # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/Forums/Nulled/parser.py b/Forums/Nulled/parser.py new file mode 100644 index 0000000..aa40365 --- /dev/null +++ b/Forums/Nulled/parser.py @@ -0,0 +1,355 @@ +__author__ = 'DarkWeb' + +# Here, we are importing the auxiliary functions to clean or convert data +from Forums.Utilities.utilities import * +from datetime import date +from datetime import timedelta +import re + +# Here, we are importing BeautifulSoup to search through the HTML tree +from bs4 import BeautifulSoup + +# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) + +def nulled_description_parser(soup): + + # Fields to be parsed + + topic = "-1" # topic name + user = [] # all users of each post + addDate = [] # all dated of each post + feedback = [] # all feedbacks of each vendor (this was found in just one Forum and with a number format) + status = [] # all user's authority in each post such as (adm, member, dangerous) + reputation = [] # all users's karma in each post (usually found as a number) + sign = [] # all user's signature in each post (usually a standard message after the content of the post) + post = [] # all messages of each post + interest = [] # all user's interest in each post + + # Finding the topic (should be just one coming from the Listing Page) + + li = soup.find("td", {"class": "thead"}).find('strong') + topic = li.text + topic = re.sub("\[\w*\]", '', topic) + + topic = topic.replace(",","") + topic = topic.replace("\n","") + topic = cleanString(topic.strip()) + print(topic) + # Finding the repeated tag that corresponds to the listing of posts + + # posts = soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg"}) + \ + # soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg2"}) + + try: + posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all( + 'div', {"class": "post"}) + # print(len(posts)) + + # For each message (post), get all the fields we are interested to: + + for ipost in posts: + + # Finding a first level of the HTML page + + # post_wrapper = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "poster"}) + post_wrapper = ipost.find('span', {"class": "largetext"}) + # Finding the author (user) of the post + + # author = post_wrapper.find('h4') + author = post_wrapper.text.strip() + # print("author " + author) + user.append(cleanString(author)) # Remember to clean the problematic characters + + # Finding the status of the author + + smalltext = ipost.find('div', {"class": "post_author"}) + + # Testing here two possibilities to find this status and combine them + if ipost.find('div', {"class": "deleted_post_author"}): + status.append(-1) + interest.append(-1) + reputation.append(-1) + addDate.append(-1) + post.append("THIS POST HAS BEEN REMOVED!") + sign.append(-1) + feedback.append(-1) + continue + + # nulled does have membergroup and postgroup + + membergroup = smalltext.find('div', {"class": "profile-rank"}) + postgroup = smalltext.find('div', {"class": "postgroup"}) + if membergroup != None: + membergroup = membergroup.text.strip() + if postgroup != None: + postgroup = postgroup.text.strip() + membergroup = membergroup + " - " + postgroup + else: + if postgroup != None: + membergroup = postgroup.text.strip() + else: + membergroup = "-1" + + status.append(cleanString(membergroup)) + # print("status " + cleanString(membergroup)) + # Finding the interest of the author + # CryptBB does not have blurb + blurb = smalltext.find('li', {"class": "blurb"}) + if blurb != None: + blurb = blurb.text.strip() + else: + blurb = "-1" + interest.append(cleanString(blurb)) + + # Finding the reputation of the user + # CryptBB does have reputation + author_stats = smalltext.find('div', {"class": "author_statistics"}) + karma = author_stats.find('strong') + if karma != None: + karma = karma.text + karma = karma.replace("Community Rating: ", "") + karma = karma.replace("Karma: ", "") + karma = karma.strip() + else: + karma = "-1" + reputation.append(cleanString(karma)) + # print("karma " + cleanString(karma)) + # Getting here another good tag to find the post date, post content and users' signature + + postarea = ipost.find('div', {"class": "post_content"}) + + dt = postarea.find('span', {"class": "post_date"}).text + # dt = dt.strip().split() + dt = dt.strip() + day=date.today() + if "Yesterday" in dt: + yesterday = day - timedelta(days=1) + yesterday = yesterday.strftime('%m-%d-%Y') + stime = dt.replace('Yesterday,','').strip() + date_time_obj = yesterday+ ', '+stime + date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p') + elif "hours ago" in dt: + day = day.strftime('%m-%d-%Y') + date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title'] + date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p') + else: + date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p') + stime = date_time_obj.strftime('%b %d, %Y') + sdate = date_time_obj.strftime('%I:%M %p') + + + addDate.append(date_time_obj) + # print("date " + str(date_time_obj)) + # Finding the date of the post + # date_time_obj = datetime.strptime(dt, '%a %b %d, %Y %I:%M %p') + # smalltext = postarea.find('div', {"class": "flow_hidden"}).find('div', {"class": "keyinfo"})\ + # .find('div', {"class": "smalltext"}) + # sdatetime = smalltext.text + # sdatetime = sdatetime.replace(u"\xab","") # Removing unnecessary characters + # sdatetime = sdatetime.replace(u"\xbb","") # Removing unnecessary characters + # sdatetime = sdatetime.split("on: ") # Removing unnecessary characters + # sdatetime = sdatetime[1].strip() + # stime = sdatetime[:-12:-1] # Finding the time of the post + # stime = stime[::-1] + # sdate = sdatetime.replace(stime,"") # Finding the date of the post + # sdate = sdate.replace(",","") + # sdate = sdate.strip() + + # Covert the date of the post that can be informed as: "12 February 2016", "today", "yesterday". We need + # a date format here as "mm/dd/yyyy" + + # addDate.append(convertDate(sdate,"english", crawlerDate) + " " + stime) + + # Finding the post + + inner = postarea.find('div', {"class": "post_body scaleimages"}) + inner = inner.text.strip() + # print(inner) + post.append(cleanString(inner)) + + # Finding the users's signature + + # signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"}) + signature = ipost.find('div', {"class": "signature scaleimages"}) + if signature != None: + signature = signature.text.strip() + # print(signature) + else: + signature = "-1" + sign.append(cleanString(signature)) + + # As no information about users's feedback was found, just assign "-1" to the variable + + feedback.append("-1") + except: + if soup.find('td', {"class": "trow1"}).text == " You do not have permission to access this page. ": + user.append("-1") + status.append(-1) + interest.append(-1) + reputation.append(-1) + addDate.append(-1) + post.append("NO ACCESS TO THIS PAGE!") + sign.append(-1) + feedback.append(-1) + + + # Populate the final variable (this should be a list with all fields scraped) + + row = (topic, post, user, addDate, feedback, status, reputation, sign, interest) + + # Sending the results + + return row + +# This is the method to parse the Listing Pages (one page with many posts) + +def nulled_listing_parser(soup): + + board = "-1" # board name (the previous level of the topic in the Forum categorization tree. + # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) + + nm = 0 # this variable should receive the number of topics + topic = [] # all topics + user = [] # all users of each topic + post = [] # number of posts of each topic + view = [] # number of views of each topic + addDate = [] # when the topic was created (difficult to find) + href = [] # this variable should receive all cleaned urls (we will use this to do the marge between + # Listing and Description pages) + + # Finding the board (should be just one) + + board = soup.find('span', {"class": "active"}).text + board = cleanString(board.strip()) + + # Finding the repeated tag that corresponds to the listing of topics + + itopics = soup.find_all('tr', {"class": "inline_row"}) + index = 0 + for itopic in itopics: + + # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them + # to don't miss any topic + + + # Adding the topic to the topic list + try: + topics = itopic.find('span', {"class": "subject_old"}).find('a').text + except: + topics = itopic.find('span', {"class": "subject_new"}).find('a').text + topics = re.sub("\[\w*\]", '', topics) + topic.append(cleanString(topics)) + + # Counting how many topics we have found so far + + nm = len(topic) + + # Adding the url to the list of urls + try: + link = itopic.find('span', {"class": "subject_old"}).find('a').get('href') + except: + link = itopic.find('span',{"class": "subject_new"}).find('a').get('href') + link = cleanLink(link) + href.append(link) + + # Finding the author of the topic + ps = itopic.find('div', {"class":"author smalltext"}).find('a').text + author = ps.strip() + user.append(cleanString(author)) + + # Finding the number of replies + columns = itopic.findChildren('td',recursive=False) + posts = columns[3].text + + post.append(cleanString(posts)) + + # Finding the number of Views + tview = columns[4].text + view.append(cleanString(tview)) + + + + # If no information about when the topic was added, just assign "-1" to the variable + #dt = itopic.find('div', {"class": "responsive-hide"}).text.split('»')[1] + #dt = dt.strip() + #date_time_obj = datetime.strptime(dt,'%a %b %d, %Y %I:%M %p') + #addDate.append(date_time_obj) + addDate.append("-1") + + + + index += 1 + return organizeTopics("Nulled", nm, topic, board, view, post, user, addDate, href) + + # if len(tag) > 0: + # + # # Finding the topic + # + # tds = tds[0].find(tag[0]) + # topics = tds.text + # topics = topics.replace(u"\xbb","") + # topics = topics.strip() + # topic.append(cleanString(topics)) + # + # # Counting how many topics we have found so far + # + # nm = len(topic) + # + # # Adding the url to the list of urls + # + # link = tds.findAll('a', href=True) + # link = link[0].get('href') + # link = cleanLink(link) + # href.append(link) + # + # # Finding the author of the topic + # + # ps = itopic.find('td', {"class": tag[1]}).find('p').find('a') + # if ps == None: + # ps = itopic.find('td', {"class": tag[1]}).find('p') + # ps = ps.text.replace("Started by ","") + # else: + # ps = ps.text + # author = ps.strip() + # user.append(cleanString(author)) + # + # # Finding the number of replies + # + # statistics = itopic.find('td', {"class": tag[2]}) + # statistics = statistics.text + # statistics = statistics.split("Replies") + # posts = statistics[0].strip() + # post.append(cleanString(posts)) + # + # # Finding the number of Views + # + # views = statistics[1] + # views = views.replace("Views","") + # views = views.strip() + # view.append(cleanString(views)) + # + # # As no information about when the topic was added, just assign "-1" to the variable + # + # addDate.append("-1") + + #return organizeTopics("TheMajesticGarden", nm, topic, board, view, post, user, addDate, href) + +def nulled_links_parser(soup): + + # Returning all links that should be visited by the Crawler + + href = [] + #print(soup.find('table', {"class": "tborder clear"}).find( + # 'tbody').find_all('tr', {"class": "inline_row"})) + listing = soup.find('tbody') + listing=listing.find_all('tr',id=True) + + for a in listing: + listing_rows = a.find_all('td') + link = listing_rows[1] + link = link.find('a',{'class':'topic_title'}) + link = link['href'] + + href.append(link) + + return href \ No newline at end of file diff --git a/MarketPlaces/Kerberos/crawler_selenium.py b/MarketPlaces/Kerberos/crawler_selenium.py new file mode 100644 index 0000000..fe8f1c0 --- /dev/null +++ b/MarketPlaces/Kerberos/crawler_selenium.py @@ -0,0 +1,337 @@ +__author__ = 'DarkWeb' + +''' +Kerberos Market Crawler (Selenium) +''' + +from selenium import webdriver +from selenium.common.exceptions import NoSuchElementException +from selenium.webdriver.firefox.firefox_profile import FirefoxProfile +from selenium.webdriver.firefox.firefox_binary import FirefoxBinary +from selenium.webdriver.firefox.service import Service +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait +from PIL import Image + +import urllib.parse as urlparse +import os, time +from datetime import date +import subprocess +from bs4 import BeautifulSoup +from MarketPlaces.Initialization.prepare_parser import new_parse +from MarketPlaces.Kerberos.parser import kerberos_links_parser +from MarketPlaces.Utilities.utilities import cleanHTML + +counter = 1 +baseURL = 'http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion' + + +# Opens Tor Browser, crawls the website +def startCrawling(): + opentor() + # marketName = getMarketName() + driver = getAccess() + + if driver != 'down': + captcha(driver) + login(driver) + crawlForum(driver) + + # new_parse(marketName, False) + closetor(driver) + + +# Opens Tor Browser +def opentor(): + global pid + print("Connecting Tor...") + path = open('../../path.txt').readline().strip() + pro = subprocess.Popen(path) + pid = pro.pid + time.sleep(7.5) + input('Tor Connected. Press ENTER to continue\n') + return + + +def captcha(driver): + # wait for captcha page + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div/div/img"))) + + # too hard to code, requires manual completion + + # wait for login page + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div[1]/div[2]/div/form/div[10]/button"))) + + +# Login using premade account credentials and do login captcha manually +def login(driver): + #wait for login page + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div[1]/div[2]/div/form/div[10]/button"))) + + #entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div/form/input[1]') + #Username here + usernameBox.send_keys('blabri') + passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div/form/input[2]') + #Password here + passwordBox.send_keys('fishowal') + + # wait for captcha page show up + # WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + # (By.XPATH, "/html/body/div/img[24]"))) + time.sleep(10) + + # save captcha to local + driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div/form/div[6]').screenshot( + r'..\Kerberos\captcha.png') + + # This method will show image in any image viewer + im = Image.open(r'..\Kerberos\captcha.png') + + im.show() + + # wait until input space show up + inputBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div/form/input[3]') + + # ask user input captcha solution in terminal + userIn = input("Enter solution: ") + + # send user solution into the input space + inputBox.send_keys(userIn) + + # click the verify(submit) button + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + driver.find_element(by=By.XPATH, value="/html/body/div[1]/div[2]/div/form/div[10]/button").click() + + # wait for listing page show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '//*[@id="breadcrumb"]'))) + + +# Returns the name of the website +def getMarketName(): + name = 'Kerberos' + return name + + +# Return the link of the website +def getFixedURL(): + url = 'http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion' + + return url + + +# Closes Tor Browser +def closetor(driver): + # global pid + # os.system("taskkill /pid " + str(pro.pid)) + # os.system("taskkill /t /f /im tor.exe") + print('Closing Tor...') + driver.quit() + time.sleep(3) + return + + +# Creates FireFox 'driver' and configure its 'Profile' +# to use Tor proxy and socket +def createFFDriver(): + file = open('../../path.txt', 'r') + lines = file.readlines() + + ff_binary = FirefoxBinary(lines[0].strip()) + + ff_prof = FirefoxProfile(lines[1].strip()) + ff_prof.set_preference("places.history.enabled", False) + ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) + ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) + ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) + ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) + ff_prof.set_preference("signon.rememberSignons", False) + ff_prof.set_preference("network.cookie.lifetimePolicy", 2) + ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) + ff_prof.set_preference("permissions.default.image", 3) + ff_prof.set_preference("browser.download.folderList", 2) + ff_prof.set_preference("browser.download.manager.showWhenStarting", False) + ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") + ff_prof.set_preference('network.proxy.type', 1) + ff_prof.set_preference("network.proxy.socks_version", 5) + ff_prof.set_preference('network.proxy.socks', '127.0.0.1') + ff_prof.set_preference('network.proxy.socks_port', 9150) + ff_prof.set_preference('network.proxy.socks_remote_dns', True) + ff_prof.set_preference("javascript.enabled", False) + ff_prof.update_preferences() + + service = Service(executable_path=lines[2].strip()) + + driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + + return driver + + +def getAccess(): + url = getFixedURL() + driver = createFFDriver() + + try: + + driver.get(url) + return driver + + except: + + return 'down' + + +# Saves the crawled html page +def savePage(page, url): + cleanPage = cleanHTML(page) + filePath = getFullPathName(url) + os.makedirs(os.path.dirname(filePath), exist_ok=True) + open(filePath, 'wb').write(cleanPage.encode('utf-8')) + return + + +# Gets the full path of the page to be saved along with its appropriate file name +def getFullPathName(url): + fileName = getNameFromURL(url) + if isDescriptionLink(url): + fullPath = r'..\Kerberos\HTML_Pages\\' + str( + "%02d" % date.today().month) + str("%02d" % date.today().day) + str( + "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' + else: + fullPath = r'..\Kerberos\HTML_Pages\\' + str( + "%02d" % date.today().month) + str("%02d" % date.today().day) + str( + "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' + return fullPath + + +# Creates the file name from passed URL +def getNameFromURL(url): + global counter + name = ''.join(e for e in url if e.isalnum()) + if name == '': + name = str(counter) + counter = counter + 1 + return name + + +def getInterestedLinks(): + links = [] + + # Services - Hacking + links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/99/block/price-none/ww/ww/1/') + # Tutorials - Hacking + links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/122/block/price-none/ww/ww/1/') + # Tutorials - Guides + links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/124/block/price-none/ww/ww/1/') + # Tutorials - Other + links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/126/block/price-none/ww/ww/1/') + # Software and Malware - Botnets + links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/129/block/price-none/ww/ww/1/') + # Software and Malware - Malware + links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/130/block/price-none/ww/ww/1/') + # Software and Malware - Trojans + links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/131/block/price-none/ww/ww/1/') + # Software and Malware - Exploits / Kits + links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/133/block/price-none/ww/ww/1/') + # Software and Malware - Other + links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/136/block/price-none/ww/ww/1/') + + return links + + +def crawlForum(driver): + print("Crawling the Kerberos market") + + linksToCrawl = getInterestedLinks() + visited = set(linksToCrawl) + initialTime = time.time() + + i = 0 + count = 0 + while i < len(linksToCrawl): + link = linksToCrawl[i] + print('Crawling :', link) + + try: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + + has_next_page = True + while has_next_page: + list = productPages(html) + for item in list: + itemURL = urlparse.urljoin(baseURL, str(item)) + try: + driver.get(itemURL) + except: + driver.refresh() + savePage(driver.page_source, item) + driver.back() + # break + + if count == 1: + count = 0 + break + + try: + nav = driver.find_element(by=By.XPATH, value= + '/html/body/div[3]/div[4]/div[4]/div/div[1]/div[28]') + a = nav.find_element(by=By.LINK_TEXT, value="Next") + link = a.get_attribute('href') + + if link == "": + raise NoSuchElementException + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + count += 1 + + except NoSuchElementException: + has_next_page = False + + except Exception as e: + print(link, e) + i += 1 + + # finalTime = time.time() + # print finalTime - initialTime + + input("Crawling Kerberos market done sucessfully. Press ENTER to continue\n") + + +# Returns 'True' if the link is Topic link +def isDescriptionLink(url): + if 'item' in url: + return True + return False + + +# Returns True if the link is a listingPage link +def isListingLink(url): + if 'categories' in url: + return True + return False + + +# calling the parser to define the links +def productPages(html): + soup = BeautifulSoup(html, "html.parser") + return kerberos_links_parser(soup) + + +def crawler(): + startCrawling() + # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/MarketPlaces/Kerberos/parser.py b/MarketPlaces/Kerberos/parser.py new file mode 100644 index 0000000..9bd37f7 --- /dev/null +++ b/MarketPlaces/Kerberos/parser.py @@ -0,0 +1,249 @@ +__author__ = 'DarkWeb' + +# Here, we are importing the auxiliary functions to clean or convert data +from MarketPlaces.Utilities.utilities import * + +# Here, we are importing BeautifulSoup to search through the HTML tree +from bs4 import BeautifulSoup + + +# This is the method to parse the Description Pages (one page to each Product in the Listing Pages) +def kerberos_description_parser(soup): + + # Fields to be parsed + + name = "-1" # 0 Product_Name y + describe = "-1" # 1 Product_Description y + lastSeen = "-1" # 2 Product_LastViewDate + rules = "-1" # 3 NOT USED ... + CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 5 Product_MS_Classification (Microsoft Security) + review = "-1" # 6 Product_Number_Of_Reviews + category = "-1" # 7 Product_Category + shipFrom = "-1" # 8 Product_ShippedFrom + shipTo = "-1" # 9 Product_ShippedTo + left = "-1" # 10 Product_QuantityLeft y + escrow = "-1" # 11 Vendor_Warranty y + terms = "-1" # 12 Vendor_TermsAndConditions + vendor = "-1" # 13 Vendor_Name y + sold = "-1" # 14 Product_QuantitySold y + addDate = "-1" # 15 Product_AddedDate + available = "-1" # 16 NOT USED ... + endDate = "-1" # 17 NOT USED ... + BTC = "-1" # 18 Product_BTC_SellingPrice y + USD = "-1" # 19 Product_USD_SellingPrice y + rating = "-1" # 20 Vendor_Rating + success = "-1" # 21 Vendor_Successful_Transactions + EURO = "-1" # 22 Product_EURO_SellingPrice + + bae = soup.find('div', {'class': "col-9"}) + + # Finding Product Name + name = bae.find('h2').text + name = name.replace('\n', ' ') + name = name.replace(",", "") + name = name.strip() + + mb = bae.findAll('div', {"class": "mb-1"}) + + # Finding Vendor + vendor = mb[0].text + vendor = vendor.replace(",", "") + vendor = vendor.replace("Sold by:", "") + vendor = vendor.strip() + + # # Finding Vendor Rating + # full_stars = bae[2].find_all('i', {'class': "fas fa-star"}) + # half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"}) + # rating = len(full_stars) + (0.5 if half_star is not None else 0) + + # Finding Warranty + escrow = mb[2].text + escrow = escrow.replace("Payment:", "") + escrow = escrow.strip() + + # Finding Quantity Sold and Left + temp = mb[4].text.split(',') + + sold = temp[0].replace("sold", "") + sold = sold.strip() + + left = temp[1].replace("in stock", "") + left = left.strip() + + # Finding USD + USD = bae.find('div', {"class": "h3 text-secondary"}).text + USD = USD.replace("$", "") + USD = USD.strip() + + # Finding BTC + temp = bae.find('div', {"class": "small"}).text.split("BTC") + + BTC = temp[0].strip() + + # shipping_info = bae[4].text + # if "Digital" not in shipping_info: + # shipping_info = shipping_info.split(" ") + # + # # Finding Shipment Information (Origin) + # shipFrom = shipping_info[0].strip() + # + # # Finding Shipment Information (Destination) + # shipTo = shipping_info[1].strip() + + # Finding the Product description + describe = bae.find('div', {"class": "card border-top-0"}).text + describe = describe.replace("\n", " ") + describe = describe.replace("\r", " ") + describe = describe.strip() + + # Searching for CVE and MS categories + cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) + if cve: + CVE = " " + for idx in cve: + CVE += (idx) + CVE += " " + CVE = CVE.replace(',', ' ') + CVE = CVE.replace('\n', '') + ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) + if ms: + MS = " " + for im in ms: + MS += (im) + MS += " " + MS = MS.replace(',', ' ') + MS = MS.replace('\n', '') + + # Populating the final variable (this should be a list with all fields scraped) + row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor, + sold, addDate, available, endDate, BTC, USD, rating, success, EURO) + + # Sending the results + return row + + +# This is the method to parse the Listing Pages +def kerberos_listing_parser(soup): + + # Fields to be parsed + nm = 0 # Total_Products (Should be Integer) + mktName = "Kerberos" # 0 Marketplace_Name + name = [] # 1 Product_Name y + CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = [] # 3 Product_MS_Classification (Microsoft Security) + category = [] # 4 Product_Category y + describe = [] # 5 Product_Description + escrow = [] # 6 Vendor_Warranty + views = [] # 7 Product_Number_Of_Views + reviews = [] # 8 Product_Number_Of_Reviews y + addDate = [] # 9 Product_AddDate + lastSeen = [] # 10 Product_LastViewDate + BTC = [] # 11 Product_BTC_SellingPrice + USD = [] # 12 Product_USD_SellingPrice y + EURO = [] # 13 Product_EURO_SellingPrice + sold = [] # 14 Product_QuantitySold + qLeft =[] # 15 Product_QuantityLeft + shipFrom = [] # 16 Product_ShippedFrom + shipTo = [] # 17 Product_ShippedTo + vendor = [] # 18 Vendor y + rating = [] # 19 Vendor_Rating + success = [] # 20 Vendor_Successful_Transactions + href = [] # 24 Product_Links (Urls) + + listing = soup.findAll('div', {"class": "card product-card mb-3"}) + + # Populating the Number of Products + nm = len(listing) + + # Finding Category + cat = soup.find("div", {"class": "col-9"}) + cat = cat.find("h2").text + cat = cat.replace("Category: ", "") + cat = cat.replace(",", "") + cat = cat.strip() + + for card in listing: + category.append(cat) + + bae = card.findAll('a') + + # Adding the url to the list of urls + link = bae[0].get('href') + link = cleanLink(link) + href.append(link) + + # Finding Product Name + product = bae[1].text + product = product.replace('\n', ' ') + product = product.replace(",", "") + product = product.strip() + name.append(product) + + # Finding Vendor + vendor_name = bae[2].text + vendor_name = vendor_name.replace(",", "") + vendor_name = vendor_name.strip() + vendor.append(vendor_name) + + # Finding USD + usd = card.find('div', {"class": "mb-1"}).text + usd = usd.replace("$", "") + usd = usd.strip() + USD.append(usd) + + # Finding Reviews + num = card.find("span", {"class": "rate-count"}).text + num = num.replace("(", "") + num = num.replace("review)", "") + num = num.replace("reviews)", "") + num = num.strip() + reviews.append(num) + + # Searching for CVE and MS categories + cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}')) + if not cve: + cveValue="-1" + else: + cee = " " + for idx in cve: + cee += (idx) + cee += " " + cee = cee.replace(',', ' ') + cee = cee.replace('\n', '') + cveValue=cee + CVE.append(cveValue) + + ms = card.findAll(text=re.compile('MS\d{2}-\d{3}')) + if not ms: + MSValue="-1" + else: + me = " " + for im in ms: + me += (im) + me += " " + me = me.replace(',', ' ') + me = me.replace('\n', '') + MSValue=me + MS.append(MSValue) + + # Populate the final variable (this should be a list with all fields scraped) + return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen, + BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href) + + +def kerberos_links_parser(soup): + + # Returning all links that should be visited by the Crawler + href = [] + + content = soup.find('div', {"id": "content-pos"}) + listing = content.findAll('div', {"class": "item-block"}) + + for div in listing: + + ae = div.find('div', {"ae zx300"}) + links = ae.findAll('a') + href.append(links[1]['href']) + + return href \ No newline at end of file From 344c654b3386863e8c5e49f514414302166249b1 Mon Sep 17 00:00:00 2001 From: westernmeadow Date: Thu, 7 Sep 2023 17:42:01 -0700 Subject: [PATCH 6/6] added iframe tag to cleanHTML in utilities.py --- Forums/Utilities/utilities.py | 1 + MarketPlaces/Utilities/utilities.py | 1 + 2 files changed, 2 insertions(+) diff --git a/Forums/Utilities/utilities.py b/Forums/Utilities/utilities.py index e7afcb8..a9165c8 100644 --- a/Forums/Utilities/utilities.py +++ b/Forums/Utilities/utilities.py @@ -419,6 +419,7 @@ def cleanHTML(driver, html): # remove JavaScript clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) clean_html = re.sub(r"", "", clean_html) clean_html = re.sub(r"", "", clean_html) clean_html = re.sub(r"", "", clean_html) diff --git a/MarketPlaces/Utilities/utilities.py b/MarketPlaces/Utilities/utilities.py index 77312f6..563ffe3 100644 --- a/MarketPlaces/Utilities/utilities.py +++ b/MarketPlaces/Utilities/utilities.py @@ -430,6 +430,7 @@ def cleanHTML(driver, html): # remove JavaScript clean_html = re.sub(r"", "", clean_html) + clean_html = re.sub(r"", "", clean_html) clean_html = re.sub(r"", "", clean_html) clean_html = re.sub(r"", "", clean_html) clean_html = re.sub(r"", "", clean_html)