diff --git a/MarketPlaces/DarkDock/crawler_selenium.py b/MarketPlaces/DarkDock/crawler_selenium.py new file mode 100644 index 0000000..c5b98b7 --- /dev/null +++ b/MarketPlaces/DarkDock/crawler_selenium.py @@ -0,0 +1,356 @@ +__author__ = 'Helium' + +""" +DarkDock Marketplace Crawler (Selenium) +""" + +from selenium import webdriver +from selenium.common.exceptions import NoSuchElementException +from selenium.webdriver.firefox.firefox_profile import FirefoxProfile +from selenium.webdriver.firefox.firefox_binary import FirefoxBinary +from selenium.webdriver.firefox.service import Service +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By + +import urllib.parse as urlparse +import os, time +from bs4 import BeautifulSoup +from MarketPlaces.Initialization.prepare_parser import new_parse +from MarketPlaces.DarkDock.parser import darkdock_links_parser +from MarketPlaces.Utilities.utilities import cleanHTML + +counter = 1 +baseURL = 'http://oirolrkrppy6sei6x6bvkkdolc4cjqzqfhxisfzu6exqblahwrrvktyd.onion/' + +def startCrawling(): + """Main method for the crawler. + + Opens Tor Browser, crawls the website, parses, then closes Tor. + """ + mktName = getMKTName() + driver = getAccess() + + if driver != 'down': + try: + crawlMarket(driver) + except Exception as e: + print(driver.current_url, e) + closeDriver(driver) + + new_parse(mktName, baseURL, True) + +def getMKTName(): + """Returns the name of the website. + """ + name = 'DarkDock' + return name + +def getFixedURL(): + """Returns the base link of site. + """ + url = 'http://oirolrkrppy6sei6x6bvkkdolc4cjqzqfhxisfzu6exqblahwrrvktyd.onion/' + return url + + +def closeDriver(driver): + """Closes Tor Browser. + + Args: + driver: The selected Selenium driver. + """ + # global pid + # os.system("taskkill /pid " + str(pro.pid)) + # os.system("taskkill /t /f /im tor.exe") + print('Closing Tor...') + driver.close() + time.sleep(3) + return + + +def createFFDriver(): + """Creates FireFox 'driver' and configure its 'Profile' to use Tor proxy and socket. + """ + from MarketPlaces.Initialization.markets_mining import config + + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) + + ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) + ff_prof.set_preference("places.history.enabled", False) + ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) + ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) + ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) + ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) + ff_prof.set_preference("signon.rememberSignons", False) + ff_prof.set_preference("network.cookie.lifetimePolicy", 2) + ff_prof.set_preference("network.dns.disablePrefetch", True) + ff_prof.set_preference("network.http.sendRefererHeader", 0) + ff_prof.set_preference("permissions.default.image", 3) + ff_prof.set_preference("browser.download.folderList", 2) + ff_prof.set_preference("browser.download.manager.showWhenStarting", False) + ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") + ff_prof.set_preference('network.proxy.type', 1) + ff_prof.set_preference("network.proxy.socks_version", 5) + ff_prof.set_preference('network.proxy.socks', '127.0.0.1') + ff_prof.set_preference('network.proxy.socks_port', 9150) + ff_prof.set_preference('network.proxy.socks_remote_dns', True) + ff_prof.set_preference("javascript.enabled", False) + ff_prof.update_preferences() + + service = Service(config.get('TOR', 'geckodriver_path')) + + driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + + driver.maximize_window() + + return driver + +def getAccess(): + """The driver 'gets' the url and attempts to access the site. + + Return: + A Selenium driver currently on the site or the string 'down' if it can't access the site. + """ + url = getFixedURL() + driver = createFFDriver() + try: + driver.get(url) + return driver + except: + driver.close() + return 'down' + +def savePage(driver, page, url): + """Saves the crawled html page. + + Cleans the html of the current page the driver is on. Then saves the current + crawled html page with its full path name without special characters into the + marketplace's directory. If the directory path doesn't exist it will make it. + + Args: + driver: The Selenium driver accessing the page. + page: The html of the saved page. + url: The URL of the saved page. + """ + cleanPage = cleanHTML(driver, page) + filePath = getFullPathName(url) + os.makedirs(os.path.dirname(filePath), exist_ok=True) + open(filePath, 'wb').write(cleanPage.encode('utf-8')) + return + +def getFullPathName(url): + """Gets the full path name. + + Gets the full path of the page to be saved along with its appropriate file name. + Determines which subdirectory to save the page, based on whether it is a description + or listing page. + + Args: + url: The URL of the page. + """ + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") + fileName = getNameFromURL(url) + if isDescriptionLink(url): + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') + else: + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') + return fullPath + +def getNameFromURL(url): + """Creates the file name from the passed URL. + + Generates a file name with only its alphanumeric characters. + If the name isn't unique, it will be given a unique name. + + Args: + url: The URL of the selected page from the crawler as it crawls through the site. + """ + global counter + name = ''.join(e for e in url if e.isalnum()) + if (name == ''): + name = str(counter) + counter = counter + 1 + return name + +def getInterestedLinks(): + """Returns list of urls the crawlers runs through. + + Returns a list of the different urls of interest that the crawler runs through. + An example of this can be different categories of a market related to hacking, + such as Software and Malware, Guides and Tutorials, Digital Products. + """ + links = [] + categories = [ + 'civil_softwares', + 'carding', + 'theft', + 'mining', + 'worms', + 'dump', + 'viruses', + 'trojans', + 'botnets', + 'security_technology', + 'computers', + 'confidential_info', + 'network_services', + 'database', + 'surveillance', + 'digital_forensics', + '0day', + 'intelligence', + 'private_security' + ] + for category in categories: + links.append(baseURL + "category/" + category) + + return links + +def crawlMarket(driver): + """Crawls and saves each page of a link of interest. + + Accesses, saves, and crawls through each link of interest. For DarkDock, each + link of interest is a category, so we crawl through all numbered pages of the + category. We find the URL of all descriptions/products on the category page, and save each + individual description/product page. + + Args: + driver: The Selenium driver accessing the site. + """ + print("Crawling the DarkDock market") + + linksToCrawl = getInterestedLinks() + + i = 0 + while i < len(linksToCrawl): + baseCategoryLink = linksToCrawl[i] + link = linksToCrawl[i] + print('Crawling :', link) + + try: + has_next_page = True + count = 1 # Number of pages traversed + maxPages = '' + + + while has_next_page: + + # Try to access current link and reload if fails + try: + driver.get(link) + except: + driver.refresh() + + # Save html page + html = driver.page_source + savePage(driver, html, linksToCrawl[i] + f"page{count}") + + # Get the number of maxPages if maxPages isn't fetched yet + if maxPages == '': + try: + # Wait 30 seconds or until element loads + WebDriverWait(driver, 30).until( + EC.presence_of_element_located((By.XPATH, '//div[@class="pages"]//a[last()]')) + ) + # fetches the element that gives the total number of pages in a category + maxPages = int(driver.find_element(By.XPATH, '//div[@class="pages"]//a[last()]').text) + print(f"Total number of Pages: {maxPages}") + except Exception as e: + print(f"Element not found: {str(e)}") + + + + # Parse the product/description pages + list = descriptionPages(html) + for item in list: + # Fetches the item URL by concatenating the base url with the item sub url + itemURL = urlparse.urljoin(baseURL, str(item)) + try: + driver.get(itemURL) + except: + driver.refresh() + savePage(driver, driver.page_source, item) + # Go back to the previous category page + driver.back() + + # # Add a break for testing if we are checking only the first description/product page + # break + + + + # # Add a break for testing based on how many numbers of pages to test + # if count == 3: + # break + + # Try to find the next page + try: + link = f"{baseCategoryLink}/{count}/" + print("\tCurrent Page :", f"{link}") + if link == "": + raise NoSuchElementException + count += 1 + + except NoSuchElementException: + has_next_page = False + + # If reached the number of maxPages stop crawling the current category + if count > maxPages: + print("Max Pages reached") + has_next_page = False + + except Exception as e: + print(link, e) + i += 1 + + print("Crawling the DarkDock market done.") + + +def isDescriptionLink(url): + """Returns whether the url is for a description page. + + Args: + url: The url of a crawled page. + + Returns: + Returns 'True' if the url is for a description page. Returns 'False' if the + url is not for a description page. + """ + if 'product' in url: + return True + return False + + +def isListingLink(url): + """Returns whether the url is for a listing page. + + Args: + url: The url of a crawled page. + + Returns: + Returns 'True' if the url is for a listing page. Returns 'False' if the + url is not for a listing page. + """ + if 'category' in url: + return True + return False + +def descriptionPages(html): + """Returns all product/description links on the current page. + + Passes the html of the category/listing page and parses it for + any description/product links. + + Args: + html: The html of the selected category/listing page. + + """ + soup = BeautifulSoup(html, "html.parser") + return darkdock_links_parser(soup) + + +def crawler(): + """Starts the crawler. + """ + startCrawling() diff --git a/MarketPlaces/DarkDock/parser.py b/MarketPlaces/DarkDock/parser.py new file mode 100644 index 0000000..52eaafa --- /dev/null +++ b/MarketPlaces/DarkDock/parser.py @@ -0,0 +1,232 @@ +__author__ = 'DarkWeb' + +# Here, we are importing the auxiliary functions to clean or convert data +from MarketPlaces.Utilities.utilities import * + +def darkdock_description_parser(soup): + """Parses the description pages of a DarkDock marketplace. + + It takes a BeautifulSoup object that represents the HTML page of a description page, and + extracts various information such as vendor name, product name, etc. + + Args: + soup: A BeautifulSoup object that represents the HTML page of a description page. + + Returns: + The row of a description item as a tuple containing the information fields extracted from the description page. + """ + vendor = "-1" # 0 Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image + + # Finding Vendor + vendor = soup.select_one('table tr:nth-of-type(2) td:nth-of-type(3) a u').text + vendor = cleanString(vendor) + vendor = vendor.strip() + + # Finding Product Name + headings = soup.find('div', {'class': 'main'}).find_all('div', {'class': 'heading'}) + name = headings[0].text + name = cleanString(name) + name = name.strip() + + # Finding the Product description + describe = soup.find('div', {'class': 'tab1'}).text + describe = cleanString(describe) + describe = describe.strip() + + # Finding the Product category + category = soup.select_one('table tr:nth-of-type(6) td:nth-of-type(3)').text + category = cleanString(category) + category = category.strip() + + # Finding Number of Product Reviews + reviews = headings[1].text + match = re.search(r'\((\d+)\)', reviews).group(1) + reviews = cleanNumbers(reviews) + reviews = reviews.strip() + + # Finding Prices + USD = soup.select_one('table tr:nth-of-type(1) td:nth-of-type(3)').text + USD = cleanNumbers(USD) + USD = USD.strip() + + # Finding the Product Quantity Available + left = soup.select_one('table tr:nth-of-type(7) td:nth-of-type(3)').text + left = cleanNumbers(left) + left = left.strip() + + # Finding Product Shipped From + shipFrom = soup.select_one('table tr:nth-of-type(3) td:nth-of-type(3)').text + shipFrom = cleanString(shipFrom) + shipFrom = shipFrom.strip() + + # Finding Product Shipped To + shipTo = soup.select_one('table tr:nth-of-type(5) td:nth-of-type(3)').text + shipTo = cleanString(shipTo) + shipTo = shipTo.strip() + + # Finding Product Image + image = soup.find('img', {'class': 'bigthumbnail'}).get('src') + image = image.split('base64,')[-1] + + # Populating the final variable (this should be a list with all fields scraped) + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) + + # Sending the results + return row + +def darkdock_listing_parser(soup): + """Parses the listing pages of a DarkDock marketplace. + + It takes a BeautifulSoup object that represents the HTML page of a listing page, + and extracts various information such as vendor name, product name, etc. It then + removes and cleans the extracted information by passing it to the organizeProducts + function. + + Args: + soup: A BeautifulSoup object that represents the HTML page of a listing page. + + Returns: + The row of a description item as a tuple containing the information fields extracted from the listing page. + """ + # Fields to be parsed + nm = 0 # Total_Products (Should be Integer) + mktName = "DarkDock" # 0 Marketplace_Name + vendor = [] # 1 Vendor + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 Product_Name + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this + category = [] # 7 Product_Category + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links + + listings = soup.findAll('div', {'class': 'item'}) + + # Populating the Number of Products + nm = len(listings) + cat = soup.find('div', {'class': 'heading'}).text + cat = cleanString(cat) + cat = cat.strip() + + for listing in listings: + # Finding the Vendor + vendor_name = listing.find('div', {'class': 'seller'}).text + vendor.append(vendor_name) + + # Finding the Product + product = listing.find('div', {'class': 'title'}).text + product = cleanString(product) + product = product.strip() + name.append(product) + + # Finding the Category + category.append(cat) + + # Finding description + description = listing.find('div', {'class': 'description'}).text + description = cleanString(description) + description = description.strip() + describe.append(description) + + # Finding product views + num_view = listing.select_one('.stats table tr:nth-of-type(3) td:nth-of-type(1)').text + num_view = cleanNumbers(num_view) + num_view = num_view.strip() + views.append(num_view) + + # Finding product reviews + num_reviews = listing.select_one('.stats table tr:nth-of-type(3) td:nth-of-type(3)').text + num_reviews = cleanNumbers(num_reviews) + num_reviews = num_reviews.strip() + reviews.append(num_reviews) + + # Finding product rating based on width style + rating = listing.find('div', {'class': 'stars2'}).get('style') + rating = re.findall(r"\d+\.\d+|\d+", rating)[0] + rating = cleanNumbers(rating) + rating = rating.strip() + rating_item.append(rating) + + # Finding Prices + price = listing.find('div', {'class': 'price'}).text + price = price.strip() + USD.append(price) + + # Finding number of times product is sold + num_sold = listing.select_one('.stats table tr:nth-of-type(3) td:nth-of-type(2)').text + num_sold = cleanNumbers(num_sold) + num_sold = num_sold.strip() + sold.append(num_sold) + + # Finding shipping locations + shipping = listing.find('div',{'class': 'shipping'}).text + shippedFrom, shippedTo = cleanString(shipping).split(' > ') + shipTo.append(shippedTo) + shipFrom.append(shippedFrom) + + # Adding the url to the list of urls + link = listing.find('a', recursive=False).get('href') + href.append(link) + + image_vendor.append("-1") + + return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) + +def darkdock_links_parser(soup): + """Returns a list of description links from a listing page. + + It takes a BeautifulSoup object that represents the HTML page of a listing page, and + extracts all the description links from the page. + + Args: + soup: A BeautifulSoup object that represents the HTML page of a listing page. + + Returns: + A list of description links from a listing page. + """ + + # Returning all links that should be visited by the Crawler + + href = [] + listing = soup.find_all('a', href=lambda href: href and '/product/' in href) + + for a in listing: + href.append(a['href']) + + return href \ No newline at end of file diff --git a/MarketPlaces/Initialization/marketsList.txt b/MarketPlaces/Initialization/marketsList.txt index 73f7d06..6b4d5d6 100644 --- a/MarketPlaces/Initialization/marketsList.txt +++ b/MarketPlaces/Initialization/marketsList.txt @@ -1 +1 @@ -DarkMarket \ No newline at end of file +DarkDock \ No newline at end of file diff --git a/MarketPlaces/Initialization/markets_mining.py b/MarketPlaces/Initialization/markets_mining.py index 4656341..d931122 100644 --- a/MarketPlaces/Initialization/markets_mining.py +++ b/MarketPlaces/Initialization/markets_mining.py @@ -26,6 +26,7 @@ from MarketPlaces.GoFish.crawler_selenium import crawler as crawlerGoFish from MarketPlaces.ZeroDay.crawler_selenium import crawler as crawlerZeroDay from MarketPlaces.Torzon.crawler_selenium import crawler as crawlerTorzon from MarketPlaces.DarkMarket.crawler_selenium import crawler as crawlerDarkMarket +from MarketPlaces.DarkDock.crawler_selenium import crawler as crawlerDarkDock import configparser import os @@ -141,5 +142,7 @@ if __name__ == '__main__': crawlerTorzon() elif mkt == "DarkMarket": crawlerDarkMarket() + elif mkt == "DarkDock": + crawlerDarkDock() print("\nScraping process completed!") diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index b00e606..3900f03 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -28,6 +28,7 @@ from MarketPlaces.Torzon.parser import * from MarketPlaces.GoFish.parser import * from MarketPlaces.ZeroDay.parser import * from MarketPlaces.DarkMarket.parser import * +from MarketPlaces.DarkDock.parser import * from MarketPlaces.Classifier.classify_product import predict from Translator.translate import translate @@ -170,6 +171,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile): rw = torzon_listing_parser(soup) elif marketPlace == "DarkMarket": rw = darkmarket_listing_parser(soup) + elif marketPlace == "DarkDock": + rw = darkdock_listing_parser(soup) else: print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") raise Exception @@ -230,6 +233,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): rmm = torzon_description_parser(soup) elif marketPlace == "DarkMarket": rmm = darkmarket_description_parser(soup) + elif marketPlace == "DarkDock": + rmm = darkdock_description_parser(soup) else: print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") raise Exception