From bb00cde8c3422e58fe04f21f9e4fce7763be6da1 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 30 Aug 2023 23:42:19 -0700 Subject: [PATCH 1/3] AnonMarket crawler and parser completed --- MarketPlaces/AnonMarket/crawler_selenium.py | 290 ++++++++++++++++++++ MarketPlaces/AnonMarket/parser.py | 207 ++++++++++++++ 2 files changed, 497 insertions(+) create mode 100644 MarketPlaces/AnonMarket/crawler_selenium.py create mode 100644 MarketPlaces/AnonMarket/parser.py diff --git a/MarketPlaces/AnonMarket/crawler_selenium.py b/MarketPlaces/AnonMarket/crawler_selenium.py new file mode 100644 index 0000000..410cbdc --- /dev/null +++ b/MarketPlaces/AnonMarket/crawler_selenium.py @@ -0,0 +1,290 @@ +__author__ = 'Helium' + +''' +Anon Market Crawler (Selenium) +''' + +from selenium import webdriver +from selenium.common.exceptions import NoSuchElementException +from selenium.webdriver.firefox.firefox_profile import FirefoxProfile +from selenium.webdriver.firefox.firefox_binary import FirefoxBinary +from selenium.webdriver.firefox.service import Service +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By + +from PIL import Image +import urllib.parse as urlparse +import os, re, time +from datetime import date +import subprocess +import configparser +from bs4 import BeautifulSoup +from MarketPlaces.Initialization.prepare_parser import new_parse +from MarketPlaces.AnonMarket.parser import AnonMarket_links_parser +from MarketPlaces.Utilities.utilities import cleanHTML + +counter = 1 +baseURL = 'http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion' + + +# Opens Tor Browser, crawls the website, then parses, then closes tor +#acts like the main method for the crawler, another function at the end of this code calls this function later +def startCrawling(): + opentor() + mktName = getMKTName() + driver = getAccess() + + if driver != 'down': + try: + crawlForum(driver) + except Exception as e: + print(driver.current_url, e) + closetor(driver) + + new_parse(mktName, baseURL, True) + +# Opens Tor Browser +#prompts for ENTER input to continue +def opentor(): + from MarketPlaces.Initialization.markets_mining import config + + global pid + print("Connecting Tor...") + pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) + pid = pro.pid + time.sleep(7.5) + input('Tor Connected. Press ENTER to continue\n') + return + +# Returns the name of the website +#return: name of site in string type +def getMKTName(): + name = 'AnonMarket' + return name + + +# Return the base link of the website +#return: url of base site in string type +def getFixedURL(): + url = 'http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion' + return url + + +# Closes Tor Browser +#@param: current selenium driver +def closetor(driver): + # global pid + # os.system("taskkill /pid " + str(pro.pid)) + # os.system("taskkill /t /f /im tor.exe") + print('Closing Tor...') + driver.close() + time.sleep(3) + return + + +# Creates FireFox 'driver' and configure its 'Profile' +# to use Tor proxy and socket +def createFFDriver(): + from MarketPlaces.Initialization.markets_mining import config + + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) + + ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) + ff_prof.set_preference("places.history.enabled", False) + ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) + ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) + ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) + ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) + ff_prof.set_preference("signon.rememberSignons", False) + ff_prof.set_preference("network.cookie.lifetimePolicy", 2) + ff_prof.set_preference("network.dns.disablePrefetch", True) + ff_prof.set_preference("network.http.sendRefererHeader", 0) + ff_prof.set_preference("permissions.default.image", 2) + ff_prof.set_preference("browser.download.folderList", 2) + ff_prof.set_preference("browser.download.manager.showWhenStarting", False) + ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") + ff_prof.set_preference('network.proxy.type', 1) + ff_prof.set_preference("network.proxy.socks_version", 5) + ff_prof.set_preference('network.proxy.socks', '127.0.0.1') + ff_prof.set_preference('network.proxy.socks_port', 9150) + ff_prof.set_preference('network.proxy.socks_remote_dns', True) + ff_prof.set_preference("javascript.enabled", False) + ff_prof.update_preferences() + + service = Service(config.get('TOR', 'geckodriver_path')) + + driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + + driver.maximize_window() + + return driver + + +#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' +#return: return the selenium driver or string 'down' +def getAccess(): + url = getFixedURL() + driver = createFFDriver() + try: + driver.get(url) + return driver + except: + driver.close() + return 'down' + +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) + filePath = getFullPathName(url) + os.makedirs(os.path.dirname(filePath), exist_ok=True) + open(filePath, 'wb').write(cleanPage.encode('utf-8')) + return + + +# Gets the full path of the page to be saved along with its appropriate file name +#@param: raw url as crawler crawls through every site +def getFullPathName(url): + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") + fileName = getNameFromURL(url) + if isDescriptionLink(url): + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') + else: + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') + return fullPath + + +# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned +#@param: raw url as crawler crawls through every site +def getNameFromURL(url): + global counter + name = ''.join(e for e in url if e.isalnum()) + if (name == ''): + name = str(counter) + counter = counter + 1 + return name + + +# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list +#in this example, there are a couple of categories some threads fall under such as +# Guides and Tutorials, Digital Products, and Software and Malware +#as you can see they are categories of products +def getInterestedLinks(): + links = [] + # # Software + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/civil_softwares') + # # Malware + links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/malware') + # # Bootkits + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/bootkits') + # # Backdoors + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/backdoors') + # # Keyloggers + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/keyloggers') + # # Wireless Trackers + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/wireless_trackers') + # # Screen Scrapers + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/screen_scrapers') + # # Mobile Forensic Tools + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/mobile_forensics_tools') + # # Wifi Jammers + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/wifi_jammers') + # # Carding + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/carding') + # # Worms + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/worms') + # # Viruses + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/viruses') + # # Trojans + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/trojans') + # # Botnets + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/botnets') + # # Security Technology + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/security_technology') + # # Hacks + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/hacks') + # # Exploit kits + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/exploit_kit') + # # Security + # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/security') + + return links + + +# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through +#topic and description pages are crawled through here, where both types of pages are saved +#@param: selenium driver +def crawlForum(driver): + print("Crawling Anon Market") + + linksToCrawl = getInterestedLinks() + + for link in linksToCrawl: + print('Crawling :', link) + + has_next_page = True + + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + + html = driver.page_source + savePage(driver, html, link) + + # Get all product links on the current page + products_list = productPages(html) + for item in products_list: + itemURL = urlparse.urljoin(baseURL, str(item)) + try: + driver.get(itemURL) + except: + driver.refresh() + savePage(driver, driver.page_source, item) + driver.back() # Go back to listing after visiting each product + + # Find the active page number + active_page_element = driver.find_element(By.XPATH, '//div[@class="page activepage"]') + current_page = int(active_page_element.text) + + # Locate the next page link + try: + next_page_element = active_page_element.find_element(By.XPATH, 'following-sibling::a[1]') + link = next_page_element.get_attribute('href') + except NoSuchElementException: + has_next_page = False + + print("Crawling Anon Market done.") + + +# Returns 'True' if the link is a description link +#@param: url of any url crawled +#return: true if is a description page, false if not +def isDescriptionLink(url): + if 'product' in url: + return True + return False + + +# Returns True if the link is a listingPage link +#@param: url of any url crawled +#return: true if is a Listing page, false if not +def isListingLink(url): + if 'category' in url: + return True + return False + + +# calling the parser to define the links, the html is the url of a link from the list of interested link list +#@param: link from interested link list ie. getInterestingLinks() +#return: list of description links that should be crawled through +def productPages(html): + soup = BeautifulSoup(html, "html.parser") + return AnonMarket_links_parser(soup) + +def crawler(): + startCrawling() + # print("Crawling and Parsing Nexus .... DONE!") + diff --git a/MarketPlaces/AnonMarket/parser.py b/MarketPlaces/AnonMarket/parser.py new file mode 100644 index 0000000..a488962 --- /dev/null +++ b/MarketPlaces/AnonMarket/parser.py @@ -0,0 +1,207 @@ +__author__ = 'DarkWeb' + +# Here, we are importing the auxiliary functions to clean or convert data +from MarketPlaces.Utilities.utilities import * + +# Here, we are importing BeautifulSoup to search through the HTML tree +from bs4 import BeautifulSoup + +import re + +#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs +#stores info it needs in different lists, these lists are returned after being organized +#@param: soup object looking at html page of description page +#return: 'row' that contains a variety of lists that each hold info on the description page +def AnonMarket_description_parser(soup): + + # Fields to be parsed + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + + name_of_product = soup.find("div", {"class": "heading"}).text + name = cleanString(name_of_product.strip()) + + description_div = soup.find("div", {"class": "tab1"}) + if description_div is None: + describe = "-1" + else: + describe = cleanString(description_div.text.strip()) + + info_div = soup.find('div', {'class': 'information'}) + table = info_div.find('table') if info_div else None + + if table: + # Find all table rows + rows = table.find_all('tr') + + # Parse each row to get relevant data + data = {} + for row in rows: + columns = row.find_all('td') + if len(columns) == 3: + key = columns[0].text.strip() + value = columns[2].text.strip() + data[key] = value + + # Extract specific data from the dictionary and assign them to individual variables + vendor = data.get('Vendor', '-1') + shipFrom = data.get('Location', '-1') + shipTo = data.get('Ships to', '-1') + category = data.get('Category', '-1') + USD = data.get('Price', '-1').split()[0] + left = data.get('Stock', '-1') + + # Populating the final variable (this should be a list with all fields scraped) + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo) + + + # Sending the results + return row + + +#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs +#stores info it needs in different lists, these lists are returned after being organized +#@param: soup object looking at html page of listing page +#return: 'row' that contains a variety of lists that each hold info on the listing page +def AnonMarket_listing_parser(soup): + + # Fields to be parsed + nm = 0 # *Total_Products (Should be Integer) + mktName = "AnonMarket" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = [] # 6 Product_MS_Classification (Microsoft Security) + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + href = [] # 20 Product_Links + base_url = "http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion" + + products_list = soup.find_all('div', {'class': 'item'}) + nm = 0 + for product in products_list: + try: + name_of_product = product.find("div", {"class": "title"}).text.strip() + name.append(name_of_product) + + name_of_vendor = product.find("a", {'class': 'seller'}).text.strip() + vendor.append(name_of_vendor) + + cat = soup.find("div", {'class': 'heading'}).text + category.append(cat) + + product_link_element = product.find("div", {"class": "title"}).find_parent('a') + if product_link_element: + link = product_link_element['href'] + if "/product/" in link and "/user/" not in link: + full_link = base_url + link + href.append(full_link) + else: + href.append("-1") + else: + href.append("-1") + + # Append '-1' for unavailable data + rating_vendor.append("-1") + success.append("-1") + CVE.append("-1") + MS.append("-1") + describe.append("-1") + views.append("-1") + reviews.append("-1") + addDate.append("-1") + BTC.append("-1") + EURO.append("-1") + sold.append("-1") + qLeft.append("-1") + shipFrom.append("-1") + shipTo.append("-1") + + nm += 1 + + except AttributeError as e: + print("I'm somewhere I don't belong. I'm going to leave") + continue + + # Populate the final variable (this should be a list with all fields scraped) + return organizeProducts( + marketplace = "AnonMarket", + nm = nm, + vendor = vendor, + rating_vendor = rating_vendor, + success_vendor = success, + nombre = name, + CVE = CVE, + MS = MS, + category = category, + describe = describe, + views = views, + reviews = reviews, + rating_item = rating_item, + addDate = addDate, + BTC = BTC, + USD = USD, + EURO = EURO, + sold = sold, + qLeft = qLeft, + shipFrom = shipFrom, + shipTo = shipTo, + href = href + ) + + +#called by the crawler to get description links on a listing page +#@param: beautifulsoup object that is using the correct html page (listing page) +#return: list of description links from a listing page +def AnonMarket_links_parser(soup): + # Base URL to prepend to each product link + base_url = "http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion" + + # Returning all links that should be visited by the Crawler + href = [] + + # Using a shorter, but still unique, class name + listing = soup.find_all('a', href=True, attrs={'href': lambda x: "/product/" in x}) + + for a in listing: + link = a.get('href') + if link: # Checks if 'href' attribute is not None + # Prepending the base URL to the scraped link + full_link = base_url + link + href.append(full_link) + + # Filtering out any links that might not have '/product/' in them + product_links = [link for link in href if '/product/' in link] + + return product_links From 033764cf714b0f71c114172e7731976e60015802 Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 31 Aug 2023 15:08:59 -0700 Subject: [PATCH 2/3] AnonMarket crawler and parser completed --- MarketPlaces/AnonMarket/crawler_selenium.py | 1 - MarketPlaces/AnonMarket/parser.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/MarketPlaces/AnonMarket/crawler_selenium.py b/MarketPlaces/AnonMarket/crawler_selenium.py index 410cbdc..2171d84 100644 --- a/MarketPlaces/AnonMarket/crawler_selenium.py +++ b/MarketPlaces/AnonMarket/crawler_selenium.py @@ -258,7 +258,6 @@ def crawlForum(driver): print("Crawling Anon Market done.") - # Returns 'True' if the link is a description link #@param: url of any url crawled #return: true if is a description page, false if not diff --git a/MarketPlaces/AnonMarket/parser.py b/MarketPlaces/AnonMarket/parser.py index a488962..c53283c 100644 --- a/MarketPlaces/AnonMarket/parser.py +++ b/MarketPlaces/AnonMarket/parser.py @@ -192,7 +192,7 @@ def AnonMarket_links_parser(soup): href = [] # Using a shorter, but still unique, class name - listing = soup.find_all('a', href=True, attrs={'href': lambda x: "/product/" in x}) + listing = soup.find('div', {'class': 'items'}).find_all('a', href=True, attrs={'href': lambda x: "/product/" in x}) for a in listing: link = a.get('href') From dfcd67a6f0c12ace4d18d5806b9a97310a9856ae Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 31 Aug 2023 15:23:25 -0700 Subject: [PATCH 3/3] added AnonMarket to mining and parser if statments --- MarketPlaces/Initialization/markets_mining.py | 3 +++ MarketPlaces/Initialization/prepare_parser.py | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/MarketPlaces/Initialization/markets_mining.py b/MarketPlaces/Initialization/markets_mining.py index 7779f10..62e97f8 100644 --- a/MarketPlaces/Initialization/markets_mining.py +++ b/MarketPlaces/Initialization/markets_mining.py @@ -26,6 +26,7 @@ from MarketPlaces.Nexus.crawler_selenium import crawler as crawlerNexus from MarketPlaces.CypherMarketplace.crawler_selenium import crawler as crawlerCypher from MarketPlaces.DarkBazar.crawler_selenium import crawler as crawlerDarkBazar from MarketPlaces.PabloEscobarMarket.crawler_selenium import crawler as crawlerPabloEscobar +from MarketPlaces.AnonMarket.crawler_selenium import crawler as crawlerAnonMarket import configparser import os @@ -143,5 +144,7 @@ if __name__ == '__main__': crawlerDarkBazar() elif mkt == "PabloEscobarMarket": crawlerPabloEscobar() + elif mkt == "AnonMarket": + crawlerAnonMarket() print("\nScraping process completed!") diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index c7699bd..7f162be 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -23,6 +23,7 @@ from MarketPlaces.Nexus.parser import * from MarketPlaces.MikesGrandStore.parser import * from MarketPlaces.DarkBazar.parser import * from MarketPlaces.PabloEscobarMarket.parser import * +from MarketPlaces.AnonMarket.parser import * from MarketPlaces.Classifier.classify_product import predict @@ -158,6 +159,8 @@ def parse_listing(marketPlace, listingFile, soup, createLog, logFile): rw = darkbazar_listing_parser(soup) elif marketPlace == "PabloEscobarMarket": rw = pabloescobarmarket_listing_parser(soup) + elif marketPlace == "AnonMarket": + rw = AnonMarket_listing_parser(soup) else: print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") raise Exception @@ -213,6 +216,8 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): rmm = darkbazar_description_parser(soup) elif marketPlace == "PabloEscobarMarket": rmm = pabloescobarmarket_description_parser(soup) + elif marketPlace == "AnonMarket": + rmm = AnonMarket_description_parser(soup) else: print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") raise Exception