From bd5a05ffb46afded758c48989bcb8e5e631c17a6 Mon Sep 17 00:00:00 2001 From: Joshua Date: Thu, 13 Jul 2023 20:16:48 -0700 Subject: [PATCH] finished crawler and parser --- MarketPlaces/Initialization/markets_mining.py | 3 + MarketPlaces/Initialization/prepare_parser.py | 7 +- MarketPlaces/ViceCity/crawler_selenium.py | 333 ++++++++++++++++++ MarketPlaces/ViceCity/parser.py | 166 +++++---- 4 files changed, 435 insertions(+), 74 deletions(-) diff --git a/MarketPlaces/Initialization/markets_mining.py b/MarketPlaces/Initialization/markets_mining.py index 664454a..1c533ad 100644 --- a/MarketPlaces/Initialization/markets_mining.py +++ b/MarketPlaces/Initialization/markets_mining.py @@ -20,6 +20,7 @@ from MarketPlaces.Apocalypse.crawler_selenium import crawler as crawlerApocalyps from MarketPlaces.CityMarket.crawler_selenium import crawler as crawlerCityMarket from MarketPlaces.DarkMatter.crawler_selenium import crawler as crawlerDarkMatter from MarketPlaces.M00nkeyMarket.crawler_selenium import crawler as crawlerM00nkeyMarket +from MarketPlaces.ViceCity.crawler_selenium import crawler as crawlerViceCity import configparser import time @@ -106,6 +107,8 @@ if __name__ == '__main__': crawlerDarkMatter() elif mkt == "M00nkeyMarket": crawlerM00nkeyMarket() + elif mkt == "ViceCity": + crawlerViceCity() diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index 4d5508b..5739f30 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -10,6 +10,7 @@ from MarketPlaces.Tor2door.parser import * from MarketPlaces.Apocalypse.parser import * from MarketPlaces.ThiefWorld.parser import * from MarketPlaces.AnonymousMarketplace.parser import * +from MarketPlaces.ViceCity.parser import * from MarketPlaces.Classifier.classify_product import predict @@ -148,6 +149,8 @@ def new_parse(marketPlace, url, createLog): rmm = thiefWorld_description_parser(soup) elif marketPlace =="AnonymousMarketplace": rmm = anonymousMarketplace_description_parser(soup) + elif marketPlace == "ViceCity": + rmm = vicecity_description_parser(soup) # key = u"Pr:" + rmm[0].upper()[:desc_lim1] + u" Vendor:" + rmm[13].upper()[:desc_lim2] key = u"Url:" + os.path.basename(line2).replace(".html", "") @@ -200,7 +203,9 @@ def new_parse(marketPlace, url, createLog): elif marketPlace == "ThiefWorld": rw = thiefWorld_listing_parser(soup) elif marketPlace == "AnonymousMarketplace": - rw = anonymousMarketplace_listing_parser(soup) + rw = anonymousMarketplace_listing_parser(soup) + elif marketPlace == "ViceCity": + rw = vicecity_listing_parser(soup) else: parseError = True diff --git a/MarketPlaces/ViceCity/crawler_selenium.py b/MarketPlaces/ViceCity/crawler_selenium.py index e69de29..0b22082 100644 --- a/MarketPlaces/ViceCity/crawler_selenium.py +++ b/MarketPlaces/ViceCity/crawler_selenium.py @@ -0,0 +1,333 @@ +__author__ = 'DarkWeb' + +''' +ViceCity Market Forum Crawler (Selenium) +''' + +from selenium import webdriver +from selenium.common.exceptions import NoSuchElementException +from selenium.webdriver.firefox.firefox_profile import FirefoxProfile +from selenium.webdriver.firefox.firefox_binary import FirefoxBinary +from selenium.webdriver.firefox.service import Service +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By + +from PIL import Image +import urllib.parse as urlparse +import os, time +from datetime import date +import subprocess +import configparser +import subprocess +from bs4 import BeautifulSoup +from MarketPlaces.Initialization.prepare_parser import new_parse +from MarketPlaces.ViceCity.parser import vicecity_links_parser +from MarketPlaces.Utilities.utilities import cleanHTML + +config = configparser.ConfigParser() +config.read('../../setup.ini') +counter = 1 +baseURL = 'http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/' + + +# Opens Tor Browser, crawls the website, then parses, then closes tor +#acts like the main method for the crawler, another function at the end of this code calls this function later +def startCrawling(): + # opentor() + mktName = getMKTName() + # driver = getAccess() + # + # if driver != 'down': + # try: + # login(driver) + # crawlForum(driver) + # except Exception as e: + # print(driver.current_url, e) + # closetor(driver) + + new_parse(mktName, baseURL, True) + + +# Opens Tor Browser +#prompts for ENTER input to continue +def opentor(): + global pid + print("Connecting Tor...") + pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) + pid = pro.pid + time.sleep(7.5) + input('Tor Connected. Press ENTER to continue\n') + return + + +# Returns the name of the website +#return: name of site in string type +def getMKTName(): + name = 'ViceCity' + return name + + +# Return the base link of the website +#return: url of base site in string type +def getFixedURL(): + url = 'http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/' + return url + + +# Closes Tor Browser +#@param: current selenium driver +def closetor(driver): + # global pid + # os.system("taskkill /pid " + str(pro.pid)) + # os.system("taskkill /t /f /im tor.exe") + print('Closing Tor...') + driver.close() + time.sleep(3) + return + + +# Creates FireFox 'driver' and configure its 'Profile' +# to use Tor proxy and socket +def createFFDriver(): + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) + + ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) + # ff_prof.set_preference("places.history.enabled", False) + # ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) + # ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) + # ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) + # ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) + # ff_prof.set_preference("signon.rememberSignons", False) + # ff_prof.set_preference("network.cookie.lifetimePolicy", 2) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("permissions.default.image", 3) + # ff_prof.set_preference("browser.download.folderList", 2) + # ff_prof.set_preference("browser.download.manager.showWhenStarting", False) + # ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") + ff_prof.set_preference('network.proxy.type', 1) + ff_prof.set_preference("network.proxy.socks_version", 5) + ff_prof.set_preference('network.proxy.socks', '127.0.0.1') + ff_prof.set_preference('network.proxy.socks_port', 9150) + ff_prof.set_preference('network.proxy.socks_remote_dns', True) + ff_prof.set_preference("javascript.enabled", False) + ff_prof.update_preferences() + + service = Service(config.get('TOR', 'geckodriver_path')) + + driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + + return driver + +#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' +#return: return the selenium driver or string 'down' +def getAccess(): + url = getFixedURL() + driver = createFFDriver() + try: + driver.get(url) + return driver + except: + driver.close() + return 'down' + + +# Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha +# then allows for manual solving of captcha in the terminal +#@param: current selenium web driver +def login(driver): + # wait for first captcha page to show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div/div/form/div/div[1]"))) + input("Press Enter once captcha done (dont press done)") + #clicks button after captcha is inputted + driver.find_element(by=By.XPATH, value='/html/body/div/div/form/button').click() + + #wait for login page to show up + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/div/div/div/form'))) + #puts username into box + userBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') + userBox.send_keys('ct1234') + + #waits for second catpcha to be inputted by user + input("Press Enter once captcha done (dont press continue)") + #clicks on continue + driver.find_element(by=By.XPATH, value='/html/body/div/div/div/form/input[2]').click() + + #waits for password to show + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/div/div/div/form/div[3]/input'))) + time.sleep(10) # give time for site to catch up + # puts password into box + passBox = driver.find_element(by=By.XPATH, value='/html/body/div/div/div/form/div[2]/input') + passBox.send_keys('DementedBed123-') + driver.find_element(by=By.XPATH, value='/html/body/div/div/div/form/div[3]/input').click() + + # wait for pin input to show + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/div[1]/div/form/span'))) + pinBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div/form/input[1]') + pinBox.send_keys('12345') + driver.find_element(by=By.XPATH, value='/html/body/div[1]/div/form/input[2]').click() + + # waits for main listing page before crawling to ensure everything goes well + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/div[1]/div/div[2]'))) + + +# Saves the crawled html page, makes the directory path for html pages if not made +def savePage(page, url): + cleanPage = cleanHTML(page) + filePath = getFullPathName(url) + os.makedirs(os.path.dirname(filePath), exist_ok=True) + open(filePath, 'wb').write(cleanPage.encode('utf-8')) + return + + +# Gets the full path of the page to be saved along with its appropriate file name +#@param: raw url as crawler crawls through every site +def getFullPathName(url): + fileName = getNameFromURL(url) + if isDescriptionLink(url): + fullPath = r'..\ViceCity\HTML_Pages\\' + str( + "%02d" % date.today().month) + str("%02d" % date.today().day) + str( + "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' + else: + fullPath = r'..\ViceCity\HTML_Pages\\' + str( + "%02d" % date.today().month) + str("%02d" % date.today().day) + str( + "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' + return fullPath + + +# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned +#@param: raw url as crawler crawls through every site +def getNameFromURL(url): + global counter + name = ''.join(e for e in url if e.isalnum()) + if (name == ''): + name = str(counter) + counter = counter + 1 + return name + + +# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list +#in this example, there are a couple of categories some threads fall under such as +# Guides and Tutorials, Digital Products, and Software and Malware +#as you can see they are categories of products +def getInterestedLinks(): + links = [] + + # Digital - Fraud Software, Has Hacking and Guides + links.append('http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/?category=150') + # Digital - Guides and Tutorials + # links.append('http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/?category=94') + # Carding Services + # links.append('http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/?category=155') + # Digital - Other (half junk half random stuff like: bots, rats, viruses, and guides) + # links.append('http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/?category=153') + + return links + + +# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through +#topic and description pages are crawled through here, where both types of pages are saved +#@param: selenium driver +def crawlForum(driver): + print("Crawling the ViceCity Market") + + linksToCrawl = getInterestedLinks() + visited = set(linksToCrawl) + initialTime = time.time() + + count = 0 + i = 0 + while i < len(linksToCrawl): + link = linksToCrawl[i] + print('Crawling :', link) + try: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + + has_next_page = True + while has_next_page: + list = productPages(html) + j = 0 + for item in list: + itemURL = urlparse.urljoin(baseURL, str(item)) + try: + driver.get(itemURL) + except: + driver.refresh() + time.sleep(2.5) # to let page catchup + savePage(driver.page_source, item) + driver.back() + + #comment out + break + + # # comment out + # if count == 1: + # count = 0 + # break + + try: + temp = driver.find_element(by=By.CLASS_NAME, value='pagination') + link = temp.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') + if link == "": + raise NoSuchElementException + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + count += 1 + + except NoSuchElementException: + has_next_page = False + + except Exception as e: + print(link, e) + i += 1 + + # finalTime = time.time() + # print finalTime - initialTime + + input("Crawling ViceCity done sucessfully. Press ENTER to continue\n") + + +# Returns 'True' if the link is a description link +#@param: url of any url crawled +#return: true if is a description page, false if not +def isDescriptionLink(url): + if 'listing' in url: + return True + return False + + +# Returns True if the link is a listingPage link +#@param: url of any url crawled +#return: true if is a Listing page, false if not +def isListingLink(url): + if 'category' in url: + return True + return False + + +# calling the parser to define the links, the html is the url of a link from the list of interested link list +#@param: link from interested link list ie. getInterestingLinks() +#return: list of description links that should be crawled through +def productPages(html): + soup = BeautifulSoup(html, "html.parser") + return vicecity_links_parser(soup) + + +def crawler(): + startCrawling() + # print("Crawling and Parsing BestCardingWorld .... DONE!") diff --git a/MarketPlaces/ViceCity/parser.py b/MarketPlaces/ViceCity/parser.py index 2508b2d..65d6b8f 100644 --- a/MarketPlaces/ViceCity/parser.py +++ b/MarketPlaces/ViceCity/parser.py @@ -8,7 +8,7 @@ from bs4 import BeautifulSoup # This is the method to parse the Description Pages (one page to each Product in the Listing Pages) -def tor2door_description_parser(soup): +def vicecity_description_parser(soup): # Fields to be parsed vendor = "-1" # 0 *Vendor_Name @@ -31,62 +31,68 @@ def tor2door_description_parser(soup): shipFrom = "-1" # 17 Product_ShippedFrom shipTo = "-1" # 18 Product_ShippedTo - bae = soup.find('div', {'class': "col-9"}) - # Finding Product Name - name = bae.find('h2').text + name = soup.find('div', {'class': "listing_info"}).find('div', {'class': "listing_right"}) + name = name.find('span', {'style': "font-size:18px;font-weight: bold;color: #fff"}).text name = name.replace('\n', ' ') name = name.replace(",", "") name = name.strip() - mb = bae.findAll('div', {"class": "mb-1"}) - # Finding Vendor - vendor = mb[0].text - vendor = vendor.replace(",", "") - vendor = vendor.replace("Sold by:", "") - vendor = vendor.strip() + vendor = soup.find('div', {'class': "listing_info"}).find('div', {'class': "listing_right"}).find('a').text.strip() - # # Finding Vendor Rating - # full_stars = bae[2].find_all('i', {'class': "fas fa-star"}) - # half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"}) - # rating = len(full_stars) + (0.5 if half_star is not None else 0) + # Finding Vendor Rating + rating = soup.find('div', {'class': "listing_info"}).find('div', {'class': "listing_right"}).find('a').get('title') + rating = str(re.match(r"\d+%", rating)).strip() # Finding Quantity Sold and Left - temp = mb[4].text.split(',') + # temp = mb[4].text.split(',') + # + # sold = temp[0].replace("sold", "") + # sold = sold.strip() + # + # left = temp[1].replace("in stock", "") + # left = left.strip() - sold = temp[0].replace("sold", "") - sold = sold.strip() + # Finding Successful Transactions + success = soup.find('div', {'class': "listing_info"}).find('div', {'class': "listing_right"}).find('a').get('title') + success = str(re.compile(r"\d+(?= sales)", success)).strip() - left = temp[1].replace("in stock", "") - left = left.strip() + bae = soup.find('pre') # Finding USD - USD = bae.find('div', {"class": "h3 text-secondary"}).text - USD = USD.replace("$", "") - USD = USD.strip() + USD = bae.find('span').text + USD = str(re.compile(r"\$\d+(?:\.\d+)?", USD)) + USD = USD.replace("$", "").strip() # Finding BTC - temp = bae.find('div', {"class": "small"}).text.split("BTC") + BTC = bae.findall('span') + BTC = str(re.compile(r"\d+(?:\.\d+)?", BTC[1].text)).strip() - BTC = temp[0].strip() + # Finding the Product Category + category = soup.find('div', {'class': "listing_info"}).find('div', {'class': "listing_right"}) + category = category.find('span', {'style': "font-size:15px;color: #a1a1a1"}).text + category = category.replace("Category:", "").strip() - # shipping_info = bae[4].text - # if "Digital" not in shipping_info: - # shipping_info = shipping_info.split(" ") - # - # # Finding Shipment Information (Origin) - # shipFrom = shipping_info[0].strip() - # - # # Finding Shipment Information (Destination) - # shipTo = shipping_info[1].strip() + li = bae.find('span', {'style': "float:right"}).find_all('span') + + # Finding Shipment Information (Origin) + shipFrom = li[1].text.strip() + + # Finding Shipment Information (Destination) + shipTo = li[-2].text.strip() # Finding the Product description - describe = bae.find('div', {"class": "card border-top-0"}).text + describe = soup.find('p', { + 'style': "width:705px;margin-left:-305px;background-color: #242424;border-radius: 3px;border: 1px solid #373737;padding: 5px;"}).text describe = describe.replace("\n", " ") - describe = describe.replace("\r", " ") describe = describe.strip() + # Finding the Number of Product Reviews + li = soup.find_all('label', {'class': "tc_label threetabs"}) + review = li[1].text + review = str(re.compile(r"\d+", review)).strip() + # Searching for CVE and MS categories cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if cve: @@ -114,10 +120,10 @@ def tor2door_description_parser(soup): # This is the method to parse the Listing Pages -def tor2door_listing_parser(soup): +def vicecity_listing_parser(soup): # Fields to be parsed nm = 0 # *Total_Products (Should be Integer) - mktName = "Tor2door" # 0 *Marketplace_Name + mktName = "ViceCity" # 0 *Marketplace_Name vendor = [] # 1 *Vendor y rating_vendor = [] # 2 Vendor_Rating success = [] # 3 Vendor_Successful_Transactions @@ -139,56 +145,69 @@ def tor2door_listing_parser(soup): shipTo = [] # 19 Product_ShippedTo href = [] # 20 Product_Links - listing = soup.findAll('div', {"class": "card product-card mb-3"}) + listing = soup.find('div', {"class": "frontpage"}).findAll('div', {"class": "wLf"}) # Populating the Number of Products nm = len(listing) - # Finding Category - cat = soup.find("div", {"class": "col-9"}) - cat = cat.find("h2").text - cat = cat.replace("Category: ", "") - cat = cat.replace(",", "") - cat = cat.strip() + # # Finding Category + # cat = soup.find("div", {"class": "col-9"}) + # cat = cat.find("h2").text + # cat = cat.replace("Category: ", "") + # cat = cat.replace(",", "") + # cat = cat.strip() - for card in listing: - category.append(cat) + for a in listing: + # category.append(cat) - bae = card.findAll('a') + # bae = card.findAll('a') # Adding the url to the list of urls - link = bae[0].get('href') + link = a.find('div', {"class": "wLfLeft"}).find('a', href=True).get('href') + link = cleanLink(link) href.append(link) - # Finding Product Name - product = bae[1].text + # Finding the Product Name + product = a.find('div', {"class": "wLfName"}).find('a').text product = product.replace('\n', ' ') product = product.replace(",", "") + product = product.replace("...", "") product = product.strip() name.append(product) - # Finding Vendor - vendor_name = bae[2].text + # Finding the Vendor + vendor_name = a.find('div', {"class": "wLfVendor"}).find('a').text vendor_name = vendor_name.replace(",", "") vendor_name = vendor_name.strip() vendor.append(vendor_name) - # Finding USD - usd = card.find('div', {"class": "mb-1"}).text - usd = usd.replace("$", "") - usd = usd.strip() - USD.append(usd) - - # Finding Reviews - num = card.find("span", {"class": "rate-count"}).text - num = num.replace("(", "") - num = num.replace("review)", "") - num = num.replace("reviews)", "") - num = num.strip() - reviews.append(num) + # Finding Prices + price = a.find('div', {"class": "wLfPrice"}).find_all('span') + ud = price[0].text.replace(" USD", " ") + # u = ud.replace("$","") + u = ud.replace(",", "") + u = u.strip() + USD.append(u) + bc = price[1].text + bc = str(re.compile(r"\d+(?:\.\d+)?", bc)) + BTC.append(bc) + + # # Finding Reviews + # num = card.find("span", {"class": "rate-count"}).text + # num = num.replace("(", "") + # num = num.replace("review)", "") + # num = num.replace("reviews)", "") + # num = num.strip() + # reviews.append(num) + + # Finding Successful Transactions + freq = a.find('div', {"class": "wLfVendor"}).find('a').get('title') + freq = re.compile(r'\d+(?= sales)', freq) + freq = freq.strip() + success.append(freq) # Searching for CVE and MS categories - cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}')) + cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if not cve: cveValue = "-1" else: @@ -201,7 +220,7 @@ def tor2door_listing_parser(soup): cveValue = cee CVE.append(cveValue) - ms = card.findAll(text=re.compile('MS\d{2}-\d{3}')) + ms = a.findAll(text=re.compile('MS\d{2}-\d{3}')) if not ms: MSValue = "-1" else: @@ -219,14 +238,15 @@ def tor2door_listing_parser(soup): reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href) -def tor2door_links_parser(soup): +def vicecity_links_parser(soup): # Returning all links that should be visited by the Crawler - href = [] - listing = soup.findAll('div', {"class": "card product-card mb-3"}) + href = [] + listing = soup.find('div', {"class": "frontpage"}).findAll('div', {"class": "wLf"}) - for div in listing: - link = div.find('a')['href'] + for a in listing: + bae = a.find('div', {"class": "wLfLeft"}).find('a', href=True) + link = bae['href'] href.append(link) - return href \ No newline at end of file + return href