__author__ = '91Shadows' ''' DarkFox marketplace Crawler ''' import codecs import socks, socket, time from datetime import date import urllib.parse as urlparse import http.client as httplib import mechanize import os import subprocess from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.DarkFox.parser import darkfox_links_parser counter = 1 httplib.HTTPConnection._http_vsn = 10 httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0' baseURL = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/' socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150) # Opens Tor Browser, crawls the mkt def startCrawling(): opentor() getUrl() url = getFixedURL() mktName = getMKTName() credentials = getCredentials() br = getAccess(url, credentials) if br != 'down': crawlMkt(url, br) #new_parse(mktName, False) #new_parse(mktName, False) closetor() #Opens Tor Browser def opentor(): global pid print("Connecting Tor...") path = open('../../path.txt').readline() pro = subprocess.Popen(path) pid = pro.pid time.sleep(5) input("Tor Connected. Press ENTER to continue\n") return # Creates a connection through Tor Port def getUrl(timeout=None): socket.socket = socks.socksocket socket.create_connection = create_connection return # Makes the onion address request def create_connection(address, timeout=None, source_address=None): sock = socks.socksocket() sock.connect(address) return sock # Returns the name of the mkt (Crypto) def getMKTName(): name = 'DarkFox' return name # Returns credentials needed for the mkt def getCredentials(): credentials = 'blank blank blank blank cap 0' return credentials # Return the link of the mkt (DarkFox Link) def getFixedURL(): url = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/' return url # Closes Tor Browser def closetor(): global pid os.system("taskkill /pid " + str(pid)) print('Closing Tor...') time.sleep(3) return # Creates a Mechanize browser and initializes its options def createBrowser(): br = mechanize.Browser() cj = mechanize.CookieJar() br.set_cookiejar(cj) # Browser options br.set_handle_equiv( True ) br.set_handle_redirect( True ) br.set_handle_referer( True ) br.set_handle_robots(False) br.set_handle_refresh( mechanize._http.HTTPRefreshProcessor(), max_time = 1 ) br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'), ('Accept', '*/*')] return br def getAccess(loginPage, credentials): logInName = credentials.split()[0] userName = credentials.split()[1] logInPass = credentials.split()[2] password = credentials.split()[3] captchaName = credentials.split()[4] formId = credentials.split()[5] br = createBrowser() try: keepTrying = True while (keepTrying): br.open(loginPage) time.sleep(7) html = br.response() soup = BeautifulSoup(html) image_tags = soup.findAll('div', {"class": "imgWrap"}) captchaLink = image_tags[0] imagelink = captchaLink['style'].split('url(')[1][:-1] data = br.open(imagelink).read() br.back() open('captcha.png', "wb").write(data) ''' subprocess.Popen("python capt.py", shell=False) time.sleep(61) captchaAnswerFile = open("answer.txt", "r") captchaAnswer = captchaAnswerFile.read().__str__() ''' captchaAnswer = input('Please provide me with captcha : ') formIndex = int(formId) br.select_form(nr=formIndex) #br[logInName] = userName #br[logInPass] = password br[captchaName] = captchaAnswer.__str__() br.submit() if br.geturl() != 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/': keepTrying = False return br except: return 'down' # Saves the crawled html page def savePage(page, url): filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) a = page.read() open(filePath, "wb").write(a) return # Gets the full path of the page to be saved along with its appropriate file name def getFullPathName(url): fileName = getNameFromURL(url) if isDescriptionLink(url): fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str( "%02d" % date.today().month) + str("%02d" % date.today().day) + str( "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' else: fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str( "%02d" % date.today().month) + str("%02d" % date.today().day) + str( "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' return fullPath # Creates the name of the file based on URL def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) if (name == ''): name = str(counter) counter = counter + 1 return name # Hacking and Markets related topics def getInterestedLinks(): links = [] # Guides and Tutorials links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/30739153-1fcd-45cd-b919-072b439c6e06') # Digital Products links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781') # Software and Malware links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc') # Services links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280') # Miscellaneous links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/fd1c989b-1a74-4dc0-92b0-67d8c1c487cb') # Hosting and Security links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/5233fd6a-72e6-466d-b108-5cc61091cd14') # links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing1.html') # links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing2.html') return links def crawlMkt(url, br): print("Crawling the DarkFox marketplace") linksToCrawl = getInterestedLinks() visited = set(linksToCrawl) initialTime = time.time() i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try : page = br.open(link) savePage(page, link) for l in br.links(): absURL = urlparse.urljoin(l.base_url, l.url) if absURL not in visited and not isSignOut(absURL) and isListingLink(absURL): visited.add(absURL) #disabling the process of finding other links #linksToCrawl.append(absURL) # crawler asks parser to get links of ALL products on ALL listing pages list = productPages(link) j = 0 for item in list: if j == 2: break #itemURL = baseURL + str(item) try: #itemPage = br.open(itemURL) itemPage = br.open(item) savePage(itemPage, item) except: #print 'Error in page: ', itemURL print('Error in page: ', item) j+=1 except Exception as e: print(link, e.message) i += 1 #finalTime = time.time() #print finalTime - initialTime input("Crawling DarkFox marketplace done successfully. Press ENTER to continue\n") return def isDescriptionLink(url): if 'product' in url: return True return False # Returns True if the link is a listingPage link def isListingLink(url): if 'category' in url: return True return False # calling the parser to define the links def productPages(url): soup = "" error = False try: html = codecs.open( r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str( "%02d" % date.today().month) + str("%02d" % date.today().day) + str( "%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html', encoding='utf8') soup = BeautifulSoup(html, "html.parser") except: try: html = open( r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str( "%02d" % date.today().month) + str("%02d" % date.today().day) + str( "%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html') soup = BeautifulSoup(html, "html.parser") except: error = True print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.") if error: return [] else: return darkfox_links_parser(soup) # Drop links that "singout" def isSignOut(url): #absURL = urlparse.urljoin(url.base_url, url.url) if 'signout' in url.lower() or 'logout' in url.lower(): return True return False def crawler(): startCrawling() #print "Crawling and Parsing Crypto .... DONE!"