__author__ = '91Shadows'
|
|
|
|
'''
|
|
DarkFox marketplace Crawler
|
|
'''
|
|
|
|
import codecs
|
|
import socks, socket, time
|
|
from datetime import date
|
|
import urllib.parse as urlparse
|
|
import http.client as httplib
|
|
import mechanize
|
|
import os
|
|
import subprocess
|
|
from bs4 import BeautifulSoup
|
|
from MarketPlaces.Initialization.prepare_parser import new_parse
|
|
from MarketPlaces.DarkFox.parser import darkfox_links_parser
|
|
|
|
counter = 1
|
|
httplib.HTTPConnection._http_vsn = 10
|
|
httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
|
|
baseURL = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/'
|
|
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150)
|
|
|
|
|
|
# Opens Tor Browser, crawls the mkt
|
|
def startCrawling():
|
|
|
|
opentor()
|
|
getUrl()
|
|
url = getFixedURL()
|
|
mktName = getMKTName()
|
|
credentials = getCredentials()
|
|
br = getAccess(url, credentials)
|
|
|
|
if br != 'down':
|
|
crawlMkt(url, br)
|
|
#new_parse(mktName, False)
|
|
|
|
#new_parse(mktName, False)
|
|
|
|
closetor()
|
|
|
|
|
|
#Opens Tor Browser
|
|
def opentor():
|
|
global pid
|
|
print("Connecting Tor...")
|
|
path = open('../../path.txt').readline()
|
|
pro = subprocess.Popen(path)
|
|
pid = pro.pid
|
|
time.sleep(5)
|
|
input("Tor Connected. Press ENTER to continue\n")
|
|
return
|
|
|
|
|
|
# Creates a connection through Tor Port
|
|
def getUrl(timeout=None):
|
|
socket.socket = socks.socksocket
|
|
socket.create_connection = create_connection
|
|
return
|
|
|
|
|
|
# Makes the onion address request
|
|
def create_connection(address, timeout=None, source_address=None):
|
|
sock = socks.socksocket()
|
|
sock.connect(address)
|
|
return sock
|
|
|
|
|
|
# Returns the name of the mkt (Crypto)
|
|
def getMKTName():
|
|
name = 'DarkFox'
|
|
return name
|
|
|
|
|
|
# Returns credentials needed for the mkt
|
|
def getCredentials():
|
|
credentials = 'blank blank blank blank cap 0'
|
|
return credentials
|
|
|
|
|
|
# Return the link of the mkt (DarkFox Link)
|
|
def getFixedURL():
|
|
url = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/'
|
|
return url
|
|
|
|
|
|
# Closes Tor Browser
|
|
def closetor():
|
|
global pid
|
|
os.system("taskkill /pid " + str(pid))
|
|
print('Closing Tor...')
|
|
time.sleep(3)
|
|
return
|
|
|
|
|
|
# Creates a Mechanize browser and initializes its options
|
|
def createBrowser():
|
|
br = mechanize.Browser()
|
|
cj = mechanize.CookieJar()
|
|
br.set_cookiejar(cj)
|
|
|
|
# Browser options
|
|
br.set_handle_equiv( True )
|
|
br.set_handle_redirect( True )
|
|
br.set_handle_referer( True )
|
|
br.set_handle_robots(False)
|
|
br.set_handle_refresh( mechanize._http.HTTPRefreshProcessor(), max_time = 1 )
|
|
|
|
br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'),
|
|
('Accept', '*/*')]
|
|
|
|
return br
|
|
|
|
|
|
def getAccess(loginPage, credentials):
|
|
|
|
logInName = credentials.split()[0]
|
|
userName = credentials.split()[1]
|
|
logInPass = credentials.split()[2]
|
|
password = credentials.split()[3]
|
|
captchaName = credentials.split()[4]
|
|
formId = credentials.split()[5]
|
|
|
|
br = createBrowser()
|
|
|
|
try:
|
|
keepTrying = True
|
|
while (keepTrying):
|
|
|
|
br.open(loginPage)
|
|
time.sleep(7)
|
|
html = br.response()
|
|
soup = BeautifulSoup(html)
|
|
image_tags = soup.findAll('div', {"class": "imgWrap"})
|
|
captchaLink = image_tags[0]
|
|
imagelink = captchaLink['style'].split('url(')[1][:-1]
|
|
data = br.open(imagelink).read()
|
|
br.back()
|
|
open('captcha.png', "wb").write(data)
|
|
'''
|
|
subprocess.Popen("python capt.py", shell=False)
|
|
time.sleep(61)
|
|
captchaAnswerFile = open("answer.txt", "r")
|
|
captchaAnswer = captchaAnswerFile.read().__str__()
|
|
'''
|
|
captchaAnswer = input('Please provide me with captcha : ')
|
|
formIndex = int(formId)
|
|
br.select_form(nr=formIndex)
|
|
#br[logInName] = userName
|
|
#br[logInPass] = password
|
|
br[captchaName] = captchaAnswer.__str__()
|
|
br.submit()
|
|
if br.geturl() != 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/':
|
|
keepTrying = False
|
|
|
|
return br
|
|
|
|
except:
|
|
|
|
return 'down'
|
|
|
|
|
|
# Saves the crawled html page
|
|
def savePage(page, url):
|
|
filePath = getFullPathName(url)
|
|
os.makedirs(os.path.dirname(filePath), exist_ok=True)
|
|
a = page.read()
|
|
open(filePath, "wb").write(a)
|
|
return
|
|
|
|
|
|
# Gets the full path of the page to be saved along with its appropriate file name
|
|
def getFullPathName(url):
|
|
fileName = getNameFromURL(url)
|
|
if isDescriptionLink(url):
|
|
fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
|
|
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
|
|
"%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
|
|
else:
|
|
fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
|
|
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
|
|
"%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
|
|
return fullPath
|
|
|
|
|
|
# Creates the name of the file based on URL
|
|
def getNameFromURL(url):
|
|
global counter
|
|
name = ''.join(e for e in url if e.isalnum())
|
|
if (name == ''):
|
|
name = str(counter)
|
|
counter = counter + 1
|
|
return name
|
|
|
|
|
|
# Hacking and Markets related topics
|
|
def getInterestedLinks():
|
|
links = []
|
|
|
|
# Guides and Tutorials
|
|
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/30739153-1fcd-45cd-b919-072b439c6e06')
|
|
# Digital Products
|
|
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781')
|
|
# Software and Malware
|
|
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc')
|
|
# Services
|
|
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280')
|
|
# Miscellaneous
|
|
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/fd1c989b-1a74-4dc0-92b0-67d8c1c487cb')
|
|
# Hosting and Security
|
|
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/5233fd6a-72e6-466d-b108-5cc61091cd14')
|
|
|
|
# links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing1.html')
|
|
# links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing2.html')
|
|
|
|
return links
|
|
|
|
|
|
def crawlMkt(url, br):
|
|
|
|
print("Crawling the DarkFox marketplace")
|
|
|
|
linksToCrawl = getInterestedLinks()
|
|
visited = set(linksToCrawl)
|
|
initialTime = time.time()
|
|
|
|
i = 0
|
|
while i < len(linksToCrawl):
|
|
link = linksToCrawl[i]
|
|
print('Crawling :', link)
|
|
try :
|
|
page = br.open(link)
|
|
savePage(page, link)
|
|
for l in br.links():
|
|
absURL = urlparse.urljoin(l.base_url, l.url)
|
|
if absURL not in visited and not isSignOut(absURL) and isListingLink(absURL):
|
|
visited.add(absURL)
|
|
|
|
#disabling the process of finding other links
|
|
#linksToCrawl.append(absURL)
|
|
|
|
# crawler asks parser to get links of ALL products on ALL listing pages
|
|
list = productPages(link)
|
|
j = 0
|
|
for item in list:
|
|
if j == 2:
|
|
break
|
|
#itemURL = baseURL + str(item)
|
|
try:
|
|
#itemPage = br.open(itemURL)
|
|
itemPage = br.open(item)
|
|
savePage(itemPage, item)
|
|
except:
|
|
#print 'Error in page: ', itemURL
|
|
print('Error in page: ', item)
|
|
j+=1
|
|
|
|
except Exception as e:
|
|
print(link, e.message)
|
|
i += 1
|
|
|
|
#finalTime = time.time()
|
|
#print finalTime - initialTime
|
|
|
|
input("Crawling DarkFox marketplace done sucessfully. Press ENTER to continue\n")
|
|
|
|
return
|
|
|
|
|
|
def isDescriptionLink(url):
|
|
if 'product' in url:
|
|
return True
|
|
return False
|
|
|
|
|
|
# Returns True if the link is a listingPage link
|
|
def isListingLink(url):
|
|
if 'category' in url:
|
|
return True
|
|
return False
|
|
|
|
|
|
# calling the parser to define the links
|
|
def productPages(url):
|
|
|
|
soup = ""
|
|
|
|
error = False
|
|
try:
|
|
html = codecs.open(
|
|
r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
|
|
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
|
|
"%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html', encoding='utf8')
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
except:
|
|
try:
|
|
html = open(
|
|
r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
|
|
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
|
|
"%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html')
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
except:
|
|
error = True
|
|
print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.")
|
|
|
|
if error:
|
|
return []
|
|
else:
|
|
return darkfox_links_parser(soup)
|
|
|
|
|
|
# Drop links that "singout"
|
|
def isSignOut(url):
|
|
#absURL = urlparse.urljoin(url.base_url, url.url)
|
|
if 'signout' in url.lower() or 'logout' in url.lower():
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def crawler():
|
|
startCrawling()
|
|
#print "Crawling and Parsing Crypto .... DONE!"
|