this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

325 lines
9.8 KiB

__author__ = '91Shadows'
'''
DarkFox marketplace Crawler
'''
import codecs
import socks, socket, time
from datetime import date
import urllib.parse as urlparse
import http.client as httplib
import mechanize
import os
import subprocess
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.DarkFox.parser import darkfox_links_parser
counter = 1
httplib.HTTPConnection._http_vsn = 10
httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
baseURL = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/'
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150)
# Opens Tor Browser, crawls the mkt
def startCrawling():
opentor()
getUrl()
url = getFixedURL()
mktName = getMKTName()
credentials = getCredentials()
br = getAccess(url, credentials)
if br != 'down':
crawlMkt(url, br)
#new_parse(mktName, False)
#new_parse(mktName, False)
closetor()
#Opens Tor Browser
def opentor():
global pid
print("Connecting Tor...")
path = open('../../path.txt').readline()
pro = subprocess.Popen(path)
pid = pro.pid
time.sleep(5)
input("Tor Connected. Press ENTER to continue\n")
return
# Creates a connection through Tor Port
def getUrl(timeout=None):
socket.socket = socks.socksocket
socket.create_connection = create_connection
return
# Makes the onion address request
def create_connection(address, timeout=None, source_address=None):
sock = socks.socksocket()
sock.connect(address)
return sock
# Returns the name of the mkt (Crypto)
def getMKTName():
name = 'DarkFox'
return name
# Returns credentials needed for the mkt
def getCredentials():
credentials = 'blank blank blank blank cap 0'
return credentials
# Return the link of the mkt (DarkFox Link)
def getFixedURL():
url = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/'
return url
# Closes Tor Browser
def closetor():
global pid
os.system("taskkill /pid " + str(pid))
print('Closing Tor...')
time.sleep(3)
return
# Creates a Mechanize browser and initializes its options
def createBrowser():
br = mechanize.Browser()
cj = mechanize.CookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv( True )
br.set_handle_redirect( True )
br.set_handle_referer( True )
br.set_handle_robots(False)
br.set_handle_refresh( mechanize._http.HTTPRefreshProcessor(), max_time = 1 )
br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'),
('Accept', '*/*')]
return br
def getAccess(loginPage, credentials):
logInName = credentials.split()[0]
userName = credentials.split()[1]
logInPass = credentials.split()[2]
password = credentials.split()[3]
captchaName = credentials.split()[4]
formId = credentials.split()[5]
br = createBrowser()
try:
keepTrying = True
while (keepTrying):
br.open(loginPage)
time.sleep(7)
html = br.response()
soup = BeautifulSoup(html)
image_tags = soup.findAll('div', {"class": "imgWrap"})
captchaLink = image_tags[0]
imagelink = captchaLink['style'].split('url(')[1][:-1]
data = br.open(imagelink).read()
br.back()
open('captcha.png', "wb").write(data)
'''
subprocess.Popen("python capt.py", shell=False)
time.sleep(61)
captchaAnswerFile = open("answer.txt", "r")
captchaAnswer = captchaAnswerFile.read().__str__()
'''
captchaAnswer = input('Please provide me with captcha : ')
formIndex = int(formId)
br.select_form(nr=formIndex)
#br[logInName] = userName
#br[logInPass] = password
br[captchaName] = captchaAnswer.__str__()
br.submit()
if br.geturl() != 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/':
keepTrying = False
return br
except:
return 'down'
# Saves the crawled html page
def savePage(page, url):
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
a = page.read()
open(filePath, "wb").write(a)
return
# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
else:
fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
return fullPath
# Creates the name of the file based on URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
# Hacking and Markets related topics
def getInterestedLinks():
links = []
# Guides and Tutorials
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/30739153-1fcd-45cd-b919-072b439c6e06')
# Digital Products
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781')
# Software and Malware
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc')
# Services
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280')
# Miscellaneous
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/fd1c989b-1a74-4dc0-92b0-67d8c1c487cb')
# Hosting and Security
links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/5233fd6a-72e6-466d-b108-5cc61091cd14')
# links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing1.html')
# links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing2.html')
return links
def crawlMkt(url, br):
print("Crawling the DarkFox marketplace")
linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try :
page = br.open(link)
savePage(page, link)
for l in br.links():
absURL = urlparse.urljoin(l.base_url, l.url)
if absURL not in visited and not isSignOut(absURL) and isListingLink(absURL):
visited.add(absURL)
#disabling the process of finding other links
#linksToCrawl.append(absURL)
# crawler asks parser to get links of ALL products on ALL listing pages
list = productPages(link)
j = 0
for item in list:
if j == 2:
break
#itemURL = baseURL + str(item)
try:
#itemPage = br.open(itemURL)
itemPage = br.open(item)
savePage(itemPage, item)
except:
#print 'Error in page: ', itemURL
print('Error in page: ', item)
j+=1
except Exception as e:
print(link, e.message)
i += 1
#finalTime = time.time()
#print finalTime - initialTime
input("Crawling DarkFox marketplace done sucessfully. Press ENTER to continue\n")
return
def isDescriptionLink(url):
if 'product' in url:
return True
return False
# Returns True if the link is a listingPage link
def isListingLink(url):
if 'category' in url:
return True
return False
# calling the parser to define the links
def productPages(url):
soup = ""
error = False
try:
html = codecs.open(
r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html', encoding='utf8')
soup = BeautifulSoup(html, "html.parser")
except:
try:
html = open(
r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html')
soup = BeautifulSoup(html, "html.parser")
except:
error = True
print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.")
if error:
return []
else:
return darkfox_links_parser(soup)
# Drop links that "singout"
def isSignOut(url):
#absURL = urlparse.urljoin(url.base_url, url.url)
if 'signout' in url.lower() or 'logout' in url.lower():
return True
return False
def crawler():
startCrawling()
#print "Crawling and Parsing Crypto .... DONE!"