khangtran
/
dark_web_forums


								__author__ = '91Shadows'


								'''

								DarkFox marketplace Crawler

								'''


								import codecs

								import socks, socket, time

								from datetime import date

								import urllib.parse as urlparse

								import http.client as httplib

								import mechanize

								import os

								import subprocess

								from bs4 import BeautifulSoup

								from MarketPlaces.Initialization.prepare_parser import new_parse

								from MarketPlaces.DarkFox.parser import darkfox_links_parser


								counter = 1

								httplib.HTTPConnection._http_vsn = 10

								httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'

								baseURL = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/'

								socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150)


								# Opens Tor Browser, crawls the mkt

								def startCrawling():


								    opentor()

								    getUrl()

								    url = getFixedURL()

								    mktName = getMKTName()

								    credentials = getCredentials()

								    br = getAccess(url, credentials)


								    if br != 'down':

								       crawlMkt(url, br)

								       #new_parse(mktName, False)


								    #new_parse(mktName, False)


								    closetor()


								#Opens Tor Browser

								def opentor():

								    global pid

								    print("Connecting Tor...")

								    path = open('../../path.txt').readline()

								    pro = subprocess.Popen(path)

								    pid = pro.pid

								    time.sleep(5)

								    input("Tor Connected. Press ENTER to continue\n")

								    return


								# Creates a connection through Tor Port

								def getUrl(timeout=None):

								    socket.socket = socks.socksocket

								    socket.create_connection = create_connection

								    return


								# Makes the onion address request

								def create_connection(address, timeout=None, source_address=None):

								    sock = socks.socksocket()

								    sock.connect(address)

								    return sock


								# Returns the name of the mkt (Crypto)

								def getMKTName():

								    name = 'DarkFox'

								    return name


								# Returns credentials needed for the mkt

								def getCredentials():

								    credentials = 'blank blank blank blank cap 0'

								    return credentials


								# Return the link of the mkt (DarkFox Link)

								def getFixedURL():

								    url = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/'

								    return url


								# Closes Tor Browser

								def closetor():

								    global pid

								    os.system("taskkill /pid " + str(pid))

								    print('Closing Tor...')

								    time.sleep(3)

								    return


								# Creates a Mechanize browser and initializes its options

								def createBrowser():

								    br = mechanize.Browser()

								    cj = mechanize.CookieJar()

								    br.set_cookiejar(cj)


								    # Browser options

								    br.set_handle_equiv( True )

								    br.set_handle_redirect( True )

								    br.set_handle_referer( True )

								    br.set_handle_robots(False)

								    br.set_handle_refresh( mechanize._http.HTTPRefreshProcessor(), max_time = 1 )


								    br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'),

								                     ('Accept', '*/*')]


								    return br


								def getAccess(loginPage, credentials):


								    logInName = credentials.split()[0]

								    userName = credentials.split()[1]

								    logInPass = credentials.split()[2]

								    password = credentials.split()[3]

								    captchaName = credentials.split()[4]

								    formId = credentials.split()[5]


								    br = createBrowser()


								    try:

								        keepTrying = True

								        while (keepTrying):


								            br.open(loginPage)

								            time.sleep(7)

								            html = br.response()

								            soup = BeautifulSoup(html)

								            image_tags = soup.findAll('div', {"class": "imgWrap"})

								            captchaLink = image_tags[0]

								            imagelink = captchaLink['style'].split('url(')[1][:-1]

								            data = br.open(imagelink).read()

								            br.back()

								            open('captcha.png', "wb").write(data)

								            '''

								            subprocess.Popen("python capt.py", shell=False)

								            time.sleep(61)

								            captchaAnswerFile = open("answer.txt", "r")

								            captchaAnswer = captchaAnswerFile.read().__str__()

								            '''

								            captchaAnswer = input('Please provide me with captcha : ')

								            formIndex = int(formId)

								            br.select_form(nr=formIndex)

								            #br[logInName] = userName

								            #br[logInPass] = password

								            br[captchaName] = captchaAnswer.__str__()

								            br.submit()

								            if br.geturl() != 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/':

								                keepTrying = False


								        return br


								    except:


								        return 'down'


								# Saves the crawled html page

								def savePage(page, url):

								    filePath = getFullPathName(url)

								    os.makedirs(os.path.dirname(filePath), exist_ok=True)

								    a = page.read()

								    open(filePath, "wb").write(a)

								    return


								# Gets the full path of the page to be saved along with its appropriate file name

								def getFullPathName(url):

								    fileName = getNameFromURL(url)

								    if isDescriptionLink(url):

								        fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(

								            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(

								            "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'

								    else:

								        fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(

								            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(

								            "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'

								    return fullPath


								# Creates the name of the file based on URL

								def getNameFromURL(url):

								    global counter

								    name = ''.join(e for e in url if e.isalnum())

								    if (name == ''):

								        name = str(counter)

								        counter = counter + 1

								    return name


								# Hacking and Markets related topics

								def getInterestedLinks():

								    links = []


								    # Guides and Tutorials

								    links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/30739153-1fcd-45cd-b919-072b439c6e06')

								    # Digital Products

								    links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781')

								    # Software and Malware

								    links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc')

								    # Services

								    links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280')

								    # Miscellaneous

								    links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/fd1c989b-1a74-4dc0-92b0-67d8c1c487cb')

								    # Hosting and Security

								    links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/5233fd6a-72e6-466d-b108-5cc61091cd14')


								    # links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing1.html')

								    # links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing2.html')


								    return links


								def crawlMkt(url, br):


								    print("Crawling the DarkFox marketplace")


								    linksToCrawl = getInterestedLinks()

								    visited = set(linksToCrawl)

								    initialTime = time.time()


								    i = 0

								    while i < len(linksToCrawl):

								        link = linksToCrawl[i]

								        print('Crawling :', link)

								        try :

								            page = br.open(link)

								            savePage(page, link)

								            for l in br.links():

								                absURL = urlparse.urljoin(l.base_url, l.url)

								                if absURL not in visited and not isSignOut(absURL) and isListingLink(absURL):

								                    visited.add(absURL)


								                    #disabling the process of finding other links

								                    #linksToCrawl.append(absURL)


								            # crawler asks parser to get links  of ALL products on ALL listing pages

								            list = productPages(link)

								            j = 0

								            for item in list:

								                if j == 2:

								                    break

								                #itemURL = baseURL + str(item)

								                try:

								                    #itemPage = br.open(itemURL)

								                    itemPage = br.open(item)

								                    savePage(itemPage, item)

								                except:

								                    #print 'Error in page: ', itemURL

								                    print('Error in page: ', item)

								                j+=1


								        except Exception as e:

								            print(link, e.message)

								        i += 1


								    #finalTime = time.time()

								    #print finalTime - initialTime


								    input("Crawling DarkFox marketplace done sucessfully. Press ENTER to continue\n")


								    return


								def isDescriptionLink(url):

								    if 'product' in url:

								        return True

								    return False


								# Returns True if the link is a listingPage link

								def isListingLink(url):

								    if 'category' in url:

								        return True

								    return False


								# calling the parser to define the links

								def productPages(url):


								    soup = ""


								    error = False

								    try:

								        html = codecs.open(

								            r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(

								                "%02d" % date.today().month) + str("%02d" % date.today().day) + str(

								                "%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html', encoding='utf8')

								        soup = BeautifulSoup(html, "html.parser")

								    except:

								        try:

								            html = open(

								                r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(

								                    "%02d" % date.today().month) + str("%02d" % date.today().day) + str(

								                    "%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html')

								            soup = BeautifulSoup(html, "html.parser")

								        except:

								            error = True

								            print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.")


								    if error:

								       return []

								    else:

								       return darkfox_links_parser(soup)


								# Drop links that "singout"

								def isSignOut(url):

								    #absURL = urlparse.urljoin(url.base_url, url.url)

								    if 'signout' in url.lower() or 'logout' in url.lower():

								        return True


								    return False


								def crawler():

								    startCrawling()

								    #print "Crawling and Parsing Crypto .... DONE!"