khangtran
/
dark_web_forums

__author__ = '91Shadows'
'''
DarkFox marketplace Crawler'''

import codecsimport socks, socket, timefrom datetime import dateimport urllib.parse as urlparseimport http.client as httplibimport mechanizeimport osimport subprocessfrom bs4 import BeautifulSoupfrom MarketPlaces.Initialization.prepare_parser import new_parsefrom MarketPlaces.DarkFox.parser import darkfox_links_parser
counter = 1httplib.HTTPConnection._http_vsn = 10httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'baseURL = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/'socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150)

# Opens Tor Browser, crawls the mktdef startCrawling():
    opentor()    getUrl()    url = getFixedURL()    mktName = getMKTName()    credentials = getCredentials()    br = getAccess(url, credentials)
    if br != 'down':       crawlMkt(url, br)       #new_parse(mktName, False)
    #new_parse(mktName, False)
    closetor()

#Opens Tor Browserdef opentor():    global pid    print("Connecting Tor...")    path = open('../../path.txt').readline()    pro = subprocess.Popen(path)    pid = pro.pid    time.sleep(5)    input("Tor Connected. Press ENTER to continue\n")    return

# Creates a connection through Tor Portdef getUrl(timeout=None):    socket.socket = socks.socksocket    socket.create_connection = create_connection    return

# Makes the onion address requestdef create_connection(address, timeout=None, source_address=None):    sock = socks.socksocket()    sock.connect(address)    return sock

# Returns the name of the mkt (Crypto)def getMKTName():    name = 'DarkFox'    return name

# Returns credentials needed for the mktdef getCredentials():    credentials = 'blank blank blank blank cap 0'    return credentials

# Return the link of the mkt (DarkFox Link)def getFixedURL():    url = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/'    return url

# Closes Tor Browserdef closetor():    global pid    os.system("taskkill /pid " + str(pid))    print('Closing Tor...')    time.sleep(3)    return

# Creates a Mechanize browser and initializes its optionsdef createBrowser():    br = mechanize.Browser()    cj = mechanize.CookieJar()    br.set_cookiejar(cj)
    # Browser options    br.set_handle_equiv( True )    br.set_handle_redirect( True )    br.set_handle_referer( True )    br.set_handle_robots(False)    br.set_handle_refresh( mechanize._http.HTTPRefreshProcessor(), max_time = 1 )
    br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'),                     ('Accept', '*/*')]
    return br

def getAccess(loginPage, credentials):
    logInName = credentials.split()[0]    userName = credentials.split()[1]    logInPass = credentials.split()[2]    password = credentials.split()[3]    captchaName = credentials.split()[4]    formId = credentials.split()[5]
    br = createBrowser()
    try:        keepTrying = True        while (keepTrying):
            br.open(loginPage)            time.sleep(7)            html = br.response()            soup = BeautifulSoup(html)            image_tags = soup.findAll('div', {"class": "imgWrap"})            captchaLink = image_tags[0]            imagelink = captchaLink['style'].split('url(')[1][:-1]            data = br.open(imagelink).read()            br.back()            open('captcha.png', "wb").write(data)            '''
            subprocess.Popen("python capt.py", shell=False)            time.sleep(61)            captchaAnswerFile = open("answer.txt", "r")            captchaAnswer = captchaAnswerFile.read().__str__()            '''
            captchaAnswer = input('Please provide me with captcha : ')            formIndex = int(formId)            br.select_form(nr=formIndex)            #br[logInName] = userName            #br[logInPass] = password            br[captchaName] = captchaAnswer.__str__()            br.submit()            if br.geturl() != 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/':                keepTrying = False
        return br
    except:
        return 'down'

# Saves the crawled html pagedef savePage(page, url):    filePath = getFullPathName(url)    os.makedirs(os.path.dirname(filePath), exist_ok=True)    a = page.read()    open(filePath, "wb").write(a)    return

# Gets the full path of the page to be saved along with its appropriate file namedef getFullPathName(url):    fileName = getNameFromURL(url)    if isDescriptionLink(url):        fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(            "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'    else:        fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(            "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'    return fullPath

# Creates the name of the file based on URLdef getNameFromURL(url):    global counter    name = ''.join(e for e in url if e.isalnum())    if (name == ''):        name = str(counter)        counter = counter + 1    return name

# Hacking and Markets related topicsdef getInterestedLinks():    links = []
    # Guides and Tutorials    links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/30739153-1fcd-45cd-b919-072b439c6e06')    # Digital Products    links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781')    # Software and Malware    links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc')    # Services    links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280')    # Miscellaneous    links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/fd1c989b-1a74-4dc0-92b0-67d8c1c487cb')    # Hosting and Security    links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/5233fd6a-72e6-466d-b108-5cc61091cd14')
    # links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing1.html')    # links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing2.html')
    return links

def crawlMkt(url, br):
    print("Crawling the DarkFox marketplace")
    linksToCrawl = getInterestedLinks()    visited = set(linksToCrawl)    initialTime = time.time()
    i = 0    while i < len(linksToCrawl):        link = linksToCrawl[i]        print('Crawling :', link)        try :            page = br.open(link)            savePage(page, link)            for l in br.links():                absURL = urlparse.urljoin(l.base_url, l.url)                if absURL not in visited and not isSignOut(absURL) and isListingLink(absURL):                    visited.add(absURL)
                    #disabling the process of finding other links                    #linksToCrawl.append(absURL)
            # crawler asks parser to get links  of ALL products on ALL listing pages            list = productPages(link)            j = 0            for item in list:                if j == 2:                    break                #itemURL = baseURL + str(item)                try:                    #itemPage = br.open(itemURL)                    itemPage = br.open(item)                    savePage(itemPage, item)                except:                    #print 'Error in page: ', itemURL                    print('Error in page: ', item)                j+=1
        except Exception as e:            print(link, e.message)        i += 1
    #finalTime = time.time()    #print finalTime - initialTime
    input("Crawling DarkFox marketplace done successfully. Press ENTER to continue\n")
    return

def isDescriptionLink(url):    if 'product' in url:        return True    return False

# Returns True if the link is a listingPage linkdef isListingLink(url):    if 'category' in url:        return True    return False

# calling the parser to define the linksdef productPages(url):
    soup = ""
    error = False    try:        html = codecs.open(            r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(                "%02d" % date.today().month) + str("%02d" % date.today().day) + str(                "%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html', encoding='utf8')        soup = BeautifulSoup(html, "html.parser")    except:        try:            html = open(                r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(                    "%02d" % date.today().month) + str("%02d" % date.today().day) + str(                    "%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html')            soup = BeautifulSoup(html, "html.parser")        except:            error = True            print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.")
    if error:       return []    else:       return darkfox_links_parser(soup)

# Drop links that "singout"def isSignOut(url):    #absURL = urlparse.urljoin(url.base_url, url.url)    if 'signout' in url.lower() or 'logout' in url.lower():        return True
    return False

def crawler():    startCrawling()    #print "Crawling and Parsing Crypto .... DONE!"