__author__ = '91Shadows'

'''
DarkFox marketplace Crawler
'''

import codecs
import socks, socket, time
from datetime import date
import urllib.parse as urlparse
import http.client as httplib
import mechanize
import os
import subprocess
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.DarkFox.parser import darkfox_links_parser

counter = 1
httplib.HTTPConnection._http_vsn = 10
httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
baseURL = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/'
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150)


# Opens Tor Browser, crawls the mkt
def startCrawling():

    opentor()
    getUrl()
    url = getFixedURL()
    mktName = getMKTName()
    credentials = getCredentials()
    br = getAccess(url, credentials)

    if br != 'down':
       crawlMkt(url, br)
       #new_parse(mktName, False)

    #new_parse(mktName, False)

    closetor()


#Opens Tor Browser
def opentor():
    global pid
    print("Connecting Tor...")
    path = open('../../path.txt').readline()
    pro = subprocess.Popen(path)
    pid = pro.pid
    time.sleep(5)
    input("Tor Connected. Press ENTER to continue\n")
    return


# Creates a connection through Tor Port
def getUrl(timeout=None):
    socket.socket = socks.socksocket
    socket.create_connection = create_connection
    return


# Makes the onion address request
def create_connection(address, timeout=None, source_address=None):
    sock = socks.socksocket()
    sock.connect(address)
    return sock


# Returns the name of the mkt (Crypto)
def getMKTName():
    name = 'DarkFox'
    return name


# Returns credentials needed for the mkt
def getCredentials():
    credentials = 'blank blank blank blank cap 0'
    return credentials


# Return the link of the mkt (DarkFox Link)
def getFixedURL():
    url = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/'
    return url


# Closes Tor Browser
def closetor():
    global pid
    os.system("taskkill /pid " + str(pid))
    print('Closing Tor...')
    time.sleep(3)
    return


# Creates a Mechanize browser and initializes its options
def createBrowser():
    br = mechanize.Browser()
    cj = mechanize.CookieJar()
    br.set_cookiejar(cj)

    # Browser options
    br.set_handle_equiv( True )
    br.set_handle_redirect( True )
    br.set_handle_referer( True )
    br.set_handle_robots(False)
    br.set_handle_refresh( mechanize._http.HTTPRefreshProcessor(), max_time = 1 )

    br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'),
                     ('Accept', '*/*')]

    return br


def getAccess(loginPage, credentials):

    logInName = credentials.split()[0]
    userName = credentials.split()[1]
    logInPass = credentials.split()[2]
    password = credentials.split()[3]
    captchaName = credentials.split()[4]
    formId = credentials.split()[5]

    br = createBrowser()

    try:
        keepTrying = True
        while (keepTrying):

            br.open(loginPage)
            time.sleep(7)
            html = br.response()
            soup = BeautifulSoup(html)
            image_tags = soup.findAll('div', {"class": "imgWrap"})
            captchaLink = image_tags[0]
            imagelink = captchaLink['style'].split('url(')[1][:-1]
            data = br.open(imagelink).read()
            br.back()
            open('captcha.png', "wb").write(data)
            '''
            subprocess.Popen("python capt.py", shell=False)
            time.sleep(61)
            captchaAnswerFile = open("answer.txt", "r")
            captchaAnswer = captchaAnswerFile.read().__str__()
            '''
            captchaAnswer = input('Please provide me with captcha : ')
            formIndex = int(formId)
            br.select_form(nr=formIndex)
            #br[logInName] = userName
            #br[logInPass] = password
            br[captchaName] = captchaAnswer.__str__()
            br.submit()
            if br.geturl() != 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/':
                keepTrying = False

        return br

    except:

        return 'down'


# Saves the crawled html page
def savePage(page, url):
    filePath = getFullPathName(url)
    os.makedirs(os.path.dirname(filePath), exist_ok=True)
    a = page.read()
    open(filePath, "wb").write(a)
    return


# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
    fileName = getNameFromURL(url)
    if isDescriptionLink(url):
        fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
            "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
    else:
        fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
            "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
    return fullPath


# Creates the name of the file based on URL
def getNameFromURL(url):
    global counter
    name = ''.join(e for e in url if e.isalnum())
    if (name == ''):
        name = str(counter)
        counter = counter + 1
    return name


# Hacking and Markets related topics
def getInterestedLinks():
    links = []

    # Guides and Tutorials
    links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/30739153-1fcd-45cd-b919-072b439c6e06')
    # Digital Products
    links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781')
    # Software and Malware
    links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc')
    # Services
    links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280')
    # Miscellaneous
    links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/fd1c989b-1a74-4dc0-92b0-67d8c1c487cb')
    # Hosting and Security
    links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/5233fd6a-72e6-466d-b108-5cc61091cd14')

    # links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing1.html')
    # links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing2.html')

    return links


def crawlMkt(url, br):

    print("Crawling the DarkFox marketplace")

    linksToCrawl = getInterestedLinks()
    visited = set(linksToCrawl)
    initialTime = time.time()

    i = 0
    while i < len(linksToCrawl):
        link = linksToCrawl[i]
        print('Crawling :', link)
        try :
            page = br.open(link)
            savePage(page, link)
            for l in br.links():
                absURL = urlparse.urljoin(l.base_url, l.url)
                if absURL not in visited and not isSignOut(absURL) and isListingLink(absURL):
                    visited.add(absURL)

                    #disabling the process of finding other links
                    #linksToCrawl.append(absURL)

            # crawler asks parser to get links  of ALL products on ALL listing pages
            list = productPages(link)
            j = 0
            for item in list:
                if j == 2:
                    break
                #itemURL = baseURL + str(item)
                try:
                    #itemPage = br.open(itemURL)
                    itemPage = br.open(item)
                    savePage(itemPage, item)
                except:
                    #print 'Error in page: ', itemURL
                    print('Error in page: ', item)
                j+=1

        except Exception as e:
            print(link, e.message)
        i += 1

    #finalTime = time.time()
    #print finalTime - initialTime

    input("Crawling DarkFox marketplace done successfully. Press ENTER to continue\n")

    return


def isDescriptionLink(url):
    if 'product' in url:
        return True
    return False


# Returns True if the link is a listingPage link
def isListingLink(url):
    if 'category' in url:
        return True
    return False


# calling the parser to define the links
def productPages(url):

    soup = ""

    error = False
    try:
        html = codecs.open(
            r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
                "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
                "%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html', encoding='utf8')
        soup = BeautifulSoup(html, "html.parser")
    except:
        try:
            html = open(
                r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
                    "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
                    "%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html')
            soup = BeautifulSoup(html, "html.parser")
        except:
            error = True
            print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.")

    if error:
       return []
    else:
       return darkfox_links_parser(soup)


# Drop links that "singout"
def isSignOut(url):
    #absURL = urlparse.urljoin(url.base_url, url.url)
    if 'signout' in url.lower() or 'logout' in url.lower():
        return True

    return False


def crawler():
    startCrawling()
    #print "Crawling and Parsing Crypto .... DONE!"