__author__ = '91Shadows'

'''
OnniForums Crawler (Mechanize)
'''

import codecs, os, re
import socks, socket, time
from datetime import date

import urllib.parse as urlparse
import http.client as httplib
import mechanize
import subprocess
from bs4 import BeautifulSoup
from Forums.Initialization.prepare_parser import new_parse
from Forums.OnniForums.parser import onniForums_listing_parser

counter = 1
httplib.HTTPConnection._http_vsn = 10
httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
baseURL = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=42&sid=ee2cbfd73c12923d979790b2bb4bdfd5'
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150)


# Opens Tor Browser, crawls the website
def startCrawling():
    opentor()
    getUrl()
    forumName = getForumName()
    br = getAccess()

    if br != 'down':
        crawlForum(br)
        new_parse(forumName, False)

    # new_parse(forumName, False)

    closetor()


# Opens Tor Browser
def opentor():
    global pid
    print("Connecting Tor...")
    path = open('../../path.txt').readline()
    pro = subprocess.Popen(path)
    pid = pro.pid
    time.sleep(7.5)
    input("Tor Connected. Press ENTER to continue\n")
    return


# Creates a connection through Tor Port
def getUrl(timeout=None):
    socket.socket = socks.socksocket
    socket.create_connection = create_connection
    return


# Makes the onion address request
def create_connection(address, timeout=None, source_address=None):
    sock = socks.socksocket()
    sock.connect(address)
    return sock


# Returns the name of website
def getForumName():
    name = 'OnniForums'
    return name


# Return the link of website
def getFixedURL():
    url = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=42&sid=ee2cbfd73c12923d979790b2bb4bdfd5'

    return url


# Closes Tor Browser
def closetor():
    global pid
    os.system("taskkill /pid " + str(pid))
    print('Closing Tor...')
    time.sleep(3)
    return


# Creates a Mechanize browser and initializes its options
def createBrowser():
    br = mechanize.Browser()
    cj = mechanize.CookieJar()
    br.set_cookiejar(cj)

    # Browser options
    br.set_handle_equiv(True)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(False)
    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
    br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'),
                     ('Accept', '*/*')]

    return br


def getAccess():
    url = getFixedURL()
    br = createBrowser()

    try:
        br.open(url)
        return br

    except:
        return 'down'


# Saves the crawled html page
def savePage(page, url):
    filePath = getFullPathName(url)
    os.makedirs(os.path.dirname(filePath), exist_ok=True)
    a = page.read()
    open(filePath, "wb").write(a)
    return


# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
    fileName = getNameFromURL(url)
    if isDescriptionLink(url):
        fullPath = 'C:/Users/CALSysLab/Documents/threatIntelligence-main/DarkWebMining_Working/Forums/OnniForums/HTML_Pages/' + str(
            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
            "%04d" % date.today().year) + '/' + 'Description/' + fileName + '.html'
    else:
        fullPath = 'C:/Users/CALSysLab/Documents/threatIntelligence-main/DarkWebMining_Working/Forums/OnniForums/HTML_Pages/' + str(
            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
            "%04d" % date.today().year) + '/' + 'Listing/' + fileName + '.html'
    return fullPath


# Creates the name of the file based on URL
def getNameFromURL(url):
    global counter
    name = ''.join(e for e in url if e.isalnum())
    if (name == ''):
        name = str(counter)
        counter = counter + 1
    return name


# Hacking and Markets related topics
def getInterestedLinks():
    links = []

    links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=43&sid=e12864ffccc5df877b03b573534955be')

    return links


# Start crawling Forum pages
def crawlForum(br):
    print("Crawling OnniForum")

    linksToCrawl = getInterestedLinks()
    visited = set(linksToCrawl)
    initialTime = time.time()


    i = 0
    while i < len(linksToCrawl):
        link = linksToCrawl[i]
        print('Crawling :', link)
        try:
            page = br.open(link)
            savePage(page, link)

            res = br.response().read()
            soup = BeautifulSoup(res, 'html.parser')

            next_link = soup.find("a", {"rel": "next"})
            if next_link != None:
                full_url = urlparse.urljoin(linksToCrawl[i], next_link['href'])
                linksToCrawl.insert(i + 1, full_url)

            listOfTopics = findDescriptionPages(link)
            for topic in listOfTopics:
                itemPage = br.open(str(topic))
                savePage(itemPage, topic)

        except Exception as e:
            print('Error getting link: ', link, e)
        i += 1

    # finalTime = time.time()
    # print finalTime - initialTime

    input("Crawling Best Carding world forum done sucessfully. Press ENTER to continue\n")

    return


# Returns True if the link is 'Topic' Links
def isDescriptionLink(url):
    if 'Topic' in url:
        return True
    return False


# Returns True if the link is a listingPage link
def isListingLink(url):
    '''
    reg = 'board=[0-9]+.[0-9]+\Z'
    if len(re.findall(reg, url)) == 0:
        return False
    return True
    '''
    if 'Forum' in url:
        return True
    return False


# calling the parser to define the links
def findDescriptionPages(url):
    soup = ""

    error = False
    try:
        html = codecs.open(
            "C:\\Users\\CALSysLab\\Documents\\threatIntelligence-main\\DarkWebMining_Working\\Forums\\OnniForums\\HTML_Pages\\" + str(
                "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
                "%04d" % date.today().year) + "\\Listing\\" + getNameFromURL(url) + ".html", encoding='utf8')
        soup = BeautifulSoup(html, "html.parser")
    except:
        try:
            html = open(
                "C:\\Users\\CALSysLab\\Documents\\threatIntelligence-main\\DarkWebMining_Working\\Forums\\OnniForums\\HTML_Pages\\" + str(
                    "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
                    "%04d" % date.today().year) + "\\Listing\\" + getNameFromURL(url) + ".html")
            soup = BeautifulSoup(html, "html.parser")
        except:
            error = True
            print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.")

    if not error:
        return onniForums_listing_parser(soup)

    else:
        return []


def crawler():
    startCrawling()
    print("Crawling and Parsing OnniForums .... DONE!")