__author__ = 'DarkWeb'

'''
BestCardingWorld Forum Crawler (Selenium)
'''

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By

import urllib.parse as urlparse
import os, time
from datetime import date
import subprocess
from bs4 import BeautifulSoup
from Forums.Initialization.prepare_parser import new_parse
from Forums.BestCardingWorld.parser import bestcardingworld_links_parser
from Forums.Utilities.utilities import cleanHTML

counter = 1
baseURL = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/'


# Opens Tor Browser, crawls the website, then parses, then closes tor
#acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling():
    # opentor()
    forumName = getForumName()
    # driver = getAccess()

    # if driver != 'down':
    #     crawlForum(driver)
    #     new_parse(forumName, False)
    new_parse(forumName, False)

    # closetor(driver)


# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
    global pid
    print("Connecting Tor...")
    path = open('../../path.txt').readline().strip()
    pro = subprocess.Popen(path)
    pid = pro.pid
    time.sleep(7.5)
    input('Tor Connected. Press ENTER to continue\n')
    return


# Returns the name of the website
#return: name of site in string type
def getForumName():
    name = 'BestCardingWorld'
    return name


# Return the base link of the website
#return: url of base site in string type
def getFixedURL():
    url = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/'
    return url


# Closes Tor Browser
#@param: current selenium driver
def closetor(driver):
    global pid
    # os.system("taskkill /pid " + str(pro.pid))
    os.system("taskkill /t /f /im tor.exe")
    print('Closing Tor...')
    driver.close()
    time.sleep(3)
    return


# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
    file = open('../../path.txt', 'r')
    lines = file.readlines()

    ff_binary = FirefoxBinary(lines[0].strip())

    ff_prof = FirefoxProfile(lines[1].strip())
    ff_prof.set_preference("places.history.enabled", False)
    ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
    ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
    ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
    ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
    ff_prof.set_preference("signon.rememberSignons", False)
    ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
    ff_prof.set_preference("network.dns.disablePrefetch", True)
    ff_prof.set_preference("network.http.sendRefererHeader", 0)
    ff_prof.set_preference("permissions.default.image", 2)
    ff_prof.set_preference("browser.download.folderList", 2)
    ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
    ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
    ff_prof.set_preference('network.proxy.type', 1)
    ff_prof.set_preference("network.proxy.socks_version", 5)
    ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
    ff_prof.set_preference('network.proxy.socks_port', 9150)
    ff_prof.set_preference('network.proxy.socks_remote_dns', True)
    ff_prof.set_preference("javascript.enabled", True)
    ff_prof.update_preferences()

    service = Service(lines[2].strip())

    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)

    return driver


#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
#return: return the selenium driver or string 'down'
def getAccess():
    url = getFixedURL()
    driver = createFFDriver()

    try:

        driver.get(url)
        return driver

    except:

        return 'down'


# Saves the crawled html page, makes the directory path for html pages if not made
def savePage(page, url):
    cleanPage = cleanHTML(page)
    filePath = getFullPathName(url)
    os.makedirs(os.path.dirname(filePath), exist_ok=True)
    open(filePath, 'wb').write(cleanPage.encode('utf-8'))
    return


# Gets the full path of the page to be saved along with its appropriate file name
#@param: raw url as crawler crawls through every site
def getFullPathName(url):
    fileName = getNameFromURL(url)
    if isDescriptionLink(url):
        fullPath = r'C:\Users\fakeguy\Documents\threatIntelligence-main\DarkWebMining_Working\Forums\BestCardingWorld\HTML_Pages\\' + str(
            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
            "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
    else:
        fullPath = r'C:\Users\fakeguy\Documents\threatIntelligence-main\DarkWebMining_Working\Forums\BestCardingWorld\HTML_Pages\\' + str(
            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
            "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
    return fullPath


# Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
#@param: raw url as crawler crawls through every site
def getNameFromURL(url):
    global counter
    name = ''.join(e for e in url if e.isalnum())
    if (name == ''):
        name = str(counter)
        counter = counter + 1
    return name


# returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
#in this example, there are a couple of categories some threads fall under such as
#exploits, malware, and hacking tutorials
def getInterestedLinks():
    links = []

    # Penetration Tests
    links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=43')
    # # Social Engineering Tests
    links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=44')
    # # Exploits
    links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=45')
    # # Tools
    links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=46')
    # # Malware
    links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=47')
    # # Cryptography
    links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=48')
    # # Others
    links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=49')
    # # Hacking Tutorials
    links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=50')
    # # Hacked Accounts and Database Dumps
    # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=30')
    # # Android Moded pak
    links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=53')


    #General Discussion
    # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=16&sid=6a4959d49be41e72944e5aa5684c187a')

    return links


# gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
#topic and description pages are crawled through here, where both types of pages are saved
#@param: selenium driver
def crawlForum(driver):
    print("Crawling the BestCardingWorld forum")

    linksToCrawl = getInterestedLinks()
    visited = set(linksToCrawl)
    initialTime = time.time()

    i = 0
    while i < len(linksToCrawl):
        link = linksToCrawl[i]
        print('Crawling :', link)
        try:
            try:
                driver.get(link)
            except:
                driver.refresh()
            html = driver.page_source
            savePage(html, link)

            has_next_page = True
            while has_next_page:
                list = topicPages(html)
                for item in list:
                    itemURL = urlparse.urljoin(baseURL, str(item))
                    try:
                        driver.get(itemURL)
                    except:
                        driver.refresh()
                    savePage(driver.page_source, item)
                    driver.back()

                try:
                    bar = driver.find_element(by=By.XPATH, value=
                        '/html/body/div[1]/div[2]/div[2]/div[3]/ul')
                    next = bar.find_element_by_class_name('next')
                    link = next.find_element_by_tag_name('a').get_attribute('href')

                    try:
                        driver.get(link)
                    except:
                        driver.refresh()
                    html = driver.page_source
                    savePage(html, link)

                except NoSuchElementException:
                    has_next_page = False

        except Exception as e:
            print(link, e.message)
        i += 1

    # finalTime = time.time()
    # print finalTime - initialTime

    input("Crawling BestCardingWorld forum done sucessfully. Press ENTER to continue\n")


# Returns 'True' if the link is a description link
#@param: url of any url crawled
#return: true if is a description page, false if not
def isDescriptionLink(url):
    if 'topic' in url:
        return True
    return False


# Returns True if the link is a listingPage link
#@param: url of any url crawled
#return: true if is a Listing page, false if not
def isListingLink(url):
    if 'forum' in url:
        return True
    return False


# calling the parser to define the links, the html is the url of a link from the list of interested link list
#@param: link from interested link list
#return: list of description links that should be crawled through
def topicPages(html):
    soup = BeautifulSoup(html, "html.parser")
    #print(soup.find('div', {"class": "forumbg"}).find('ul', {"class": "topiclist topics"}).find('li', {"class": "row bg1"}).find('a', {"class": "topictitle"}, href=True))
    return bestcardingworld_links_parser(soup)


def crawler():
    startCrawling()
    # print("Crawling and Parsing BestCardingWorld .... DONE!")