__author__ = 'Helium'

'''
Procrax Forum Crawler (Selenium)
rechecked and confirmed 
'''

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from PIL import Image

import urllib.parse as urlparse
import os, re, time
from datetime import date
import configparser
import subprocess
from bs4 import BeautifulSoup
from Forums.Initialization.prepare_parser import new_parse
from Forums.Procrax.parser import procrax_links_parser
from Forums.Utilities.utilities import cleanHTML

counter = 1
BASE_URL = 'https://procrax.cx/'
FORUM_NAME = 'Procrax'


# Opens Tor Browser, crawls the website
def startCrawling():
    driver = getAccess()

    if driver != 'down':
        try:
            login(driver)
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closeDriver(driver)

    new_parse(
        forum=FORUM_NAME,
        url=BASE_URL, 
        createLog=True
    )


# Login using premade account credentials and do login captcha manually
def login(driver):
    WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
        (By.XPATH, '/html/body/div[1]/div[3]/div[2]/div[3]/div[2]/div[1]/form/div/div/div/dl[4]/dd/div/div[2]/button/span')))
    #entering username and password into input boxes
    usernameBox = driver.find_element(by=By.NAME, value='login')
    #Username here
    usernameBox.send_keys('cheese_pizza_man')#sends string to the username box
    passwordBox = driver.find_element(by=By.NAME, value='password')
    #Password here
    passwordBox.send_keys('Gr33nSp@m&3ggs')# sends string to passwordBox

    clicker = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[3]/div[2]/div[3]/div[2]/div[1]/form/div/div/div/dl[4]/dd/div/div[2]/button/span')
    clicker.click()

    # # wait for listing page show up (This Xpath may need to change based on different seed url)
    # # wait for 50 sec until id = tab_content is found, then cont
    WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
        (By.XPATH, '/html/body/div[1]/div[3]/div[2]/div[3]/div[1]/div/div[1]/div')))


# Returns the name of the website
def getForumName():
    name = 'Procrax'
    return name


# Return the link of the website
def getFixedURL():
    url = 'https://procrax.cx/'
    return url


# Closes Tor Browser
def closeDriver(driver):
    # global pid
    # os.system("taskkill /pid " + str(pro.pid))
    # os.system("taskkill /t /f /im tor.exe")
    print('Closing Tor...')
    driver.close() #close tab
    time.sleep(3)
    return


# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
    from Forums.Initialization.forums_mining import config

    ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))

    ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
    ff_prof.set_preference("places.history.enabled", False)
    ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
    ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
    ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
    ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
    ff_prof.set_preference("signon.rememberSignons", False)
    ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
    ff_prof.set_preference("network.dns.disablePrefetch", True)
    ff_prof.set_preference("network.http.sendRefererHeader", 0)
    ff_prof.set_preference("permissions.default.image", 3)
    ff_prof.set_preference("browser.download.folderList", 2)
    ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
    ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
    ff_prof.set_preference('network.proxy.type', 1)
    ff_prof.set_preference("network.proxy.socks_version", 5)
    ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
    ff_prof.set_preference('network.proxy.socks_port', 9150)
    ff_prof.set_preference('network.proxy.socks_remote_dns', True)
    ff_prof.set_preference("javascript.enabled", True)
    ff_prof.update_preferences()

    service = Service(config.get('TOR', 'geckodriver_path'))

    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)

    driver.maximize_window()

    return driver

def getAccess():
    driver = createFFDriver()
    try:
        driver.get(BASE_URL)# open url in browser
        return driver
    except:
        driver.close()# close tab
        return 'down'


# Saves the crawled html page
def savePage(driver, page, url):
    cleanPage = cleanHTML(driver, page)
    filePath = getFullPathName(url)
    os.makedirs(os.path.dirname(filePath), exist_ok=True)
    open(filePath, 'wb').write(cleanPage.encode('utf-8'))
    return


# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
    from Forums.Initialization.forums_mining import config, CURRENT_DATE

    mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + FORUM_NAME + "/HTML_Pages")
    fileName = getNameFromURL(url)
    if isDescriptionLink(url):
        fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
    else:
        fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
    return fullPath


# Creates the file name from passed URL
def getNameFromURL(url):
    global counter
    name = ''.join(e for e in url if e.isalnum())
    if (name == ''):
        name = str(counter)
        counter = counter + 1
    return name


def getInterestedLinks():
    links = []

    # # general hacking
    links.append('https://procrax.cx/forums/general-hacking.24/')
    # # hacking security tools
    # links.append('https://procrax.cx/forums/hacking-security-tools.20/')
    # # hacktube
    # links.append('https://procrax.cx/forums/hacktube.22/')
    # # cardable
    #  links.append('https://procrax.cx/forums/cardable-websites.28/')
    # #  tools
    # links.append('https://procrax.cx/forums/tools-bots-validators.73/')
    # general forum
    # links.append('https://procrax.cx/forums/forum-discussions-updates.7/')


    return links


def crawlForum(driver):
    print("Crawling the Procrax forum")

    linksToCrawl = getInterestedLinks()

    i = 0
    while i < len(linksToCrawl):
        link = linksToCrawl[i]
        print('Crawling :', link)
        try:
            has_next_page = True
            count = 0

            while has_next_page:
                try:
                    driver.get(link)
                except:
                    driver.refresh()
                html = driver.page_source
                savePage(driver, html, link)

                topics = topicPages(html)
                for topic in topics:
                    has_next_topic_page = True
                    counter = 1
                    page = topic

                    while has_next_topic_page:
                        itemURL = urlparse.urljoin(BASE_URL, str(page))
                        try:
                            driver.get(itemURL)
                        except:
                            driver.refresh()
                        savePage(driver, driver.page_source, topic + f"page{counter}")  # very important

                        # comment out
                        if counter == 2:
                            break

                        try:
                            page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')

                            if page == "":
                                raise NoSuchElementException
                            counter += 1

                        except NoSuchElementException:
                            has_next_topic_page = False

                    for i in range(counter):
                        driver.back()

                    # comment out
                    # break

                # comment out
                if count == 1:
                    break

                try:

                    link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')

                    if link == "":
                        raise NoSuchElementException
                    count += 1

                except NoSuchElementException:
                    has_next_page = False

        except Exception as e:
            print(link, e)
        i += 1

    print("Crawling the Procrax forum done.")


# Returns 'True' if the link is Topic link, may need to change for every website
def isDescriptionLink(url):
    if 'threads' in url:
        return True
    return False


# Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url):
    if 'forums' in url:
        return True
    return False


# calling the parser to define the links
def topicPages(html):
    soup = BeautifulSoup(html, "html.parser")
    #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)
    return procrax_links_parser(soup)


def crawler():
    startCrawling()
    # print("Crawling and Parsing BestCardingWorld .... DONE!")