khangtran
/
dark_web_forums


								__author__ = 'DarkWeb'


								'''

								OnniForums Crawler (Selenium)

								'''


								from selenium import webdriver

								from selenium.common.exceptions import NoSuchElementException

								from selenium.webdriver.firefox.firefox_profile import FirefoxProfile

								from selenium.webdriver.firefox.firefox_binary import FirefoxBinary

								from selenium.webdriver.firefox.service import Service

								from selenium.webdriver.common.by import By

								from selenium.webdriver.support import expected_conditions as EC

								from selenium.webdriver.support.ui import WebDriverWait

								from PIL import Image


								import urllib.parse as urlparse

								import os, re, time

								from datetime import date

								import subprocess

								from bs4 import BeautifulSoup

								from Forums.Initialization.prepare_parser import new_parse

								from Forums.CrackingPro.parser import crackingPro_links_parser

								from Forums.Utilities.utilities import cleanHTML


								counter = 1

								baseURL = 'https://www.crackingpro.com/'


								# Opens Tor Browser, crawls the website

								def startCrawling():

								    opentor()

								    # forumName = getForumName()

								    driver = getAccess()


								    if driver != 'down':

								        try:

								            login(driver)

								            crawlForum(driver)

								        except Exception as e:

								            print(driver.current_url, e)

								        closetor(driver)


								    # new_parse(forumName, False)


								# Opens Tor Browser

								def opentor():

								    global pid

								    print("Connecting Tor...")

								    path = open('../../path.txt').readline().strip()

								    pro = subprocess.Popen(path)

								    pid = pro.pid

								    time.sleep(7.5)

								    input('Tor Connected. Press ENTER to continue\n')

								    return


								# Login using premade account credentials and do login captcha manually

								def login(driver):

								    '''

								    #click login button

								    login_link = driver.find_element(

								                by=By.ID, value='elUserSignIn').\

								                get_attribute('href')

								    driver.get(login_link)


								    #entering username and password into input boxes

								    usernameBox = driver.find_element(by=By.ID, value='auth')

								    #Username here

								    usernameBox.send_keys('cheese_pizza_man')

								    passwordBox = driver.find_element(by=By.ID, value='password')

								    #Password here

								    passwordBox.send_keys('Gr33nSp@m&3ggs')


								    '''


								    input("Press ENTER when log in is completed\n")


								    # wait for listing page show up (This Xpath may need to change based on different seed url)

								    WebDriverWait(driver, 50).until(EC.visibility_of_element_located(

								        (By.XPATH, '/html/body/main/div/div/div[1]/section/ol/li[8]')))


								# Returns the name of the website

								def getForumName():

								    name = 'CrackingPro'

								    return name


								# Return the link of the website

								def getFixedURL():

								    url = 'https://www.crackingpro.com/'

								    return url


								# Closes Tor Browser

								def closetor(driver):

								    # global pid

								    # os.system("taskkill /pid " + str(pro.pid))

								    # os.system("taskkill /t /f /im tor.exe")

								    print('Closing Tor...')

								    driver.close()# close the current tab

								    time.sleep(3)

								    return


								# Creates FireFox 'driver' and configure its 'Profile'

								# to use Tor proxy and socket

								def createFFDriver():

								    file = open('../../path.txt', 'r')

								    lines = file.readlines()


								    ff_binary = FirefoxBinary(lines[0].strip())


								    ff_prof = FirefoxProfile(lines[1].strip())

								    ff_prof.set_preference("places.history.enabled", False)

								    ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)

								    ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)

								    ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)

								    ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)

								    ff_prof.set_preference("signon.rememberSignons", False)

								    ff_prof.set_preference("network.cookie.lifetimePolicy", 2)

								    ff_prof.set_preference("network.dns.disablePrefetch", True)#

								    ff_prof.set_preference("network.http.sendRefererHeader", 0)

								    ff_prof.set_preference("permissions.default.image", 3)

								    ff_prof.set_preference("browser.download.folderList", 2)

								    ff_prof.set_preference("browser.download.manager.showWhenStarting", False)

								    ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")

								    ff_prof.set_preference('network.proxy.type', 1)

								    ff_prof.set_preference("network.proxy.socks_version", 5)

								    ff_prof.set_preference('network.proxy.socks', '127.0.0.1')

								    ff_prof.set_preference('network.proxy.socks_port', 9150)

								    ff_prof.set_preference('network.proxy.socks_remote_dns', True)

								    ff_prof.set_preference("javascript.enabled", True)

								    ff_prof.update_preferences()


								    service = Service(lines[2].strip())


								    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)


								    return driver


								def getAccess():

								    url = getFixedURL()

								    driver = createFFDriver()

								    try:

								        driver.get(url)# open given url

								        return driver

								    except:

								        driver.close()#close the current tab

								        return 'down'


								# Saves the crawled html page

								def savePage(page, url):

								    cleanPage = cleanHTML(page)

								    filePath = getFullPathName(url)

								    os.makedirs(os.path.dirname(filePath), exist_ok=True)

								    open(filePath, 'wb').write(cleanPage.encode('utf-8'))

								    return


								# Gets the full path of the page to be saved along with its appropriate file name

								def getFullPathName(url):

								    fileName = getNameFromURL(url)

								    if isDescriptionLink(url):

								        #..\CryptBB\HTML_Pages\\

								        fullPath = r'..\CrackingPro\HTML_Pages\\' + str(

								            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(

								            "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'

								    else:

								        fullPath = r'..\CrackingPro\HTML_Pages\\' + str(

								            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(

								            "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'

								    return fullPath


								# Creates the file name from passed URL

								def getNameFromURL(url):

								    global counter

								    name = ''.join(e for e in url if e.isalnum())

								    if (name == ''):

								        name = str(counter)

								        counter = counter + 1

								    return name


								def getInterestedLinks():

								    links = []


								    # exploiting tutorials

								    links.append('https://www.crackingpro.com/forum/38-exploiting-tutorials/')

								    # Hacking & Cracking questions

								    # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-questions')

								    # Exploit PoCs

								    # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Exploit-PoCs')

								    # Cracked software

								    # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Cracked-software')

								    # Malware-development

								    # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Malware-development')

								    # Carding & Fraud

								    # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Carding-Fraud')

								    # Darknet Discussions

								    # links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=88')

								    # OPSEC

								    # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-OPSEC')

								    # Databases

								    # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Databases')

								    # Proxies

								    # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Proxies')


								    return links


								def crawlForum(driver):

								    print("Crawling the CrackingPro forum")


								    linksToCrawl = getInterestedLinks()

								    visited = set(linksToCrawl)

								    initialTime = time.time()


								    i = 0

								    count = 0

								    while i < len(linksToCrawl):

								        link = linksToCrawl[i]

								        print('Crawling :', link)

								        try:

								            try:

								                driver.get(link)

								            except:

								                driver.refresh()

								            html = driver.page_source

								            savePage(html, link)


								            has_next_page = True

								            while has_next_page:

								                list = topicPages(html)  # for multiple pages

								                for item in list:

								                    # variable to check if there is a next page for the topic

								                    has_next_topic_page = True

								                    back_counter = 1


								                    # check if there is a next page for the topics

								                    while has_next_topic_page:

								                        # try to access next page of th topic

								                        itemURL = urlparse.urljoin(baseURL, str(item))

								                        try:

								                            driver.get(itemURL)

								                        except:

								                            driver.refresh()

								                        savePage(driver.page_source, item)


								                        # if there is a next page then go and save....

								                        # next page in the topic?

								                        try:

								                            temp = driver.find_element(by=By.ID, value='comments')  #

								                            temp2 = temp.find_elements(by=By.XPATH, value='/html/body/main/div/div/div/div[4]/div[1]')

								                            temp3 = temp2.find_elements(by=By.CLASS_NAME, value='ipsPagination')#/html/body/main/div/div/div/div[4]/div[1]

								                            item = temp3.find_element(by=By.CLASS_NAME, value='ipsPagination_next').get_attribute('href')  # /html/body/div/div[2]/div/div[2]/div


								                            if item == "":

								                                raise NoSuchElementException

								                                has_next_topic_page = False

								                            else:

								                                back_counter += 1

								                        except NoSuchElementException:

								                            has_next_topic_page = False


								                    # end of loop

								                    for i in range(back_counter):

								                        driver.back()

								                    # comment out

								                    break


								                # comment out

								                # if count == 1:

								                #     count = 0

								                #     break


								                try:  # change depending on web page, #next page

								                    temp = driver.find_element(by=By.XPATH, value='/html/body/main/div/div/div/div[4]/div/div[1]/div')#/html/body/main/div/div/div/div[4]/div/div[1]/div

								                    temp2 = temp.find_element(by=By.CLASS_NAME, value='ipsPagination')

								                    link = temp2.find_element(by=By.CLASS_NAME, value='ipsPagination_next').get_attribute('href')


								                    if link == "":

								                        raise NoSuchElementException

								                    try:

								                        driver.get(link)

								                    except:

								                        driver.refresh()

								                    html = driver.page_source

								                    savePage(html, link)

								                    count += 1


								                except NoSuchElementException:

								                    has_next_page = False


								        except Exception as e:

								            print(link, e)

								        i += 1


								    # finalTime = time.time()

								    # print finalTime - initialTime


								    input("Crawling CrackingPro forum done sucessfully. Press ENTER to continue\n")


								# Returns 'True' if the link is Topic link

								def isDescriptionLink(url):

								    if 'topic' in url:

								        return True

								    return False


								# Returns True if the link is a listingPage link

								def isListingLink(url):

								    if 'forum' in url:

								        return True

								    return False


								# calling the parser to define the links

								def topicPages(html):

								    soup = BeautifulSoup(html, "html.parser")

								    #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)

								    return crackingPro_links_parser(soup)


								def crawler():

								    startCrawling()

								    # print("Crawling and Parsing BestCardingWorld .... DONE!")