khangtran
/
dark_web_forums

__author__ = 'DarkWeb'
'''
Kingdom Market Crawler (Selenium)'''

from selenium import webdriverfrom selenium.common.exceptions import NoSuchElementExceptionfrom selenium.webdriver.firefox.firefox_profile import FirefoxProfilefrom selenium.webdriver.firefox.firefox_binary import FirefoxBinaryfrom selenium.webdriver.firefox.service import Servicefrom selenium.common.exceptions import TimeoutExceptionfrom selenium.webdriver.firefox.options import Optionsfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support.ui import Selectfrom PIL import Imageimport base64from io import BytesIO

import urllib.parse as urlparseimport os, re, timefrom datetime import dateimport subprocessfrom bs4 import BeautifulSoupfrom MarketPlaces.Initialization.prepare_parser import new_parsefrom MarketPlaces.Kingdom.parser import kingdom_links_parserfrom MarketPlaces.Utilities.utilities import cleanHTML
counter = 1baseURL = 'http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion'

# Opens Tor Browser, crawls the websitedef startCrawling():    # marketName = getMarketName()    driver = getAccess()
    if driver != 'down':        try:            captcha(driver)            login(driver)            crawlForum(driver)        except Exception as e:            print(driver.current_url, e)        closeDriver(driver)
    # new_parse(marketName, False)

def captcha(driver):    '''
    # wait for captcha page    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(        (By.XPATH, "/html/body/div/div[1]")))
    # save captcha to local    driver.find_element(by=By.XPATH, value='/html/body/div/div[2]').screenshot(        r'..\Kingdom\captcha1.png')
    # This method will show image in any image viewer    im = Image.open(r'..\Kingdom\captcha1.png')    im.show()
    iframes = driver.find_elements(by=By.TAG_NAME, value='iframe')
    # ask user input captcha solution in terminal    print("Enter squares from smallest to largest (squares are numbered 1-9 left to right)")    for order in ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']:        id = input(f"{order}: ")        iframes[int(id)-1].click()    '''
    input("Press ENTER when CAPTCHA is completed\n")
    # wait for login page    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(        (By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button")))

# Login using premade account credentials and do login captcha manuallydef login(driver):    # wait for login page    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(        (By.XPATH, "/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button")))
    # entering username and password into input boxes    usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-name"]')    # Username here    usernameBox.send_keys('blabri')    passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-passwd"]')    # Password here    passwordBox.send_keys('fishowal')
    select = Select(driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-sessiontime"]'))    select.select_by_visible_text('24 hours')
    '''
    # wait for captcha page show up    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(        (By.XPATH, '//*[@id="captcha"]')))
    # save captcha to local    driver.find_element(by=By.XPATH, value='//*[@id="captcha"]').screenshot(r'..\Kingdom\captcha2.png')
    # This method will show image in any image viewer    im = Image.open(r'..\Kingdom\captcha2.png')    im.show()
    # wait until input space show up    inputBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-captcha"]')
    # ask user input captcha solution in terminal    userIn = input("Enter solution: ")
    # send user solution into the input space    inputBox.send_keys(userIn)
    # click the verify(submit) button    driver.find_element(by=By.XPATH, value="/html/body/div/div/div[3]/div[1]/div/div/form/div[3]/div/div[1]/button").click()    '''
    input("Press ENTER when CAPTCHA is completed\n")
    # wait for listing page show up (This Xpath may need to change based on different seed url)    WebDriverWait(driver, 50).until(EC.visibility_of_element_located(        (By.XPATH, '/html/body/div/div/div[3]/div[2]')))

# Returns the name of the websitedef getMarketName():    name = 'Kingdom'    return name

# Return the link of the websitedef getFixedURL():    url = 'http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion'
    return url

# Closes Tor Browserdef closeDriver(driver):    # global pid    # os.system("taskkill /pid " + str(pro.pid))    # os.system("taskkill /t /f /im tor.exe")    print('Closing Tor...')    driver.close()    time.sleep(3)    return

# Creates FireFox 'driver' and configure its 'Profile'# to use Tor proxy and socketdef createFFDriver():    from MarketPlaces.Initialization.markets_mining import config
    ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
    ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))    ff_prof.set_preference("places.history.enabled", False)    ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)    ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)    ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)    ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)    ff_prof.set_preference("signon.rememberSignons", False)    ff_prof.set_preference("network.cookie.lifetimePolicy", 2)    ff_prof.set_preference("network.dns.disablePrefetch", True)    ff_prof.set_preference("network.http.sendRefererHeader", 0)    ff_prof.set_preference("permissions.default.image", 3)    ff_prof.set_preference("browser.download.folderList", 2)    ff_prof.set_preference("browser.download.manager.showWhenStarting", False)    ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")    ff_prof.set_preference('network.proxy.type', 1)    ff_prof.set_preference("network.proxy.socks_version", 5)    ff_prof.set_preference('network.proxy.socks', '127.0.0.1')    ff_prof.set_preference('network.proxy.socks_port', 9150)    ff_prof.set_preference('network.proxy.socks_remote_dns', True)    ff_prof.set_preference("javascript.enabled", False)    ff_prof.update_preferences()
    service = Service(config.get('TOR', 'geckodriver_path'))
    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)        driver.maximize_window()
    return driver

def getAccess():    url = getFixedURL()    driver = createFFDriver()    try:        driver.get(url)        return driver    except:        driver.close()        return 'down'

# Saves the crawled html pagedef savePage(driver, page, url):    cleanPage = cleanHTML(driver, page)    filePath = getFullPathName(url)    os.makedirs(os.path.dirname(filePath), exist_ok=True)    open(filePath, 'wb').write(cleanPage.encode('utf-8'))    return

# Gets the full path of the page to be saved along with its appropriate file namedef getFullPathName(url):    from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
    mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")    fileName = getNameFromURL(url)    if isDescriptionLink(url):        fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')    else:        fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')    return fullPath

# Creates the file name from passed URLdef getNameFromURL(url):    global counter    name = ''.join(e for e in url if e.isalnum())    if (name == ''):        name = str(counter)        counter = counter + 1    return name

def getInterestedLinks():    links = []
    # Software and Malware    links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=127&t=c298a77d9e93ad32')    # # Services    # links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=45&t=c298a77d9e93ad32')    # # Exploits    # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=45')    # # Tools    # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=46')    # # Malware    # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=47')    # # Cryptography    # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=48')    # # Others    # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=49')    # # Hacking Tutorials    # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=50')    # # Hacked Accounts and Database Dumps    # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=30')    # # Android Moded pak    # links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=53')
    return links

def crawlForum(driver):    print("Crawling the Kingdom market")
    linksToCrawl = getInterestedLinks()
    i = 0    while i < len(linksToCrawl):        link = linksToCrawl[i]        print('Crawling :', link)        try:            has_next_page = True            count = 0
            while has_next_page:                try:                    driver.get(link)                except:                    driver.refresh()                html = driver.page_source                savePage(driver, html, link)
                list = productPages(html)                for item in list:                    itemURL = urlparse.urljoin(baseURL, str(item))                    try:                        driver.get(itemURL)                    except:                        driver.refresh()                    savePage(driver, driver.page_source, item)                    driver.back()
                    # comment out                    break
                # comment out                if count == 1:                    break
                try:                    temp = driver.find_element(by=By.XPATH, value=                        '/html/body/div/div/div[3]/div[2]/div[2]/div/div/ul')                    next = temp.find_element_by_class_name("next")                    link = link.find_element_by_tag_name('a').get_attribute('href')                    if link == "":                        raise NoSuchElementException                    count += 1
                except NoSuchElementException:                    has_next_page = False
        except Exception as e:            print(link, e)        i += 1
    input("Crawling Kingdom Market done sucessfully. Press ENTER to continue\n")

# Returns 'True' if the link is Topic linkdef isDescriptionLink(url):    if 'view' in url:        return True    return False

# Returns True if the link is a listingPage linkdef isListingLink(url):    if 'category' in url:        return True    return False

# calling the parser to define the linksdef productPages(html):    soup = BeautifulSoup(html, "html.parser")    #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)    return kingdom_links_parser(soup)

def crawler():    startCrawling()    # print("Crawling and Parsing BestCardingWorld .... DONE!")