khangtran
/
dark_web_forums

__author__ = 'Helium'
'''
Kingdom Market Crawler (Selenium)'''

from selenium import webdriverfrom selenium.common.exceptions import NoSuchElementExceptionfrom selenium.webdriver.firefox.firefox_profile import FirefoxProfilefrom selenium.webdriver.firefox.firefox_binary import FirefoxBinaryfrom selenium.webdriver.firefox.service import Servicefrom selenium.common.exceptions import TimeoutExceptionfrom selenium.webdriver.firefox.options import Optionsfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support.ui import Selectfrom PIL import Imageimport base64from io import BytesIO

import urllib.parse as urlparseimport os, re, timefrom datetime import dateimport subprocessfrom bs4 import BeautifulSoupfrom MarketPlaces.Initialization.prepare_parser import new_parsefrom MarketPlaces.Kingdom.parser import kingdom_links_parserfrom MarketPlaces.Utilities.utilities import cleanHTML
counter = 1baseURL = 'http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion'

# Opens Tor Browser, crawls the websitedef startCrawling():    mktName = getMKTName()    driver = getAccess()
    if driver != 'down':        try:            login(driver)            crawlForum(driver)        except Exception as e:            print(driver.current_url, e)        closeDriver(driver)
    new_parse(mktName, baseURL, True)

# Login using premade account credentials and do login captcha manuallydef login(driver):
    input("Press ENTER when CAPTCHA is completed\n")
    # wait for login page    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(        (By.XPATH, '//*[@id="login-form"]')))
    # entering username and password into input boxes    usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-name"]')    # Username here    usernameBox.send_keys('blabri')    passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-passwd"]')    # Password here    passwordBox.send_keys('fishowal')
    select = Select(driver.find_element(by=By.XPATH, value='//*[@id="loginformwithcaptcha-sessiontime"]'))    select.select_by_visible_text('24 hours')
    input("Press ENTER when CAPTCHA and DDOS is completed\n")
    # wait for listing page show up (This Xpath may need to change based on different seed url)    WebDriverWait(driver, 50).until(EC.visibility_of_element_located(        (By.XPATH, '/html/body/div/div/div[3]/div[1]/div/div[3]')))

# Returns the name of the websitedef getMKTName():    name = 'Kingdom'    return name

# Return the link of the websitedef getFixedURL():    url = 'http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion'
    return url

# Closes Tor Browserdef closeDriver(driver):    # global pid    # os.system("taskkill /pid " + str(pro.pid))    # os.system("taskkill /t /f /im tor.exe")    print('Closing Tor...')    driver.close()    time.sleep(3)    return

# Creates FireFox 'driver' and configure its 'Profile'# to use Tor proxy and socketdef createFFDriver():    from MarketPlaces.Initialization.markets_mining import config
    ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
    ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))    ff_prof.set_preference("places.history.enabled", False)    ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)    ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)    ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)    ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)    ff_prof.set_preference("signon.rememberSignons", False)    ff_prof.set_preference("network.cookie.lifetimePolicy", 2)    # ff_prof.set_preference("network.dns.disablePrefetch", True)    # ff_prof.set_preference("network.http.sendRefererHeader", 0)    ff_prof.set_preference("permissions.default.image", 3)    ff_prof.set_preference("browser.download.folderList", 2)    ff_prof.set_preference("browser.download.manager.showWhenStarting", False)    ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")    ff_prof.set_preference('network.proxy.type', 1)    ff_prof.set_preference("network.proxy.socks_version", 5)    ff_prof.set_preference('network.proxy.socks', '127.0.0.1')    ff_prof.set_preference('network.proxy.socks_port', 9150)    ff_prof.set_preference('network.proxy.socks_remote_dns', True)    ff_prof.set_preference("javascript.enabled", False)    ff_prof.update_preferences()
    service = Service(config.get('TOR', 'geckodriver_path'))
    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)        driver.maximize_window()
    return driver

def getAccess():    url = getFixedURL()    driver = createFFDriver()    try:        driver.get(url)        return driver    except:        driver.close()        return 'down'

# Saves the crawled html pagedef savePage(driver, page, url):    cleanPage = cleanHTML(driver, page)    filePath = getFullPathName(url)    os.makedirs(os.path.dirname(filePath), exist_ok=True)    open(filePath, 'wb').write(cleanPage.encode('utf-8'))    return

# Gets the full path of the page to be saved along with its appropriate file namedef getFullPathName(url):    from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
    mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")    fileName = getNameFromURL(url)    if isDescriptionLink(url):        fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')    else:        fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')    return fullPath

# Creates the file name from passed URLdef getNameFromURL(url):    global counter    name = ''.join(e for e in url if e.isalnum())    if (name == ''):        name = str(counter)        counter = counter + 1    return name

def getInterestedLinks():    links = []
    # Software and Malware    links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=127')    # # Services    links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=45')    # # guides and tutorials    links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=107')
    return links

def crawlForum(driver):
    print("Crawling the Kingdom market")
    linksToCrawl = getInterestedLinks()
    i = 0    while i < len(linksToCrawl):        link = linksToCrawl[i]        print('Crawling :', link)        try:            has_next_page = True            count = 0
            while has_next_page:                try:                    driver.get(link)                except:                    driver.refresh()                html = driver.page_source                savePage(driver, html, link)
                list = productPages(html)
                for item in list:                    itemURL = urlparse.urljoin(baseURL, str(item))                    try:                        driver.get(itemURL)                    except:                        driver.refresh()                    savePage(driver, driver.page_source, item)                    driver.back()
                #     # comment out                #     break                #                # # comment out                # if count == 1:                #     break
                try:                    link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "»")]').get_attribute('href')                    if link == "":                        raise NoSuchElementException                    count += 1
                except NoSuchElementException:                    has_next_page = False
        except Exception as e:            print(link, e)        i += 1
    print("Crawling the Kingdom market done.")

# Returns 'True' if the link is Topic linkdef isDescriptionLink(url):    if 'view' in url:        return True    return False

# Returns True if the link is a listingPage linkdef isListingLink(url):    if 'filter_category' in url:        return True    return False

# calling the parser to define the linksdef productPages(html):    soup = BeautifulSoup(html, "html.parser")    return kingdom_links_parser(soup)

def crawler():    startCrawling()