added to gitignore

2 years ago · 19e8d4aa79
--- a/MarketPlaces/DarkFox/crawler_seleniumtest.py
+++ b/MarketPlaces/DarkFox/crawler_seleniumtest.py
@ -1,192 +0,0 @@
 from selenium import webdriver
 from selenium.common.exceptions import NoSuchElementException
 from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
 from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
 from selenium.webdriver.firefox.service import Service
 import os
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.common.by import By
 from selenium.webdriver.firefox.options import Options
 from PIL import Image

 import codecs
 import time
 from datetime import date
 import urllib.parse as urlparse
 import os
 from bs4 import BeautifulSoup
 from MarketPlaces.DarkFox.parser import darkfox_links_parser

 file = open('../../path.txt', 'r')
 lines = file.readlines()

 # torexe = os.popen(lines[0].strip())  # path for tor.exe
 binary = FirefoxBinary(lines[0].strip())  # full path for firefox.exe
 # options = Options()
 profile = FirefoxProfile(lines[1].strip())  # full path for profile.default
 profile.set_preference('network.proxy.type', 1)
 profile.set_preference('network.proxy.socks', '127.0.0.1')
 profile.set_preference('network.proxy.socks_port', 9150)
 profile.set_preference("network.proxy.socks_remote_dns", True)
 profile.update_preferences()
 service = Service(lines[2].strip())  # full path for geckodriver.exe
 driver = webdriver.Firefox(firefox_binary=binary, firefox_profile=profile,
                           service=service)

 # Manual captcha solver
 def captcha(driver):
    # wait for captcha page show up
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located((By.XPATH, "/html/body/div/div/form/button[1]")))

    # save captcha to local
    driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div[1]/div[1]").screenshot("captcha.png")

    # open method used to open different extension image file
    im = Image.open(r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\captcha.png')

    # This method will show image in any image viewer
    im.show()

    # wait until input space show up
    inputBox = driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div[1]/div[2]/input")

    # ask user input captha solution in terminal
    userIn = input("Enter solution: ")

    # send user solution into the input space
    inputBox.send_keys(userIn)

    # click the verify(submit) button
    driver.find_element(by=By.XPATH, value="/html/body/div/div/form/button[1]").click()

    # wait for listing page show up (This Xpath may need to change based on different seed url)
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
        (By.XPATH, "/html/body/main/div/div/div[2]/div[1]/div[1]/form/div[1]/h1")))


 # Saves the crawled html page
 def savePage(page, url):
    filePath = getFullPathName(url)
    os.makedirs(os.path.dirname(filePath), exist_ok=True)
    open(filePath, 'wb').write(page)
    return


 # Gets the full path of the page to be saved along with its appropriate file name
 def getFullPathName(url):
    fileName = getNameFromURL(url)
    if isDescriptionLink(url):
        fullPath = r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\HTML_Pages\\' + str(
            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
            "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
    else:
        fullPath = r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\HTML_Pages\\' + str(
            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
            "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
    return fullPath


 # Creates the name of the file based on URL
 def getNameFromURL(url):
    global counter
    name = ''.join(e for e in url if e.isalnum())
    if (name == ''):
        name = str(counter)
        counter = counter + 1
    return name


 def getInterestedLinks():
    links = []

    # Guides and Tutorials
    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/30739153-1fcd-45cd-b919-072b439c6e06')
    # Digital Products
    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781')
    # Software and Malware
    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc')
    # Services
    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280')
    # Miscellaneous
    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/fd1c989b-1a74-4dc0-92b0-67d8c1c487cb')
    # Hosting and Security
    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/5233fd6a-72e6-466d-b108-5cc61091cd14')

    return links


 def isDescriptionLink(url):
    if 'product' in url:
        return True
    return False


 def isListingLink(url):
    if 'category' in url:
        return True
    return False


 def productPages(html):
    soup = BeautifulSoup(html, "html.parser")
    return darkfox_links_parser(soup)


 def isSignOut(url):
    #absURL = urlparse.urljoin(url.base_url, url.url)
    if 'signout' in url.lower() or 'logout' in url.lower():
        return True

    return False


 # dark fox seed url
 baseurl = 'http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion'
 driver.get(baseurl)
 captcha(driver)

 # visited = set()
 # visited.add(br.geturl())
 linksToCrawl = getInterestedLinks()
 initialTime = time.time()

 i = 0
 while i < len(linksToCrawl):
    link = linksToCrawl[i]
    print('Crawling :', link)
    try:
        driver.get(link)
        html = driver.page_source.encode('utf-8', link)
        savePage(html, link)
        '''
        has_next_page = True
        while has_next_page:
            j = 0
            list = productPages(html)
            for item in list:
                if j == 1:
                    break
                itemURL = str(item)
                driver.get(itemURL)
                savePage(driver.page_source.encode('utf-8'), item)
                driver.back()
                j += 1

            try:
                link = driver.find_element(by=By.XPATH, value=
                    '/html/body/main/div/div[2]/div/div[2]/div/div/div/nav/a[2]').get_attribute('href')
                driver.get(link)
                html = driver.page_source.encode('utf-8', link)
                savePage(html, link)
            except NoSuchElementException:
                has_next_page = False
        '''
    except Exception as e:
        print(link, e.message)
    i += 1

 # finalTime = time.time()
 # print finalTime - initialTime

 input("Crawling DarkFox marketplace done sucessfully. Press ENTER to continue\n")