From 19e8d4aa7967caeba5dbfa785269a72f642e68b0 Mon Sep 17 00:00:00 2001 From: westernmeadow <43891839+westernmeadow@users.noreply.github.com> Date: Wed, 7 Jun 2023 12:33:54 -0700 Subject: [PATCH] added to gitignore --- MarketPlaces/DarkFox/crawler_seleniumtest.py | 192 ------------------- 1 file changed, 192 deletions(-) delete mode 100644 MarketPlaces/DarkFox/crawler_seleniumtest.py diff --git a/MarketPlaces/DarkFox/crawler_seleniumtest.py b/MarketPlaces/DarkFox/crawler_seleniumtest.py deleted file mode 100644 index e183b53..0000000 --- a/MarketPlaces/DarkFox/crawler_seleniumtest.py +++ /dev/null @@ -1,192 +0,0 @@ -from selenium import webdriver -from selenium.common.exceptions import NoSuchElementException -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile -from selenium.webdriver.firefox.firefox_binary import FirefoxBinary -from selenium.webdriver.firefox.service import Service -import os -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.common.by import By -from selenium.webdriver.firefox.options import Options -from PIL import Image - -import codecs -import time -from datetime import date -import urllib.parse as urlparse -import os -from bs4 import BeautifulSoup -from MarketPlaces.DarkFox.parser import darkfox_links_parser - -file = open('../../path.txt', 'r') -lines = file.readlines() - -# torexe = os.popen(lines[0].strip()) # path for tor.exe -binary = FirefoxBinary(lines[0].strip()) # full path for firefox.exe -# options = Options() -profile = FirefoxProfile(lines[1].strip()) # full path for profile.default -profile.set_preference('network.proxy.type', 1) -profile.set_preference('network.proxy.socks', '127.0.0.1') -profile.set_preference('network.proxy.socks_port', 9150) -profile.set_preference("network.proxy.socks_remote_dns", True) -profile.update_preferences() -service = Service(lines[2].strip()) # full path for geckodriver.exe -driver = webdriver.Firefox(firefox_binary=binary, firefox_profile=profile, - service=service) - -# Manual captcha solver -def captcha(driver): - # wait for captcha page show up - WebDriverWait(driver, 100).until(EC.visibility_of_element_located((By.XPATH, "/html/body/div/div/form/button[1]"))) - - # save captcha to local - driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div[1]/div[1]").screenshot("captcha.png") - - # open method used to open different extension image file - im = Image.open(r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\captcha.png') - - # This method will show image in any image viewer - im.show() - - # wait until input space show up - inputBox = driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div[1]/div[2]/input") - - # ask user input captha solution in terminal - userIn = input("Enter solution: ") - - # send user solution into the input space - inputBox.send_keys(userIn) - - # click the verify(submit) button - driver.find_element(by=By.XPATH, value="/html/body/div/div/form/button[1]").click() - - # wait for listing page show up (This Xpath may need to change based on different seed url) - WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/main/div/div/div[2]/div[1]/div[1]/form/div[1]/h1"))) - - -# Saves the crawled html page -def savePage(page, url): - filePath = getFullPathName(url) - os.makedirs(os.path.dirname(filePath), exist_ok=True) - open(filePath, 'wb').write(page) - return - - -# Gets the full path of the page to be saved along with its appropriate file name -def getFullPathName(url): - fileName = getNameFromURL(url) - if isDescriptionLink(url): - fullPath = r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' - else: - fullPath = r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' - return fullPath - - -# Creates the name of the file based on URL -def getNameFromURL(url): - global counter - name = ''.join(e for e in url if e.isalnum()) - if (name == ''): - name = str(counter) - counter = counter + 1 - return name - - -def getInterestedLinks(): - links = [] - - # Guides and Tutorials - links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/30739153-1fcd-45cd-b919-072b439c6e06') - # Digital Products - links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781') - # Software and Malware - links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc') - # Services - links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280') - # Miscellaneous - links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/fd1c989b-1a74-4dc0-92b0-67d8c1c487cb') - # Hosting and Security - links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/5233fd6a-72e6-466d-b108-5cc61091cd14') - - return links - - -def isDescriptionLink(url): - if 'product' in url: - return True - return False - - -def isListingLink(url): - if 'category' in url: - return True - return False - - -def productPages(html): - soup = BeautifulSoup(html, "html.parser") - return darkfox_links_parser(soup) - - -def isSignOut(url): - #absURL = urlparse.urljoin(url.base_url, url.url) - if 'signout' in url.lower() or 'logout' in url.lower(): - return True - - return False - - -# dark fox seed url -baseurl = 'http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion' -driver.get(baseurl) -captcha(driver) - -# visited = set() -# visited.add(br.geturl()) -linksToCrawl = getInterestedLinks() -initialTime = time.time() - -i = 0 -while i < len(linksToCrawl): - link = linksToCrawl[i] - print('Crawling :', link) - try: - driver.get(link) - html = driver.page_source.encode('utf-8', link) - savePage(html, link) - ''' - has_next_page = True - while has_next_page: - j = 0 - list = productPages(html) - for item in list: - if j == 1: - break - itemURL = str(item) - driver.get(itemURL) - savePage(driver.page_source.encode('utf-8'), item) - driver.back() - j += 1 - - try: - link = driver.find_element(by=By.XPATH, value= - '/html/body/main/div/div[2]/div/div[2]/div/div/div/nav/a[2]').get_attribute('href') - driver.get(link) - html = driver.page_source.encode('utf-8', link) - savePage(html, link) - except NoSuchElementException: - has_next_page = False - ''' - except Exception as e: - print(link, e.message) - i += 1 - -# finalTime = time.time() -# print finalTime - initialTime - -input("Crawling DarkFox marketplace done sucessfully. Press ENTER to continue\n") \ No newline at end of file