Browse Source

added to gitignore

main
westernmeadow 1 year ago
parent
commit
19e8d4aa79
1 changed files with 0 additions and 192 deletions
  1. +0
    -192
      MarketPlaces/DarkFox/crawler_seleniumtest.py

+ 0
- 192
MarketPlaces/DarkFox/crawler_seleniumtest.py View File

@ -1,192 +0,0 @@
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
import os
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from PIL import Image
import codecs
import time
from datetime import date
import urllib.parse as urlparse
import os
from bs4 import BeautifulSoup
from MarketPlaces.DarkFox.parser import darkfox_links_parser
file = open('../../path.txt', 'r')
lines = file.readlines()
# torexe = os.popen(lines[0].strip()) # path for tor.exe
binary = FirefoxBinary(lines[0].strip()) # full path for firefox.exe
# options = Options()
profile = FirefoxProfile(lines[1].strip()) # full path for profile.default
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.socks', '127.0.0.1')
profile.set_preference('network.proxy.socks_port', 9150)
profile.set_preference("network.proxy.socks_remote_dns", True)
profile.update_preferences()
service = Service(lines[2].strip()) # full path for geckodriver.exe
driver = webdriver.Firefox(firefox_binary=binary, firefox_profile=profile,
service=service)
# Manual captcha solver
def captcha(driver):
# wait for captcha page show up
WebDriverWait(driver, 100).until(EC.visibility_of_element_located((By.XPATH, "/html/body/div/div/form/button[1]")))
# save captcha to local
driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div[1]/div[1]").screenshot("captcha.png")
# open method used to open different extension image file
im = Image.open(r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\captcha.png')
# This method will show image in any image viewer
im.show()
# wait until input space show up
inputBox = driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div[1]/div[2]/input")
# ask user input captha solution in terminal
userIn = input("Enter solution: ")
# send user solution into the input space
inputBox.send_keys(userIn)
# click the verify(submit) button
driver.find_element(by=By.XPATH, value="/html/body/div/div/form/button[1]").click()
# wait for listing page show up (This Xpath may need to change based on different seed url)
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/main/div/div/div[2]/div[1]/div[1]/form/div[1]/h1")))
# Saves the crawled html page
def savePage(page, url):
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(page)
return
# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
else:
fullPath = r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
return fullPath
# Creates the name of the file based on URL
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if (name == ''):
name = str(counter)
counter = counter + 1
return name
def getInterestedLinks():
links = []
# Guides and Tutorials
links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/30739153-1fcd-45cd-b919-072b439c6e06')
# Digital Products
links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781')
# Software and Malware
links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc')
# Services
links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280')
# Miscellaneous
links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/fd1c989b-1a74-4dc0-92b0-67d8c1c487cb')
# Hosting and Security
links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/5233fd6a-72e6-466d-b108-5cc61091cd14')
return links
def isDescriptionLink(url):
if 'product' in url:
return True
return False
def isListingLink(url):
if 'category' in url:
return True
return False
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return darkfox_links_parser(soup)
def isSignOut(url):
#absURL = urlparse.urljoin(url.base_url, url.url)
if 'signout' in url.lower() or 'logout' in url.lower():
return True
return False
# dark fox seed url
baseurl = 'http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion'
driver.get(baseurl)
captcha(driver)
# visited = set()
# visited.add(br.geturl())
linksToCrawl = getInterestedLinks()
initialTime = time.time()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
driver.get(link)
html = driver.page_source.encode('utf-8', link)
savePage(html, link)
'''
has_next_page = True
while has_next_page:
j = 0
list = productPages(html)
for item in list:
if j == 1:
break
itemURL = str(item)
driver.get(itemURL)
savePage(driver.page_source.encode('utf-8'), item)
driver.back()
j += 1
try:
link = driver.find_element(by=By.XPATH, value=
'/html/body/main/div/div[2]/div/div[2]/div/div/div/nav/a[2]').get_attribute('href')
driver.get(link)
html = driver.page_source.encode('utf-8', link)
savePage(html, link)
except NoSuchElementException:
has_next_page = False
'''
except Exception as e:
print(link, e.message)
i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling DarkFox marketplace done sucessfully. Press ENTER to continue\n")

Loading…
Cancel
Save