From 98a2a6b70f076b8ee385e22b4c52f0336efd8dfa Mon Sep 17 00:00:00 2001
From: westernmeadow <43891839+westernmeadow@users.noreply.github.com>
Date: Wed, 7 Jun 2023 12:33:06 -0700
Subject: [PATCH] added to gitignore

---
 .idea/.gitignore                             |   5 +
 MarketPlaces/DarkFox/crawler_seleniumtest.py | 192 +++++++++++++++++++
 2 files changed, 197 insertions(+)
 create mode 100644 MarketPlaces/DarkFox/crawler_seleniumtest.py

diff --git a/.idea/.gitignore b/.idea/.gitignore
index 26d3352..7b08725 100644
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@@ -1,3 +1,8 @@
 # Default ignored files
 /shelf/
 /workspace.xml
+/selenium/geckodriver.exe
+*.html
+*.log
+*.png
+*.txt
\ No newline at end of file
diff --git a/MarketPlaces/DarkFox/crawler_seleniumtest.py b/MarketPlaces/DarkFox/crawler_seleniumtest.py
new file mode 100644
index 0000000..e183b53
--- /dev/null
+++ b/MarketPlaces/DarkFox/crawler_seleniumtest.py
@@ -0,0 +1,192 @@
+from selenium import webdriver
+from selenium.common.exceptions import NoSuchElementException
+from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
+from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
+from selenium.webdriver.firefox.service import Service
+import os
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
+from selenium.webdriver.firefox.options import Options
+from PIL import Image
+
+import codecs
+import time
+from datetime import date
+import urllib.parse as urlparse
+import os
+from bs4 import BeautifulSoup
+from MarketPlaces.DarkFox.parser import darkfox_links_parser
+
+file = open('../../path.txt', 'r')
+lines = file.readlines()
+
+# torexe = os.popen(lines[0].strip())  # path for tor.exe
+binary = FirefoxBinary(lines[0].strip())  # full path for firefox.exe
+# options = Options()
+profile = FirefoxProfile(lines[1].strip())  # full path for profile.default
+profile.set_preference('network.proxy.type', 1)
+profile.set_preference('network.proxy.socks', '127.0.0.1')
+profile.set_preference('network.proxy.socks_port', 9150)
+profile.set_preference("network.proxy.socks_remote_dns", True)
+profile.update_preferences()
+service = Service(lines[2].strip())  # full path for geckodriver.exe
+driver = webdriver.Firefox(firefox_binary=binary, firefox_profile=profile,
+                           service=service)
+
+# Manual captcha solver
+def captcha(driver):
+    # wait for captcha page show up
+    WebDriverWait(driver, 100).until(EC.visibility_of_element_located((By.XPATH, "/html/body/div/div/form/button[1]")))
+
+    # save captcha to local
+    driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div[1]/div[1]").screenshot("captcha.png")
+
+    # open method used to open different extension image file
+    im = Image.open(r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\captcha.png')
+
+    # This method will show image in any image viewer
+    im.show()
+
+    # wait until input space show up
+    inputBox = driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div[1]/div[2]/input")
+
+    # ask user input captha solution in terminal
+    userIn = input("Enter solution: ")
+
+    # send user solution into the input space
+    inputBox.send_keys(userIn)
+
+    # click the verify(submit) button
+    driver.find_element(by=By.XPATH, value="/html/body/div/div/form/button[1]").click()
+
+    # wait for listing page show up (This Xpath may need to change based on different seed url)
+    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
+        (By.XPATH, "/html/body/main/div/div/div[2]/div[1]/div[1]/form/div[1]/h1")))
+
+
+# Saves the crawled html page
+def savePage(page, url):
+    filePath = getFullPathName(url)
+    os.makedirs(os.path.dirname(filePath), exist_ok=True)
+    open(filePath, 'wb').write(page)
+    return
+
+
+# Gets the full path of the page to be saved along with its appropriate file name
+def getFullPathName(url):
+    fileName = getNameFromURL(url)
+    if isDescriptionLink(url):
+        fullPath = r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\HTML_Pages\\' + str(
+            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
+            "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
+    else:
+        fullPath = r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\HTML_Pages\\' + str(
+            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
+            "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
+    return fullPath
+
+
+# Creates the name of the file based on URL
+def getNameFromURL(url):
+    global counter
+    name = ''.join(e for e in url if e.isalnum())
+    if (name == ''):
+        name = str(counter)
+        counter = counter + 1
+    return name
+
+
+def getInterestedLinks():
+    links = []
+
+    # Guides and Tutorials
+    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/30739153-1fcd-45cd-b919-072b439c6e06')
+    # Digital Products
+    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781')
+    # Software and Malware
+    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc')
+    # Services
+    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280')
+    # Miscellaneous
+    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/fd1c989b-1a74-4dc0-92b0-67d8c1c487cb')
+    # Hosting and Security
+    links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/5233fd6a-72e6-466d-b108-5cc61091cd14')
+
+    return links
+
+
+def isDescriptionLink(url):
+    if 'product' in url:
+        return True
+    return False
+
+
+def isListingLink(url):
+    if 'category' in url:
+        return True
+    return False
+
+
+def productPages(html):
+    soup = BeautifulSoup(html, "html.parser")
+    return darkfox_links_parser(soup)
+
+
+def isSignOut(url):
+    #absURL = urlparse.urljoin(url.base_url, url.url)
+    if 'signout' in url.lower() or 'logout' in url.lower():
+        return True
+
+    return False
+
+
+# dark fox seed url
+baseurl = 'http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion'
+driver.get(baseurl)
+captcha(driver)
+
+# visited = set()
+# visited.add(br.geturl())
+linksToCrawl = getInterestedLinks()
+initialTime = time.time()
+
+i = 0
+while i < len(linksToCrawl):
+    link = linksToCrawl[i]
+    print('Crawling :', link)
+    try:
+        driver.get(link)
+        html = driver.page_source.encode('utf-8', link)
+        savePage(html, link)
+        '''
+        has_next_page = True
+        while has_next_page:
+            j = 0
+            list = productPages(html)
+            for item in list:
+                if j == 1:
+                    break
+                itemURL = str(item)
+                driver.get(itemURL)
+                savePage(driver.page_source.encode('utf-8'), item)
+                driver.back()
+                j += 1
+
+            try:
+                link = driver.find_element(by=By.XPATH, value=
+                    '/html/body/main/div/div[2]/div/div[2]/div/div/div/nav/a[2]').get_attribute('href')
+                driver.get(link)
+                html = driver.page_source.encode('utf-8', link)
+                savePage(html, link)
+            except NoSuchElementException:
+                has_next_page = False
+        '''
+    except Exception as e:
+        print(link, e.message)
+    i += 1
+
+# finalTime = time.time()
+# print finalTime - initialTime
+
+input("Crawling DarkFox marketplace done sucessfully. Press ENTER to continue\n")
\ No newline at end of file