this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

191 lines
6.8 KiB

1 year ago
  1. from selenium import webdriver
  2. from selenium.common.exceptions import NoSuchElementException
  3. from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
  4. from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
  5. from selenium.webdriver.firefox.service import Service
  6. import os
  7. from selenium.webdriver.support.ui import WebDriverWait
  8. from selenium.webdriver.support import expected_conditions as EC
  9. from selenium.webdriver.common.by import By
  10. from selenium.webdriver.firefox.options import Options
  11. from PIL import Image
  12. import codecs
  13. import time
  14. from datetime import date
  15. import urllib.parse as urlparse
  16. import os
  17. from bs4 import BeautifulSoup
  18. from MarketPlaces.DarkFox.parser import darkfox_links_parser
  19. file = open('../../path.txt', 'r')
  20. lines = file.readlines()
  21. # torexe = os.popen(lines[0].strip()) # path for tor.exe
  22. binary = FirefoxBinary(lines[0].strip()) # full path for firefox.exe
  23. # options = Options()
  24. profile = FirefoxProfile(lines[1].strip()) # full path for profile.default
  25. profile.set_preference('network.proxy.type', 1)
  26. profile.set_preference('network.proxy.socks', '127.0.0.1')
  27. profile.set_preference('network.proxy.socks_port', 9150)
  28. profile.set_preference("network.proxy.socks_remote_dns", True)
  29. profile.update_preferences()
  30. service = Service(lines[2].strip()) # full path for geckodriver.exe
  31. driver = webdriver.Firefox(firefox_binary=binary, firefox_profile=profile,
  32. service=service)
  33. # Manual captcha solver
  34. def captcha(driver):
  35. # wait for captcha page show up
  36. WebDriverWait(driver, 100).until(EC.visibility_of_element_located((By.XPATH, "/html/body/div/div/form/button[1]")))
  37. # save captcha to local
  38. driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div[1]/div[1]").screenshot("captcha.png")
  39. # open method used to open different extension image file
  40. im = Image.open(r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\captcha.png')
  41. # This method will show image in any image viewer
  42. im.show()
  43. # wait until input space show up
  44. inputBox = driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div[1]/div[2]/input")
  45. # ask user input captha solution in terminal
  46. userIn = input("Enter solution: ")
  47. # send user solution into the input space
  48. inputBox.send_keys(userIn)
  49. # click the verify(submit) button
  50. driver.find_element(by=By.XPATH, value="/html/body/div/div/form/button[1]").click()
  51. # wait for listing page show up (This Xpath may need to change based on different seed url)
  52. WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
  53. (By.XPATH, "/html/body/main/div/div/div[2]/div[1]/div[1]/form/div[1]/h1")))
  54. # Saves the crawled html page
  55. def savePage(page, url):
  56. filePath = getFullPathName(url)
  57. os.makedirs(os.path.dirname(filePath), exist_ok=True)
  58. open(filePath, 'wb').write(page)
  59. return
  60. # Gets the full path of the page to be saved along with its appropriate file name
  61. def getFullPathName(url):
  62. fileName = getNameFromURL(url)
  63. if isDescriptionLink(url):
  64. fullPath = r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\HTML_Pages\\' + str(
  65. "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
  66. "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
  67. else:
  68. fullPath = r'C:\Users\CALSysLab\Documents\threatIntelligence-main\DarkWebMining_Working\MarketPlaces\DarkFox\HTML_Pages\\' + str(
  69. "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
  70. "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
  71. return fullPath
  72. # Creates the name of the file based on URL
  73. def getNameFromURL(url):
  74. global counter
  75. name = ''.join(e for e in url if e.isalnum())
  76. if (name == ''):
  77. name = str(counter)
  78. counter = counter + 1
  79. return name
  80. def getInterestedLinks():
  81. links = []
  82. # Guides and Tutorials
  83. links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/30739153-1fcd-45cd-b919-072b439c6e06')
  84. # Digital Products
  85. links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781')
  86. # Software and Malware
  87. links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc')
  88. # Services
  89. links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280')
  90. # Miscellaneous
  91. links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/fd1c989b-1a74-4dc0-92b0-67d8c1c487cb')
  92. # Hosting and Security
  93. links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/5233fd6a-72e6-466d-b108-5cc61091cd14')
  94. return links
  95. def isDescriptionLink(url):
  96. if 'product' in url:
  97. return True
  98. return False
  99. def isListingLink(url):
  100. if 'category' in url:
  101. return True
  102. return False
  103. def productPages(html):
  104. soup = BeautifulSoup(html, "html.parser")
  105. return darkfox_links_parser(soup)
  106. def isSignOut(url):
  107. #absURL = urlparse.urljoin(url.base_url, url.url)
  108. if 'signout' in url.lower() or 'logout' in url.lower():
  109. return True
  110. return False
  111. # dark fox seed url
  112. baseurl = 'http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion'
  113. driver.get(baseurl)
  114. captcha(driver)
  115. # visited = set()
  116. # visited.add(br.geturl())
  117. linksToCrawl = getInterestedLinks()
  118. initialTime = time.time()
  119. i = 0
  120. while i < len(linksToCrawl):
  121. link = linksToCrawl[i]
  122. print('Crawling :', link)
  123. try:
  124. driver.get(link)
  125. html = driver.page_source.encode('utf-8', link)
  126. savePage(html, link)
  127. '''
  128. has_next_page = True
  129. while has_next_page:
  130. j = 0
  131. list = productPages(html)
  132. for item in list:
  133. if j == 1:
  134. break
  135. itemURL = str(item)
  136. driver.get(itemURL)
  137. savePage(driver.page_source.encode('utf-8'), item)
  138. driver.back()
  139. j += 1
  140. try:
  141. link = driver.find_element(by=By.XPATH, value=
  142. '/html/body/main/div/div[2]/div/div[2]/div/div/div/nav/a[2]').get_attribute('href')
  143. driver.get(link)
  144. html = driver.page_source.encode('utf-8', link)
  145. savePage(html, link)
  146. except NoSuchElementException:
  147. has_next_page = False
  148. '''
  149. except Exception as e:
  150. print(link, e.message)
  151. i += 1
  152. # finalTime = time.time()
  153. # print finalTime - initialTime
  154. input("Crawling DarkFox marketplace done sucessfully. Press ENTER to continue\n")