this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

383 lines
15 KiB

  1. __author__ = 'Helium'
  2. '''
  3. Silk Road 4 Crawler (Selenium)
  4. '''
  5. from selenium import webdriver
  6. from selenium.common.exceptions import NoSuchElementException
  7. from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
  8. from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
  9. from selenium.webdriver.firefox.service import Service
  10. from selenium.webdriver.support.ui import WebDriverWait
  11. from selenium.webdriver.support import expected_conditions as EC
  12. from selenium.webdriver.common.by import By
  13. from PIL import Image
  14. import urllib.parse as urlparse
  15. import os, re, time
  16. from datetime import date
  17. import subprocess
  18. import configparser
  19. from bs4 import BeautifulSoup
  20. from MarketPlaces.Initialization.prepare_parser import new_parse
  21. from MarketPlaces.SilkRoad4.parser import silkroad4_links_parser
  22. from MarketPlaces.Utilities.utilities import cleanHTML
  23. counter = 1
  24. baseURL = 'http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/'
  25. # Opens Tor Browser, crawls the website, then parses, then closes tor
  26. #acts like the main method for the crawler, another function at the end of this code calls this function later
  27. def startCrawling():
  28. mktName = getMKTName()
  29. driver = getAccess()
  30. if driver != 'down':
  31. try:
  32. login(driver)
  33. crawlForum(driver)
  34. except Exception as e:
  35. print(driver.current_url, e)
  36. closeDriver(driver)
  37. new_parse(mktName, baseURL, True)
  38. # Returns the name of the website
  39. #return: name of site in string type
  40. def getMKTName():
  41. name = 'SilkRoad4'
  42. return name
  43. # Return the base link of the website
  44. #return: url of base site in string type
  45. def getFixedURL():
  46. url = 'http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/'
  47. return url
  48. # Closes Tor Browser
  49. #@param: current selenium driver
  50. def closeDriver(driver):
  51. # global pid
  52. # os.system("taskkill /pid " + str(pro.pid))
  53. # os.system("taskkill /t /f /im tor.exe")
  54. print('Closing Tor...')
  55. driver.close()
  56. time.sleep(3)
  57. return
  58. # Creates FireFox 'driver' and configure its 'Profile'
  59. # to use Tor proxy and socket
  60. def createFFDriver():
  61. from MarketPlaces.Initialization.markets_mining import config
  62. ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
  63. ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
  64. ff_prof.set_preference("places.history.enabled", False)
  65. ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
  66. ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
  67. ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
  68. ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
  69. ff_prof.set_preference("signon.rememberSignons", False)
  70. ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
  71. ff_prof.set_preference("network.dns.disablePrefetch", True)
  72. ff_prof.set_preference("network.http.sendRefererHeader", 0)
  73. ff_prof.set_preference("permissions.default.image", 3)
  74. ff_prof.set_preference("browser.download.folderList", 2)
  75. ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
  76. ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
  77. ff_prof.set_preference('network.proxy.type', 1)
  78. ff_prof.set_preference("network.proxy.socks_version", 5)
  79. ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
  80. ff_prof.set_preference('network.proxy.socks_port', 9150)
  81. ff_prof.set_preference('network.proxy.socks_remote_dns', True)
  82. ff_prof.set_preference("javascript.enabled", False)
  83. ff_prof.update_preferences()
  84. service = Service(config.get('TOR', 'geckodriver_path'))
  85. driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
  86. driver.maximize_window()
  87. return driver
  88. #the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
  89. #return: return the selenium driver or string 'down'
  90. def getAccess():
  91. url = getFixedURL()
  92. driver = createFFDriver()
  93. try:
  94. driver.get(url)
  95. return driver
  96. except:
  97. driver.close()
  98. return 'down'
  99. # Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
  100. # then allows for manual solving of captcha in the terminal
  101. #If CAPTCHA is incorrect the first time, will automatically re-enter password and username after pressing 'Enter'
  102. #@param: current selenium web driver
  103. def login(driver):
  104. #login urls
  105. login_page1 = driver.current_url
  106. login_page2 = 'http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?login'
  107. # Will automatically re-enter the user/pw if first time CAPTCHA is entered incorrectly.
  108. while(driver.current_url == login_page1 or driver.current_url == login_page2):
  109. # entering username and password into input boxes
  110. usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div/form/div/input[1]')
  111. # Username here
  112. usernameBox.send_keys('SamarIsland')
  113. passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div/form/div/input[2]')
  114. # Password here
  115. passwordBox.send_keys('Underthe5e@')
  116. input("Press ENTER if: \n- username and password needs to be re-entered again\n- when CAPTCHA is completed and home page is visible \n")
  117. # wait for listing page show up (This Xpath may need to change based on different seed url)
  118. WebDriverWait(driver, 100).until(EC.visibility_of_element_located((By.XPATH, '/html/body')))
  119. # Saves the crawled html page, makes the directory path for html pages if not made
  120. def savePage(driver, page, url):
  121. cleanPage = cleanHTML(driver, page)
  122. filePath = getFullPathName(url)
  123. os.makedirs(os.path.dirname(filePath), exist_ok=True)
  124. open(filePath, 'wb').write(cleanPage.encode('utf-8'))
  125. return
  126. # Gets the full path of the page to be saved along with its appropriate file name
  127. #@param: raw url as crawler crawls through every site
  128. def getFullPathName(url):
  129. from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
  130. mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
  131. fileName = getNameFromURL(url)
  132. if isDescriptionLink(url):
  133. fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
  134. else:
  135. fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
  136. return fullPath
  137. # Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
  138. #@param: raw url as crawler crawls through every site
  139. def getNameFromURL(url):
  140. global counter
  141. name = ''.join(e for e in url if e.isalnum())
  142. if (name == ''):
  143. name = str(counter)
  144. counter = counter + 1
  145. return name
  146. # returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
  147. #in this example, there are a couple of categories some threads fall under such as
  148. # Guides and Tutorials, Digital Products, and Software and Malware
  149. #as you can see they are categories of products
  150. def getInterestedLinks():
  151. links = []
  152. #Hacking and Spam Category
  153. #Databases, Access, & Data
  154. links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=12')
  155. #Exploit Kits
  156. links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=13')
  157. #Exploits, Kits, & Vulnerabilities
  158. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=11')
  159. #Hacking Tools & Scripts
  160. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=14')
  161. #Other Hacking & Spam
  162. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=21')
  163. #Phishing Tools & Utilities
  164. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=19')
  165. #RDPs/VNCs
  166. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=18')
  167. #Social Media Boosting
  168. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=20')
  169. #Spam Tools & Scripts
  170. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=15')
  171. #Traffic
  172. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=16')
  173. #Fraud
  174. #Accounts & Bank Drops
  175. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=2')
  176. #CVV & Cards
  177. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=3')
  178. #Carding/Fraud Software
  179. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=8')
  180. #Other Fraud
  181. links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=9')
  182. #Personal Info(SSN/DOB/Fullz?Scans)
  183. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=5')
  184. #Physical Fake IDs
  185. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=7')
  186. #Dumps
  187. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=4')
  188. #Skimmers
  189. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=6')
  190. #Guides & Tutorials
  191. #Fraud Guides
  192. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=137')
  193. #Hacking Guides
  194. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=136')
  195. #Other Guides and tutorial
  196. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=143')
  197. #Security and Anonymity Guides
  198. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=140')
  199. #Social Engineering Guides
  200. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=141')
  201. #Malware
  202. #Keyloggers
  203. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=30')
  204. #Botnets
  205. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=23')
  206. #RATs
  207. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=24')
  208. #Software
  209. #Other Software
  210. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=150')
  211. #Security Software
  212. #Security Software
  213. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=145')
  214. #Cracked Products
  215. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=149')
  216. #Services
  217. #Hacking
  218. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=112')
  219. #Social Engineering
  220. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=116')
  221. #Security
  222. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=113')
  223. #Programming
  224. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=115')
  225. #Carding
  226. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=114')
  227. #Security and Hosting
  228. #VPN
  229. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=129')
  230. #VPN Accounts
  231. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=130')
  232. #Proxies
  233. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=132')
  234. #Bulletproof Hosting
  235. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=126')
  236. #Hosting
  237. #links.append('http://silkroadm5oqzuz3yq7pvwvinvyzeg2wnpnw3t4cyrsggrcgqfy2ptid.onion/?cat=125')
  238. return links
  239. # gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
  240. #topic and description pages are crawled through here, where both types of pages are saved
  241. #@param: selenium driver
  242. def crawlForum(driver):
  243. print("Crawling the SilkRoad4 market")
  244. linksToCrawl = getInterestedLinks()
  245. i = 0
  246. while i < len(linksToCrawl):
  247. link = linksToCrawl[i]
  248. print('Crawling :', link)
  249. try:
  250. has_next_page = True
  251. count = 0
  252. while has_next_page:
  253. try:
  254. driver.get(link)
  255. except:
  256. driver.refresh()
  257. html = driver.page_source
  258. savePage(driver, html, linksToCrawl[i] + f"page{count+1}")
  259. list = productPages(html)
  260. countItem = 0
  261. for item in list:
  262. countItem+=1
  263. itemURL = urlparse.urljoin(baseURL, str(item))
  264. try:
  265. driver.get(itemURL)
  266. except:
  267. driver.refresh()
  268. savePage(driver, driver.page_source, item)
  269. driver.back()
  270. if (countItem == 10):
  271. break
  272. #break
  273. # # comment out
  274. # break
  275. #
  276. #comment out
  277. if count == 2:
  278. break
  279. try:
  280. link = driver.find_element(by=By.XPATH, value='/html/body/div/div/table/tbody/tr/td[1]/center/form/input[3]').get_attribute('href')
  281. if link == "":
  282. raise NoSuchElementException
  283. count += 1
  284. except NoSuchElementException:
  285. has_next_page = False
  286. except Exception as e:
  287. print(link, e)
  288. i += 1
  289. print("Crawling the SilkRoad4 market done.")
  290. # Returns 'True' if the link is a description link
  291. #@param: url of any url crawled
  292. #return: true if is a description page, false if not
  293. def isDescriptionLink(url):
  294. if '?listing=' in url:
  295. return True
  296. return False
  297. # Returns True if the link is a listingPage link
  298. #@param: url of any url crawled
  299. #return: true if is a Listing page, false if not
  300. def isListingLink(url):
  301. if '?cat=' in url:
  302. return True
  303. return False
  304. # calling the parser to define the links, the html is the url of a link from the list of interested link list
  305. #@param: link from interested link list ie. getInterestingLinks()
  306. #return: list of description links that should be crawled through
  307. def productPages(html):
  308. soup = BeautifulSoup(html, "html.parser")
  309. return silkroad4_links_parser(soup)
  310. def crawler():
  311. startCrawling()