this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

356 lines
12 KiB

  1. __author__ = 'Helium'
  2. """
  3. DarkDock Marketplace Crawler (Selenium)
  4. """
  5. from selenium import webdriver
  6. from selenium.common.exceptions import NoSuchElementException
  7. from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
  8. from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
  9. from selenium.webdriver.firefox.service import Service
  10. from selenium.webdriver.support.ui import WebDriverWait
  11. from selenium.webdriver.support import expected_conditions as EC
  12. from selenium.webdriver.common.by import By
  13. import urllib.parse as urlparse
  14. import os, time
  15. from bs4 import BeautifulSoup
  16. from MarketPlaces.Initialization.prepare_parser import new_parse
  17. from MarketPlaces.DarkDock.parser import darkdock_links_parser
  18. from MarketPlaces.Utilities.utilities import cleanHTML
  19. counter = 1
  20. baseURL = 'http://oirolrkrppy6sei6x6bvkkdolc4cjqzqfhxisfzu6exqblahwrrvktyd.onion/'
  21. def startCrawling():
  22. """Main method for the crawler.
  23. Opens Tor Browser, crawls the website, parses, then closes Tor.
  24. """
  25. mktName = getMKTName()
  26. driver = getAccess()
  27. if driver != 'down':
  28. try:
  29. crawlMarket(driver)
  30. except Exception as e:
  31. print(driver.current_url, e)
  32. closeDriver(driver)
  33. new_parse(mktName, baseURL, True)
  34. def getMKTName():
  35. """Returns the name of the website.
  36. """
  37. name = 'DarkDock'
  38. return name
  39. def getFixedURL():
  40. """Returns the base link of site.
  41. """
  42. url = 'http://oirolrkrppy6sei6x6bvkkdolc4cjqzqfhxisfzu6exqblahwrrvktyd.onion/'
  43. return url
  44. def closeDriver(driver):
  45. """Closes Tor Browser.
  46. Args:
  47. driver: The selected Selenium driver.
  48. """
  49. # global pid
  50. # os.system("taskkill /pid " + str(pro.pid))
  51. # os.system("taskkill /t /f /im tor.exe")
  52. print('Closing Tor...')
  53. driver.close()
  54. time.sleep(3)
  55. return
  56. def createFFDriver():
  57. """Creates FireFox 'driver' and configure its 'Profile' to use Tor proxy and socket.
  58. """
  59. from MarketPlaces.Initialization.markets_mining import config
  60. ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
  61. ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
  62. ff_prof.set_preference("places.history.enabled", False)
  63. ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
  64. ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
  65. ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
  66. ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
  67. ff_prof.set_preference("signon.rememberSignons", False)
  68. ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
  69. ff_prof.set_preference("network.dns.disablePrefetch", True)
  70. ff_prof.set_preference("network.http.sendRefererHeader", 0)
  71. ff_prof.set_preference("permissions.default.image", 3)
  72. ff_prof.set_preference("browser.download.folderList", 2)
  73. ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
  74. ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
  75. ff_prof.set_preference('network.proxy.type', 1)
  76. ff_prof.set_preference("network.proxy.socks_version", 5)
  77. ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
  78. ff_prof.set_preference('network.proxy.socks_port', 9150)
  79. ff_prof.set_preference('network.proxy.socks_remote_dns', True)
  80. ff_prof.set_preference("javascript.enabled", False)
  81. ff_prof.update_preferences()
  82. service = Service(config.get('TOR', 'geckodriver_path'))
  83. driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
  84. driver.maximize_window()
  85. return driver
  86. def getAccess():
  87. """The driver 'gets' the url and attempts to access the site.
  88. Return:
  89. A Selenium driver currently on the site or the string 'down' if it can't access the site.
  90. """
  91. url = getFixedURL()
  92. driver = createFFDriver()
  93. try:
  94. driver.get(url)
  95. return driver
  96. except:
  97. driver.close()
  98. return 'down'
  99. def savePage(driver, page, url):
  100. """Saves the crawled html page.
  101. Cleans the html of the current page the driver is on. Then saves the current
  102. crawled html page with its full path name without special characters into the
  103. marketplace's directory. If the directory path doesn't exist it will make it.
  104. Args:
  105. driver: The Selenium driver accessing the page.
  106. page: The html of the saved page.
  107. url: The URL of the saved page.
  108. """
  109. cleanPage = cleanHTML(driver, page)
  110. filePath = getFullPathName(url)
  111. os.makedirs(os.path.dirname(filePath), exist_ok=True)
  112. open(filePath, 'wb').write(cleanPage.encode('utf-8'))
  113. return
  114. def getFullPathName(url):
  115. """Gets the full path name.
  116. Gets the full path of the page to be saved along with its appropriate file name.
  117. Determines which subdirectory to save the page, based on whether it is a description
  118. or listing page.
  119. Args:
  120. url: The URL of the page.
  121. """
  122. from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
  123. mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
  124. fileName = getNameFromURL(url)
  125. if isDescriptionLink(url):
  126. fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
  127. else:
  128. fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
  129. return fullPath
  130. def getNameFromURL(url):
  131. """Creates the file name from the passed URL.
  132. Generates a file name with only its alphanumeric characters.
  133. If the name isn't unique, it will be given a unique name.
  134. Args:
  135. url: The URL of the selected page from the crawler as it crawls through the site.
  136. """
  137. global counter
  138. name = ''.join(e for e in url if e.isalnum())
  139. if (name == ''):
  140. name = str(counter)
  141. counter = counter + 1
  142. return name
  143. def getInterestedLinks():
  144. """Returns list of urls the crawlers runs through.
  145. Returns a list of the different urls of interest that the crawler runs through.
  146. An example of this can be different categories of a market related to hacking,
  147. such as Software and Malware, Guides and Tutorials, Digital Products.
  148. """
  149. links = []
  150. categories = [
  151. 'civil_softwares',
  152. 'carding',
  153. 'theft',
  154. 'mining',
  155. 'worms',
  156. 'dump',
  157. 'viruses',
  158. 'trojans',
  159. 'botnets',
  160. 'security_technology',
  161. 'computers',
  162. 'confidential_info',
  163. 'network_services',
  164. 'database',
  165. 'surveillance',
  166. 'digital_forensics',
  167. '0day',
  168. 'intelligence',
  169. 'private_security'
  170. ]
  171. for category in categories:
  172. links.append(baseURL + "category/" + category)
  173. return links
  174. def crawlMarket(driver):
  175. """Crawls and saves each page of a link of interest.
  176. Accesses, saves, and crawls through each link of interest. For DarkDock, each
  177. link of interest is a category, so we crawl through all numbered pages of the
  178. category. We find the URL of all descriptions/products on the category page, and save each
  179. individual description/product page.
  180. Args:
  181. driver: The Selenium driver accessing the site.
  182. """
  183. print("Crawling the DarkDock market")
  184. linksToCrawl = getInterestedLinks()
  185. i = 0
  186. while i < len(linksToCrawl):
  187. baseCategoryLink = linksToCrawl[i]
  188. link = linksToCrawl[i]
  189. print('Crawling :', link)
  190. try:
  191. has_next_page = True
  192. count = 1 # Number of pages traversed
  193. maxPages = ''
  194. while has_next_page:
  195. # Try to access current link and reload if fails
  196. try:
  197. driver.get(link)
  198. except:
  199. driver.refresh()
  200. # Save html page
  201. html = driver.page_source
  202. savePage(driver, html, linksToCrawl[i] + f"page{count}")
  203. # Get the number of maxPages if maxPages isn't fetched yet
  204. if maxPages == '':
  205. try:
  206. # Wait 30 seconds or until element loads
  207. WebDriverWait(driver, 30).until(
  208. EC.presence_of_element_located((By.XPATH, '//div[@class="pages"]//a[last()]'))
  209. )
  210. # fetches the element that gives the total number of pages in a category
  211. maxPages = int(driver.find_element(By.XPATH, '//div[@class="pages"]//a[last()]').text)
  212. print(f"Total number of Pages: {maxPages}")
  213. except Exception as e:
  214. print(f"Element not found: {str(e)}")
  215. # Parse the product/description pages
  216. list = descriptionPages(html)
  217. for item in list:
  218. # Fetches the item URL by concatenating the base url with the item sub url
  219. itemURL = urlparse.urljoin(baseURL, str(item))
  220. try:
  221. driver.get(itemURL)
  222. except:
  223. driver.refresh()
  224. savePage(driver, driver.page_source, item)
  225. # Go back to the previous category page
  226. driver.back()
  227. # # Add a break for testing if we are checking only the first description/product page
  228. # break
  229. # # Add a break for testing based on how many numbers of pages to test
  230. # if count == 3:
  231. # break
  232. # Try to find the next page
  233. try:
  234. link = f"{baseCategoryLink}/{count}/"
  235. print("\tCurrent Page :", f"{link}")
  236. if link == "":
  237. raise NoSuchElementException
  238. count += 1
  239. except NoSuchElementException:
  240. has_next_page = False
  241. # If reached the number of maxPages stop crawling the current category
  242. if count > maxPages:
  243. print("Max Pages reached")
  244. has_next_page = False
  245. except Exception as e:
  246. print(link, e)
  247. i += 1
  248. print("Crawling the DarkDock market done.")
  249. def isDescriptionLink(url):
  250. """Returns whether the url is for a description page.
  251. Args:
  252. url: The url of a crawled page.
  253. Returns:
  254. Returns 'True' if the url is for a description page. Returns 'False' if the
  255. url is not for a description page.
  256. """
  257. if 'product' in url:
  258. return True
  259. return False
  260. def isListingLink(url):
  261. """Returns whether the url is for a listing page.
  262. Args:
  263. url: The url of a crawled page.
  264. Returns:
  265. Returns 'True' if the url is for a listing page. Returns 'False' if the
  266. url is not for a listing page.
  267. """
  268. if 'category' in url:
  269. return True
  270. return False
  271. def descriptionPages(html):
  272. """Returns all product/description links on the current page.
  273. Passes the html of the category/listing page and parses it for
  274. any description/product links.
  275. Args:
  276. html: The html of the selected category/listing page.
  277. """
  278. soup = BeautifulSoup(html, "html.parser")
  279. return darkdock_links_parser(soup)
  280. def crawler():
  281. """Starts the crawler.
  282. """
  283. startCrawling()