this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

260 lines
8.4 KiB

  1. __author__ = 'Helium'
  2. '''
  3. NemesisMarket Crawler (Selenium)
  4. Website is very slow
  5. '''
  6. from selenium import webdriver
  7. from selenium.common.exceptions import NoSuchElementException
  8. from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
  9. from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
  10. from selenium.webdriver.firefox.service import Service
  11. from selenium.webdriver.support.ui import WebDriverWait
  12. from selenium.webdriver.support.ui import Select
  13. from selenium.webdriver.support import expected_conditions as EC
  14. from selenium.webdriver.common.by import By
  15. from PIL import Image
  16. import urllib.parse as urlparse
  17. import os, re, time
  18. from datetime import date
  19. import subprocess
  20. import configparser
  21. from bs4 import BeautifulSoup
  22. from MarketPlaces.Initialization.prepare_parser import new_parse
  23. from MarketPlaces.NemesisMarket.parser import nemesis_links_parser
  24. from MarketPlaces.Utilities.utilities import cleanHTML
  25. counter = 1
  26. baseURL = 'http://nemesis55gdxo6emcigofp26nmjokadvmvsbnauloweoa47v2aap2ead.onion/market'
  27. def startCrawling():
  28. mktName = getMKTName()
  29. # driver = getAccess()
  30. #
  31. # if driver != 'down':
  32. # try:
  33. # login(driver)
  34. # crawlForum(driver)
  35. # except Exception as e:
  36. # print(driver.current_url, e)
  37. # closeDriver(driver)
  38. new_parse(mktName, baseURL, True)
  39. # Returns the name of the website
  40. def getMKTName():
  41. name = 'NemesisMarket'
  42. return name
  43. # Return the base link of the website
  44. def getFixedURL():
  45. url = 'http://nemesis55gdxo6emcigofp26nmjokadvmvsbnauloweoa47v2aap2ead.onion/market'
  46. return url
  47. # Closes Tor Browser
  48. def closeDriver(driver):
  49. # global pid
  50. # os.system("taskkill /pid " + str(pro.pid))
  51. # os.system("taskkill /t /f /im tor.exe")
  52. print('Closing Tor...')
  53. driver.close()
  54. time.sleep(3)
  55. return
  56. # Creates FireFox 'driver' and configure its 'Profile'
  57. # to use Tor proxy and socket
  58. def createFFDriver():
  59. from MarketPlaces.Initialization.markets_mining import config
  60. ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
  61. ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
  62. ff_prof.set_preference("places.history.enabled", False)
  63. ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
  64. ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
  65. ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
  66. ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
  67. ff_prof.set_preference("signon.rememberSignons", False)
  68. ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
  69. ff_prof.set_preference("network.dns.disablePrefetch", True)
  70. ff_prof.set_preference("network.http.sendRefererHeader", 0)
  71. ff_prof.set_preference("permissions.default.image", 3)
  72. ff_prof.set_preference("browser.download.folderList", 2)
  73. ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
  74. ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
  75. ff_prof.set_preference('network.proxy.type', 1)
  76. ff_prof.set_preference("network.proxy.socks_version", 5)
  77. ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
  78. ff_prof.set_preference('network.proxy.socks_port', 9150)
  79. ff_prof.set_preference('network.proxy.socks_remote_dns', True)
  80. ff_prof.set_preference("javascript.enabled", False)
  81. ff_prof.update_preferences()
  82. service = Service(config.get('TOR', 'geckodriver_path'))
  83. driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
  84. driver.maximize_window()
  85. return driver
  86. #the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
  87. def getAccess():
  88. url = getFixedURL()
  89. driver = createFFDriver()
  90. try:
  91. driver.get(url)
  92. return driver
  93. except:
  94. driver.close()
  95. return 'down'
  96. def login(driver):
  97. # wait for listing page show up (This Xpath may need to change based on different seed url)
  98. WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
  99. (By.XPATH, '/html/body/div/nav[2]/div/div')))
  100. def savePage(driver, page, url):
  101. cleanPage = cleanHTML(driver, page)
  102. filePath = getFullPathName(url)
  103. os.makedirs(os.path.dirname(filePath), exist_ok=True)
  104. open(filePath, 'wb').write(cleanPage.encode('utf-8'))
  105. return
  106. def getFullPathName(url):
  107. from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
  108. mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
  109. fileName = getNameFromURL(url)
  110. if isDescriptionLink(url):
  111. fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
  112. else:
  113. fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
  114. return fullPath
  115. def getMKTName() -> str:
  116. name = 'NemesisMarket'
  117. return name
  118. def getNameFromURL(url):
  119. global counter
  120. name = ''.join(e for e in url if e.isalnum())
  121. if name == '':
  122. name = str(counter)
  123. counter = counter + 1
  124. return name
  125. def getInterestedLinks():
  126. links = []
  127. # Ransomware
  128. links.append('http://nemesis55gdxo6emcigofp26nmjokadvmvsbnauloweoa47v2aap2ead.onion/items/hacking/ransomware')
  129. # # malware/botnets
  130. # links.append('http://nemesis55gdxo6emcigofp26nmjokadvmvsbnauloweoa47v2aap2ead.onion/items/hacking/malware-botnets')
  131. # # Exploits
  132. # links.append('http://nemesis55gdxo6emcigofp26nmjokadvmvsbnauloweoa47v2aap2ead.onion/items/hacking/exploits')
  133. # DDOS
  134. # links.append('http://nemesis55gdxo6emcigofp26nmjokadvmvsbnauloweoa47v2aap2ead.onion/items/hacking/ddos')
  135. # spam anti-captcha
  136. # links.append('http://nemesis55gdxo6emcigofp26nmjokadvmvsbnauloweoa47v2aap2ead.onion/items/hacking/spamming-anti-captcha')
  137. # # phishing
  138. # links.append('http://nemesis55gdxo6emcigofp26nmjokadvmvsbnauloweoa47v2aap2ead.onion/items/hacking/phishing-social-engineering')
  139. # # hackers for hire
  140. # links.append('http://nemesis55gdxo6emcigofp26nmjokadvmvsbnauloweoa47v2aap2ead.onion/items/hacking/hackers-for-hire')
  141. # # scripts and applications
  142. # links.append('http://nemesis55gdxo6emcigofp26nmjokadvmvsbnauloweoa47v2aap2ead.onion/items/hacking/scripts-applications')
  143. # # other
  144. # links.append('http://nemesis55gdxo6emcigofp26nmjokadvmvsbnauloweoa47v2aap2ead.onion/items/hacking/other')
  145. return links
  146. def crawlForum(driver):
  147. print("Crawling the Nemesis Market")
  148. linksToCrawl = getInterestedLinks()
  149. i = 0
  150. while i < len(linksToCrawl):
  151. link = linksToCrawl[i]
  152. print('Crawling :', link)
  153. try:
  154. has_next_page = True
  155. count = 0
  156. while has_next_page:
  157. try:
  158. driver.get(link)
  159. except:
  160. driver.refresh()
  161. html = driver.page_source
  162. savePage(driver, html, link)
  163. list = productPages(html)
  164. for item in list:
  165. itemURL = urlparse.urljoin(baseURL, str(item))
  166. try:
  167. driver.get(itemURL)
  168. except:
  169. driver.refresh()
  170. savePage(driver, driver.page_source, item)
  171. driver.back()
  172. # comment out
  173. # break
  174. # comment out
  175. if count == 1:
  176. break
  177. try:
  178. link = driver.find_element(by=By.XPATH, value='//a[contains(text(), ">")]').get_attribute('href')
  179. if link == "":
  180. raise NoSuchElementException
  181. count += 1
  182. except NoSuchElementException:
  183. has_next_page = False
  184. except Exception as e:
  185. print(link, e)
  186. i += 1
  187. print("Crawling the Nemesis Market done.")
  188. # Returns 'True' if the link is Topic link, may need to change for every website
  189. def isDescriptionLink(url):
  190. if 'item' in url and 'items' not in url:
  191. return True
  192. return False
  193. # Returns True if the link is a listingPage link, may need to change for every website
  194. def isListingLink(url):
  195. if 'items' in url:
  196. return True
  197. return False
  198. def productPages(html):
  199. soup = BeautifulSoup(html, "html.parser")
  200. return nemesis_links_parser(soup)
  201. def crawler():
  202. startCrawling()