this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

282 lines
9.8 KiB

  1. __author__ = 'DarkWeb'
  2. '''
  3. DarkNetArmy Forum Crawler (Selenium)
  4. '''
  5. from selenium import webdriver
  6. from selenium.common.exceptions import NoSuchElementException
  7. from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
  8. from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
  9. from selenium.webdriver.firefox.service import Service
  10. from selenium.webdriver.common.by import By
  11. import urllib.parse as urlparse
  12. import os, time
  13. from datetime import date
  14. import subprocess
  15. from bs4 import BeautifulSoup
  16. from Forums.Initialization.prepare_parser import new_parse
  17. from Forums.DarkNetArmy.parser import darknetarmy_links_parser
  18. from Forums.Utilities.utilities import cleanHTML
  19. counter = 1
  20. baseURL = 'http://darknet77vonbqeatfsnawm5jtnoci5z22mxay6cizmoucgmz52mwyad.onion/'
  21. # Opens Tor Browser, crawls the website, then parses, then closes tor
  22. #acts like the main method for the crawler, another function at the end of this code calls this function later
  23. def startCrawling():
  24. forumName = getForumName()
  25. # driver = getAccess()
  26. #
  27. # if driver != 'down':
  28. # try:
  29. # crawlForum(driver)
  30. # except Exception as e:
  31. # print(driver.current_url, e)
  32. # closeDriver(driver)
  33. new_parse(forumName, baseURL, True)
  34. # Returns the name of the website
  35. #return: name of site in string type
  36. def getForumName():
  37. name = 'DarkNetArmy'
  38. return name
  39. # Return the base link of the website
  40. #return: url of base site in string type
  41. def getFixedURL():
  42. url = 'http://darknet77vonbqeatfsnawm5jtnoci5z22mxay6cizmoucgmz52mwyad.onion/'
  43. return url
  44. # Closes Tor Browser
  45. #@param: current selenium driver
  46. def closeDriver(driver):
  47. # global pid
  48. # os.system("taskkill /pid " + str(pro.pid))
  49. # os.system("taskkill /t /f /im tor.exe")
  50. print('Closing Tor...')
  51. driver.close()
  52. time.sleep(3)
  53. return
  54. # Creates FireFox 'driver' and configure its 'Profile'
  55. # to use Tor proxy and socket
  56. def createFFDriver():
  57. from Forums.Initialization.forums_mining import config
  58. ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
  59. ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
  60. ff_prof.set_preference("places.history.enabled", False)
  61. ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
  62. ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
  63. ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
  64. ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
  65. ff_prof.set_preference("signon.rememberSignons", False)
  66. ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
  67. ff_prof.set_preference("network.dns.disablePrefetch", True)#might need to turn off
  68. ff_prof.set_preference("network.http.sendRefererHeader", 0)
  69. ff_prof.set_preference("permissions.default.image", 3)
  70. ff_prof.set_preference("browser.download.folderList", 2)
  71. ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
  72. ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
  73. ff_prof.set_preference('network.proxy.type', 1)
  74. ff_prof.set_preference("network.proxy.socks_version", 5)
  75. ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
  76. ff_prof.set_preference('network.proxy.socks_port', 9150)
  77. ff_prof.set_preference('network.proxy.socks_remote_dns', True)
  78. ff_prof.set_preference("javascript.enabled", True)
  79. ff_prof.update_preferences()
  80. service = Service(config.get('TOR', 'geckodriver_path'))
  81. driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
  82. driver.maximize_window()
  83. return driver
  84. #the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
  85. #return: return the selenium driver or string 'down'
  86. def getAccess():
  87. url = getFixedURL()
  88. driver = createFFDriver()
  89. try:
  90. driver.get(url)
  91. return driver
  92. except:
  93. driver.close()
  94. return 'down'
  95. # Saves the crawled html page, makes the directory path for html pages if not made
  96. def savePage(driver, page, url):
  97. cleanPage = cleanHTML(driver, page)
  98. filePath = getFullPathName(url)
  99. os.makedirs(os.path.dirname(filePath), exist_ok=True)
  100. open(filePath, 'wb').write(cleanPage.encode('utf-8'))
  101. return
  102. # Gets the full path of the page to be saved along with its appropriate file name
  103. #@param: raw url as crawler crawls through every site
  104. def getFullPathName(url):
  105. from Forums.Initialization.forums_mining import config, CURRENT_DATE
  106. mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
  107. fileName = getNameFromURL(url)
  108. if isDescriptionLink(url):
  109. fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
  110. else:
  111. fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
  112. return fullPath
  113. # Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
  114. #@param: raw url as crawler crawls through every site
  115. def getNameFromURL(url):
  116. global counter
  117. name = ''.join(e for e in url if e.isalnum())
  118. if (name == ''):
  119. name = str(counter)
  120. counter = counter + 1
  121. return name
  122. # returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
  123. #in this example, there are a couple of categories some threads fall under such as
  124. #exploits, malware, and hacking tutorials
  125. def getInterestedLinks():
  126. links = []
  127. # rats, malware, ransomware
  128. links.append('http://darknet77vonbqeatfsnawm5jtnoci5z22mxay6cizmoucgmz52mwyad.onion/forums/rats-malwares-key-loggers-ransomware-tools.24/')
  129. # hacking tutorials
  130. links.append('http://darknet77vonbqeatfsnawm5jtnoci5z22mxay6cizmoucgmz52mwyad.onion/forums/hacking-cracking-tutorials-courses-methods.45/')
  131. # # hacking
  132. # links.append('http://darknet77vonbqeatfsnawm5jtnoci5z22mxay6cizmoucgmz52mwyad.onion/forums/hacking-cracking-tools-apps.21/')
  133. return links
  134. # gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
  135. #topic and description pages are crawled through here, where both types of pages are saved
  136. #@param: selenium driver
  137. def crawlForum(driver):
  138. print("Crawling the BestCardingWorld forum")
  139. linksToCrawl = getInterestedLinks()
  140. i = 0
  141. while i < len(linksToCrawl):
  142. link = linksToCrawl[i]
  143. print('Crawling :', link)
  144. try:
  145. has_next_page = True
  146. count = 0
  147. while has_next_page:
  148. try:
  149. driver.get(link)
  150. except:
  151. driver.refresh()
  152. html = driver.page_source
  153. savePage(driver, html, link)
  154. topics = topicPages(html)
  155. for topic in topics:
  156. has_next_topic_page = True
  157. counter = 1
  158. page = topic
  159. while has_next_topic_page:
  160. itemURL = urlparse.urljoin(baseURL, str(page))
  161. try:
  162. driver.get(itemURL)
  163. except:
  164. driver.refresh()
  165. if isListingLink(driver.current_url):
  166. break
  167. savePage(driver, driver.page_source, topic + f"page{counter}") # very important
  168. # comment out
  169. # if counter == 2:
  170. # break
  171. try:
  172. page = link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href')
  173. if page == "":
  174. raise NoSuchElementException
  175. counter += 1
  176. except NoSuchElementException:
  177. has_next_topic_page = False
  178. # making sure we go back to the listing page (browser back button simulation)
  179. try:
  180. driver.get(link)
  181. except:
  182. driver.refresh()
  183. # comment out
  184. # break
  185. # comment out
  186. # if count == 1:
  187. # break
  188. try:
  189. link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href')
  190. if link == "":
  191. raise NoSuchElementException
  192. count += 1
  193. except NoSuchElementException:
  194. has_next_page = False
  195. except Exception as e:
  196. print(link, e)
  197. i += 1
  198. print("Crawling the DarkNetArmy forum done.")
  199. # Returns 'True' if the link is a description link
  200. #@param: url of any url crawled
  201. #return: true if is a description page, false if not
  202. def isDescriptionLink(url):
  203. if 'threads' in url:
  204. return True
  205. return False
  206. # Returns True if the link is a listingPage link
  207. #@param: url of any url crawled
  208. #return: true if is a Listing page, false if not
  209. def isListingLink(url):
  210. if 'forums' in url:
  211. return True
  212. return False
  213. # calling the parser to define the links, the html is the url of a link from the list of interested link list
  214. #@param: link from interested link list
  215. #return: list of description links that should be crawled through
  216. def topicPages(html):
  217. soup = BeautifulSoup(html, "html.parser")
  218. #print(soup.find('div', {"class": "forumbg"}).find('ul', {"class": "topiclist topics"}).find('li', {"class": "row bg1"}).find('a', {"class": "topictitle"}, href=True))
  219. return darknetarmy_links_parser(soup)
  220. def crawler():
  221. startCrawling()
  222. # print("Crawling and Parsing BestCardingWorld .... DONE!")