this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

281 lines
8.3 KiB

  1. __author__ = 'DarkWeb'
  2. '''
  3. nulled Forum Crawler (Selenium)
  4. '''
  5. from selenium import webdriver
  6. from selenium.common.exceptions import NoSuchElementException
  7. from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
  8. from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
  9. from selenium.webdriver.firefox.service import Service
  10. from selenium.webdriver.common.by import By
  11. from selenium.webdriver.support import expected_conditions as EC
  12. from selenium.webdriver.support.ui import WebDriverWait
  13. from PIL import Image
  14. import base64
  15. from io import BytesIO
  16. import urllib.parse as urlparse
  17. import os, re, time
  18. from datetime import date
  19. import subprocess
  20. from bs4 import BeautifulSoup
  21. from Forums.Initialization.prepare_parser import new_parse
  22. from Forums.Nulled.parser import nulled_links_parser
  23. from Forums.Utilities.utilities import cleanHTML
  24. counter = 1
  25. baseURL = 'https://www.nulled.to'
  26. # Opens Tor Browser, crawls the website
  27. def startCrawling():
  28. # opentor()
  29. forumName = getForumName()
  30. # driver = getAccess()
  31. #
  32. # if driver != 'down':
  33. # login(driver)
  34. # crawlForum(driver)
  35. # closetor(driver)
  36. new_parse(forumName, False)
  37. # Opens Tor Browser
  38. def opentor():
  39. global pid
  40. print("Connecting Tor...")
  41. path = open('../../path.txt').readline().strip()
  42. pro = subprocess.Popen(path)
  43. pid = pro.pid
  44. time.sleep(7.5)
  45. input('Tor Connected. Press ENTER to continue\n')
  46. return
  47. # Login using premade account credentials and do login captcha manually
  48. def login(driver):
  49. time.sleep(3)
  50. # Returns the name of the website
  51. def getForumName():
  52. name = 'Nulled'
  53. return name
  54. # Return the link of the website
  55. def getFixedURL():
  56. url = 'https://www.nulled.to'
  57. return url
  58. # Closes Tor Browser
  59. def closetor(driver):
  60. global pid
  61. # os.system("taskkill /pid " + str(pro.pid))
  62. os.system("taskkill /t /f /im tor.exe")
  63. print('Closing Tor...')
  64. driver.close()
  65. time.sleep(3)
  66. return
  67. # Creates FireFox 'driver' and configure its 'Profile'
  68. # to use Tor proxy and socket
  69. def createFFDriver():
  70. file = open('../../path.txt', 'r')
  71. lines = file.readlines()
  72. ff_binary = FirefoxBinary(lines[0].strip())
  73. ff_prof = FirefoxProfile(lines[1].strip())
  74. # ff_prof.set_preference("places.history.enabled", False)
  75. # ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
  76. # ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
  77. # ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
  78. # ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
  79. # ff_prof.set_preference("signon.rememberSignons", False)
  80. # ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
  81. # ff_prof.set_preference("network.dns.disablePrefetch", True)
  82. # ff_prof.set_preference("network.http.sendRefererHeader", 0)
  83. # ff_prof.set_preference("permissions.default.image", 3)
  84. # ff_prof.set_preference("browser.download.folderList", 2)
  85. # ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
  86. # ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
  87. ff_prof.set_preference('network.proxy.type', 1)
  88. ff_prof.set_preference("network.proxy.socks_version", 5)
  89. ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
  90. ff_prof.set_preference('network.proxy.socks_port', 9150)
  91. ff_prof.set_preference('network.proxy.socks_remote_dns', True)
  92. ff_prof.set_preference("javascript.enabled", True)
  93. ff_prof.update_preferences()
  94. service = Service(lines[2].strip())
  95. driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
  96. return driver
  97. def getAccess():
  98. url = getFixedURL()
  99. driver = createFFDriver()
  100. try:
  101. driver.get(url)
  102. # time.sleep(3)
  103. return driver
  104. except:
  105. return 'down'
  106. # Saves the crawled html page
  107. def savePage(page, url):
  108. cleanPage = cleanHTML(page)
  109. filePath = getFullPathName(url)
  110. os.makedirs(os.path.dirname(filePath), exist_ok=True)
  111. open(filePath, 'wb').write(cleanPage.encode('utf-8'))
  112. return
  113. # Gets the full path of the page to be saved along with its appropriate file name
  114. def getFullPathName(url):
  115. fileName = getNameFromURL(url)
  116. if isDescriptionLink(url):
  117. fullPath = r'..\Nulled\HTML_Pages\\' + str(
  118. "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
  119. "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
  120. else:
  121. fullPath = r'..\Nulled\HTML_Pages\\' + str(
  122. "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
  123. "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
  124. return fullPath
  125. # Creates the file name from passed URL
  126. def getNameFromURL(url):
  127. global counter
  128. name = ''.join(e for e in url if e.isalnum())
  129. if (name == ''):
  130. name = str(counter)
  131. counter = counter + 1
  132. return name
  133. def getInterestedLinks():
  134. links = []
  135. # Cracking Tools
  136. links.append('https://www.nulled.to/forum/90-cracking-tools/')
  137. # # Cracking Tutorials
  138. # links.append('https://www.nulled.to/forum/98-cracking-tutorials/')
  139. # # Releases
  140. # links.append('https://www.nulled.to/forum/209-releases/')
  141. # # Newbi.Net Frameworkse
  142. # links.append('https://www.nulled.to/forum/51-net-framework/')
  143. # # html css js php
  144. # links.append('https://www.nulled.to/forum/54-html-css-js-php/')
  145. # # C C++
  146. # links.append('https://www.nulled.to/forum/52-cc/')
  147. # # other languages
  148. # links.append('https://www.nulled.to/forum/135-other-languages/')
  149. return links
  150. def crawlForum(driver):
  151. print("Crawling the Nulled forum")
  152. linksToCrawl = getInterestedLinks()
  153. visited = set(linksToCrawl)
  154. initialTime = time.time()
  155. i = 0
  156. count = 0
  157. while i < len(linksToCrawl):
  158. link = linksToCrawl[i]
  159. print('Crawling :', link)
  160. try:
  161. try:
  162. driver.get(link)
  163. except:
  164. driver.refresh()
  165. html = driver.page_source
  166. savePage(html, link)
  167. has_next_page = True
  168. while has_next_page:
  169. list = topicPages(html)
  170. for item in list:
  171. itemURL = urlparse.urljoin(baseURL, str(item))
  172. try:
  173. driver.get(itemURL)
  174. except:
  175. driver.refresh()
  176. savePage(driver.page_source, item)
  177. driver.back()
  178. break
  179. if count == 1:
  180. count = 0
  181. break
  182. try:
  183. temp = driver.find_element(by=By.XPATH, value='/html/body/div[4]/div[3]/div/div[3]/div[4]')
  184. temp = temp.find_element(by=By.CLASS_NAME, value='pagination')
  185. link = temp.find_element(by=By.CLASS_NAME, value='next')
  186. link = link.find_element(by=By.TAG_NAME, value='a').get_attribute('href')
  187. if link == "":
  188. raise NoSuchElementException
  189. try:
  190. driver.get(link)
  191. except:
  192. driver.refresh()
  193. html = driver.page_source
  194. savePage(html, link)
  195. count += 1
  196. except NoSuchElementException:
  197. has_next_page = False
  198. except Exception as e:
  199. print(link, e.message)
  200. i += 1
  201. # finalTime = time.time()
  202. # print finalTime - initialTime
  203. input("Crawling Nulled forum done sucessfully. Press ENTER to continue\n")
  204. # Returns 'True' if the link is Topic link
  205. def isDescriptionLink(url):
  206. if 'topic/' in url:
  207. return True
  208. return False
  209. # Returns True if the link is a listingPage link
  210. def isListingLink(url):
  211. if 'forum/' in url:
  212. return True
  213. return False
  214. # calling the parser to define the links
  215. def topicPages(html):
  216. soup = BeautifulSoup(html, "html.parser")
  217. #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)
  218. return nulled_links_parser(soup)
  219. def crawler():
  220. startCrawling()
  221. # print("Crawling and Parsing BestCardingWorld .... DONE!")