this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

404 lines
14 KiB

  1. __author__ = 'DarkWeb'
  2. '''
  3. Bohemia Market Crawler (Selenium)
  4. '''
  5. from selenium import webdriver
  6. from selenium.common.exceptions import NoSuchElementException
  7. from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
  8. from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
  9. from selenium.webdriver.firefox.service import Service
  10. from selenium.webdriver.support.ui import WebDriverWait
  11. from selenium.webdriver.support.ui import Select
  12. from selenium.webdriver.support import expected_conditions as EC
  13. from selenium.webdriver.common.by import By
  14. from PIL import Image
  15. import urllib.parse as urlparse
  16. import os, re, time
  17. from datetime import date
  18. import subprocess
  19. from bs4 import BeautifulSoup
  20. from MarketPlaces.Initialization.prepare_parser import new_parse
  21. from MarketPlaces.Bohemia.parser import bohemia_links_parser
  22. from MarketPlaces.Utilities.utilities import cleanHTML
  23. counter = 1
  24. baseURL = 'http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/'
  25. # Opens Tor Browser, crawls the website, then parses, then closes tor
  26. #acts like the main method for the crawler, another function at the end of this code calls this function later
  27. def startCrawling():
  28. mktName = getMKTName()
  29. driver = getAccess()
  30. if driver != 'down':
  31. try:
  32. captcha(driver)
  33. login(driver)
  34. crawlForum(driver)
  35. except Exception as e:
  36. print(driver.current_url, e)
  37. closeDriver(driver)
  38. new_parse(mktName, False)
  39. def login(driver):
  40. #wait for login page
  41. WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
  42. (By.XPATH, "/html/body/div/div[4]/div/div/form/input[1]")))
  43. #click on login page confirmation
  44. driver.find_element(by=By.XPATH, value="/html/body/div/div[4]/div/div/form/input[1]").click()
  45. #wait until next page shows up
  46. WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
  47. (By.XPATH, "/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[1]/input")))
  48. #entering username and password into input boxes
  49. usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[1]/input')
  50. #username here
  51. usernameBox.send_keys('ct-1234')
  52. passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[2]/input')
  53. #password here
  54. passwordBox.send_keys('DementedBed123-')
  55. #session time
  56. session_select = Select(driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[3]/select'))
  57. session_select.select_by_visible_text('300 Minutes')
  58. '''
  59. #wait for captcha page to show up
  60. inputBox = driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[4]/div/input')
  61. #save captcha to local
  62. driver.find_element(by=By.XPATH, value='//*[@id="captcha"]').screenshot(r'..\Bohemia\captcha2.png')
  63. im = Image.open(r'..\Bohemia\captcha2.png')
  64. im.show()
  65. #ask user input captcha solution in terminal
  66. userIn = input("Enter Solution: ")
  67. #send user solution into input field
  68. inputBox.send_keys(userIn)
  69. #click the submit button
  70. driver.find_element(by=By.XPATH, value='/html/body/div/div[2]/div/div[2]/div/div[2]/form/div[5]/button').click()
  71. '''
  72. input("Press ENTER when CAPTCHA is completed\n")
  73. #wait for listing page to show up
  74. WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
  75. (By.XPATH, "/html/body/div[2]/div[2]/div[1]/div")))
  76. # Returns the name of the website
  77. #return: name of site in string type
  78. def getMKTName():
  79. name = 'Bohemia'
  80. return name
  81. # Returns credentials needed for the mkt
  82. def getCredentials():
  83. credentials = 'blank blank blank blank cap 0'
  84. return credentials
  85. # Return the base link of the website
  86. #return: url of base site in string type
  87. def getFixedURL():
  88. url = 'http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/'
  89. return url
  90. # Closes Tor Browser
  91. #@param: current selenium driver
  92. def closeDriver(driver):
  93. # global pid
  94. # os.system("taskkill /pid " + str(pro.pid))
  95. # os.system("taskkill /t /f /im tor.exe")
  96. print('Closing Tor...')
  97. driver.close()
  98. time.sleep(3)
  99. return
  100. # Creates FireFox 'driver' and configure its 'Profile'
  101. # to use Tor proxy and socket
  102. def createFFDriver():
  103. from MarketPlaces.Initialization.markets_mining import config
  104. ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
  105. ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
  106. # ff_prof.set_preference("places.history.enabled", False)
  107. # ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
  108. # ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
  109. # ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
  110. # ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
  111. # ff_prof.set_preference("signon.rememberSignons", False)
  112. # ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
  113. # ff_prof.set_preference("network.dns.disablePrefetch", True)
  114. # ff_prof.set_preference("network.http.sendRefererHeader", 0)
  115. ff_prof.set_preference("permissions.default.image", 3)
  116. ff_prof.set_preference("browser.download.folderList", 2)
  117. ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
  118. ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
  119. ff_prof.set_preference('network.proxy.type', 1)
  120. ff_prof.set_preference("network.proxy.socks_version", 5)
  121. ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
  122. ff_prof.set_preference('network.proxy.socks_port', 9150)
  123. ff_prof.set_preference('network.proxy.socks_remote_dns', True)
  124. ff_prof.set_preference("javascript.enabled", True)
  125. ff_prof.update_preferences()
  126. service = Service(config.get('TOR', 'geckodriver_path'))
  127. driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
  128. driver.maximize_window()
  129. return driver
  130. #the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
  131. #return: return the selenium driver or string 'down'
  132. def getAccess():
  133. url = getFixedURL()
  134. driver = createFFDriver()
  135. try:
  136. driver.get(url)
  137. return driver
  138. except:
  139. driver.close()
  140. return 'down'
  141. # Manual captcha solver, waits fora specific element so that the whole page loads, finds the input box, gets screenshot of captcha
  142. # then allows for manual solving of captcha in the terminal
  143. #@param: current selenium web driver
  144. def captcha(driver):
  145. '''
  146. # wait for captcha page show up (for bohemia it takes A WHILE)
  147. print("Connecting Bohemia...")
  148. time.sleep(7.5)
  149. WebDriverWait(driver, 100).until(EC.visibility_of_element_located((By.XPATH, "/html/body/div/div/form/div")))
  150. input('Bohemia Connected. Press ENTER to continue\n')
  151. # save captcha to local
  152. driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div/div").screenshot(r'..\Bohemia\captcha.png')
  153. # open method used to open different extension image file
  154. im = Image.open(r'..\Bohemia\captcha.png')
  155. # This method will show image in any image viewer
  156. im.show()
  157. # Prints link to console since captcha requires the link
  158. print(getFixedURL())
  159. # wait until input space show up
  160. inputBox = driver.find_element(by=By.XPATH, value="/html/body/div/div/form/div/div/input")
  161. # ask user input captha solution in terminal
  162. userIn = input("Enter solution: ")
  163. # send user solution into the input space
  164. inputBox.send_keys(userIn)
  165. # click the verify(submit) button
  166. driver.find_element(by=By.XPATH, value='/html/body/div/div/form/button[1]').click()
  167. # im.close()
  168. '''
  169. input("Press ENTER when CAPTCHA is completed\n")
  170. # wait for next captcha to show up
  171. WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
  172. (By.XPATH, "/html/body/div/div/form")))
  173. '''
  174. for square in range(1,7):
  175. inputBox = driver.find_element(by=By.XPATH, value=f"/html/body/div/div/form/div[1]/input[{square}]")
  176. inputBox.click()
  177. time.sleep(.5)
  178. # userIn = input("Enter Solution: ")
  179. # inputBox.send_keys(userIn)
  180. # Takes screenshot every iteration because after input the captcha changes
  181. driver.find_element(by=By.XPATH, value="/html/body/div/div/form").screenshot(r'..\Bohemia\captcha1.png')
  182. # Opens and crops image
  183. im = Image.open(r'..\Bohemia\captcha1.png')
  184. im = im.crop(((im.width // 2 - 80), (im.height // 2 - 100), (im.width // 2 + 80), (im.height // 2 + 60)))
  185. im.show()
  186. # im.close()
  187. userIn = input("Enter Solution: ")
  188. inputBox.send_keys(userIn)
  189. #locate and press submit button
  190. driver.find_element(by=By.XPATH, value="/html/body/div/div/form/button[1]").click()
  191. # driver.find_element(by=By.XPATH, value='/html/body/div/div/form/button[2]')
  192. '''
  193. input("Press ENTER when CAPTCHA is completed\n")
  194. #wait for next page to show up
  195. WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
  196. (By.XPATH, "/html/body/div/div[4]/div/div/form/input[1]")))
  197. # Saves the crawled html page, makes the directory path for html pages if not made
  198. def savePage(driver, page, url):
  199. cleanPage = cleanHTML(driver, page)
  200. filePath = getFullPathName(url)
  201. os.makedirs(os.path.dirname(filePath), exist_ok=True)
  202. open(filePath, 'wb').write(cleanPage.encode('utf-8'))
  203. return
  204. # Gets the full path of the page to be saved along with its appropriate file name
  205. #@param: raw url as crawler crawls through every site
  206. def getFullPathName(url):
  207. from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
  208. mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
  209. fileName = getNameFromURL(url)
  210. if isDescriptionLink(url):
  211. fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
  212. else:
  213. fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
  214. return fullPath
  215. # Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
  216. #@param: raw url as crawler crawls through every site
  217. def getNameFromURL(url):
  218. global counter
  219. name = ''.join(e for e in url if e.isalnum())
  220. if (name == ''):
  221. name = str(counter)
  222. counter = counter + 1
  223. return name
  224. # returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
  225. #in this example, there are a couple of categories some threads fall under such as
  226. # Guides and Tutorials, Digital Products, and Software and Malware
  227. #as you can see they are categories of products
  228. def getInterestedLinks():
  229. links = []
  230. # Malware and Botnets
  231. links.append('http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/listings?page=1&type=all&catid=95')
  232. # #Exploits
  233. # links.append('http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/listings?page=1&type=all&catid=99')
  234. # #Methods
  235. # links.append('http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/listings?catid=100')
  236. # #Exploit kits
  237. # links.append('http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/listings?catid=101')
  238. # #Hacking Software
  239. # links.append('http://bohemiaobko4cecexkj5xmlaove6yn726dstp5wfw4pojjwp6762paqd.onion/listings?catid=103')
  240. return links
  241. # gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
  242. #topic and description pages are crawled through here, where both types of pages are saved
  243. #@param: selenium driver
  244. def crawlForum(driver):
  245. print("Crawling the Bohemia Market")
  246. linksToCrawl = getInterestedLinks()
  247. i = 0
  248. while i < len(linksToCrawl):
  249. link = linksToCrawl[i]
  250. print('Crawling :', link)
  251. try:
  252. has_next_page = True
  253. count = 0
  254. while has_next_page:
  255. try:
  256. driver.get(link)
  257. except:
  258. driver.refresh()
  259. html = driver.page_source
  260. savePage(driver, html, link)
  261. list = productPages(html)
  262. for item in list:
  263. itemURL = urlparse.urljoin(baseURL, str(item))
  264. try:
  265. driver.get(itemURL)
  266. except:
  267. driver.refresh()
  268. savePage(driver, driver.page_source, item)
  269. driver.back()
  270. # comment out
  271. break
  272. # comment out
  273. if count == 1:
  274. break
  275. try:
  276. nav = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div/div[2]/ul')
  277. a = nav.find_element(by=By.PARTIAL_LINK_TEXT, value="Next")
  278. link = a.get_attribute('href')
  279. if link == "":
  280. raise NoSuchElementException
  281. count += 1
  282. except NoSuchElementException:
  283. has_next_page = False
  284. except Exception as e:
  285. print(link, e)
  286. i += 1
  287. input("Crawling Bohemia Market done sucessfully. Press ENTER to continue\n")
  288. # Returns 'True' if the link is a description link
  289. #@param: url of any url crawled
  290. #return: true if is a description page, false if not
  291. def isDescriptionLink(url):
  292. if bool(re.search(r'\blisting\b',url)): # accurate with bohemia
  293. return True
  294. return False
  295. # Returns True if the link is a listingPage link
  296. #@param: url of any url crawled
  297. #return: true if is a Listing page, false if not
  298. def isListingLink(url):
  299. if bool(re.search(r'\blistings\b',url)): # accurate with bohemia
  300. return True
  301. return False
  302. # calling the parser to define the links, the html is the url of a link from the list of interested link list
  303. #@param: link from interested link list ie. getInterestingLinks()
  304. #return: list of description links that should be crawled through
  305. def productPages(html):
  306. soup = BeautifulSoup(html, "html.parser")
  307. return bohemia_links_parser(soup)
  308. # Drop links that "signout"
  309. def isSignOut(url):
  310. #absURL = urlparse.urljoin(url.base_url, url.url)
  311. if 'signout' in url.lower() or 'logout' in url.lower():
  312. return True
  313. return False
  314. def crawler():
  315. startCrawling()
  316. # print("Crawling and Parsing BestCardingWorld .... DONE!")