this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

325 lines
9.8 KiB

  1. __author__ = '91Shadows'
  2. '''
  3. DarkFox marketplace Crawler
  4. '''
  5. import codecs
  6. import socks, socket, time
  7. from datetime import date
  8. import urllib.parse as urlparse
  9. import http.client as httplib
  10. import mechanize
  11. import os
  12. import subprocess
  13. from bs4 import BeautifulSoup
  14. from MarketPlaces.Initialization.prepare_parser import new_parse
  15. from MarketPlaces.DarkFox.parser import darkfox_links_parser
  16. counter = 1
  17. httplib.HTTPConnection._http_vsn = 10
  18. httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
  19. baseURL = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/'
  20. socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150)
  21. # Opens Tor Browser, crawls the mkt
  22. def startCrawling():
  23. opentor()
  24. getUrl()
  25. url = getFixedURL()
  26. mktName = getMKTName()
  27. credentials = getCredentials()
  28. br = getAccess(url, credentials)
  29. if br != 'down':
  30. crawlMkt(url, br)
  31. #new_parse(mktName, False)
  32. #new_parse(mktName, False)
  33. closetor()
  34. #Opens Tor Browser
  35. def opentor():
  36. global pid
  37. print("Connecting Tor...")
  38. path = open('../../path.txt').readline()
  39. pro = subprocess.Popen(path)
  40. pid = pro.pid
  41. time.sleep(5)
  42. input("Tor Connected. Press ENTER to continue\n")
  43. return
  44. # Creates a connection through Tor Port
  45. def getUrl(timeout=None):
  46. socket.socket = socks.socksocket
  47. socket.create_connection = create_connection
  48. return
  49. # Makes the onion address request
  50. def create_connection(address, timeout=None, source_address=None):
  51. sock = socks.socksocket()
  52. sock.connect(address)
  53. return sock
  54. # Returns the name of the mkt (Crypto)
  55. def getMKTName():
  56. name = 'DarkFox'
  57. return name
  58. # Returns credentials needed for the mkt
  59. def getCredentials():
  60. credentials = 'blank blank blank blank cap 0'
  61. return credentials
  62. # Return the link of the mkt (DarkFox Link)
  63. def getFixedURL():
  64. url = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/'
  65. return url
  66. # Closes Tor Browser
  67. def closetor():
  68. global pid
  69. os.system("taskkill /pid " + str(pid))
  70. print('Closing Tor...')
  71. time.sleep(3)
  72. return
  73. # Creates a Mechanize browser and initializes its options
  74. def createBrowser():
  75. br = mechanize.Browser()
  76. cj = mechanize.CookieJar()
  77. br.set_cookiejar(cj)
  78. # Browser options
  79. br.set_handle_equiv( True )
  80. br.set_handle_redirect( True )
  81. br.set_handle_referer( True )
  82. br.set_handle_robots(False)
  83. br.set_handle_refresh( mechanize._http.HTTPRefreshProcessor(), max_time = 1 )
  84. br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'),
  85. ('Accept', '*/*')]
  86. return br
  87. def getAccess(loginPage, credentials):
  88. logInName = credentials.split()[0]
  89. userName = credentials.split()[1]
  90. logInPass = credentials.split()[2]
  91. password = credentials.split()[3]
  92. captchaName = credentials.split()[4]
  93. formId = credentials.split()[5]
  94. br = createBrowser()
  95. try:
  96. keepTrying = True
  97. while (keepTrying):
  98. br.open(loginPage)
  99. time.sleep(7)
  100. html = br.response()
  101. soup = BeautifulSoup(html)
  102. image_tags = soup.findAll('div', {"class": "imgWrap"})
  103. captchaLink = image_tags[0]
  104. imagelink = captchaLink['style'].split('url(')[1][:-1]
  105. data = br.open(imagelink).read()
  106. br.back()
  107. open('captcha.png', "wb").write(data)
  108. '''
  109. subprocess.Popen("python capt.py", shell=False)
  110. time.sleep(61)
  111. captchaAnswerFile = open("answer.txt", "r")
  112. captchaAnswer = captchaAnswerFile.read().__str__()
  113. '''
  114. captchaAnswer = input('Please provide me with captcha : ')
  115. formIndex = int(formId)
  116. br.select_form(nr=formIndex)
  117. #br[logInName] = userName
  118. #br[logInPass] = password
  119. br[captchaName] = captchaAnswer.__str__()
  120. br.submit()
  121. if br.geturl() != 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/':
  122. keepTrying = False
  123. return br
  124. except:
  125. return 'down'
  126. # Saves the crawled html page
  127. def savePage(page, url):
  128. filePath = getFullPathName(url)
  129. os.makedirs(os.path.dirname(filePath), exist_ok=True)
  130. a = page.read()
  131. open(filePath, "wb").write(a)
  132. return
  133. # Gets the full path of the page to be saved along with its appropriate file name
  134. def getFullPathName(url):
  135. fileName = getNameFromURL(url)
  136. if isDescriptionLink(url):
  137. fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
  138. "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
  139. "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
  140. else:
  141. fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
  142. "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
  143. "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
  144. return fullPath
  145. # Creates the name of the file based on URL
  146. def getNameFromURL(url):
  147. global counter
  148. name = ''.join(e for e in url if e.isalnum())
  149. if (name == ''):
  150. name = str(counter)
  151. counter = counter + 1
  152. return name
  153. # Hacking and Markets related topics
  154. def getInterestedLinks():
  155. links = []
  156. # Guides and Tutorials
  157. links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/30739153-1fcd-45cd-b919-072b439c6e06')
  158. # Digital Products
  159. links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781')
  160. # Software and Malware
  161. links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc')
  162. # Services
  163. links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280')
  164. # Miscellaneous
  165. links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/fd1c989b-1a74-4dc0-92b0-67d8c1c487cb')
  166. # Hosting and Security
  167. links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/5233fd6a-72e6-466d-b108-5cc61091cd14')
  168. # links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing1.html')
  169. # links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing2.html')
  170. return links
  171. def crawlMkt(url, br):
  172. print("Crawling the DarkFox marketplace")
  173. linksToCrawl = getInterestedLinks()
  174. visited = set(linksToCrawl)
  175. initialTime = time.time()
  176. i = 0
  177. while i < len(linksToCrawl):
  178. link = linksToCrawl[i]
  179. print('Crawling :', link)
  180. try :
  181. page = br.open(link)
  182. savePage(page, link)
  183. for l in br.links():
  184. absURL = urlparse.urljoin(l.base_url, l.url)
  185. if absURL not in visited and not isSignOut(absURL) and isListingLink(absURL):
  186. visited.add(absURL)
  187. #disabling the process of finding other links
  188. #linksToCrawl.append(absURL)
  189. # crawler asks parser to get links of ALL products on ALL listing pages
  190. list = productPages(link)
  191. j = 0
  192. for item in list:
  193. if j == 2:
  194. break
  195. #itemURL = baseURL + str(item)
  196. try:
  197. #itemPage = br.open(itemURL)
  198. itemPage = br.open(item)
  199. savePage(itemPage, item)
  200. except:
  201. #print 'Error in page: ', itemURL
  202. print('Error in page: ', item)
  203. j+=1
  204. except Exception as e:
  205. print(link, e.message)
  206. i += 1
  207. #finalTime = time.time()
  208. #print finalTime - initialTime
  209. input("Crawling DarkFox marketplace done sucessfully. Press ENTER to continue\n")
  210. return
  211. def isDescriptionLink(url):
  212. if 'product' in url:
  213. return True
  214. return False
  215. # Returns True if the link is a listingPage link
  216. def isListingLink(url):
  217. if 'category' in url:
  218. return True
  219. return False
  220. # calling the parser to define the links
  221. def productPages(url):
  222. soup = ""
  223. error = False
  224. try:
  225. html = codecs.open(
  226. r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
  227. "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
  228. "%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html', encoding='utf8')
  229. soup = BeautifulSoup(html, "html.parser")
  230. except:
  231. try:
  232. html = open(
  233. r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
  234. "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
  235. "%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html')
  236. soup = BeautifulSoup(html, "html.parser")
  237. except:
  238. error = True
  239. print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.")
  240. if error:
  241. return []
  242. else:
  243. return darkfox_links_parser(soup)
  244. # Drop links that "singout"
  245. def isSignOut(url):
  246. #absURL = urlparse.urljoin(url.base_url, url.url)
  247. if 'signout' in url.lower() or 'logout' in url.lower():
  248. return True
  249. return False
  250. def crawler():
  251. startCrawling()
  252. #print "Crawling and Parsing Crypto .... DONE!"