this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

257 lines
7.0 KiB

1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
  1. __author__ = '91Shadows'
  2. '''
  3. CryptBB Crawler (Mechanize)
  4. '''
  5. import codecs, os, re
  6. import socks, socket, time
  7. from datetime import date
  8. import urllib.parse as urlparse
  9. import http.client as httplib
  10. import mechanize
  11. import subprocess
  12. from bs4 import BeautifulSoup
  13. from Forums.Initialization.prepare_parser import new_parse
  14. from Forums.BestCardingWorld.parser import bestcardingworld_links_parser
  15. counter = 1
  16. httplib.HTTPConnection._http_vsn = 10
  17. httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
  18. baseURL = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=42&sid=ee2cbfd73c12923d979790b2bb4bdfd5'
  19. socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150)
  20. # Opens Tor Browser, crawls the website
  21. def startCrawling():
  22. opentor()
  23. getUrl()
  24. forumName = getForumName()
  25. br = getAccess()
  26. if br != 'down':
  27. crawlForum(br)
  28. new_parse(forumName, False)
  29. # new_parse(forumName, False)
  30. closetor()
  31. # Opens Tor Browser
  32. def opentor():
  33. global pid
  34. print("Connecting Tor...")
  35. path = open('../../path.txt').readline()
  36. pro = subprocess.Popen(path)
  37. pid = pro.pid
  38. time.sleep(7.5)
  39. input("Tor Connected. Press ENTER to continue\n")
  40. return
  41. # Creates a connection through Tor Port
  42. def getUrl(timeout=None):
  43. socket.socket = socks.socksocket
  44. socket.create_connection = create_connection
  45. return
  46. # Makes the onion address request
  47. def create_connection(address, timeout=None, source_address=None):
  48. sock = socks.socksocket()
  49. sock.connect(address)
  50. return sock
  51. # Returns the name of website
  52. def getForumName():
  53. name = 'CryptBB'
  54. return name
  55. # Return the link of website
  56. def getFixedURL():
  57. url = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=42&sid=ee2cbfd73c12923d979790b2bb4bdfd5'
  58. return url
  59. # Closes Tor Browser
  60. def closetor():
  61. global pid
  62. os.system("taskkill /pid " + str(pid))
  63. print('Closing Tor...')
  64. time.sleep(3)
  65. return
  66. # Creates a Mechanize browser and initializes its options
  67. def createBrowser():
  68. br = mechanize.Browser()
  69. cj = mechanize.CookieJar()
  70. br.set_cookiejar(cj)
  71. # Browser options
  72. br.set_handle_equiv(True)
  73. br.set_handle_redirect(True)
  74. br.set_handle_referer(True)
  75. br.set_handle_robots(False)
  76. br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
  77. br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'),
  78. ('Accept', '*/*')]
  79. return br
  80. def getAccess():
  81. url = getFixedURL()
  82. br = createBrowser()
  83. try:
  84. br.open(url)
  85. return br
  86. except:
  87. return 'down'
  88. # Saves the crawled html page
  89. def savePage(page, url):
  90. filePath = getFullPathName(url)
  91. os.makedirs(os.path.dirname(filePath), exist_ok=True)
  92. a = page.read()
  93. open(filePath, "wb").write(a)
  94. return
  95. # Gets the full path of the page to be saved along with its appropriate file name
  96. def getFullPathName(url):
  97. fileName = getNameFromURL(url)
  98. if isDescriptionLink(url):
  99. fullPath = 'C:/Users/CALSysLab/Documents/threatIntelligence-main/DarkWebMining_Working/Forums/ThiefWorld/HTML_Pages/' + str(
  100. "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
  101. "%04d" % date.today().year) + '/' + 'Description/' + fileName + '.html'
  102. else:
  103. fullPath = 'C:/Users/CALSysLab/Documents/threatIntelligence-main/DarkWebMining_Working/Forums/ThiefWorld/HTML_Pages/' + str(
  104. "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
  105. "%04d" % date.today().year) + '/' + 'Listing/' + fileName + '.html'
  106. return fullPath
  107. # Creates the name of the file based on URL
  108. def getNameFromURL(url):
  109. global counter
  110. name = ''.join(e for e in url if e.isalnum())
  111. if (name == ''):
  112. name = str(counter)
  113. counter = counter + 1
  114. return name
  115. # Hacking and Markets related topics
  116. def getInterestedLinks():
  117. links = []
  118. links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=43&sid=e12864ffccc5df877b03b573534955be')
  119. return links
  120. # Start crawling Forum pages
  121. def crawlForum(br):
  122. print("Crawling CryptBB forum")
  123. linksToCrawl = getInterestedLinks()
  124. visited = set(linksToCrawl)
  125. initialTime = time.time()
  126. i = 0
  127. while i < len(linksToCrawl):
  128. link = linksToCrawl[i]
  129. print('Crawling :', link)
  130. try:
  131. page = br.open(link)
  132. savePage(page, link)
  133. res = br.response().read()
  134. soup = BeautifulSoup(res, 'html.parser')
  135. next_link = soup.find("a", {"rel": "next"})
  136. if next_link != None:
  137. full_url = urlparse.urljoin(linksToCrawl[i], next_link['href'])
  138. linksToCrawl.insert(i + 1, full_url)
  139. listOfTopics = findDescriptionPages(link)
  140. for topic in listOfTopics:
  141. itemPage = br.open(str(topic))
  142. savePage(itemPage, topic)
  143. except Exception as e:
  144. print('Error getting link: ', link, e)
  145. i += 1
  146. # finalTime = time.time()
  147. # print finalTime - initialTime
  148. input("CryptBB forum done sucessfully. Press ENTER to continue\n")
  149. return
  150. # Returns True if the link is 'Topic' Links, may need to change for diff websites
  151. def isDescriptionLink(url):
  152. if 'topic' in url:
  153. return True
  154. return False
  155. # Returns True if the link is a listingPage link, may need to change for diff websites
  156. def isListingLink(url):
  157. '''
  158. reg = 'board=[0-9]+.[0-9]+\Z'
  159. if len(re.findall(reg, url)) == 0:
  160. return False
  161. return True
  162. '''
  163. if 'forum' in url:
  164. return True
  165. return False
  166. # calling the parser to define the links
  167. def findDescriptionPages(url):
  168. soup = ""
  169. error = False
  170. try:
  171. html = codecs.open(
  172. "C:\\Users\\CALSysLab\\Documents\\threatIntelligence-main\\DarkWebMining_Working\\Forums\\BestCardingWorld\\HTML_Pages\\" + str(
  173. "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
  174. "%04d" % date.today().year) + "\\Listing\\" + getNameFromURL(url) + ".html", encoding='utf8')
  175. soup = BeautifulSoup(html, "html.parser")
  176. except:
  177. try:
  178. html = open(
  179. "C:\\Users\\CALSysLab\\Documents\\threatIntelligence-main\\DarkWebMining_Working\\Forums\\BestCardingWorld\\HTML_Pages\\" + str(
  180. "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
  181. "%04d" % date.today().year) + "\\Listing\\" + getNameFromURL(url) + ".html")
  182. soup = BeautifulSoup(html, "html.parser")
  183. except:
  184. error = True
  185. print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.")
  186. if not error:
  187. return bestcardingworld_links_parser(soup)
  188. else:
  189. return []
  190. def crawler():
  191. startCrawling()
  192. print("Crawling and Parsing CryptBB .... DONE!")