|
|
- __author__ = '91Shadows'
-
- '''
- CryptBB Crawler (Mechanize)
- '''
-
- import codecs, os, re
- import socks, socket, time
- from datetime import date
-
- import urllib.parse as urlparse
- import http.client as httplib
- import mechanize
- import subprocess
- from bs4 import BeautifulSoup
- from Forums.Initialization.prepare_parser import new_parse
- from Forums.BestCardingWorld.parser import bestcardingworld_links_parser
-
- counter = 1
- httplib.HTTPConnection._http_vsn = 10
- httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
- baseURL = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=42&sid=ee2cbfd73c12923d979790b2bb4bdfd5'
- socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150)
-
-
- # Opens Tor Browser, crawls the website
- def startCrawling():
- opentor()
- getUrl()
- forumName = getForumName()
- br = getAccess()
-
- if br != 'down':
- crawlForum(br)
- new_parse(forumName, False)
-
- # new_parse(forumName, False)
-
- closetor()
-
-
- # Opens Tor Browser
- def opentor():
- global pid
- print("Connecting Tor...")
- path = open('../../path.txt').readline()
- pro = subprocess.Popen(path)
- pid = pro.pid
- time.sleep(7.5)
- input("Tor Connected. Press ENTER to continue\n")
- return
-
-
- # Creates a connection through Tor Port
- def getUrl(timeout=None):
- socket.socket = socks.socksocket
- socket.create_connection = create_connection
- return
-
-
- # Makes the onion address request
- def create_connection(address, timeout=None, source_address=None):
- sock = socks.socksocket()
- sock.connect(address)
- return sock
-
-
- # Returns the name of website
- def getForumName():
- name = 'CryptBB'
- return name
-
-
- # Return the link of website
- def getFixedURL():
- url = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=42&sid=ee2cbfd73c12923d979790b2bb4bdfd5'
-
- return url
-
-
- # Closes Tor Browser
- def closetor():
- global pid
- os.system("taskkill /pid " + str(pid))
- print('Closing Tor...')
- time.sleep(3)
- return
-
-
- # Creates a Mechanize browser and initializes its options
- def createBrowser():
- br = mechanize.Browser()
- cj = mechanize.CookieJar()
- br.set_cookiejar(cj)
-
- # Browser options
- br.set_handle_equiv(True)
- br.set_handle_redirect(True)
- br.set_handle_referer(True)
- br.set_handle_robots(False)
- br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
- br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'),
- ('Accept', '*/*')]
-
- return br
-
-
- def getAccess():
- url = getFixedURL()
- br = createBrowser()
-
- try:
-
- br.open(url)
- return br
-
- except:
-
- return 'down'
-
-
- # Saves the crawled html page
- def savePage(page, url):
- filePath = getFullPathName(url)
- os.makedirs(os.path.dirname(filePath), exist_ok=True)
- a = page.read()
- open(filePath, "wb").write(a)
- return
-
-
- # Gets the full path of the page to be saved along with its appropriate file name
- def getFullPathName(url):
- fileName = getNameFromURL(url)
- if isDescriptionLink(url):
- fullPath = 'C:/Users/CALSysLab/Documents/threatIntelligence-main/DarkWebMining_Working/Forums/ThiefWorld/HTML_Pages/' + str(
- "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
- "%04d" % date.today().year) + '/' + 'Description/' + fileName + '.html'
- else:
- fullPath = 'C:/Users/CALSysLab/Documents/threatIntelligence-main/DarkWebMining_Working/Forums/ThiefWorld/HTML_Pages/' + str(
- "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
- "%04d" % date.today().year) + '/' + 'Listing/' + fileName + '.html'
- return fullPath
-
-
- # Creates the name of the file based on URL
- def getNameFromURL(url):
- global counter
- name = ''.join(e for e in url if e.isalnum())
- if (name == ''):
- name = str(counter)
- counter = counter + 1
- return name
-
-
- # Hacking and Markets related topics
- def getInterestedLinks():
- links = []
-
- links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=43&sid=e12864ffccc5df877b03b573534955be')
-
- return links
-
-
- # Start crawling Forum pages
- def crawlForum(br):
- print("Crawling CryptBB forum")
-
- linksToCrawl = getInterestedLinks()
- visited = set(linksToCrawl)
- initialTime = time.time()
-
-
- i = 0
- while i < len(linksToCrawl):
- link = linksToCrawl[i]
- print('Crawling :', link)
- try:
- page = br.open(link)
- savePage(page, link)
-
- res = br.response().read()
- soup = BeautifulSoup(res, 'html.parser')
-
- next_link = soup.find("a", {"rel": "next"})
- if next_link != None:
- full_url = urlparse.urljoin(linksToCrawl[i], next_link['href'])
- linksToCrawl.insert(i + 1, full_url)
-
- listOfTopics = findDescriptionPages(link)
- for topic in listOfTopics:
- itemPage = br.open(str(topic))
- savePage(itemPage, topic)
-
- except Exception as e:
- print('Error getting link: ', link, e)
- i += 1
-
- # finalTime = time.time()
- # print finalTime - initialTime
-
- input("CryptBB forum done sucessfully. Press ENTER to continue\n")
-
- return
-
-
- # Returns True if the link is 'Topic' Links, may need to change for diff websites
- def isDescriptionLink(url):
- if 'topic' in url:
- return True
- return False
-
-
- # Returns True if the link is a listingPage link, may need to change for diff websites
- def isListingLink(url):
- '''
- reg = 'board=[0-9]+.[0-9]+\Z'
- if len(re.findall(reg, url)) == 0:
- return False
- return True
- '''
- if 'forum' in url:
- return True
- return False
-
-
- # calling the parser to define the links
- def findDescriptionPages(url):
- soup = ""
-
- error = False
- try:
- html = codecs.open(
- "C:\\Users\\CALSysLab\\Documents\\threatIntelligence-main\\DarkWebMining_Working\\Forums\\BestCardingWorld\\HTML_Pages\\" + str(
- "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
- "%04d" % date.today().year) + "\\Listing\\" + getNameFromURL(url) + ".html", encoding='utf8')
- soup = BeautifulSoup(html, "html.parser")
- except:
- try:
- html = open(
- "C:\\Users\\CALSysLab\\Documents\\threatIntelligence-main\\DarkWebMining_Working\\Forums\\BestCardingWorld\\HTML_Pages\\" + str(
- "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
- "%04d" % date.today().year) + "\\Listing\\" + getNameFromURL(url) + ".html")
- soup = BeautifulSoup(html, "html.parser")
- except:
- error = True
- print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.")
-
- if not error:
- return bestcardingworld_links_parser(soup)
-
- else:
- return []
-
-
- def crawler():
- startCrawling()
- print("Crawling and Parsing CryptBB .... DONE!")
|