__author__ = '91Shadows' ''' BestCardingWorld Crawler (Mechanize) ''' import codecs, os, re import socks, socket, time from datetime import date import urllib.parse as urlparse import http.client as httplib import mechanize import subprocess from bs4 import BeautifulSoup from Forums.Initialization.prepare_parser import new_parse from Forums.BestCardingWorld.parser import bestcardingworld_links_parser counter = 1 httplib.HTTPConnection._http_vsn = 10 httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0' baseURL = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=42&sid=ee2cbfd73c12923d979790b2bb4bdfd5' socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150) # Opens Tor Browser, crawls the website def startCrawling(): opentor() getUrl() forumName = getForumName() br = getAccess() if br != 'down': crawlForum(br) new_parse(forumName, False) # new_parse(forumName, False) closetor() # Opens Tor Browser def opentor(): global pid print("Connecting Tor...") path = open('../../path.txt').readline() pro = subprocess.Popen(path) pid = pro.pid time.sleep(7.5) input("Tor Connected. Press ENTER to continue\n") return # Creates a connection through Tor Port def getUrl(timeout=None): socket.socket = socks.socksocket socket.create_connection = create_connection return # Makes the onion address request def create_connection(address, timeout=None, source_address=None): sock = socks.socksocket() sock.connect(address) return sock # Returns the name of website def getForumName(): name = 'BestCardingWorld' return name # Return the link of website def getFixedURL(): url = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=42&sid=ee2cbfd73c12923d979790b2bb4bdfd5' return url # Closes Tor Browser def closetor(): global pid os.system("taskkill /pid " + str(pid)) print('Closing Tor...') time.sleep(3) return # Creates a Mechanize browser and initializes its options def createBrowser(): br = mechanize.Browser() cj = mechanize.CookieJar() br.set_cookiejar(cj) # Browser options br.set_handle_equiv(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'), ('Accept', '*/*')] return br def getAccess(): url = getFixedURL() br = createBrowser() try: br.open(url) return br except: return 'down' # Saves the crawled html page def savePage(page, url): filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) a = page.read() open(filePath, "wb").write(a) return # Gets the full path of the page to be saved along with its appropriate file name def getFullPathName(url): fileName = getNameFromURL(url) if isDescriptionLink(url): fullPath = 'C:/Users/CALSysLab/Documents/threatIntelligence-main/DarkWebMining_Working/Forums/BestCardingWorld/HTML_Pages/' + str( "%02d" % date.today().month) + str("%02d" % date.today().day) + str( "%04d" % date.today().year) + '/' + 'Description/' + fileName + '.html' else: fullPath = 'C:/Users/CALSysLab/Documents/threatIntelligence-main/DarkWebMining_Working/Forums/BestCardingWorld/HTML_Pages/' + str( "%02d" % date.today().month) + str("%02d" % date.today().day) + str( "%04d" % date.today().year) + '/' + 'Listing/' + fileName + '.html' return fullPath # Creates the name of the file based on URL def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) if (name == ''): name = str(counter) counter = counter + 1 return name # Hacking and Markets related topics def getInterestedLinks(): links = [] links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=43&sid=e12864ffccc5df877b03b573534955be') return links # Start crawling Forum pages def crawlForum(br): print("Crawling The Best Carding World forum") linksToCrawl = getInterestedLinks() visited = set(linksToCrawl) initialTime = time.time() i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: page = br.open(link) savePage(page, link) res = br.response().read() soup = BeautifulSoup(res, 'html.parser') next_link = soup.find("a", {"rel": "next"}) if next_link != None: full_url = urlparse.urljoin(linksToCrawl[i], next_link['href']) linksToCrawl.insert(i + 1, full_url) listOfTopics = findDescriptionPages(link) for topic in listOfTopics: itemPage = br.open(str(topic)) savePage(itemPage, topic) except Exception as e: print('Error getting link: ', link, e) i += 1 # finalTime = time.time() # print finalTime - initialTime input("Crawling Best Carding world forum done sucessfully. Press ENTER to continue\n") return # Returns True if the link is 'Topic' Links def isDescriptionLink(url): if 'topic' in url: return True return False # Returns True if the link is a listingPage link def isListingLink(url): ''' reg = 'board=[0-9]+.[0-9]+\Z' if len(re.findall(reg, url)) == 0: return False return True ''' if 'forum' in url: return True return False # calling the parser to define the links def findDescriptionPages(url): soup = "" error = False try: html = codecs.open( "C:\\Users\\CALSysLab\\Documents\\threatIntelligence-main\\DarkWebMining_Working\\Forums\\BestCardingWorld\\HTML_Pages\\" + str( "%02d" % date.today().month) + str("%02d" % date.today().day) + str( "%04d" % date.today().year) + "\\Listing\\" + getNameFromURL(url) + ".html", encoding='utf8') soup = BeautifulSoup(html, "html.parser") except: try: html = open( "C:\\Users\\CALSysLab\\Documents\\threatIntelligence-main\\DarkWebMining_Working\\Forums\\BestCardingWorld\\HTML_Pages\\" + str( "%02d" % date.today().month) + str("%02d" % date.today().day) + str( "%04d" % date.today().year) + "\\Listing\\" + getNameFromURL(url) + ".html") soup = BeautifulSoup(html, "html.parser") except: error = True print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.") if not error: return bestcardingworld_links_parser(soup) else: return [] def crawler(): startCrawling() print("Crawling and Parsing The Best Carding World .... DONE!")