khangtran
/
dark_web_forums

__author__ = '91Shadows'
'''
CryptBB Crawler (Mechanize)'''

import codecs, os, reimport socks, socket, timefrom datetime import date
import urllib.parse as urlparseimport http.client as httplibimport mechanizeimport subprocessfrom bs4 import BeautifulSoupfrom Forums.Initialization.prepare_parser import new_parsefrom Forums.BestCardingWorld.parser import bestcardingworld_links_parser
counter = 1httplib.HTTPConnection._http_vsn = 10httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'baseURL = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=42&sid=ee2cbfd73c12923d979790b2bb4bdfd5'socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150)

# Opens Tor Browser, crawls the websitedef startCrawling():    opentor()    getUrl()    forumName = getForumName()    br = getAccess()
    if br != 'down':        crawlForum(br)        new_parse(forumName, False)
    # new_parse(forumName, False)
    closetor()

# Opens Tor Browserdef opentor():    global pid    print("Connecting Tor...")    path = open('../../path.txt').readline()    pro = subprocess.Popen(path)    pid = pro.pid    time.sleep(7.5)    input("Tor Connected. Press ENTER to continue\n")    return

# Creates a connection through Tor Portdef getUrl(timeout=None):    socket.socket = socks.socksocket    socket.create_connection = create_connection    return

# Makes the onion address requestdef create_connection(address, timeout=None, source_address=None):    sock = socks.socksocket()    sock.connect(address)    return sock

# Returns the name of websitedef getForumName():    name = 'CryptBB'    return name

# Return the link of websitedef getFixedURL():    url = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=42&sid=ee2cbfd73c12923d979790b2bb4bdfd5'
    return url

# Closes Tor Browserdef closetor():    global pid    os.system("taskkill /pid " + str(pid))    print('Closing Tor...')    time.sleep(3)    return

# Creates a Mechanize browser and initializes its optionsdef createBrowser():    br = mechanize.Browser()    cj = mechanize.CookieJar()    br.set_cookiejar(cj)
    # Browser options    br.set_handle_equiv(True)    br.set_handle_redirect(True)    br.set_handle_referer(True)    br.set_handle_robots(False)    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)    br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'),                     ('Accept', '*/*')]
    return br

def getAccess():    url = getFixedURL()    br = createBrowser()
    try:
        br.open(url)        return br
    except:
        return 'down'

# Saves the crawled html pagedef savePage(page, url):    filePath = getFullPathName(url)    os.makedirs(os.path.dirname(filePath), exist_ok=True)    a = page.read()    open(filePath, "wb").write(a)    return

# Gets the full path of the page to be saved along with its appropriate file namedef getFullPathName(url):    fileName = getNameFromURL(url)    if isDescriptionLink(url):        fullPath = 'C:/Users/CALSysLab/Documents/threatIntelligence-main/DarkWebMining_Working/Forums/ThiefWorld/HTML_Pages/' + str(            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(            "%04d" % date.today().year) + '/' + 'Description/' + fileName + '.html'    else:        fullPath = 'C:/Users/CALSysLab/Documents/threatIntelligence-main/DarkWebMining_Working/Forums/ThiefWorld/HTML_Pages/' + str(            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(            "%04d" % date.today().year) + '/' + 'Listing/' + fileName + '.html'    return fullPath

# Creates the name of the file based on URLdef getNameFromURL(url):    global counter    name = ''.join(e for e in url if e.isalnum())    if (name == ''):        name = str(counter)        counter = counter + 1    return name

# Hacking and Markets related topicsdef getInterestedLinks():    links = []
    links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=43&sid=e12864ffccc5df877b03b573534955be')
    return links

# Start crawling Forum pagesdef crawlForum(br):    print("Crawling CryptBB forum")
    linksToCrawl = getInterestedLinks()    visited = set(linksToCrawl)    initialTime = time.time()

    i = 0    while i < len(linksToCrawl):        link = linksToCrawl[i]        print('Crawling :', link)        try:            page = br.open(link)            savePage(page, link)
            res = br.response().read()            soup = BeautifulSoup(res, 'html.parser')
            next_link = soup.find("a", {"rel": "next"})            if next_link != None:                full_url = urlparse.urljoin(linksToCrawl[i], next_link['href'])                linksToCrawl.insert(i + 1, full_url)
            listOfTopics = findDescriptionPages(link)            for topic in listOfTopics:                itemPage = br.open(str(topic))                savePage(itemPage, topic)
        except Exception as e:            print('Error getting link: ', link, e)        i += 1
    # finalTime = time.time()    # print finalTime - initialTime
    input("CryptBB forum done sucessfully. Press ENTER to continue\n")
    return

# Returns True if the link is 'Topic' Links, may need to change for diff websitesdef isDescriptionLink(url):    if 'topic' in url:        return True    return False

# Returns True if the link is a listingPage link, may need to change for diff websitesdef isListingLink(url):    '''
    reg = 'board=[0-9]+.[0-9]+\Z'    if len(re.findall(reg, url)) == 0:        return False    return True    '''
    if 'forum' in url:        return True    return False

# calling the parser to define the linksdef findDescriptionPages(url):    soup = ""
    error = False    try:        html = codecs.open(            "C:\\Users\\CALSysLab\\Documents\\threatIntelligence-main\\DarkWebMining_Working\\Forums\\BestCardingWorld\\HTML_Pages\\" + str(                "%02d" % date.today().month) + str("%02d" % date.today().day) + str(                "%04d" % date.today().year) + "\\Listing\\" + getNameFromURL(url) + ".html", encoding='utf8')        soup = BeautifulSoup(html, "html.parser")    except:        try:            html = open(                "C:\\Users\\CALSysLab\\Documents\\threatIntelligence-main\\DarkWebMining_Working\\Forums\\BestCardingWorld\\HTML_Pages\\" + str(                    "%02d" % date.today().month) + str("%02d" % date.today().day) + str(                    "%04d" % date.today().year) + "\\Listing\\" + getNameFromURL(url) + ".html")            soup = BeautifulSoup(html, "html.parser")        except:            error = True            print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.")
    if not error:        return bestcardingworld_links_parser(soup)
    else:        return []

def crawler():    startCrawling()    print("Crawling and Parsing CryptBB .... DONE!")