darknetarmy scraper completed

1 year ago · 1808a03ee3
--- a/Forums/DarkNetArmy/crawler_selenium.py
+++ b/Forums/DarkNetArmy/crawler_selenium.py
@ -0,0 +1,282 @@
 __author__ = 'DarkWeb'

 '''
 DarkNetArmy Forum Crawler (Selenium)
 '''

 from selenium import webdriver
 from selenium.common.exceptions import NoSuchElementException
 from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
 from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
 from selenium.webdriver.firefox.service import Service
 from selenium.webdriver.common.by import By

 import urllib.parse as urlparse
 import os, time
 from datetime import date
 import subprocess
 from bs4 import BeautifulSoup
 from Forums.Initialization.prepare_parser import new_parse
 from Forums.DarkNetArmy.parser import darknetarmy_links_parser
 from Forums.Utilities.utilities import cleanHTML

 counter = 1
 baseURL = 'http://darknet77vonbqeatfsnawm5jtnoci5z22mxay6cizmoucgmz52mwyad.onion/'


 # Opens Tor Browser, crawls the website, then parses, then closes tor
 #acts like the main method for the crawler, another function at the end of this code calls this function later
 def startCrawling():
    forumName = getForumName()
    # driver = getAccess()
    #
    # if driver != 'down':
    #     try:
    #         crawlForum(driver)
    #     except Exception as e:
    #         print(driver.current_url, e)
    #     closeDriver(driver)

    new_parse(forumName, baseURL, True)


 # Returns the name of the website
 #return: name of site in string type
 def getForumName():
    name = 'DarkNetArmy'
    return name


 # Return the base link of the website
 #return: url of base site in string type
 def getFixedURL():
    url = 'http://darknet77vonbqeatfsnawm5jtnoci5z22mxay6cizmoucgmz52mwyad.onion/'
    return url


 # Closes Tor Browser
 #@param: current selenium driver
 def closeDriver(driver):
    # global pid
    # os.system("taskkill /pid " + str(pro.pid))
    # os.system("taskkill /t /f /im tor.exe")
    print('Closing Tor...')
    driver.close()
    time.sleep(3)
    return


 # Creates FireFox 'driver' and configure its 'Profile'
 # to use Tor proxy and socket
 def createFFDriver():
    from Forums.Initialization.forums_mining import config

    ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))

    ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
    ff_prof.set_preference("places.history.enabled", False)
    ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
    ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
    ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
    ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
    ff_prof.set_preference("signon.rememberSignons", False)
    ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
    ff_prof.set_preference("network.dns.disablePrefetch", True)#might need to turn off
    ff_prof.set_preference("network.http.sendRefererHeader", 0)
    ff_prof.set_preference("permissions.default.image", 3)
    ff_prof.set_preference("browser.download.folderList", 2)
    ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
    ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
    ff_prof.set_preference('network.proxy.type', 1)
    ff_prof.set_preference("network.proxy.socks_version", 5)
    ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
    ff_prof.set_preference('network.proxy.socks_port', 9150)
    ff_prof.set_preference('network.proxy.socks_remote_dns', True)
    ff_prof.set_preference("javascript.enabled", True)
    ff_prof.update_preferences()

    service = Service(config.get('TOR', 'geckodriver_path'))

    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)

    driver.maximize_window()

    return driver


 #the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
 #return: return the selenium driver or string 'down'
 def getAccess():
    url = getFixedURL()
    driver = createFFDriver()
    try:
        driver.get(url)
        return driver
    except:
        driver.close()
        return 'down'


 # Saves the crawled html page, makes the directory path for html pages if not made
 def savePage(driver, page, url):
    cleanPage = cleanHTML(driver, page)
    filePath = getFullPathName(url)
    os.makedirs(os.path.dirname(filePath), exist_ok=True)
    open(filePath, 'wb').write(cleanPage.encode('utf-8'))
    return


 # Gets the full path of the page to be saved along with its appropriate file name
 #@param: raw url as crawler crawls through every site
 def getFullPathName(url):
    from Forums.Initialization.forums_mining import config, CURRENT_DATE

    mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
    fileName = getNameFromURL(url)
    if isDescriptionLink(url):
        fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
    else:
        fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
    return fullPath


 # Creates the file name from passed URL, gives distinct name if can't be made unique after cleaned
 #@param: raw url as crawler crawls through every site
 def getNameFromURL(url):
    global counter
    name = ''.join(e for e in url if e.isalnum())
    if (name == ''):
        name = str(counter)
        counter = counter + 1
    return name


 # returns list of urls, here is where you can list the different urls of interest, the crawler runs through this list
 #in this example, there are a couple of categories some threads fall under such as
 #exploits, malware, and hacking tutorials
 def getInterestedLinks():
    links = []

    # rats, malware, ransomware
    links.append('http://darknet77vonbqeatfsnawm5jtnoci5z22mxay6cizmoucgmz52mwyad.onion/forums/rats-malwares-key-loggers-ransomware-tools.24/')
    # hacking tutorials
    links.append('http://darknet77vonbqeatfsnawm5jtnoci5z22mxay6cizmoucgmz52mwyad.onion/forums/hacking-cracking-tutorials-courses-methods.45/')
    # # hacking
    # links.append('http://darknet77vonbqeatfsnawm5jtnoci5z22mxay6cizmoucgmz52mwyad.onion/forums/hacking-cracking-tools-apps.21/')
    return links


 # gets links of interest to crawl through, iterates through list, where each link is clicked and crawled through
 #topic and description pages are crawled through here, where both types of pages are saved
 #@param: selenium driver
 def crawlForum(driver):
    print("Crawling the BestCardingWorld forum")

    linksToCrawl = getInterestedLinks()

    i = 0
    while i < len(linksToCrawl):
        link = linksToCrawl[i]
        print('Crawling :', link)
        try:
            has_next_page = True
            count = 0

            while has_next_page:
                try:
                    driver.get(link)
                except:
                    driver.refresh()
                html = driver.page_source
                savePage(driver, html, link)

                topics = topicPages(html)
                for topic in topics:
                    has_next_topic_page = True
                    counter = 1
                    page = topic

                    while has_next_topic_page:
                        itemURL = urlparse.urljoin(baseURL, str(page))
                        try:
                            driver.get(itemURL)
                        except:
                            driver.refresh()
                        
                        if isListingLink(driver.current_url):
                            break

                        savePage(driver, driver.page_source, topic + f"page{counter}")  # very important

                        # comment out
                        # if counter == 2:
                        #     break

                        try:
                            page = link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href')
                            if page == "":
                                raise NoSuchElementException
                            counter += 1

                        except NoSuchElementException:
                            has_next_topic_page = False

                    # making sure we go back to the listing page (browser back button simulation)
                    try:
                        driver.get(link)
                    except:
                        driver.refresh()

                    # comment out
                    # break

                # comment out
                # if count == 1:
                #     break

                try:
                    link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href')
                    if link == "":
                        raise NoSuchElementException
                    count += 1

                except NoSuchElementException:
                    has_next_page = False

        except Exception as e:
            print(link, e)
        i += 1

    print("Crawling the DarkNetArmy forum done.")


 # Returns 'True' if the link is a description link
 #@param: url of any url crawled
 #return: true if is a description page, false if not
 def isDescriptionLink(url):
    if 'threads' in url:
        return True
    return False


 # Returns True if the link is a listingPage link
 #@param: url of any url crawled
 #return: true if is a Listing page, false if not
 def isListingLink(url):
    if 'forums' in url:
        return True
    return False


 # calling the parser to define the links, the html is the url of a link from the list of interested link list
 #@param: link from interested link list
 #return: list of description links that should be crawled through
 def topicPages(html):
    soup = BeautifulSoup(html, "html.parser")
    #print(soup.find('div', {"class": "forumbg"}).find('ul', {"class": "topiclist topics"}).find('li', {"class": "row bg1"}).find('a', {"class": "topictitle"}, href=True))
    return darknetarmy_links_parser(soup)


 def crawler():
    startCrawling()
    # print("Crawling and Parsing BestCardingWorld .... DONE!")
--- a/Forums/DarkNetArmy/parser.py
+++ b/Forums/DarkNetArmy/parser.py
@ -0,0 +1,262 @@
 __author__ = 'DarkWeb'

 import re

 # Here, we are importing the auxiliary functions to clean or convert data
 from Forums.Utilities.utilities import *

 # Here, we are importing BeautifulSoup to search through the HTML tree
 from bs4 import BeautifulSoup


 # This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
 #parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
 #stores info it needs in different lists, these lists are returned after being organized
 #@param: soup object looking at html page of description page
 #return: 'row' that contains a variety of lists that each hold info on the description page
 def darknetarmy_description_parser(soup):
    # Fields to be parsed

    topic = "-1"  # 0 topic name
    user = []  # 1 all users of each post
    status = []  # 2 all user's authority in each post such as (adm, member, dangerous)
    reputation = []  # 3 all users's karma in each post (usually found as a number)
    interest = []  # 4 all user's interest in each post
    sign = []  # 5 all user's signature in each post (usually a standard message after the content of the post)
    post = []  # 6 all messages of each post
    feedback = []  # 7 all feedbacks of each user (this was found in just one Forum and with a number format)
    addDate = []  # 8 all dated of each post
    image_user = []  # 9 all user avatars of each post
    image_post = []  # 10 all first images of each post

    # Finding the topic (should be just one coming from the Listing Page)

    topic = soup.find("h1", {"class": "p-title-value"})
    topic = topic.text
    topic = topic.replace(",", "")
    topic = topic.replace("\n", "")
    topic = cleanString(topic.strip())

    # Finding the repeated tag that corresponds to the listing of posts

    # posts = soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg"}) + \
    #         soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg2"})

    posts = soup.findAll('article', class_=re.compile("message message--post js-post js-inlineModContainer.*"))

    # For each message (post), get all the fields we are interested to:
    for ipost in posts:

        # Finding a first level of the HTML page

        #post_wrapper = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "poster"})
        post_wrapper = ipost.find('div', {"class": "message-inner"})
        # Finding the author (user) of the post

        # users
        author = post_wrapper.find('div', {'class': 'message-userName'}).find('h4').text
        user.append(cleanString(author))  # Remember to clean the problematic characters

        # Finding the status of the author
        try:
            membergroup = post_wrapper.find('h5', {'class': 'userTitle message-userTitle'}).text
        except:
            membergroup = '-1'

        status.append(cleanString(membergroup))

        # reputation
        temp = post_wrapper.find('div', {'class': 'message-userExtras'}).find_all('dl')
        rep = temp[2].find('dd').text
        if 'K' or 'k' in rep:
            rep = rep.replace('K', '000').replace('k', '000')
        reputation.append(rep)

        # na
        interest.append('-1')
        sign.append('-1')
        feedback.append('-1')
        image_post.append('-1')

        try:
            message = post_wrapper.find('article', {'class': 'message-body js-selectToQuote'}).text
            message = cleanString(message.strip())
        except:
            message = post_wrapper.find('div', {'content': 'message-content js-messageContent'}).text

            message = cleanString(message.strip())

        post.append(message)


        time = post_wrapper.find('ul', class_ = re.compile(r'message-attribution-main listInline.*')).find('time').text
        if ',' in time:
            time = time.replace(',', '')
        if 'today' in time:
            today = datetime.today()
            time = today
        elif 'at' or 'AM' or 'PM' in time:
            today = datetime.today()
            start_of_week = today - timedelta(days=(today.weekday() + 1) % 7)
            days_mapping = {
                'Sunday': start_of_week,
                'Monday': start_of_week + timedelta(days=1),
                'Tuesday': start_of_week + timedelta(days=2),
                'Wednesday': start_of_week + timedelta(days=3),
                'Thursday': start_of_week + timedelta(days=4),
                'Friday': start_of_week + timedelta(days=5),
                'Saturday': start_of_week + timedelta(days=6),
            }
            for day, date in days_mapping.items():
                if day in time:
                    time = date.strftime('%Y-%m-%d')
                    break
        addDate.append(time)

        try:
            image = post_wrapper.find('div', {'class': 'message-avatar '}).find('img').get('src').split('base64,')[-1]
        except:
            image = '-1'
        image_user.append(image)

    # Populate the final variable (this should be a list with all fields scraped)

    row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)

    # Sending the results

    return row


 # This is the method to parse the Listing Pages (one page with many posts)
 #parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
 #stores info it needs in different lists, these lists are returned after being organized
 #@param: soup object looking at html page of listing page
 #return: 'row' that contains a variety of lists that each hold info on the listing page
 def darknetarmy_listing_parser(soup):
    nm = 0  # *this variable should receive the number of topics
    forum = "DarkNetArmy"  # 0 *forum name
    board = "-1"  # 1 *board name (the previous level of the topic in the Forum categorization tree.
    # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
    author = []  # 2 *all authors of each topic
    topic = []  # 3 *all topics
    views = []  # 4 number of views of each topic
    posts = []  # 5 number of posts of each topic
    href = []  # 6 this variable should receive all cleaned urls (we will use this to do the marge between
    # Listing and Description pages)
    addDate = []  # 7 when the topic was created (difficult to find)
    image_author = []  # 8 all author avatars used in each topic

    # Finding the board (should be just one)

    board = soup.find('h1', {"class": "p-title-value"}).text
    board = board.replace(u"\xbb", "")
    board = cleanString(board.strip())

    # Finding the repeated tag that corresponds to the listing of topics

    itopics = soup.find('div', {"class": "structItemContainer-group js-threadList"}).find_all('div', class_=re.compile(
        r'^structItem structItem--thread js-inlineModContainer js-threadListItem.*'))

    nm = len(itopics)

    index = 0
    for itopic in itopics:
        # authors
        a = itopic.find('ul', {"class": "structItem-parts"}).find('li').text
        a = cleanString(a.strip())
        author.append(a)

        # topic
        top = itopic.find('div', {"class": 'structItem-title'}).text
        top = cleanString(top.strip())
        topic.append(top)

        # href
        ref = itopic.find('div', {"class": 'structItem-title'}).find('a').get('href')
        href.append(ref)

        # image
        try:
            image = itopic.find('div', {"class": 'structItem-iconContainer'}).find('img').get('src').split('base64,')[
                -1]
        except:
            image = '-1'

        image_author.append(image)

        # add date
        try:
            time = soup.find('li', {"class": 'structItem-startDate'}).find('time').text
            if ',' in time:
                time = time.replace(',', '')
            time = time.strip()
            if 'today' in time:
                today = datetime.today()
                time = today
            elif 'at' or 'AM' or 'PM' in time:
                today = datetime.today()
                start_of_week = today - timedelta(days=(today.weekday() + 1) % 7)
                days_mapping = {
                    'Sunday': start_of_week,
                    'Monday': start_of_week + timedelta(days=1),
                    'Tuesday': start_of_week + timedelta(days=2),
                    'Wednesday': start_of_week + timedelta(days=3),
                    'Thursday': start_of_week + timedelta(days=4),
                    'Friday': start_of_week + timedelta(days=5),
                    'Saturday': start_of_week + timedelta(days=6),
                }
                for day, date in days_mapping.items():
                    if day in time:
                        time = date.strftime('%Y-%m-%d')
                        break
            addDate.append(time)
        except:
            addDate.append('-1')

        try:
            temp = itopic.find('div', class_=re.compile(r'^structItem-cell structItem-cell--meta.*')).find_all('dl')
            try:
                reply = temp[0].find('dd').text
                reply = cleanString(reply.strip())
                if 'K' or 'k' in reply:
                    reply = reply.replace('K', '000').replace('k', '000')
            except:
                reply = '-1'
            posts.append(reply)

            # views
            try:
                view = temp[1].find('dd').text
                view = cleanString(view.strip())
                if 'K' or 'k' in view:
                    view = view.replace('K', '000').replace('k', '000')
            except:
                view = '-1'
            views.append(view)
        except:
            reply = '-1'
            posts.append(reply)
            view = '-1'
            views.append(view)

    return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author)


 #called by the crawler to get description links on a listing page
 #@param: beautifulsoup object that is using the correct html page (listing page)
 #return: list of description links from a listing page
 def darknetarmy_links_parser(soup):
    # Returning all links that should be visited by the Crawler

    href = []

    listing = soup.find('div', {"class": "block-container block-container--nodes"}).findAll('div', {
        "class": "structItem-title"})

    for a in listing:
        bae = a.find('a', href=True)
        link = bae['href']
        href.append(link)

    return href