khangtran
/
dark_web_forums

__author__ = 'DarkWeb'
import re
# Here, we are importing the auxiliary functions to clean or convert datafrom Forums.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML treefrom bs4 import BeautifulSoup

# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs#stores info it needs in different lists, these lists are returned after being organized#@param: soup object looking at html page of description page#return: 'row' that contains a variety of lists that each hold info on the description pagedef darknetarmy_description_parser(soup):    # Fields to be parsed
    topic = "-1"  # 0 topic name    user = []  # 1 all users of each post    status = []  # 2 all user's authority in each post such as (adm, member, dangerous)    reputation = []  # 3 all users's karma in each post (usually found as a number)    interest = []  # 4 all user's interest in each post    sign = []  # 5 all user's signature in each post (usually a standard message after the content of the post)    post = []  # 6 all messages of each post    feedback = []  # 7 all feedbacks of each user (this was found in just one Forum and with a number format)    addDate = []  # 8 all dated of each post    image_user = []  # 9 all user avatars of each post    image_post = []  # 10 all first images of each post
    # Finding the topic (should be just one coming from the Listing Page)
    topic = soup.find("h1", {"class": "p-title-value"})    topic = topic.text    topic = topic.replace(",", "")    topic = topic.replace("\n", "")    topic = cleanString(topic.strip())
    # Finding the repeated tag that corresponds to the listing of posts
    # posts = soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg"}) + \    #         soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg2"})
    posts = soup.findAll('article', class_=re.compile("message message--post js-post js-inlineModContainer.*"))
    # For each message (post), get all the fields we are interested to:    for ipost in posts:
        # Finding a first level of the HTML page
        #post_wrapper = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "poster"})        post_wrapper = ipost.find('div', {"class": "message-inner"})        # Finding the author (user) of the post
        # users        author = post_wrapper.find('div', {'class': 'message-userName'}).find('h4').text        user.append(cleanString(author))  # Remember to clean the problematic characters
        # Finding the status of the author        try:            membergroup = post_wrapper.find('h5', {'class': 'userTitle message-userTitle'}).text        except:            membergroup = '-1'
        status.append(cleanString(membergroup))
        # reputation        temp = post_wrapper.find('div', {'class': 'message-userExtras'}).find_all('dl')        rep = temp[2].find('dd').text        if 'K' or 'k' in rep:            rep = rep.replace('K', '000').replace('k', '000')        reputation.append(rep)
        # na        interest.append('-1')        sign.append('-1')        feedback.append('-1')        image_post.append('-1')
        try:            message = post_wrapper.find('article', {'class': 'message-body js-selectToQuote'}).text            message = cleanString(message.strip())        except:            message = post_wrapper.find('div', {'content': 'message-content js-messageContent'}).text
            message = cleanString(message.strip())
        post.append(message)

        time = post_wrapper.find('ul', class_ = re.compile(r'message-attribution-main listInline.*')).find('time').text        if ',' in time:            time = time.replace(',', '')        if 'today' in time:            today = datetime.today()            time = today        elif 'at' or 'AM' or 'PM' in time:            today = datetime.today()            start_of_week = today - timedelta(days=(today.weekday() + 1) % 7)            days_mapping = {                'Sunday': start_of_week,                'Monday': start_of_week + timedelta(days=1),                'Tuesday': start_of_week + timedelta(days=2),                'Wednesday': start_of_week + timedelta(days=3),                'Thursday': start_of_week + timedelta(days=4),                'Friday': start_of_week + timedelta(days=5),                'Saturday': start_of_week + timedelta(days=6),            }            for day, date in days_mapping.items():                if day in time:                    time = date.strftime('%Y-%m-%d')                    break        addDate.append(time)
        try:            image = post_wrapper.find('div', {'class': 'message-avatar '}).find('img').get('src').split('base64,')[-1]        except:            image = '-1'        image_user.append(image)
    # Populate the final variable (this should be a list with all fields scraped)
    row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
    # Sending the results
    return row

# This is the method to parse the Listing Pages (one page with many posts)#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs#stores info it needs in different lists, these lists are returned after being organized#@param: soup object looking at html page of listing page#return: 'row' that contains a variety of lists that each hold info on the listing pagedef darknetarmy_listing_parser(soup):    nm = 0  # *this variable should receive the number of topics    forum = "DarkNetArmy"  # 0 *forum name    board = "-1"  # 1 *board name (the previous level of the topic in the Forum categorization tree.    # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)    author = []  # 2 *all authors of each topic    topic = []  # 3 *all topics    views = []  # 4 number of views of each topic    posts = []  # 5 number of posts of each topic    href = []  # 6 this variable should receive all cleaned urls (we will use this to do the marge between    # Listing and Description pages)    addDate = []  # 7 when the topic was created (difficult to find)    image_author = []  # 8 all author avatars used in each topic
    # Finding the board (should be just one)
    board = soup.find('h1', {"class": "p-title-value"}).text    board = board.replace(u"\xbb", "")    board = cleanString(board.strip())
    # Finding the repeated tag that corresponds to the listing of topics
    itopics = soup.find('div', {"class": "structItemContainer-group js-threadList"}).find_all('div', class_=re.compile(        r'^structItem structItem--thread js-inlineModContainer js-threadListItem.*'))
    nm = len(itopics)
    index = 0    for itopic in itopics:        # authors        a = itopic.find('ul', {"class": "structItem-parts"}).find('li').text        a = cleanString(a.strip())        author.append(a)
        # topic        top = itopic.find('div', {"class": 'structItem-title'}).text        top = cleanString(top.strip())        topic.append(top)
        # href        ref = itopic.find('div', {"class": 'structItem-title'}).find('a').get('href')        href.append(ref)
        # image        try:            image = itopic.find('div', {"class": 'structItem-iconContainer'}).find('img').get('src').split('base64,')[                -1]        except:            image = '-1'
        image_author.append(image)
        # add date        try:            time = soup.find('li', {"class": 'structItem-startDate'}).find('time').text            if ',' in time:                time = time.replace(',', '')            time = time.strip()            if 'today' in time:                today = datetime.today()                time = today            elif 'at' or 'AM' or 'PM' in time:                today = datetime.today()                start_of_week = today - timedelta(days=(today.weekday() + 1) % 7)                days_mapping = {                    'Sunday': start_of_week,                    'Monday': start_of_week + timedelta(days=1),                    'Tuesday': start_of_week + timedelta(days=2),                    'Wednesday': start_of_week + timedelta(days=3),                    'Thursday': start_of_week + timedelta(days=4),                    'Friday': start_of_week + timedelta(days=5),                    'Saturday': start_of_week + timedelta(days=6),                }                for day, date in days_mapping.items():                    if day in time:                        time = date.strftime('%Y-%m-%d')                        break            addDate.append(time)        except:            addDate.append('-1')
        try:            temp = itopic.find('div', class_=re.compile(r'^structItem-cell structItem-cell--meta.*')).find_all('dl')            try:                reply = temp[0].find('dd').text                reply = cleanString(reply.strip())                if 'K' or 'k' in reply:                    reply = reply.replace('K', '000').replace('k', '000')            except:                reply = '-1'            posts.append(reply)
            # views            try:                view = temp[1].find('dd').text                view = cleanString(view.strip())                if 'K' or 'k' in view:                    view = view.replace('K', '000').replace('k', '000')            except:                view = '-1'            views.append(view)        except:            reply = '-1'            posts.append(reply)            view = '-1'            views.append(view)
    return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author)

#called by the crawler to get description links on a listing page#@param: beautifulsoup object that is using the correct html page (listing page)#return: list of description links from a listing pagedef darknetarmy_links_parser(soup):    # Returning all links that should be visited by the Crawler
    href = []
    listing = soup.find('div', {"class": "block-container block-container--nodes"}).findAll('div', {        "class": "structItem-title"})
    for a in listing:        bae = a.find('a', href=True)        link = bae['href']        href.append(link)
    return href