khangtran
/
dark_web_forums


								__author__ = 'DarkWeb'


								import re


								# Here, we are importing the auxiliary functions to clean or convert data

								from Forums.Utilities.utilities import *


								# Here, we are importing BeautifulSoup to search through the HTML tree

								from bs4 import BeautifulSoup


								# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)

								#parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs

								#stores info it needs in different lists, these lists are returned after being organized

								#@param: soup object looking at html page of description page

								#return: 'row' that contains a variety of lists that each hold info on the description page

								def darknetarmy_description_parser(soup):

								    # Fields to be parsed


								    topic = "-1"  # 0 topic name

								    user = []  # 1 all users of each post

								    status = []  # 2 all user's authority in each post such as (adm, member, dangerous)

								    reputation = []  # 3 all users's karma in each post (usually found as a number)

								    interest = []  # 4 all user's interest in each post

								    sign = []  # 5 all user's signature in each post (usually a standard message after the content of the post)

								    post = []  # 6 all messages of each post

								    feedback = []  # 7 all feedbacks of each user (this was found in just one Forum and with a number format)

								    addDate = []  # 8 all dated of each post

								    image_user = []  # 9 all user avatars of each post

								    image_post = []  # 10 all first images of each post


								    # Finding the topic (should be just one coming from the Listing Page)


								    topic = soup.find("h1", {"class": "p-title-value"})

								    topic = topic.text

								    topic = topic.replace(",", "")

								    topic = topic.replace("\n", "")

								    topic = cleanString(topic.strip())


								    # Finding the repeated tag that corresponds to the listing of posts


								    # posts = soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg"}) + \

								    #         soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg2"})


								    posts = soup.findAll('article', class_=re.compile("message message--post js-post js-inlineModContainer.*"))


								    # For each message (post), get all the fields we are interested to:

								    for ipost in posts:


								        # Finding a first level of the HTML page


								        #post_wrapper = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "poster"})

								        post_wrapper = ipost.find('div', {"class": "message-inner"})

								        # Finding the author (user) of the post


								        # users

								        author = post_wrapper.find('div', {'class': 'message-userName'}).find('h4').text

								        user.append(cleanString(author))  # Remember to clean the problematic characters


								        # Finding the status of the author

								        try:

								            membergroup = post_wrapper.find('h5', {'class': 'userTitle message-userTitle'}).text

								        except:

								            membergroup = '-1'


								        status.append(cleanString(membergroup))


								        # reputation

								        temp = post_wrapper.find('div', {'class': 'message-userExtras'}).find_all('dl')

								        rep = temp[2].find('dd').text

								        if 'K' or 'k' in rep:

								            rep = rep.replace('K', '000').replace('k', '000')

								        reputation.append(rep)


								        # na

								        interest.append('-1')

								        sign.append('-1')

								        feedback.append('-1')

								        image_post.append('-1')


								        try:

								            message = post_wrapper.find('article', {'class': 'message-body js-selectToQuote'}).text

								            message = cleanString(message.strip())

								        except:

								            message = post_wrapper.find('div', {'content': 'message-content js-messageContent'}).text


								            message = cleanString(message.strip())


								        post.append(message)


								        time = post_wrapper.find('ul', class_ = re.compile(r'message-attribution-main listInline.*')).find('time').text

								        if ',' in time:

								            time = time.replace(',', '')

								        if 'today' in time:

								            today = datetime.today()

								            time = today

								        elif 'at' or 'AM' or 'PM' in time:

								            today = datetime.today()

								            start_of_week = today - timedelta(days=(today.weekday() + 1) % 7)

								            days_mapping = {

								                'Sunday': start_of_week,

								                'Monday': start_of_week + timedelta(days=1),

								                'Tuesday': start_of_week + timedelta(days=2),

								                'Wednesday': start_of_week + timedelta(days=3),

								                'Thursday': start_of_week + timedelta(days=4),

								                'Friday': start_of_week + timedelta(days=5),

								                'Saturday': start_of_week + timedelta(days=6),

								            }

								            for day, date in days_mapping.items():

								                if day in time:

								                    time = date.strftime('%Y-%m-%d')

								                    break

								        addDate.append(time)


								        try:

								            image = post_wrapper.find('div', {'class': 'message-avatar '}).find('img').get('src').split('base64,')[-1]

								        except:

								            image = '-1'

								        image_user.append(image)


								    # Populate the final variable (this should be a list with all fields scraped)


								    row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)


								    # Sending the results


								    return row


								# This is the method to parse the Listing Pages (one page with many posts)

								#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs

								#stores info it needs in different lists, these lists are returned after being organized

								#@param: soup object looking at html page of listing page

								#return: 'row' that contains a variety of lists that each hold info on the listing page

								def darknetarmy_listing_parser(soup):

								    nm = 0  # *this variable should receive the number of topics

								    forum = "DarkNetArmy"  # 0 *forum name

								    board = "-1"  # 1 *board name (the previous level of the topic in the Forum categorization tree.

								    # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)

								    author = []  # 2 *all authors of each topic

								    topic = []  # 3 *all topics

								    views = []  # 4 number of views of each topic

								    posts = []  # 5 number of posts of each topic

								    href = []  # 6 this variable should receive all cleaned urls (we will use this to do the marge between

								    # Listing and Description pages)

								    addDate = []  # 7 when the topic was created (difficult to find)

								    image_author = []  # 8 all author avatars used in each topic


								    # Finding the board (should be just one)


								    board = soup.find('h1', {"class": "p-title-value"}).text

								    board = board.replace(u"\xbb", "")

								    board = cleanString(board.strip())


								    # Finding the repeated tag that corresponds to the listing of topics


								    itopics = soup.find('div', {"class": "structItemContainer-group js-threadList"}).find_all('div', class_=re.compile(

								        r'^structItem structItem--thread js-inlineModContainer js-threadListItem.*'))


								    nm = len(itopics)


								    index = 0

								    for itopic in itopics:

								        # authors

								        a = itopic.find('ul', {"class": "structItem-parts"}).find('li').text

								        a = cleanString(a.strip())

								        author.append(a)


								        # topic

								        top = itopic.find('div', {"class": 'structItem-title'}).text

								        top = cleanString(top.strip())

								        topic.append(top)


								        # href

								        ref = itopic.find('div', {"class": 'structItem-title'}).find('a').get('href')

								        href.append(ref)


								        # image

								        try:

								            image = itopic.find('div', {"class": 'structItem-iconContainer'}).find('img').get('src').split('base64,')[

								                -1]

								        except:

								            image = '-1'


								        image_author.append(image)


								        # add date

								        try:

								            time = soup.find('li', {"class": 'structItem-startDate'}).find('time').text

								            if ',' in time:

								                time = time.replace(',', '')

								            time = time.strip()

								            if 'today' in time:

								                today = datetime.today()

								                time = today

								            elif 'at' or 'AM' or 'PM' in time:

								                today = datetime.today()

								                start_of_week = today - timedelta(days=(today.weekday() + 1) % 7)

								                days_mapping = {

								                    'Sunday': start_of_week,

								                    'Monday': start_of_week + timedelta(days=1),

								                    'Tuesday': start_of_week + timedelta(days=2),

								                    'Wednesday': start_of_week + timedelta(days=3),

								                    'Thursday': start_of_week + timedelta(days=4),

								                    'Friday': start_of_week + timedelta(days=5),

								                    'Saturday': start_of_week + timedelta(days=6),

								                }

								                for day, date in days_mapping.items():

								                    if day in time:

								                        time = date.strftime('%Y-%m-%d')

								                        break

								            addDate.append(time)

								        except:

								            addDate.append('-1')


								        try:

								            temp = itopic.find('div', class_=re.compile(r'^structItem-cell structItem-cell--meta.*')).find_all('dl')

								            try:

								                reply = temp[0].find('dd').text

								                reply = cleanString(reply.strip())

								                if 'K' or 'k' in reply:

								                    reply = reply.replace('K', '000').replace('k', '000')

								            except:

								                reply = '-1'

								            posts.append(reply)


								            # views

								            try:

								                view = temp[1].find('dd').text

								                view = cleanString(view.strip())

								                if 'K' or 'k' in view:

								                    view = view.replace('K', '000').replace('k', '000')

								            except:

								                view = '-1'

								            views.append(view)

								        except:

								            reply = '-1'

								            posts.append(reply)

								            view = '-1'

								            views.append(view)


								    return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author)


								#called by the crawler to get description links on a listing page

								#@param: beautifulsoup object that is using the correct html page (listing page)

								#return: list of description links from a listing page

								def darknetarmy_links_parser(soup):

								    # Returning all links that should be visited by the Crawler


								    href = []


								    listing = soup.find('div', {"class": "block-container block-container--nodes"}).findAll('div', {

								        "class": "structItem-title"})


								    for a in listing:

								        bae = a.find('a', href=True)

								        link = bae['href']

								        href.append(link)


								    return href