khangtran
/
dark_web_forums


								__author__ = 'DarkWeb'


								# Here, we are importing the auxiliary functions to clean or convert data

								from typing import List

								from Forums.Utilities.utilities import *

								from datetime import date

								from datetime import timedelta

								import re

								import string


								# Here, we are importing BeautifulSoup to search through the HTML tree

								from bs4 import BeautifulSoup


								# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)


								def onniForums_description_parser(soup: BeautifulSoup) -> tuple:


								    topicName: str = "-1"    # 0 *topic name

								    users       : List[str] = []  # 1 *all users of each post

								    statuses    : List[str] = []  # 2 all user's authority in each post such as (adm, member, dangerous)

								    reputations : List[str] = []  # 3 all user's karma in each post (usually found as a number)

								    interests   : List[str] = []  # 4 all user's interest in each post

								    signs       : List[str] = []  # 5 all user's signature in each post (usually a standard message after the content of the post)

								    posts       : List[str] = []  # 6 all messages of each post

								    feedbacks   : List[str] = []  # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)

								    addDates    : List[datetime] = []  # 8 all dates of each post


								    # Getting the topicName

								    topicName = soup.find("table", {"class": "tborder tfixed clear"}) \

								                    .find("td", {"class": "thead"}) \

								                    .find_all("div")[-1].text


								    topicName = cleanString(topicName.strip())


								    topics_array = soup.find_all("div", {"class": "post"})


								    for topic in topics_array:

								        # Extracting and cleaning author information

								        author_information: BeautifulSoup = topic.find("div", {"class": "author_information"})


								        username: str = author_information.find("span", {"class": "largetext"}).text

								        username_cleaned = cleanString(username.strip())

								        users.append(username_cleaned)


								        user_status: str = author_information.find("span", {"class": "smalltext"}).text


								        # Banned users often have weird text issues in HTML

								        # So we detect banned users and give them a unique string

								        if user_status.find("Banned") > 0: user_status_cleaned = "Banned"


								        elif user_status.find("Unregistered") > 0: user_status_cleaned = "Unregistered"


								        else: user_status_cleaned = cleanString(user_status.strip())  # Remove excessive spaces in string


								        # Add cleaned data into array

								        statuses.append(user_status_cleaned)


								        if user_status_cleaned in ['Unregistered', 'Banned']: reputations.append(-1)

								        else:

								            author_statistics: BeautifulSoup = topic.find("div", {"class": "author_statistics"})


								            reputation: str = author_statistics.find_all("div", {"class": "float_right"})[-1].text

								            reputation_cleaned = cleanString(reputation.strip())

								            reputations.append(reputation_cleaned)


								        # Append a "-1" to `interests` and `signs` array since they don't exist on this forum

								        interests.append("-1")

								        signs.append("-1")


								        post_content: str = topic.find("div", {"class": "post_body scaleimages"}).text

								        # Clean post content of excessive spaces and characters

								        post_content_cleaned = post_content.replace("[You must reply to view this hidden content]", "")

								        post_content_cleaned = cleanString(post_content_cleaned.strip())

								        posts.append(post_content_cleaned)


								        # Append a "-1" to `feedbacks` array since they don't exists on this forum

								        feedbacks.append("-1")


								        date_posted: str = topic.find("span", {"class": "post_date"}).text

								        date_posted_cleaned = cleanString(date_posted.split(",")[0])


								        today = datetime.now()


								        if date_posted_cleaned == 'Yesterday':

								            date_object = today - timedelta(days=1)


								        elif date_posted_cleaned.find('hour') > 0:

								            hours_ago = int(date_posted_cleaned.split(' ')[0])

								            date_object = today - timedelta(hours=hours_ago)


								        elif date_posted_cleaned.find('minute') > 0:

								            minutes_ago = int(date_posted_cleaned.split(' ')[0])

								            date_object = today - timedelta(minutes=minutes_ago)


								        else:

								            date_object = datetime.strptime(date_posted_cleaned, "%m-%d-%Y")


								        addDates.append(date_object)


								    # TESTING PURPOSES - DO NOT REMOVE


								    # Populate the final variable (this should be a list with all fields scraped)


								    row = (topicName, users, statuses, reputations, interests, signs, posts, feedbacks, addDates)


								    # Sending the results


								    return row


								def onniForums_listing_parser(soup: BeautifulSoup):


								    boardName = "-1"       # board name (the previous level of the topic in the Forum categorization tree.

								                       # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)

								    forum = "OnniForums"

								    nm = 0             # this variable should receive the number of topics

								    topic   : List[str] = []         # all topics

								    user    : List[str] = []          # all users of each topic

								    post    : List[int] = []         # number of posts of each topic

								    view    : List[int] = []          # number of views of each topic

								    addDate : List[str] = []       # when the topic was created (difficult to find)

								    href    : List[str] = []          # this variable should receive all cleaned urls (we will use this to do the merge between

								                       # Listing and Description pages)


								    # Finding the board (should be just one)

								    board_metadata: BeautifulSoup = soup.find("table",{"class" : "tborder clear"})


								    boardName = board_metadata.find_all("div")[1].text

								    boardName = cleanString(boardName.strip())


								    thread_arrays = board_metadata.find_all("tr", {"class":"inline_row"}) # gets the information of posts


								    nm = len(thread_arrays)


								    for thread in thread_arrays: #getting the information from the posts and sorting them into the arrays defined above


								        try:

								            post_subject: str = thread.find("span",{"class": "subject_new"}).text #getting the topic


								        except AttributeError:

								            post_subject: str = thread.find("span",{"class": "subject_old"}).text


								        post_subject_cleaned = cleanString(post_subject.strip())

								        topic.append(post_subject_cleaned)


								        reply_count = thread.find_all("td", {"align": "center"})[2].text

								        post.append(reply_count)


								        views = thread.find_all("td", {"align": "center"})[3].text

								        view.append(views)


								        # dates_added: str = thread.find("span",{"class" : "thread_start_datetime smalltext"}).text

								        # dates_added_cleaned = dates_added.split(',')[0]

								        # addDate.append(dates_added_cleaned)


								        author = thread.find("span",{"class" : "author smalltext"}).text

								        author_cleaned = cleanString(author.strip())

								        user.append(author_cleaned)


								        reply_anchor = thread.find_all("td", {"align": "center"})[2].find('a')

								        thread_link = reply_anchor.get('href')

								        href.append(thread_link)


								    return organizeTopics(

								        forum=forum,

								        nm=nm,

								        board=boardName,

								        author=user,

								        topic=topic,

								        views=view,

								        posts=post,

								        href=href,

								        addDate=addDate

								    )


								# This is the method to parse the Listing Pages (one page with many posts)


								def onniForums_links_parser(soup: BeautifulSoup):


								    href = []

								    listing = soup.find_all('tr', {'class': 'inline_row'})


								    for thread in listing:

								        try:

								            link = thread.find('span', {"class": "subject_old"}).find('a').get('href')

								        except:

								            link = thread.find('span', {"class": "subject_new"}).find('a').get('href')


								        href.append(link)


								    return href