khangtran
/
dark_web_forums

__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert datafrom typing import Listfrom Forums.Utilities.utilities import *from datetime import datefrom datetime import timedeltaimport reimport string
# Here, we are importing BeautifulSoup to search through the HTML treefrom bs4 import BeautifulSoup
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
def onniForums_description_parser(soup: BeautifulSoup) -> tuple:        topicName: str = "-1"              # 0 *topic name    users       : List[str] = []       # 1 *all users of each post    statuses    : List[str] = []       # 2 all user's authority in each post such as (adm, member, dangerous)    reputations : List[str] = []       # 3 all user's karma in each post (usually found as a number)    interests   : List[str] = []       # 4 all user's interest in each post    signs       : List[str] = []       # 5 all user's signature in each post (usually a standard message after the content of the post)    posts       : List[str] = []       # 6 all messages of each post    feedbacks   : List[str] = []       # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)    addDates    : List[datetime] = []  # 8 all dates of each post    image_user  : List[str] = []       # 9 all user avatars of each post    image_post  : List[str] = []       # 10 all first images of each post        # Getting the topicName    topicName = soup.find("table", {"class": "tborder tfixed clear"}) \                    .find("td", {"class": "thead"}) \                    .find_all("div")[-1].text        topicName = cleanString(topicName.strip())        topics_array = soup.find_all("div", {"class": "post"})            for topic in topics_array:        # Extracting and cleaning author information        author_information: BeautifulSoup = topic.find("div", {"class": "author_information"})                username: str = author_information.find("span", {"class": "largetext"}).text        username_cleaned = cleanString(username.strip())         users.append(username_cleaned)                user_status: str = author_information.find("span", {"class": "smalltext"}).text                        # Banned users often have weird text issues in HTML        # So we detect banned users and give them a unique string        if user_status.find("Banned") > 0: user_status_cleaned = "Banned"
        elif user_status.find("Unregistered") > 0: user_status_cleaned = "Unregistered"                else: user_status_cleaned = cleanString(user_status.strip())  # Remove excessive spaces in string                # Add cleaned data into array        statuses.append(user_status_cleaned)                if user_status_cleaned in ['Unregistered', 'Banned']: reputations.append(-1)        else:            author_statistics: BeautifulSoup = topic.find("div", {"class": "author_statistics"})                        reputation: str = author_statistics.find_all("div", {"class": "float_right"})[-1].text            reputation_cleaned = cleanString(reputation.strip())            reputations.append(reputation_cleaned)                # Append a "-1" to `interests` and `signs` array since they don't exist on this forum        interests.append("-1")        signs.append("-1")                post_content: str = topic.find("div", {"class": "post_body scaleimages"}).text        # Clean post content of excessive spaces and characters        post_content_cleaned = post_content.replace("[You must reply to view this hidden content]", "")        post_content_cleaned = cleanString(post_content_cleaned.strip())        posts.append(post_content_cleaned)                # Append a "-1" to `feedbacks` array since they don't exists on this forum        feedbacks.append("-1")                        date_posted: str = topic.find("span", {"class": "post_date"}).text        date_posted_cleaned = cleanString(date_posted.split(",")[0])                today = datetime.now()                if date_posted_cleaned == 'Yesterday':            date_object = today - timedelta(days=1)                elif date_posted_cleaned.find('hour') > 0:            hours_ago = int(date_posted_cleaned.split(' ')[0])            date_object = today - timedelta(hours=hours_ago)                elif date_posted_cleaned.find('minute') > 0:            minutes_ago = int(date_posted_cleaned.split(' ')[0])            date_object = today - timedelta(minutes=minutes_ago)                    else:            date_object = datetime.strptime(date_posted_cleaned, "%m-%d-%Y")                addDates.append(date_object)
        image_post.append("-1")
        img = topic.find('div', {"class": "author_avatar"}).find('img')        if img is not None:            img = img.get('src').split('base64,')[-1]        else:            img = "-1"        image_user.append(img)        # TESTING PURPOSES - DO NOT REMOVE        # Populate the final variable (this should be a list with all fields scraped)
    row = (topicName, users, statuses, reputations, interests, signs, posts, feedbacks, addDates, image_user, image_post)
    # Sending the results
    return row    

def onniForums_listing_parser(soup: BeautifulSoup):
    nm = 0                            # this variable should receive the number of topics    forum = "OnniForums"              # 0 *forum name    boardName = "-1"                  # 1 board name (the previous level of the topic in the Forum categorization tree.                                      # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)    user: List[str] = []              # 2 all users of each topic    topic   : List[str] = []          # 3 all topics    view: List[int] = []              # 4 number of views of each topic    post    : List[int] = []          # 5 number of posts of each topic    href: List[str] = []              # 6 this variable should receive all cleaned urls (we will use this to do the merge between Listing and Description pages)    addDate : List[str] = []          # 7 when the topic was created (difficult to find)    image_author : List[str] = []     # 8 all author avatars used in each topic
    # Finding the board (should be just one)    board_metadata: BeautifulSoup = soup.find("table",{"class" : "tborder clear"})        boardName = board_metadata.find_all("div")[1].text    boardName = cleanString(boardName.strip())        thread_arrays = board_metadata.find_all("tr", {"class":"inline_row"}) # gets the information of posts        nm = len(thread_arrays)        for thread in thread_arrays: #getting the information from the posts and sorting them into the arrays defined above
        body = thread.find("span",{"class": "subject_new"})        try:            post_subject: str = body.text #getting the topic        except:            body = thread.find("span",{"class": "subject_old"})            post_subject: str = body.text                    post_subject_cleaned = cleanString(post_subject.strip())            topic.append(post_subject_cleaned)
        author_icon = thread.find('div', {"class": "lavatar-old lavatar-old-f"})        if author_icon != None:            author_icon = author_icon.find('img')            author_icon = author_icon.get('src')            author_icon = author_icon.split('base64,')[-1]        else:            author_icon = "-1"        image_author.append(author_icon)
        reply_count = thread.find_all("td", {"align": "center"})[2].text        post.append(cleanNumbers(reply_count))                views = thread.find_all("td", {"align": "center"})[3].text        view.append(cleanNumbers(views))                # dates_added: str = thread.find("span",{"class" : "thread_start_datetime smalltext"}).text        # dates_added_cleaned = dates_added.split(',')[0]        # addDate.append(dates_added_cleaned)                author = thread.find("span",{"class" : "author smalltext"}).text        author_cleaned = cleanString(author.strip())        user.append(author_cleaned)
        thread_link = body.find('a').get('href')        href.append(thread_link)            return organizeTopics(        forum=forum,        nm=nm,        board=boardName,        author=user,        topic=topic,        views=view,        posts=post,        href=href,        addDate=addDate,        image_author=image_author    )                                 
# This is the method to parse the Listing Pages (one page with many posts)
def onniForums_links_parser(soup: BeautifulSoup):        href = []    listing = soup.find_all('tr', {'class': 'inline_row'})        for thread in listing:        try:            link = thread.find('span', {"class": "subject_old"}).find('a').get('href')        except:            link = thread.find('span', {"class": "subject_new"}).find('a').get('href')
        href.append(link)        return href