__author__ = 'DarkWeb'
|
|
|
|
# Here, we are importing the auxiliary functions to clean or convert data
|
|
from typing import List
|
|
from Forums.Utilities.utilities import *
|
|
from datetime import date
|
|
from datetime import timedelta
|
|
import re
|
|
import string
|
|
|
|
# Here, we are importing BeautifulSoup to search through the HTML tree
|
|
from bs4 import BeautifulSoup
|
|
|
|
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
|
|
|
|
def onniForums_description_parser(soup: BeautifulSoup):
|
|
|
|
topicName: str = "-1" # 0 *topic name
|
|
users : List[str] = [] # 1 *all users of each post
|
|
statuses : List[str] = [] # 2 all user's authority in each post such as (adm, member, dangerous)
|
|
reputations : List[int] = [] # 3 all user's karma in each post (usually found as a number)
|
|
interests : List[str] = [] # 4 all user's interest in each post
|
|
signs : List[str] = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
|
|
posts : List[int] = [] # 6 all messages of each post
|
|
feedbacks : List[str] = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
|
|
addDates : List[str] = [] # 8 all dates of each post
|
|
|
|
# Getting the topicName
|
|
topicName = soup.find("table", {"class": "tborder tfixed clear"}) \
|
|
.find("td", {"class": "thead"}) \
|
|
.find_all("div")[-1].text
|
|
|
|
topics_array = soup.find_all("div", {"class": "post"})
|
|
|
|
|
|
for topic in topics_array:
|
|
# Extracting and cleaning author information
|
|
author_information: BeautifulSoup = topic.find("div", {"class": "author_information"})
|
|
|
|
username: str = author_information.find("span", {"class": "largetext"}).text
|
|
users.append(username)
|
|
|
|
user_status: str = author_information.find("span", {"class": "smalltext"}).text
|
|
|
|
|
|
# Banned users often have weird text issues in HTML
|
|
# So we detect banned users and give them a unique string
|
|
if user_status.find("Banned") > 0: user_status_cleaned = "Banned"
|
|
|
|
elif user_status.find("Unregistered") > 0: user_status_cleaned = "Unregistered"
|
|
|
|
else: user_status_cleaned = user_status[1:len(user_status)-2] # Remove excessive spaces in string
|
|
|
|
# Add cleaned data into array
|
|
statuses.append(user_status_cleaned)
|
|
|
|
if user_status_cleaned in ['Unregistered', 'Banned']: reputations.append(-1)
|
|
else:
|
|
author_statistics: BeautifulSoup = topic.find("div", {"class": "author_statistics"})
|
|
|
|
reputation: str = author_statistics.find_all("div", {"class": "float_right"})[-1].text
|
|
reputations.append(int(reputation))
|
|
|
|
# Append a "-1" to `interests` and `signs` array since they don't exist on this forum
|
|
interests.append("-1")
|
|
signs.append("-1")
|
|
|
|
post_content: str = topic.find("div", {"class": "post_body scaleimages"}).text
|
|
# Clean post content of excessive spaces and characters
|
|
post_content_cleaned = post_content.replace("[You must reply to view this hidden content]", "")
|
|
post_content_cleaned = post_content_cleaned[1:len(post_content_cleaned)-1]
|
|
posts.append(post_content_cleaned)
|
|
|
|
# Append a "-1" to `feedbacks` array since they don't exists on this forum
|
|
feedbacks.append("-1")
|
|
|
|
|
|
date_posted: str = topic.find("span", {"class": "post_date"}).text
|
|
date_posted_cleaned = date_posted.split(",")[0]
|
|
addDates.append(date_posted_cleaned)
|
|
|
|
|
|
# TESTING PURPOSES - DO NOT REMOVE
|
|
|
|
# Populate the final variable (this should be a list with all fields scraped)
|
|
|
|
row = (topicName, posts, users, addDates, feedbacks, statuses, reputations, signs, interests)
|
|
|
|
# Sending the results
|
|
|
|
return row
|
|
|
|
|
|
|
|
def onniForums_listing_parser(soup: BeautifulSoup):
|
|
|
|
boardName = "-1" # board name (the previous level of the topic in the Forum categorization tree.
|
|
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
|
|
|
|
nm = 0 # this variable should receive the number of topics
|
|
topic : List[str] = [] # all topics
|
|
user : List[str] = [] # all users of each topic
|
|
post : List[int] = [] # number of posts of each topic
|
|
view : List[int] = [] # number of views of each topic
|
|
addDate : List[str] = [] # when the topic was created (difficult to find)
|
|
href : List[str] = [] # this variable should receive all cleaned urls (we will use this to do the merge between
|
|
# Listing and Description pages)
|
|
|
|
# Finding the board (should be just one)
|
|
board_metadata: BeautifulSoup = soup.find("table",{"class" : "tborder clear"})
|
|
|
|
boardName = board_metadata.find_all("div")[1].text
|
|
|
|
thread_arrays = board_metadata.find_all("tr", {"class":"inline_row"}) # gets the information of posts
|
|
|
|
nm = len(thread_arrays)
|
|
|
|
for thread in thread_arrays: #getting the information from the posts and sorting them into the arrays defined above
|
|
|
|
try:
|
|
post_subject = thread.find("span",{"class": "subject_new"}).text #getting the topic
|
|
|
|
except AttributeError:
|
|
post_subject = thread.find("span",{"class": "subject_old"}).text
|
|
|
|
topic.append(post_subject)
|
|
|
|
|
|
reply_count = thread.find_all("td", {"align": "center"})[2].text
|
|
post.append(reply_count)
|
|
|
|
views = thread.find_all("td", {"align": "center"})[3].text
|
|
view.append(views)
|
|
|
|
dates_added: str = thread.find("span",{"class" : "thread_start_datetime smalltext"}).text
|
|
dates_added_cleaned = dates_added.split(',')[0]
|
|
addDate.append(dates_added_cleaned)
|
|
|
|
author = thread.find("span",{"class" : "author smalltext"}).text
|
|
user.append(author)
|
|
|
|
reply_anchor = thread.find_all("td", {"align": "center"})[2].find('a')
|
|
thread_link = reply_anchor.get('href')
|
|
href.append(thread_link)
|
|
|
|
return organizeTopics(
|
|
forum="OnniForums",
|
|
nm=nm,
|
|
board=boardName,
|
|
author=user,
|
|
topic=topic,
|
|
views=view,
|
|
posts=post,
|
|
href=href,
|
|
addDate=addDate
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# This is the method to parse the Listing Pages (one page with many posts)
|
|
|
|
def onniForums_links_parser(soup: BeautifulSoup):
|
|
|
|
href = []
|
|
listing = soup.find_all('tr', {'class': 'inline_row'})
|
|
|
|
for thread in listing:
|
|
try:
|
|
link = thread.find('span', {"class": "subject_old"}).find('a').get('href')
|
|
except:
|
|
link = thread.find('span', {"class": "subject_new"}).find('a').get('href')
|
|
|
|
href.append(link)
|
|
|
|
return href
|