|
|
- __author__ = 'DarkWeb'
-
- # Here, we are importing the auxiliary functions to clean or convert data
- from typing import List
- from Forums.Utilities.utilities import *
- from datetime import date
- from datetime import timedelta
- import re
- import string
-
- # Here, we are importing BeautifulSoup to search through the HTML tree
- from bs4 import BeautifulSoup
-
- # This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
-
- def onniForums_description_parser(soup: BeautifulSoup) -> tuple:
-
- topicName: str = "-1" # 0 *topic name
- users : List[str] = [] # 1 *all users of each post
- statuses : List[str] = [] # 2 all user's authority in each post such as (adm, member, dangerous)
- reputations : List[str] = [] # 3 all user's karma in each post (usually found as a number)
- interests : List[str] = [] # 4 all user's interest in each post
- signs : List[str] = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
- posts : List[str] = [] # 6 all messages of each post
- feedbacks : List[str] = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
- addDates : List[datetime] = [] # 8 all dates of each post
-
- # Getting the topicName
- topicName = soup.find("table", {"class": "tborder tfixed clear"}) \
- .find("td", {"class": "thead"}) \
- .find_all("div")[-1].text
-
- topicName = cleanString(topicName.strip())
-
- topics_array = soup.find_all("div", {"class": "post"})
-
-
- for topic in topics_array:
- # Extracting and cleaning author information
- author_information: BeautifulSoup = topic.find("div", {"class": "author_information"})
-
- username: str = author_information.find("span", {"class": "largetext"}).text
- username_cleaned = cleanString(username.strip())
- users.append(username_cleaned)
-
- user_status: str = author_information.find("span", {"class": "smalltext"}).text
-
-
- # Banned users often have weird text issues in HTML
- # So we detect banned users and give them a unique string
- if user_status.find("Banned") > 0: user_status_cleaned = "Banned"
-
- elif user_status.find("Unregistered") > 0: user_status_cleaned = "Unregistered"
-
- else: user_status_cleaned = cleanString(user_status.strip()) # Remove excessive spaces in string
-
- # Add cleaned data into array
- statuses.append(user_status_cleaned)
-
- if user_status_cleaned in ['Unregistered', 'Banned']: reputations.append(-1)
- else:
- author_statistics: BeautifulSoup = topic.find("div", {"class": "author_statistics"})
-
- reputation: str = author_statistics.find_all("div", {"class": "float_right"})[-1].text
- reputation_cleaned = cleanString(reputation.strip())
- reputations.append(reputation_cleaned)
-
- # Append a "-1" to `interests` and `signs` array since they don't exist on this forum
- interests.append("-1")
- signs.append("-1")
-
- post_content: str = topic.find("div", {"class": "post_body scaleimages"}).text
- # Clean post content of excessive spaces and characters
- post_content_cleaned = post_content.replace("[You must reply to view this hidden content]", "")
- post_content_cleaned = cleanString(post_content_cleaned.strip())
- posts.append(post_content_cleaned)
-
- # Append a "-1" to `feedbacks` array since they don't exists on this forum
- feedbacks.append("-1")
-
-
- date_posted: str = topic.find("span", {"class": "post_date"}).text
- date_posted_cleaned = cleanString(date_posted.split(",")[0])
-
- today = datetime.now()
-
- if date_posted_cleaned == 'Yesterday':
- date_object = today - timedelta(days=1)
-
- elif date_posted_cleaned.find('hour') > 0:
- hours_ago = int(date_posted_cleaned.split(' ')[0])
- date_object = today - timedelta(hours=hours_ago)
-
- elif date_posted_cleaned.find('minute') > 0:
- minutes_ago = int(date_posted_cleaned.split(' ')[0])
- date_object = today - timedelta(minutes=minutes_ago)
-
- else:
- date_object = datetime.strptime(date_posted_cleaned, "%m-%d-%Y")
-
- addDates.append(date_object)
-
-
-
- # TESTING PURPOSES - DO NOT REMOVE
-
- # Populate the final variable (this should be a list with all fields scraped)
-
- row = (topicName, users, statuses, reputations, interests, signs, posts, feedbacks, addDates)
-
- # Sending the results
-
- return row
-
-
-
- def onniForums_listing_parser(soup: BeautifulSoup):
-
- boardName = "-1" # board name (the previous level of the topic in the Forum categorization tree.
- # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
- forum = "OnniForums"
- nm = 0 # this variable should receive the number of topics
- topic : List[str] = [] # all topics
- user : List[str] = [] # all users of each topic
- post : List[int] = [] # number of posts of each topic
- view : List[int] = [] # number of views of each topic
- addDate : List[str] = [] # when the topic was created (difficult to find)
- href : List[str] = [] # this variable should receive all cleaned urls (we will use this to do the merge between
- # Listing and Description pages)
-
- # Finding the board (should be just one)
- board_metadata: BeautifulSoup = soup.find("table",{"class" : "tborder clear"})
-
- boardName = board_metadata.find_all("div")[1].text
- boardName = cleanString(boardName.strip())
-
- thread_arrays = board_metadata.find_all("tr", {"class":"inline_row"}) # gets the information of posts
-
- nm = len(thread_arrays)
-
- for thread in thread_arrays: #getting the information from the posts and sorting them into the arrays defined above
-
- body = thread.find("span",{"class": "subject_new"})
- try:
- post_subject: str = body.text #getting the topic
- except:
- body = thread.find("span",{"class": "subject_old"})
- post_subject: str = body.text
-
- post_subject_cleaned = cleanString(post_subject.strip())
- topic.append(post_subject_cleaned)
-
-
- reply_count = thread.find_all("td", {"align": "center"})[2].text
- post.append(cleanNumbers(reply_count))
-
- views = thread.find_all("td", {"align": "center"})[3].text
- view.append(cleanNumbers(views))
-
- # dates_added: str = thread.find("span",{"class" : "thread_start_datetime smalltext"}).text
- # dates_added_cleaned = dates_added.split(',')[0]
- # addDate.append(dates_added_cleaned)
-
- author = thread.find("span",{"class" : "author smalltext"}).text
- author_cleaned = cleanString(author.strip())
- user.append(author_cleaned)
-
- thread_link = body.find('a').get('href')
- href.append(thread_link)
-
- return organizeTopics(
- forum=forum,
- nm=nm,
- board=boardName,
- author=user,
- topic=topic,
- views=view,
- posts=post,
- href=href,
- addDate=addDate
- )
-
-
-
-
-
-
- # This is the method to parse the Listing Pages (one page with many posts)
-
- def onniForums_links_parser(soup: BeautifulSoup):
-
- href = []
- listing = soup.find_all('tr', {'class': 'inline_row'})
-
- for thread in listing:
- try:
- link = thread.find('span', {"class": "subject_old"}).find('a').get('href')
- except:
- link = thread.find('span', {"class": "subject_new"}).find('a').get('href')
-
- href.append(link)
-
- return href
|