__author__ = 'DarkWeb' # Here, we are importing the auxiliary functions to clean or convert data from typing import List from Forums.Utilities.utilities import * from datetime import date from datetime import timedelta import re import string # Here, we are importing BeautifulSoup to search through the HTML tree from bs4 import BeautifulSoup # This is the method to parse the Description Pages (one page to each topic in the Listing Pages) def onniForums_description_parser(soup: BeautifulSoup) -> tuple: topicName: str = "-1" # 0 *topic name users : List[str] = [] # 1 *all users of each post statuses : List[str] = [] # 2 all user's authority in each post such as (adm, member, dangerous) reputations : List[str] = [] # 3 all user's karma in each post (usually found as a number) interests : List[str] = [] # 4 all user's interest in each post signs : List[str] = [] # 5 all user's signature in each post (usually a standard message after the content of the post) posts : List[str] = [] # 6 all messages of each post feedbacks : List[str] = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) addDates : List[datetime] = [] # 8 all dates of each post # Getting the topicName topicName = soup.find("table", {"class": "tborder tfixed clear"}) \ .find("td", {"class": "thead"}) \ .find_all("div")[-1].text topicName = cleanString(topicName.strip()) topics_array = soup.find_all("div", {"class": "post"}) for topic in topics_array: # Extracting and cleaning author information author_information: BeautifulSoup = topic.find("div", {"class": "author_information"}) username: str = author_information.find("span", {"class": "largetext"}).text username_cleaned = cleanString(username.strip()) users.append(username_cleaned) user_status: str = author_information.find("span", {"class": "smalltext"}).text # Banned users often have weird text issues in HTML # So we detect banned users and give them a unique string if user_status.find("Banned") > 0: user_status_cleaned = "Banned" elif user_status.find("Unregistered") > 0: user_status_cleaned = "Unregistered" else: user_status_cleaned = cleanString(user_status.strip()) # Remove excessive spaces in string # Add cleaned data into array statuses.append(user_status_cleaned) if user_status_cleaned in ['Unregistered', 'Banned']: reputations.append(-1) else: author_statistics: BeautifulSoup = topic.find("div", {"class": "author_statistics"}) reputation: str = author_statistics.find_all("div", {"class": "float_right"})[-1].text reputation_cleaned = cleanString(reputation.strip()) reputations.append(reputation_cleaned) # Append a "-1" to `interests` and `signs` array since they don't exist on this forum interests.append("-1") signs.append("-1") post_content: str = topic.find("div", {"class": "post_body scaleimages"}).text # Clean post content of excessive spaces and characters post_content_cleaned = post_content.replace("[You must reply to view this hidden content]", "") post_content_cleaned = cleanString(post_content_cleaned.strip()) posts.append(post_content_cleaned) # Append a "-1" to `feedbacks` array since they don't exists on this forum feedbacks.append("-1") date_posted: str = topic.find("span", {"class": "post_date"}).text date_posted_cleaned = cleanString(date_posted.split(",")[0]) today = datetime.now() if date_posted_cleaned == 'Yesterday': date_object = today - timedelta(days=1) elif date_posted_cleaned.find('hour') > 0: hours_ago = int(date_posted_cleaned.split(' ')[0]) date_object = today - timedelta(hours=hours_ago) elif date_posted_cleaned.find('minute') > 0: minutes_ago = int(date_posted_cleaned.split(' ')[0]) date_object = today - timedelta(minutes=minutes_ago) else: date_object = datetime.strptime(date_posted_cleaned, "%m-%d-%Y") addDates.append(date_object) # TESTING PURPOSES - DO NOT REMOVE # Populate the final variable (this should be a list with all fields scraped) row = (topicName, users, statuses, reputations, interests, signs, posts, feedbacks, addDates) # Sending the results return row def onniForums_listing_parser(soup: BeautifulSoup): boardName = "-1" # board name (the previous level of the topic in the Forum categorization tree. # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) forum = "OnniForums" nm = 0 # this variable should receive the number of topics topic : List[str] = [] # all topics user : List[str] = [] # all users of each topic post : List[int] = [] # number of posts of each topic view : List[int] = [] # number of views of each topic addDate : List[str] = [] # when the topic was created (difficult to find) href : List[str] = [] # this variable should receive all cleaned urls (we will use this to do the merge between # Listing and Description pages) # Finding the board (should be just one) board_metadata: BeautifulSoup = soup.find("table",{"class" : "tborder clear"}) boardName = board_metadata.find_all("div")[1].text boardName = cleanString(boardName.strip()) thread_arrays = board_metadata.find_all("tr", {"class":"inline_row"}) # gets the information of posts nm = len(thread_arrays) for thread in thread_arrays: #getting the information from the posts and sorting them into the arrays defined above body = thread.find("span",{"class": "subject_new"}) try: post_subject: str = body.text #getting the topic except: body = thread.find("span",{"class": "subject_old"}) post_subject: str = body.text post_subject_cleaned = cleanString(post_subject.strip()) topic.append(post_subject_cleaned) reply_count = thread.find_all("td", {"align": "center"})[2].text post.append(cleanNumbers(reply_count)) views = thread.find_all("td", {"align": "center"})[3].text view.append(cleanNumbers(views)) # dates_added: str = thread.find("span",{"class" : "thread_start_datetime smalltext"}).text # dates_added_cleaned = dates_added.split(',')[0] # addDate.append(dates_added_cleaned) author = thread.find("span",{"class" : "author smalltext"}).text author_cleaned = cleanString(author.strip()) user.append(author_cleaned) thread_link = body.find('a').get('href') href.append(thread_link) return organizeTopics( forum=forum, nm=nm, board=boardName, author=user, topic=topic, views=view, posts=post, href=href, addDate=addDate ) # This is the method to parse the Listing Pages (one page with many posts) def onniForums_links_parser(soup: BeautifulSoup): href = [] listing = soup.find_all('tr', {'class': 'inline_row'}) for thread in listing: try: link = thread.find('span', {"class": "subject_old"}).find('a').get('href') except: link = thread.find('span', {"class": "subject_new"}).find('a').get('href') href.append(link) return href