__author__ = 'DarkWeb' # Here, we are importing the auxiliary functions to clean or convert data from Forums.Utilities.utilities import * from datetime import date from datetime import timedelta import re # Here, we are importing BeautifulSoup to search through the HTML tree from bs4 import BeautifulSoup # This is the method to parse the Description Pages (one page to each topic in the Listing Pages) def altenens_description_parser(soup): topic = "-1" # 0 *topic name user = [] # 1 *all users of each post status = [] # 2 all user's authority in each post such as (adm, member, dangerous) reputation = [] # 3 all user's karma in each post (usually found as a number) interest = [] # 4 all user's interest in each post sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) post = [] # 6 all messages of each post feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) addDate = [] # 8 all dates of each post image_user = [] # 9 all user avatars of each post image_post = [] # 10 all first images of each post topic = soup.find("h1", {"class": "p-title-value"}).text topic = cleanString(topic.strip()) body = soup.find('div', {"class": "block-container lbContainer"}) iposts = body.find_all('article', {"class": "message message--post js-post js-inlineModContainer"}) for ipost in iposts: author = ipost.find('h4', {"class": "message-name"}).text user.append(cleanString(author.strip())) stat = ipost.find('h5', {"class": "userTitle message-userTitle"}).text status.append(cleanString(stat.strip())) bar = ipost.find('div', {"class": "xtr-progress-bar"}) if bar is not None: rep = bar.find('p').get('data-value') else: rep = "-1" reputation.append(cleanString(rep)) interest.append("-1") signature = ipost.find('aside', {"class": "message-signature"}) if signature is not None: signature = signature.text.strip() else: signature = "-1" sign.append(cleanString(signature)) inner = ipost.find('div', {"class": "bbWrapper"}).find(text=True, recursive=False) if inner is not None: inner = inner.strip() else: inner = "" # cannot use -1 because the post is hidden unless you reply post.append(cleanString(inner)) feedback.append("-1") dt = ipost.find('time', {"class": "u-dt"}).get('datetime') date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z') addDate.append(date_time_obj) img = ipost.find('div', {"class": "message-avatar-wrapper"}).find('img') if img is not None: img = img.get('src').split('base64,')[-1] else: img = "-1" image_user.append(img) image_post.append("-1") # Populate the final variable (this should be a list with all fields scraped) row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) # Sending the results return row # This is the method to parse the Listing Pages (one page with many posts) def altenens_listing_parser(soup): nm = 0 # *this variable should receive the number of topics forum = "Altenens" # 0 *forum name board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) author = [] # 2 *all authors of each topic topic = [] # 3 *all topics views = [] # 4 number of views of each topic posts = [] # 5 number of posts of each topic href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between # Listing and Description pages) addDate = [] # 7 when the topic was created (difficult to find) image_author = [] # 8 all author avatars used in each topic board = soup.find('h1', {"class": "p-title-value"}).text board = cleanString(board.strip()) regex = re.compile('structItem structItem--thread.*') itopics = soup.find_all('div', {"class": regex}) nm = len(itopics) for itopic in itopics: topics = itopic.find('div', {"class": "structItem-title"}).text topic.append(cleanString(topics.strip())) author_icon = itopic.find('a', {"class": "avatar avatar--s"}) if author_icon != None: author_icon = author_icon.find('img') author_icon = author_icon.get('src') author_icon = author_icon.split('base64,')[-1] else: author_icon = "-1" image_author.append(author_icon) link = itopic.find('div', {"class": "structItem-title"}).find('a').get('href') href.append(link) user = itopic.find('ul', {"class": "structItem-parts"}).find('a').text author.append(cleanString(user.strip())) dt = itopic.find('time', {"class": "u-dt"}).get('datetime') date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z') addDate.append(date_time_obj) nposts = itopic.find('dl', {"class": "pairs pairs--justified"}).text nposts = nposts.replace('Replies', '') nposts = nposts.replace('K', '000') posts.append(cleanString(nposts)) nviews = itopic.find('dl', {"class": "pairs pairs--justified structItem-minor"}).text nviews = nviews.replace('Views', '') nviews = nviews.replace('K', '000') views.append(cleanString(nviews)) return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author) def altenens_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] listing = soup.find_all('div', {"class": "structItem-cell structItem-cell--main"}) for a in listing: link = a.find('a', {"class": ""}).get('href') href.append(link) return href