# Here, we are importing the auxiliary functions to clean or convert data from Forums.Utilities.utilities import * import datetime import re # Here, we are importing BeautifulSoup to search through the HTML tree from bs4 import BeautifulSoup # This is the method to parse the Description Pages (one page to each topic in the Listing Pages) def endchan_description_parser(soup): # Fields to be parsed topic = "-1" # 0 *topic name user = [] # 1 *all users of each post status = [] # 2 all user's authority in each post such as (adm, member, dangerous) reputation = [] # 3 all user's karma in each post (usually found as a number) interest = [] # 4 all user's interest in each post sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) post = [] # 6 all messages of each post feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) addDate = [] # 8 all dates of each post image_user = [] # 9 all user avatars of each post image_post = [] # 10 all first images of each post # Finding the topic (should be just one coming from the Listing Page) entire_post = soup.find('div', {"id": "threadList"}).find('div', {"id": "divThreads"}).find('div', class_ = re.compile('opCell')) original_post = entire_post.find('div', {"class": "innerOP"}) post_header = original_post.find('div', {"class": "opHead"}) topic = post_header.find('span', {"class": "labelSubject"}).text topic = re.sub("\[\w*\]", '', topic) topic = topic.replace(",","") topic = topic.replace("\n","") topic = cleanString(topic.strip()) # the replies are separated from the original post, so have to get original post and then get repeated tags for the replies #functions to reuse code: def get_user(area): name = area.find('a', class_=re.compile('linkName')) author = name.text.strip() #user.append(cleanString(author)) def get_post(area): content = area.find('div', {"class": "divMessage"}) content = content.text.strip() #post.append(cleanString(content)) def get_date(area): dt = area.find('span', {"class": "labelCreated"}).text dt = dt.strip().split() date_time_obj = datetime.strftime(dt[0], '%m-%d-%Y') #addDate.append(date_time_obj) def get_user_img(area): avatar_img = area.find('img', class_= re.compile('imgFlag')) if avatar_img is not None: avatar_img = avatar_img.get('src').split('base64,')[-1] else: avatar_img = "-1" #image_user.append(avatar_img) def get_first_img(area): img_cell = area.find('div', class_= re.compile('panelUploads')).find('figure', {"class": "uploadCell"}) if img_cell is not None: img = img_cell.find('img') img = img.get('src').split('base64,')[-1] else: img = "-1" #image_post.append(img) # Endchan does not have status, blurb, reputation, signature or feedback def set_other_lists(): status.append("-1") reputation.append("-1") interest.append("-1") sign.append("-1") feedback.append("-1") # For the original post, get all fields we are interested in # get user get_user(post_header) # get post get_post(original_post) # get addDate get_date(post_header) # get user image get_user_img(post_header) #get first post image get_first_img(original_post) #no status, interest, reputation, feedback, or signature #set_other_lists() # Finding the repeated tag that corresponds to the listing of posts post_replies = entire_post.find('div', {"class": "divPosts"}).find_all('div', class_ = re.compile('postCell')) # For all replies, get all the fields we are interested in for ipost in post_replies: post_area = ipost.find('div', {"class": "innerPost"}) # Finding user of the post get_user(post_area) # getting post date get_date(post_area) # getting the post content get_post(post_area) # get first image from post using panel uploads get_first_img(post_area) # get author avatar get_user_img(post_area) #set_other_lists() # Populate the final variable (this should be a list with all fields scraped) #row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) # Sending the results #return row # This is the method to parse the Listing Pages (one page with many posts) def endchan_listing_parser(soup): nm = 0 # *this variable should receive the number of topics forum = "Endchan" # 0 *forum name board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) author = [] # 2 *all authors of each topic topic = [] # 3 *all topics views = [] # 4 number of views of each topic posts = [] # 5 number of posts of each topic href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between # Listing and Description pages) addDate = [] # 7 when the topic was created (difficult to find) image_author = [] # 8 all author avatars used in each topic # Finding the board (should be just one) header = soup.find("header", {"class": "boardHeader"}) labelName = header.find("p", {"id": "labelName"}).text board = cleanString(labelName.strip()) # Finding the repeated tag that corresponds to the listing of topics topics = soup.find('div', {"id": "threadList"}).find('div', {"id": "divThreads"}).find_all('div', class_ = re.compile('opCell')) # Counting how many topics nm = len(topics) for itopic in topics: post_header = itopic.find('div', {"class": "innerOP"}).find('div', {"class": "opHead"}) topics = post_header.find('span', {"class": "labelSubject"}) # Adding the topic to the topic list topics = re.sub("\[\w*\]", '', topics) topic.append(cleanString(topics)) # get author avatar avatar_img = post_header.find('img', class_ = re.compile('imgFlag')) if avatar_img is not None: avatar_img = avatar_img.get('src').split('base64,')[-1] else: avatar_img = "-1" image_author.append(avatar_img) # Adding the url to the list of urls #using linkSelf to get link, because the website is formatted differently. link = post_header.find('a', {"class": "linkSelf"}).get('href') href.append(link) # Finding the author of the topic name = post_header.find('a', class_= re.compile('linkName')) user = name.strip() author.append(cleanString(user)) # Finding the number of replies replies = itopic.find('div', {"class": "divPosts"}).find_all('div', class_ = re.compile('postCell')) if replies is not None: num_replies = str(len(replies)) posts.append(cleanString(num_replies)) else: posts.append('-1') # No information on number of Views views.append('-1') #get date topic was added dt = post_header.find('span', {"class": "labelCreated"}).text dt = dt.strip().split() date_time_obj = datetime.strftime(dt[0], '%m-%d-%Y') addDate.append(date_time_obj) #return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author) def endchan_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] listing = soup.find('div', {"id": "threadList"}).find('div', {"id": "divThreads"}).find_all('div', class_ = re.compile('opCell')) for a in listing: link = a.find('a', {"class": "linkSelf"}).get('href') href.append(link) return href