__author__ = 'DarkWeb' # Here, we are importing the auxiliary functions to clean or convert data from Forums.Utilities.utilities import * from datetime import date from datetime import timedelta import re # Here, we are importing BeautifulSoup to search through the HTML tree from bs4 import BeautifulSoup, ResultSet, Tag # This is the method to parse the Description Pages (one page to each topic in the Listing Pages) def cardingleaks_description_parser(soup: Tag): # Fields to be parsed topic = "-1" # 0 *topic name user = [] # 1 *all users of each post status = [] # 2 all user's authority in each post such as (adm, member, dangerous) reputation = [] # 3 all user's karma in each post (usually found as a number) interest = [] # 4 all user's interest in each post sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) post = [] # 6 all messages of each post feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) addDate = [] # 8 all dates of each post li = soup.find("h1", {"class": "p-title-value"}) topic = cleanString(li.text.strip()) post_list: ResultSet[Tag] = soup.find("div", {"class": "block-body js-replyNewMessageContainer"}).find_all("article", {"data-author": True}) for ipost in post_list: username = ipost.get('data-author') user.append(username) user_status = ipost.find("h5", {"class": "userTitle message-userTitle"}).text status.append(cleanString(user_status.strip())) user_statistics: ResultSet[Tag] = ipost.find("div", {"class": "message-userExtras"}).find_all("dl", {"class": "pairs pairs--justified"}) user_reputation = "-1" for stat in user_statistics: data_type = stat.find("span").get("data-original-title") if data_type == "Points": user_reputation = stat.find("dd").text break reputation.append(cleanString(user_reputation.strip())) interest.append("-1") sign.append("-1") user_post = ipost.find("div", {"class": "message-content js-messageContent"}).text post.append(cleanString(user_post.strip())) feedback.append("-1") datetime_text = ipost.find("ul", {"class": "message-attribution-main listInline"}).find("time").get("datetime") datetime_obj = datetime.strptime(datetime_text, "%Y-%m-%dT%H:%M:%S%z") addDate.append(datetime_obj) # Populate the final variable (this should be a list with all fields scraped) row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) # Sending the results return row # This is the method to parse the Listing Pages (one page with many posts) def cardingleaks_listing_parser(soup: Tag): nm = 0 # *this variable should receive the number of topics forum = "Cardingleaks" # 0 *forum name board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) author = [] # 2 *all authors of each topic topic = [] # 3 *all topics views = [] # 4 number of views of each topic posts = [] # 5 number of posts of each topic href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between # Listing and Description pages) addDate = [] # 7 when the topic was created (difficult to find) # Finding the board (should be just one) li = soup.find("h1", {"class": "p-title-value"}) board = cleanString(li.text.strip()) thread_list: ResultSet[Tag] = soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True}) nm = len(thread_list) for thread in thread_list: thread_author = thread.get("data-author") author.append(thread_author) thread_topic = thread.find("div", {"class": "structItem-title"}).text topic.append(cleanString(thread_topic.strip())) thread_view = thread.find("dl", {"class": "pairs pairs--justified structItem-minor"}).find("dd").text # Context text view count (i.e., 8.8K) to numerical (i.e., 8800) if thread_view.find("K") > 0: thread_view = str(int(float(thread_view.replace("K", "")) * 1000)) views.append(thread_view) thread_posts = thread.find("dl", {"class": "pairs pairs--justified"}).find("dd").text posts.append(cleanString(thread_posts.strip())) thread_href = thread.find("div", {"class": "structItem-title"}).find("a").get("href") href.append(thread_href) thread_date = thread.find("li", {"class": "structItem-startDate"}).find("time").get("datetime") datetime_obj = datetime.strptime(thread_date, "%Y-%m-%dT%H:%M:%S%z") addDate.append(datetime_obj) return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate) def cardingleaks_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] listing = soup.find_all('div', {"class": "structItem-title"}) for a in listing: link = a.find('a').get('href') href.append(link) return href