|
|
@ -13,17 +13,20 @@ from bs4 import BeautifulSoup, ResultSet, Tag |
|
|
|
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) |
|
|
|
|
|
|
|
def HiddenAnswers_description_parser(soup: BeautifulSoup): |
|
|
|
topic: str = "-1" # topic name |
|
|
|
user: List[str] = [] # all users of each post |
|
|
|
addDate: List[datetime] = [] # all dated of each post |
|
|
|
feedback: List[str] = [] # all feedbacks of each vendor (this was found in just one Forum and with a number format) |
|
|
|
status: List[str] = [] # all user's authority in each post such as (adm, member, dangerous) |
|
|
|
reputation: List[str] = [] # all user's karma in each post (usually found as a number) |
|
|
|
sign: List[str] = [] # all user's signature in each post (usually a standard message after the content of the post) |
|
|
|
post: List[str] = [] # all messages of each post |
|
|
|
interest: List[str] = [] # all user's interest in each post |
|
|
|
image = [] |
|
|
|
image_user = [] |
|
|
|
|
|
|
|
# Fields to be parsed |
|
|
|
|
|
|
|
topic: str = "-1" # 0 topic name |
|
|
|
user: List[str] = [] # 1 all users of each post |
|
|
|
addDate: List[datetime] = [] # 2 all dated of each post |
|
|
|
feedback: List[str] = [] # 3 all feedbacks of each vendor (this was found in just one Forum and with a number format) |
|
|
|
status: List[str] = [] # 4 all user's authority in each post such as (adm, member, dangerous) |
|
|
|
reputation: List[str] = [] # 5 all user's karma in each post (usually found as a number) |
|
|
|
sign: List[str] = [] # 6 all user's signature in each post (usually a standard message after the content of the post) |
|
|
|
post: List[str] = [] # 7 all messages of each post |
|
|
|
interest: List[str] = [] # 8 all user's interest in each post |
|
|
|
image_user = [] # 9 all user avatars of each post |
|
|
|
image_post = [] # 10 all first images of each post |
|
|
|
|
|
|
|
# Finding the topic (should be just one coming from the Listing Page) |
|
|
|
li = soup.find("h1").find("span", {"itemprop": "name"}) |
|
|
@ -60,7 +63,7 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup): |
|
|
|
img = img.get('src').split('base64,')[-1] |
|
|
|
else: |
|
|
|
img = "-1" |
|
|
|
image.append(img) |
|
|
|
image_post.append(img) |
|
|
|
|
|
|
|
img = question.find('span', {"class": "qa-q-view-avatar-meta"}).find('img') |
|
|
|
if img is not None: |
|
|
@ -103,7 +106,7 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup): |
|
|
|
img = img.get('src').split('base64,')[-1] |
|
|
|
else: |
|
|
|
img = "-1" |
|
|
|
image.append(img) |
|
|
|
image_post.append(img) |
|
|
|
|
|
|
|
img = replies.find('span', {"class": "qa-a-item-avatar-meta"}).find('img') |
|
|
|
if img is not None: |
|
|
@ -114,24 +117,27 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup): |
|
|
|
|
|
|
|
# Populate the final variable (this should be a list with all fields scraped) |
|
|
|
|
|
|
|
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) |
|
|
|
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) |
|
|
|
|
|
|
|
# Sending the results |
|
|
|
return row |
|
|
|
|
|
|
|
|
|
|
|
def HiddenAnswers_listing_parser(soup: BeautifulSoup): |
|
|
|
board = "-1" # board name (the previous level of the topic in the Forum categorization tree. |
|
|
|
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) |
|
|
|
forum: str = "HiddenAnswers" |
|
|
|
nm: int = 0 # this variable should receive the number of topics |
|
|
|
topic: List[str] = [] # all topics |
|
|
|
user: List[str] = [] # all users of each topic |
|
|
|
post: List[int] = [] # number of posts of each topic |
|
|
|
view: List[int] = [] # number of views of each topic |
|
|
|
addDate: List[str] = [] # when the topic was created (difficult to find) |
|
|
|
href: List[str] = [] # this variable should receive all cleaned urls (we will use this to do the merge between |
|
|
|
# Listing and Description pages) |
|
|
|
|
|
|
|
nm: int = 0 # this variable should receive the number of topics |
|
|
|
forum: str = "HiddenAnswers" # 0 *forum name |
|
|
|
board = "-1" # 1 board name (the previous level of the topic in the Forum categorization tree. |
|
|
|
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) |
|
|
|
user: List[str] = [] # 2 all users of each topic |
|
|
|
topic: List[str] = [] # 3 all topics |
|
|
|
view: List[int] = [] # 4 number of views of each topic |
|
|
|
post: List[int] = [] # 5 number of posts of each topic |
|
|
|
href: List[str] = [] # 6 this variable should receive all cleaned urls (we will use this to do the merge between |
|
|
|
# Listing and Description pages) |
|
|
|
addDate: List[str] = [] # 7 when the topic was created (difficult to find) |
|
|
|
image_user = [] # 8 all user avatars used in each topic |
|
|
|
|
|
|
|
# Finding the board |
|
|
|
literature = soup.find("div", {"class": "qa-main-heading"}).find("h1") |
|
|
|
board = literature.text |
|
|
@ -141,6 +147,8 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup): |
|
|
|
for queries in queries_by_user: |
|
|
|
topic_of_query = queries.find("div", {"class": "qa-q-item-title"}).find("a").text |
|
|
|
topic.append(cleanString(topic_of_query.strip())) |
|
|
|
|
|
|
|
image_user.append("-1") |
|
|
|
|
|
|
|
author = queries.find("span", {"class": "qa-q-item-who-data"}).find("a").text |
|
|
|
user.append(cleanString(author.strip())) |
|
|
@ -167,7 +175,7 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup): |
|
|
|
|
|
|
|
nm = len(topic) |
|
|
|
|
|
|
|
return organizeTopics(forum, nm, board, user, topic, view, post, href, addDate) |
|
|
|
return organizeTopics(forum, nm, board, user, topic, view, post, href, addDate, image_user) |
|
|
|
|
|
|
|
#need to change this method |
|
|
|
def hiddenanswers_links_parser(soup): |
|
|
|