diff --git a/Forums/Altenens/parser.py b/Forums/Altenens/parser.py index 1d274ac..70abf79 100644 --- a/Forums/Altenens/parser.py +++ b/Forums/Altenens/parser.py @@ -100,7 +100,7 @@ def altenens_listing_parser(soup): href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between # Listing and Description pages) addDate = [] # 7 when the topic was created (difficult to find) - image_user = [] # 9 all user avatars used in each topic + image_user = [] # 8 all user avatars used in each topic board = soup.find('h1', {"class": "p-title-value"}).text board = cleanString(board.strip()) diff --git a/Forums/Cardingleaks/parser.py b/Forums/Cardingleaks/parser.py index 4d0b345..022fbe1 100644 --- a/Forums/Cardingleaks/parser.py +++ b/Forums/Cardingleaks/parser.py @@ -25,8 +25,8 @@ def cardingleaks_description_parser(soup: Tag): post = [] # 6 all messages of each post feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) addDate = [] # 8 all dates of each post - image = [] - image_user = [] + image_user = [] # 9 all user avatars of each post + image_post = [] # 10 all first images of each post li = soup.find("h1", {"class": "p-title-value"}) topic = cleanString(li.text.strip()) @@ -70,15 +70,18 @@ def cardingleaks_description_parser(soup: Tag): img = img.get('src').split('base64,')[-1] else: img = "-1" - image.append(img) + image_post.append(img) img = ipost.find('div', {"class": "message-avatar"}).find('img') - img = img.get('src').split('base64,')[-1] + if img is not None: + img = img.get('src').split('base64,')[-1] + else: + img = "-1" image_user.append(img) # Populate the final variable (this should be a list with all fields scraped) - row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) + row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) # Sending the results @@ -88,17 +91,18 @@ def cardingleaks_description_parser(soup: Tag): def cardingleaks_listing_parser(soup: Tag): - nm = 0 # *this variable should receive the number of topics + nm = 0 # *this variable should receive the number of topics forum = "Cardingleaks" # 0 *forum name - board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. - # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) - author = [] # 2 *all authors of each topic - topic = [] # 3 *all topics - views = [] # 4 number of views of each topic - posts = [] # 5 number of posts of each topic - href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between - # Listing and Description pages) - addDate = [] # 7 when the topic was created (difficult to find) + board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. + # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) + author = [] # 2 *all authors of each topic + topic = [] # 3 *all topics + views = [] # 4 number of views of each topic + posts = [] # 5 number of posts of each topic + href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between + # Listing and Description pages) + addDate = [] # 7 when the topic was created (difficult to find) + image_user = [] # 8 all user avatars used in each topic # Finding the board (should be just one) @@ -115,6 +119,11 @@ def cardingleaks_listing_parser(soup: Tag): thread_topic = thread.find("div", {"class": "structItem-title"}).text topic.append(cleanString(thread_topic.strip())) + + author_icon = thread.find("a", {"class": "avatar avatar--s"}).find("img") + author_icon = author_icon.get('src') + author_icon = author_icon.split('base64,')[-1] + image_user.append(author_icon) thread_view = thread.find("dl", {"class": "pairs pairs--justified structItem-minor"}).find("dd").text # Context text view count (i.e., 8.8K) to numerical (i.e., 8800) @@ -132,7 +141,7 @@ def cardingleaks_listing_parser(soup: Tag): datetime_obj = datetime.strptime(thread_date, "%Y-%m-%dT%H:%M:%S%z") addDate.append(datetime_obj) - return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate) + return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_user) def cardingleaks_links_parser(soup): diff --git a/Forums/CryptBB/parser.py b/Forums/CryptBB/parser.py index c137336..5b83ab5 100644 --- a/Forums/CryptBB/parser.py +++ b/Forums/CryptBB/parser.py @@ -25,8 +25,8 @@ def cryptBB_description_parser(soup): post = [] # 6 all messages of each post feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) addDate = [] # 8 all dates of each post - image = [] - image_user = [] + image_user = [] # 9 all user avatars of each post + image_post = [] # 10 all first images of each post # Finding the topic (should be just one coming from the Listing Page) @@ -162,15 +162,18 @@ def cryptBB_description_parser(soup): img = img.get('src').split('base64,')[-1] else: img = "-1" - image.append(img) + image_post.append(img) img = ipost.find('div', {"class": "author_avatar"}).find('img') - img = img.get('src').split('base64,')[-1] + if img is not None: + img = img.get('src').split('base64,')[-1] + else: + img = "-1" image_user.append(img) # Populate the final variable (this should be a list with all fields scraped) - row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) + row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) # Sending the results @@ -191,6 +194,8 @@ def cryptBB_listing_parser(soup): href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between # Listing and Description pages) addDate = [] # 7 when the topic was created (difficult to find) + image_user = [] # 8 all user avatars used in each topic + # Finding the board (should be just one) @@ -214,6 +219,8 @@ def cryptBB_listing_parser(soup): topics = re.sub("\[\w*\]", '', topics) topic.append(cleanString(topics)) + image_user.append(-1) + # Counting how many topics we have found so far nm = len(topic) @@ -244,7 +251,7 @@ def cryptBB_listing_parser(soup): addDate.append("-1") - return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate) + return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_user) def cryptBB_links_parser(soup): diff --git a/Forums/HiddenAnswers/parser.py b/Forums/HiddenAnswers/parser.py index bcf8e33..995a7f0 100644 --- a/Forums/HiddenAnswers/parser.py +++ b/Forums/HiddenAnswers/parser.py @@ -13,17 +13,20 @@ from bs4 import BeautifulSoup, ResultSet, Tag # This is the method to parse the Description Pages (one page to each topic in the Listing Pages) def HiddenAnswers_description_parser(soup: BeautifulSoup): - topic: str = "-1" # topic name - user: List[str] = [] # all users of each post - addDate: List[datetime] = [] # all dated of each post - feedback: List[str] = [] # all feedbacks of each vendor (this was found in just one Forum and with a number format) - status: List[str] = [] # all user's authority in each post such as (adm, member, dangerous) - reputation: List[str] = [] # all user's karma in each post (usually found as a number) - sign: List[str] = [] # all user's signature in each post (usually a standard message after the content of the post) - post: List[str] = [] # all messages of each post - interest: List[str] = [] # all user's interest in each post - image = [] - image_user = [] + + # Fields to be parsed + + topic: str = "-1" # 0 topic name + user: List[str] = [] # 1 all users of each post + addDate: List[datetime] = [] # 2 all dated of each post + feedback: List[str] = [] # 3 all feedbacks of each vendor (this was found in just one Forum and with a number format) + status: List[str] = [] # 4 all user's authority in each post such as (adm, member, dangerous) + reputation: List[str] = [] # 5 all user's karma in each post (usually found as a number) + sign: List[str] = [] # 6 all user's signature in each post (usually a standard message after the content of the post) + post: List[str] = [] # 7 all messages of each post + interest: List[str] = [] # 8 all user's interest in each post + image_user = [] # 9 all user avatars of each post + image_post = [] # 10 all first images of each post # Finding the topic (should be just one coming from the Listing Page) li = soup.find("h1").find("span", {"itemprop": "name"}) @@ -60,7 +63,7 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup): img = img.get('src').split('base64,')[-1] else: img = "-1" - image.append(img) + image_post.append(img) img = question.find('span', {"class": "qa-q-view-avatar-meta"}).find('img') if img is not None: @@ -103,7 +106,7 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup): img = img.get('src').split('base64,')[-1] else: img = "-1" - image.append(img) + image_post.append(img) img = replies.find('span', {"class": "qa-a-item-avatar-meta"}).find('img') if img is not None: @@ -114,24 +117,27 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup): # Populate the final variable (this should be a list with all fields scraped) - row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) + row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) # Sending the results return row def HiddenAnswers_listing_parser(soup: BeautifulSoup): - board = "-1" # board name (the previous level of the topic in the Forum categorization tree. - # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) - forum: str = "HiddenAnswers" - nm: int = 0 # this variable should receive the number of topics - topic: List[str] = [] # all topics - user: List[str] = [] # all users of each topic - post: List[int] = [] # number of posts of each topic - view: List[int] = [] # number of views of each topic - addDate: List[str] = [] # when the topic was created (difficult to find) - href: List[str] = [] # this variable should receive all cleaned urls (we will use this to do the merge between - # Listing and Description pages) + + nm: int = 0 # this variable should receive the number of topics + forum: str = "HiddenAnswers" # 0 *forum name + board = "-1" # 1 board name (the previous level of the topic in the Forum categorization tree. + # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) + user: List[str] = [] # 2 all users of each topic + topic: List[str] = [] # 3 all topics + view: List[int] = [] # 4 number of views of each topic + post: List[int] = [] # 5 number of posts of each topic + href: List[str] = [] # 6 this variable should receive all cleaned urls (we will use this to do the merge between + # Listing and Description pages) + addDate: List[str] = [] # 7 when the topic was created (difficult to find) + image_user = [] # 8 all user avatars used in each topic + # Finding the board literature = soup.find("div", {"class": "qa-main-heading"}).find("h1") board = literature.text @@ -141,6 +147,8 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup): for queries in queries_by_user: topic_of_query = queries.find("div", {"class": "qa-q-item-title"}).find("a").text topic.append(cleanString(topic_of_query.strip())) + + image_user.append("-1") author = queries.find("span", {"class": "qa-q-item-who-data"}).find("a").text user.append(cleanString(author.strip())) @@ -167,7 +175,7 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup): nm = len(topic) - return organizeTopics(forum, nm, board, user, topic, view, post, href, addDate) + return organizeTopics(forum, nm, board, user, topic, view, post, href, addDate, image_user) #need to change this method def hiddenanswers_links_parser(soup):