Browse Source

Image tracking.

main
ericssonmarin-cpp 1 year ago
parent
commit
cfa4a1c501
4 changed files with 73 additions and 49 deletions
  1. +1
    -1
      Forums/Altenens/parser.py
  2. +25
    -16
      Forums/Cardingleaks/parser.py
  3. +13
    -6
      Forums/CryptBB/parser.py
  4. +34
    -26
      Forums/HiddenAnswers/parser.py

+ 1
- 1
Forums/Altenens/parser.py View File

@ -100,7 +100,7 @@ def altenens_listing_parser(soup):
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
# Listing and Description pages)
addDate = [] # 7 when the topic was created (difficult to find)
image_user = [] # 9 all user avatars used in each topic
image_user = [] # 8 all user avatars used in each topic
board = soup.find('h1', {"class": "p-title-value"}).text
board = cleanString(board.strip())


+ 25
- 16
Forums/Cardingleaks/parser.py View File

@ -25,8 +25,8 @@ def cardingleaks_description_parser(soup: Tag):
post = [] # 6 all messages of each post
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
addDate = [] # 8 all dates of each post
image = []
image_user = []
image_user = [] # 9 all user avatars of each post
image_post = [] # 10 all first images of each post
li = soup.find("h1", {"class": "p-title-value"})
topic = cleanString(li.text.strip())
@ -70,15 +70,18 @@ def cardingleaks_description_parser(soup: Tag):
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
image.append(img)
image_post.append(img)
img = ipost.find('div', {"class": "message-avatar"}).find('img')
img = img.get('src').split('base64,')[-1]
if img is not None:
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
image_user.append(img)
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
# Sending the results
@ -88,17 +91,18 @@ def cardingleaks_description_parser(soup: Tag):
def cardingleaks_listing_parser(soup: Tag):
nm = 0 # *this variable should receive the number of topics
nm = 0 # *this variable should receive the number of topics
forum = "Cardingleaks" # 0 *forum name
board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
author = [] # 2 *all authors of each topic
topic = [] # 3 *all topics
views = [] # 4 number of views of each topic
posts = [] # 5 number of posts of each topic
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
# Listing and Description pages)
addDate = [] # 7 when the topic was created (difficult to find)
board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
author = [] # 2 *all authors of each topic
topic = [] # 3 *all topics
views = [] # 4 number of views of each topic
posts = [] # 5 number of posts of each topic
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
# Listing and Description pages)
addDate = [] # 7 when the topic was created (difficult to find)
image_user = [] # 8 all user avatars used in each topic
# Finding the board (should be just one)
@ -115,6 +119,11 @@ def cardingleaks_listing_parser(soup: Tag):
thread_topic = thread.find("div", {"class": "structItem-title"}).text
topic.append(cleanString(thread_topic.strip()))
author_icon = thread.find("a", {"class": "avatar avatar--s"}).find("img")
author_icon = author_icon.get('src')
author_icon = author_icon.split('base64,')[-1]
image_user.append(author_icon)
thread_view = thread.find("dl", {"class": "pairs pairs--justified structItem-minor"}).find("dd").text
# Context text view count (i.e., 8.8K) to numerical (i.e., 8800)
@ -132,7 +141,7 @@ def cardingleaks_listing_parser(soup: Tag):
datetime_obj = datetime.strptime(thread_date, "%Y-%m-%dT%H:%M:%S%z")
addDate.append(datetime_obj)
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate)
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_user)
def cardingleaks_links_parser(soup):


+ 13
- 6
Forums/CryptBB/parser.py View File

@ -25,8 +25,8 @@ def cryptBB_description_parser(soup):
post = [] # 6 all messages of each post
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
addDate = [] # 8 all dates of each post
image = []
image_user = []
image_user = [] # 9 all user avatars of each post
image_post = [] # 10 all first images of each post
# Finding the topic (should be just one coming from the Listing Page)
@ -162,15 +162,18 @@ def cryptBB_description_parser(soup):
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
image.append(img)
image_post.append(img)
img = ipost.find('div', {"class": "author_avatar"}).find('img')
img = img.get('src').split('base64,')[-1]
if img is not None:
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
image_user.append(img)
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
# Sending the results
@ -191,6 +194,8 @@ def cryptBB_listing_parser(soup):
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
# Listing and Description pages)
addDate = [] # 7 when the topic was created (difficult to find)
image_user = [] # 8 all user avatars used in each topic
# Finding the board (should be just one)
@ -214,6 +219,8 @@ def cryptBB_listing_parser(soup):
topics = re.sub("\[\w*\]", '', topics)
topic.append(cleanString(topics))
image_user.append(-1)
# Counting how many topics we have found so far
nm = len(topic)
@ -244,7 +251,7 @@ def cryptBB_listing_parser(soup):
addDate.append("-1")
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate)
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_user)
def cryptBB_links_parser(soup):


+ 34
- 26
Forums/HiddenAnswers/parser.py View File

@ -13,17 +13,20 @@ from bs4 import BeautifulSoup, ResultSet, Tag
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
def HiddenAnswers_description_parser(soup: BeautifulSoup):
topic: str = "-1" # topic name
user: List[str] = [] # all users of each post
addDate: List[datetime] = [] # all dated of each post
feedback: List[str] = [] # all feedbacks of each vendor (this was found in just one Forum and with a number format)
status: List[str] = [] # all user's authority in each post such as (adm, member, dangerous)
reputation: List[str] = [] # all user's karma in each post (usually found as a number)
sign: List[str] = [] # all user's signature in each post (usually a standard message after the content of the post)
post: List[str] = [] # all messages of each post
interest: List[str] = [] # all user's interest in each post
image = []
image_user = []
# Fields to be parsed
topic: str = "-1" # 0 topic name
user: List[str] = [] # 1 all users of each post
addDate: List[datetime] = [] # 2 all dated of each post
feedback: List[str] = [] # 3 all feedbacks of each vendor (this was found in just one Forum and with a number format)
status: List[str] = [] # 4 all user's authority in each post such as (adm, member, dangerous)
reputation: List[str] = [] # 5 all user's karma in each post (usually found as a number)
sign: List[str] = [] # 6 all user's signature in each post (usually a standard message after the content of the post)
post: List[str] = [] # 7 all messages of each post
interest: List[str] = [] # 8 all user's interest in each post
image_user = [] # 9 all user avatars of each post
image_post = [] # 10 all first images of each post
# Finding the topic (should be just one coming from the Listing Page)
li = soup.find("h1").find("span", {"itemprop": "name"})
@ -60,7 +63,7 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
image.append(img)
image_post.append(img)
img = question.find('span', {"class": "qa-q-view-avatar-meta"}).find('img')
if img is not None:
@ -103,7 +106,7 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
img = img.get('src').split('base64,')[-1]
else:
img = "-1"
image.append(img)
image_post.append(img)
img = replies.find('span', {"class": "qa-a-item-avatar-meta"}).find('img')
if img is not None:
@ -114,24 +117,27 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
# Sending the results
return row
def HiddenAnswers_listing_parser(soup: BeautifulSoup):
board = "-1" # board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
forum: str = "HiddenAnswers"
nm: int = 0 # this variable should receive the number of topics
topic: List[str] = [] # all topics
user: List[str] = [] # all users of each topic
post: List[int] = [] # number of posts of each topic
view: List[int] = [] # number of views of each topic
addDate: List[str] = [] # when the topic was created (difficult to find)
href: List[str] = [] # this variable should receive all cleaned urls (we will use this to do the merge between
# Listing and Description pages)
nm: int = 0 # this variable should receive the number of topics
forum: str = "HiddenAnswers" # 0 *forum name
board = "-1" # 1 board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
user: List[str] = [] # 2 all users of each topic
topic: List[str] = [] # 3 all topics
view: List[int] = [] # 4 number of views of each topic
post: List[int] = [] # 5 number of posts of each topic
href: List[str] = [] # 6 this variable should receive all cleaned urls (we will use this to do the merge between
# Listing and Description pages)
addDate: List[str] = [] # 7 when the topic was created (difficult to find)
image_user = [] # 8 all user avatars used in each topic
# Finding the board
literature = soup.find("div", {"class": "qa-main-heading"}).find("h1")
board = literature.text
@ -141,6 +147,8 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup):
for queries in queries_by_user:
topic_of_query = queries.find("div", {"class": "qa-q-item-title"}).find("a").text
topic.append(cleanString(topic_of_query.strip()))
image_user.append("-1")
author = queries.find("span", {"class": "qa-q-item-who-data"}).find("a").text
user.append(cleanString(author.strip()))
@ -167,7 +175,7 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup):
nm = len(topic)
return organizeTopics(forum, nm, board, user, topic, view, post, href, addDate)
return organizeTopics(forum, nm, board, user, topic, view, post, href, addDate, image_user)
#need to change this method
def hiddenanswers_links_parser(soup):


Loading…
Cancel
Save