this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

249 lines
9.1 KiB

1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from Forums.Utilities.utilities import *
  4. from datetime import date
  5. from datetime import timedelta
  6. import re
  7. # Here, we are importing BeautifulSoup to search through the HTML tree
  8. from bs4 import BeautifulSoup, ResultSet, Tag
  9. # This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
  10. def libre_description_parser(soup: Tag):
  11. # Fields to be parsed
  12. topic = "-1" # 0 *topic name
  13. user = [] # 1 *all users of each post
  14. status = [] # 2 all user's authority in each post such as (adm, member, dangerous)
  15. reputation = [] # 3 all user's karma in each post (usually found as a number)
  16. interest = [] # 4 all user's interest in each post
  17. sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
  18. post = [] # 6 all messages of each post
  19. feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
  20. addDate = [] # 8 all dates of each post
  21. image_user = [] # 9 all user avatars of each post
  22. image_post = [] # 10 all first images of each post
  23. # Finding the topic (should be just one coming from the Listing Page)
  24. topic_found = soup.find("a", {"class": "link text-xl text-zinc-300"}).text
  25. topic = cleanString(topic_found.strip())
  26. original_post: Tag = soup.find("div", {"class": "flex items-start"})
  27. original_user = original_post.find("div", {"class": "info-p"}).find("a", {"class": "link"}).text
  28. user.append(cleanString(original_user.replace("/u/", "").strip()))
  29. original_user_statistics: ResultSet[Tag] = original_post.find("div", {"class": "info-p"}).find_all("span")
  30. original_time = original_user_statistics[0].text[2:]
  31. datetime_append = datetime.strptime(original_time, "%Y-%m-%d %H:%M:%S GMT")
  32. addDate.append(datetime_append)
  33. original_karma = original_user_statistics[1].text[2]
  34. reputation.append(cleanString(original_karma.strip()))
  35. original_content = soup.find("div", {"class": "content-p"}).text
  36. post.append(cleanString(original_content.strip()))
  37. status.append("-1")
  38. interest.append("-1")
  39. sign.append("-1")
  40. feedback.append("-1")
  41. image_post.append("-1")
  42. img = original_post.find('img')
  43. if img is not None:
  44. img = img.get('src').split('base64,')[-1]
  45. else:
  46. img = "-1"
  47. image_user.append(img)
  48. # Finding the repeated tag that corresponds to the listing of posts
  49. # try:
  50. posts: ResultSet[Tag] = soup.find_all("div", {"class": "flex items-stretch"})
  51. # For each message (post), get all the fields we are interested to:
  52. for ipost in posts:
  53. # Finding a first level of the HTML page
  54. # Finding the author (user) of the post
  55. user_name = ipost.find("p", {"class": "text-zinc-400 text-justify"}).find("a", {"class": "link"}).text
  56. user.append(cleanString(user_name.replace("/u/", "").strip())) # Remember to clean the problematic characters
  57. status.append("-1")
  58. # Finding the interest of the author
  59. # CryptBB does not have blurb
  60. interest.append("-1")
  61. # Finding the reputation of the user
  62. # CryptBB does have reputation
  63. karma = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text
  64. karma_cleaned = karma.split(" ")[6]
  65. reputation.append(cleanString(karma_cleaned.strip()))
  66. # Getting here another good tag to find the post date, post content and users' signature
  67. date_posted = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text
  68. date_time_cleaned = date_posted.replace(user_name, "")[3:-12]
  69. datetime_append = datetime.strptime(date_time_cleaned, "%Y-%m-%d %H:%M:%S GMT")
  70. addDate.append(datetime_append)
  71. # Finding the post
  72. user_post = ipost.find("div", {"class": "content-c"}).text
  73. post.append(cleanString(user_post))
  74. # Finding the user's signature
  75. sign.append("-1")
  76. # As no information about user's feedback was found, just assign "-1" to the variable
  77. feedback.append("-1")
  78. # As no information about post's image was found, just assign "-1" to the variable
  79. image_post.append("-1")
  80. # As no information about user's image was found, just assign "-1" to the variable
  81. image_user.append("-1")
  82. # Populate the final variable (this should be a list with all fields scraped)
  83. # print(topic)
  84. # print(user)
  85. # print(status)
  86. # print(reputation)
  87. # print(interest)
  88. # print(sign)
  89. # print(post)
  90. # print(feedback)
  91. # print(addDate)
  92. # print(len(user))
  93. # print(len(status))
  94. # print(len(reputation))
  95. # print(len(interest))
  96. # print(len(sign))
  97. # print(len(feedback))
  98. # print(len(addDate))
  99. row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
  100. # Sending the results
  101. return row
  102. # This is the method to parse the Listing Pages (one page with many posts)
  103. def libre_listing_parser(soup):
  104. nm = 0 # *this variable should receive the number of topics
  105. forum = "Libre" # 0 *forum name
  106. board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
  107. # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
  108. author = [] # 2 *all authors of each topic
  109. topic = [] # 3 *all topics
  110. views = [] # 4 number of views of each topic
  111. posts = [] # 5 number of posts of each topic
  112. href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
  113. # Listing and Description pages)
  114. addDate = [] # 7 when the topic was created (difficult to find)
  115. image_author = [] # 8 all author avatars used in each topic
  116. # Finding the board (should be just one)
  117. board = soup.find('div', {"class": "title"}).find("h1").text
  118. board = cleanString(board.strip())
  119. # Finding the repeated tag that corresponds to the listing of topics
  120. itopics = soup.find("div", {"class", "space-y-2 mt-4"}).find_all('div', {"class": "flex box"})
  121. nm = 0
  122. for itopic in itopics:
  123. nm += 1
  124. # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
  125. # to don't miss any topic
  126. # Adding the topic to the topic list
  127. topic_string = itopic.find("a", {"class": "link text-xl text-zinc-300"}).text
  128. cleaned_topic_string = cleanString(topic_string.strip())
  129. topic.append(cleaned_topic_string)
  130. image_author.append("-1")
  131. # Adding the url to the list of urls
  132. link_to_clean = itopic.find("a", {"class": "link text-xl text-zinc-300"}).get("href")
  133. href.append(link_to_clean)
  134. # Finding the author of the topic
  135. username_not_cleaned = itopic.find('div', {"class": "flex-grow p-2 text-justify"}).find('a').text
  136. username_cleaned = username_not_cleaned.split("/")[-1]
  137. author.append(cleanString(username_cleaned))
  138. # Finding the number of views
  139. num_views = itopic.find_all("div", {"class": "flex items-center"})[0].find("p").text
  140. views.append(cleanString(num_views))
  141. # Finding the number of replies
  142. num_replies = itopic.find_all("div", {"class": "flex items-center"})[1].find("p").text
  143. posts.append(cleanString(num_replies))
  144. # If no information about when the topic was added, just assign "-1" to the variable
  145. date_time_concatenated = itopic.find("p", {"class": "text-sm text-zinc-400 italic"}).text
  146. date_time_cleaned = date_time_concatenated.replace(username_not_cleaned, "")
  147. # creating the datetime object
  148. date_time_array = date_time_cleaned[3:]
  149. datetime_append = datetime.strptime(date_time_array, "%Y-%m-%d %H:%M:%S GMT")
  150. addDate.append(datetime_append)
  151. # print(forum)
  152. # print(nm)
  153. # print(board)
  154. # print(author)
  155. # print(topic)
  156. # print(views)
  157. # print(href)
  158. # print(addDate)
  159. # print(len(author))
  160. # print(len(topic))
  161. # print(len(views))
  162. # print(len(href))
  163. # print(len(addDate))
  164. return organizeTopics(
  165. forum=forum,
  166. nm=nm,
  167. board=board,
  168. author=author,
  169. topic=topic,
  170. views=views,
  171. posts=posts,
  172. href=href,
  173. addDate=addDate,
  174. image_author=image_author
  175. )
  176. def libre_links_parser(soup):
  177. # Returning all links that should be visited by the Crawler
  178. href = []
  179. listing = soup.find_all('div', {"class": "flex-grow p-2 text-justify"})
  180. for a in listing:
  181. link = a.find('div', {'class': 'flex space-x-2 items-center'}).find('a').get('href')
  182. href.append(link)
  183. return href