this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

258 lines
9.6 KiB

11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from Forums.Utilities.utilities import *
  4. # Here, we are importing BeautifulSoup to search through the HTML tree
  5. from bs4 import BeautifulSoup
  6. # This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
  7. #parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
  8. #stores info it needs in different lists, these lists are returned after being organized
  9. #@param: soup object looking at html page of description page
  10. #return: 'row' that contains a variety of lists that each hold info on the description page
  11. def incogsnoo_description_parser(soup):
  12. # Fields to be parsed
  13. topic = "-1" # 0 topic name ***$
  14. user = [] # 1 all users of each post ***$ author
  15. status = [] # 2 all user's authority in each post such as (adm, member, dangerous)
  16. reputation = [] # 3 all users's karma in each post (usually found as a number) ??? ups
  17. interest = [] # 4 all user's interest in each post
  18. sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
  19. post = [] # 6 all messages of each post
  20. feedback = [] # 7 all feedbacks of each user (this was found in just one Forum and with a number format)
  21. addDate = [] # 8 all dated of each post ***$ created
  22. image_user = [] # 9 all user avatars of each post
  23. image_post = [] # 10 all first images of each post
  24. # Finding the topic (should be just one coming from the Listing Page)
  25. topic = soup.find("div", {"class": "title"}).find("h2").text
  26. topic = topic.replace('"', '')
  27. topic = cleanString(topic.strip())
  28. # the first post's html is separated from all subsequent comments/replies/posts to the first post
  29. # so parse the first post by itself first
  30. # Finding body of first post
  31. post_text = soup.find("div", {"class": "md"})
  32. if post_text:
  33. post_text = post_text.text.strip()
  34. post.append(cleanString(post_text))
  35. else: # some posts just links to other sites/articles/videos and have no text by itself
  36. post_link = soup.find("div", {"class": "title"}).find("a").get("href")
  37. post_link = cleanLink(post_link)
  38. post.append(post_link)
  39. # User
  40. p_tag = soup.find("p", {"class": "submitted"})
  41. author = p_tag.find("a")
  42. if author:
  43. author = author.text.strip()
  44. elif "[deleted]" in p_tag.text:
  45. author = "deleted"
  46. else:
  47. author = "-1"
  48. user.append(cleanString(author))
  49. # Finding the status of the author
  50. status.append("-1")
  51. # Finding the reputation of the user
  52. reputation.append("-1")
  53. # Finding the interest of the author
  54. interest.append("-1")
  55. # Finding signature
  56. sign.append("-1")
  57. # Finding feedback
  58. upvote = soup.find("div", {"class": "score"}).find("span")
  59. if upvote:
  60. upvote = upvote.text.strip()
  61. else:
  62. upvote = "-1"
  63. feedback.append(cleanString(upvote))
  64. # Finding the date of the post - e.g. "Fri, 18 December 2023 05:49:20 GMT"
  65. dt = soup.find("p", {"class": "submitted"}).find("span")["title"]
  66. # Convert to datetime object - e.g. 2023-12-18 05:49:20
  67. date_time_obj = datetime.strptime(dt, '%a, %d %b %Y %H:%M:%S %Z')
  68. # sdate = date_time_obj.strftime('%m %d %Y')
  69. # stime = date_time_obj.strftime('%I:%M %p')
  70. # date = convertDate(sdate, "english", datetime.now()) + " " + stime
  71. # e.g. "12/18/2023 05:49 AM"
  72. addDate.append(date_time_obj)
  73. image_user.append("-1")
  74. image_post.append("-1")
  75. posts = soup.find("div", {"class": "comments"}).findAll("details")
  76. # For each message (post), get all the fields we are interested to:
  77. for ipost in posts:
  78. # Finding user
  79. p_tag = ipost.find("p", {"class": "author"})
  80. author = p_tag.find("a")
  81. if author:
  82. author = author.text.strip()
  83. elif "[deleted]" in p_tag.text:
  84. author = "deleted"
  85. else:
  86. author = "-1"
  87. user.append(cleanString(author))
  88. # Finding the status of the author
  89. status.append("-1")
  90. # Finding the reputation of the user
  91. reputation.append("-1")
  92. # Finding the interest of the author
  93. interest.append("-1")
  94. # Finding signature
  95. sign.append("-1")
  96. # Finding the post
  97. comment = ipost.find("div", {"class": "md"})
  98. if comment:
  99. comment = comment.text.strip()
  100. else:
  101. comment = "-1"
  102. post.append(cleanString(comment))
  103. # Finding feedback
  104. upvote = ipost.find("p", {"class": "ups"})
  105. if upvote:
  106. upvote = upvote.text.strip().split()[0]
  107. else:
  108. upvote = "-1"
  109. feedback.append(cleanString(upvote))
  110. # Finding the date of the post - e.g. "Fri, 18 December 2023 05:49:20 GMT"
  111. dt = ipost.find("p", {"class": "created"})["title"]
  112. # Convert to datetime object - e.g. 2023-12-18 05:49:20
  113. date_time_obj = datetime.strptime(dt, '%a, %d %b %Y %H:%M:%S %Z')
  114. # sdate = date_time_obj.strftime('%m %d %Y')
  115. # stime = date_time_obj.strftime('%I:%M %p')
  116. # date = convertDate(sdate, "english", datetime.now()) + " " + stime
  117. # e.g. "12/18/2023 05:49 AM"
  118. addDate.append(date_time_obj)
  119. image_user.append("-1")
  120. image_post.append("-1")
  121. # Populate the final variable (this should be a list with all fields scraped)
  122. row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
  123. # Sending the results
  124. return row
  125. # This is the method to parse the Listing Pages (one page with many posts)
  126. #parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
  127. #stores info it needs in different lists, these lists are returned after being organized
  128. #@param: soup object looking at html page of listing page
  129. #return: 'row' that contains a variety of lists that each hold info on the listing page
  130. def incogsnoo_listing_parser(soup):
  131. nm = 0 # *this variable should receive the number of topics
  132. forum = "Incogsnoo" # 0 *forum name
  133. board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
  134. # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
  135. author = [] # 2 *all authors of each topic
  136. topic = [] # 3 *all topics
  137. views = [] # 4 number of views of each topic
  138. posts = [] # 5 number of posts of each topic
  139. href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
  140. # Listing and Description pages)
  141. addDate = [] # 7 when the topic was created (difficult to find)
  142. image_author = [] # 8 all author avatars used in each topic
  143. # Finding the board (should be just one)
  144. board = soup.find("a", {"class": "subreddit"}).find("h2")
  145. board = cleanString(board.text.strip())
  146. # Finding the repeated tag that corresponds to the listing of topics
  147. itopics = soup.find("div", {"id": "links", "class": "sr"}).findAll("div", {"class": "link"})
  148. itopics.pop()
  149. # Counting how many topics we have found so far
  150. nm = len(itopics)
  151. index = 0
  152. for itopic in itopics:
  153. # Finding the author of the topic
  154. p_tag = itopic.find("p", {"class": "submitted"})
  155. user = p_tag.find("a")
  156. if user:
  157. user = user.text.strip()
  158. elif "[deleted]" in p_tag.text:
  159. user = "deleted"
  160. else:
  161. user = "-1"
  162. author.append(cleanString(user))
  163. # Adding the topic to the topic list
  164. topic_title = itopic.find("div", {"class": "title"}).find("h2").text
  165. topic.append(cleanString(topic_title))
  166. # Finding the number of Views
  167. views.append("-1")
  168. # Finding the number of posts
  169. comments = itopic.find("a", {"class": "comments"}).text
  170. number_comments = comments.split()[0]
  171. posts.append(cleanString(number_comments))
  172. # Adding the url to the list of urls
  173. link = itopic.find("a", {"class": "comments"}).get("href")
  174. href.append(link)
  175. # Finding dates
  176. p_tag = itopic.find("p", {"class": "submitted"})
  177. dt = p_tag.find("span")["title"]
  178. date_time_obj = datetime.strptime(dt,'%a, %d %b %Y %H:%M:%S %Z')
  179. # sdate = date_time_obj.strftime('%m %d %Y')
  180. # stime = date_time_obj.strftime('%I:%M %p')
  181. # date = convertDate(sdate, "english", datetime.now()) + " " + stime
  182. # e.g. "12/18/2023 05:49 AM"
  183. addDate.append(date_time_obj)
  184. image_author.append("-1")
  185. index += 1
  186. return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author)
  187. #called by the crawler to get description links on a listing page
  188. #@param: beautifulsoup object that is using the correct html page (listing page)
  189. #return: list of description links from a listing page
  190. def incogsnoo_links_parser(soup):
  191. # Returning all links that should be visited by the Crawler
  192. href = []
  193. listing_parent = soup.find("div", {"id": "links", "class": "sr"})
  194. listing = listing_parent.findAll("div", {"class": "entry"})
  195. for entry in listing:
  196. parent_div = entry.find("div", {"class": "meta"}).find("div", {"class", "links"})
  197. a_tag = parent_div.find("a", {"class", "comments"})
  198. if a_tag:
  199. href.append(a_tag.get("href"))
  200. return href