this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

188 lines
7.2 KiB

  1. __author__ = 'Helium'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from Forums.Utilities.utilities import *
  4. from datetime import date
  5. from datetime import timedelta
  6. import re
  7. # Here, we are importing BeautifulSoup to search through the HTML tree
  8. from bs4 import BeautifulSoup, ResultSet, Tag
  9. # This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
  10. def procrax_description_parser(soup: Tag):
  11. # Fields to be parsed
  12. topic = "-1" # 0 topic name
  13. user = [] # 1 all users of each post
  14. addDate = [] # 2 all dated of each post
  15. feedback = [] # 3 all feedbacks of each vendor (this was found in just one Forum and with a number format)
  16. status = [] # 4 all user's authority in each post such as (adm, member, dangerous)
  17. reputation = [] # 5 all user's karma in each post (usually found as a number)
  18. sign = [] # 6 all user's signature in each post (usually a standard message after the content of the post)
  19. post = [] # 7 all messages of each post
  20. interest = [] # 8 all user's interest in each post
  21. image_user = [] # 9 all user avatars of each post
  22. image_post = [] # 10 all first images of each post
  23. # Finding the topic (should be just one coming from the Listing Page)
  24. li = soup.find("h1", {"class": "p-title-value"})
  25. topic = li.text
  26. thread: ResultSet[Tag] = soup.find("div", {"class": "block-body js-replyNewMessageContainer"}).find_all("article", {"data-author": True})
  27. for ipost in thread:
  28. username = ipost.find("h4", {"class": "message-name"}).text
  29. user.append(cleanString(username.strip()))
  30. date_posted = ipost.find("ul", {"class": "message-attribution-main listInline"}).find("time").get("datetime")
  31. datetime_obj = datetime.strptime(date_posted, "%Y-%m-%dT%H:%M:%S%z")
  32. addDate.append(datetime_obj)
  33. feedback.append("-1")
  34. user_status = ipost.find("h5", {"class": "userTitle message-userTitle"}).text
  35. status.append(cleanString(user_status.strip()))
  36. user_lvl = ipost.find("div", {"class": "afAwardLevel"})
  37. if user_lvl is not None:
  38. user_lvl = user_lvl.text
  39. reputation.append(cleanString(user_lvl.strip()))
  40. else:
  41. reputation.append('-1')
  42. sign.append("-1")
  43. user_post = ipost.find("article", {"class": "message-body js-selectToQuote"}).text
  44. post.append(cleanString(user_post.strip()))
  45. interest.append("-1")
  46. bbWrapper = ipost.find('div', {"class": "bbWrapper"})
  47. if bbWrapper is not None:
  48. img = bbWrapper.find('img')
  49. if img is not None:
  50. img = img.get('src').split('base64,')[-1]
  51. else:
  52. img = "-1"
  53. else:
  54. img = "-1"
  55. image_post.append(img)
  56. avatar = ipost.find("a", {"class": "avatar avatar--m"})
  57. if avatar is not None:
  58. img = avatar.find('img')
  59. if img is not None:
  60. img = img.get('src').split('base64,')[-1]
  61. else:
  62. img = "-1"
  63. else:
  64. img = "-1"
  65. image_user.append(img)
  66. # Populate the final variable (this should be a list with all fields scraped)
  67. row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
  68. # Sending the results
  69. return row
  70. # This is the method to parse the Listing Pages (one page with many posts)
  71. def procrax_listing_parser(soup: Tag):
  72. nm = 0 # this variable should receive the number of topics
  73. forum: str = "Procrax" # 0 *forum name
  74. board = "-1" # 1 board name (the previous level of the topic in the Forum categorization tree.
  75. # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
  76. author = [] # 2 all authors of each topic
  77. topic = [] # 3 all topics
  78. views = [] # 4 number of views of each topic
  79. posts = [] # 5 number of posts of each topic
  80. href = [] # 6this variable should receive all cleaned urls (we will use this to do the marge between
  81. # Listing and Description pages)
  82. addDate = [] # 7 when the topic was created (difficult to find)
  83. image_author = [] # 8 all author avatars used in each topic
  84. # Finding the board (should be just one)
  85. li = soup.find("h1", {"class": "p-title-value"})
  86. board = cleanString(li.text.strip())
  87. threads_list = soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True})
  88. sticky = soup.find("div", {"class": "structItemContainer-group structItemContainer-group--sticky"})
  89. if sticky is not None:
  90. threads_list = sticky.find_all("div", {"data-author": True}) + threads_list
  91. nm = len(threads_list)
  92. for thread in threads_list:
  93. thread_title = thread.find("div", {"class": "structItem-title"}).text
  94. topic.append(cleanString(thread_title.strip()))
  95. author_icon = thread.find('a', {"class": "avatar avatar--s"})
  96. if author_icon != None:
  97. author_icon = author_icon.find('img')
  98. if author_icon != None:
  99. author_icon = author_icon.get('src')
  100. author_icon = author_icon.split('base64,')[-1]
  101. else:
  102. author_icon = "-1"
  103. else:
  104. author_icon = "-1"
  105. image_author.append(author_icon)
  106. thread_author = thread.get("data-author")
  107. author.append(cleanString(thread_author))
  108. thread_views = thread.find("dl", {"class": "pairs pairs--justified structItem-minor"}).find('dd').text
  109. thread_views = thread_views.lower().replace("k", "000")
  110. thread_views = thread_views.lower().replace("m", "000000")
  111. views.append(thread_views.strip())
  112. thread_replies = thread.find("dl", {"class": "pairs pairs--justified"}).find('dd').text
  113. # All threads contain one topic post and reply posts
  114. thread_total_posts = thread_replies.lower().replace("k", "000")
  115. posts.append(thread_total_posts.strip())
  116. thread_date = thread.find("li", {"class": "structItem-startDate"}).find("time").get("datetime")
  117. datetime_obj = datetime.strptime(thread_date, "%Y-%m-%dT%H:%M:%S%z")
  118. addDate.append(datetime_obj)
  119. thread_link: str = thread.find("div", {"class": "structItem-title"}).find('a', {'class': ''}).get('href')
  120. href.append(thread_link)
  121. return organizeTopics(
  122. forum=forum,
  123. nm=nm,
  124. board=board,
  125. author=author,
  126. topic=topic,
  127. views=views,
  128. posts=posts,
  129. addDate=addDate,
  130. href=href,
  131. image_author=image_author
  132. )
  133. def procrax_links_parser(soup):
  134. # Returning all links that should be visited by the Crawler
  135. href = []
  136. listing = soup.find_all('div', {"class": "structItem-title"})
  137. for a in listing:
  138. link = a.find('a', {'class': ''}).get('href')
  139. href.append(link)
  140. return href