this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

177 lines
6.9 KiB

  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from typing import List
  4. from Forums.Utilities.utilities import *
  5. from datetime import date
  6. from datetime import timedelta
  7. import re
  8. import string
  9. # Here, we are importing BeautifulSoup to search through the HTML tree
  10. from bs4 import BeautifulSoup
  11. # This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
  12. def onniForums_description_parser(soup: BeautifulSoup):
  13. topicName: str = "-1" # 0 *topic name
  14. users : List[str] = [] # 1 *all users of each post
  15. statuses : List[str] = [] # 2 all user's authority in each post such as (adm, member, dangerous)
  16. reputations : List[int] = [] # 3 all user's karma in each post (usually found as a number)
  17. interests : List[str] = [] # 4 all user's interest in each post
  18. signs : List[str] = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
  19. posts : List[int] = [] # 6 all messages of each post
  20. feedbacks : List[str] = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
  21. addDates : List[str] = [] # 8 all dates of each post
  22. # Getting the topicName
  23. topicName = soup.find("table", {"class": "tborder tfixed clear"}) \
  24. .find("td", {"class": "thead"}) \
  25. .find_all("div")[-1].text
  26. topics_array = soup.find_all("div", {"class": "post"})
  27. for topic in topics_array:
  28. # Extracting and cleaning author information
  29. author_information: BeautifulSoup = topic.find("div", {"class": "author_information"})
  30. username: str = author_information.find("span", {"class": "largetext"}).text
  31. users.append(username)
  32. user_status: str = author_information.find("span", {"class": "smalltext"}).text
  33. # Banned users often have weird text issues in HTML
  34. # So we detect banned users and give them a unique string
  35. if user_status.find("Banned") > 0: user_status_cleaned = "Banned"
  36. elif user_status.find("Unregistered") > 0: user_status_cleaned = "Unregistered"
  37. else: user_status_cleaned = user_status[1:len(user_status)-2] # Remove excessive spaces in string
  38. # Add cleaned data into array
  39. statuses.append(user_status_cleaned)
  40. if user_status_cleaned in ['Unregistered', 'Banned']: reputations.append(-1)
  41. else:
  42. author_statistics: BeautifulSoup = topic.find("div", {"class": "author_statistics"})
  43. reputation: str = author_statistics.find_all("div", {"class": "float_right"})[-1].text
  44. reputations.append(int(reputation))
  45. # Append a "-1" to `interests` and `signs` array since they don't exist on this forum
  46. interests.append("-1")
  47. signs.append("-1")
  48. post_content: str = topic.find("div", {"class": "post_body scaleimages"}).text
  49. # Clean post content of excessive spaces and characters
  50. post_content_cleaned = post_content.replace("[You must reply to view this hidden content]", "")
  51. post_content_cleaned = post_content_cleaned[1:len(post_content_cleaned)-1]
  52. posts.append(post_content_cleaned)
  53. # Append a "-1" to `feedbacks` array since they don't exists on this forum
  54. feedbacks.append("-1")
  55. date_posted: str = topic.find("span", {"class": "post_date"}).text
  56. date_posted_cleaned = date_posted.split(",")[0]
  57. addDates.append(date_posted_cleaned)
  58. # TESTING PURPOSES - DO NOT REMOVE
  59. # Populate the final variable (this should be a list with all fields scraped)
  60. row = (topicName, posts, users, addDates, feedbacks, statuses, reputations, signs, interests)
  61. # Sending the results
  62. return row
  63. def onniForums_listing_parser(soup: BeautifulSoup):
  64. boardName = "-1" # board name (the previous level of the topic in the Forum categorization tree.
  65. # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
  66. nm = 0 # this variable should receive the number of topics
  67. topic : List[str] = [] # all topics
  68. user : List[str] = [] # all users of each topic
  69. post : List[int] = [] # number of posts of each topic
  70. view : List[int] = [] # number of views of each topic
  71. addDate : List[str] = [] # when the topic was created (difficult to find)
  72. href : List[str] = [] # this variable should receive all cleaned urls (we will use this to do the merge between
  73. # Listing and Description pages)
  74. # Finding the board (should be just one)
  75. board_metadata: BeautifulSoup = soup.find("table",{"class" : "tborder clear"})
  76. boardName = board_metadata.find_all("div")[1].text
  77. thread_arrays = board_metadata.find_all("tr", {"class":"inline_row"}) # gets the information of posts
  78. nm = len(thread_arrays)
  79. for thread in thread_arrays: #getting the information from the posts and sorting them into the arrays defined above
  80. try:
  81. post_subject = thread.find("span",{"class": "subject_new"}).text #getting the topic
  82. except AttributeError:
  83. post_subject = thread.find("span",{"class": "subject_old"}).text
  84. topic.append(post_subject)
  85. reply_count = thread.find_all("td", {"align": "center"})[2].text
  86. post.append(reply_count)
  87. views = thread.find_all("td", {"align": "center"})[3].text
  88. view.append(views)
  89. dates_added: str = thread.find("span",{"class" : "thread_start_datetime smalltext"}).text
  90. dates_added_cleaned = dates_added.split(',')[0]
  91. addDate.append(dates_added_cleaned)
  92. author = thread.find("span",{"class" : "author smalltext"}).text
  93. user.append(author)
  94. reply_anchor = thread.find_all("td", {"align": "center"})[2].find('a')
  95. thread_link = reply_anchor.get('href')
  96. href.append(thread_link)
  97. return organizeTopics(
  98. forum="OnniForums",
  99. nm=nm,
  100. board=boardName,
  101. author=user,
  102. topic=topic,
  103. views=view,
  104. posts=post,
  105. href=href,
  106. addDate=addDate
  107. )
  108. # This is the method to parse the Listing Pages (one page with many posts)
  109. def onniForums_links_parser(soup: BeautifulSoup):
  110. href = []
  111. listing = soup.find_all('tr', {'class': 'inline_row'})
  112. for thread in listing:
  113. try:
  114. link = thread.find('span', {"class": "subject_old"}).find('a').get('href')
  115. except:
  116. link = thread.find('span', {"class": "subject_new"}).find('a').get('href')
  117. href.append(link)
  118. return href