this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

220 lines
8.8 KiB

  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from typing import List
  4. from Forums.Utilities.utilities import *
  5. from datetime import date
  6. from datetime import timedelta
  7. import re
  8. import string
  9. # Here, we are importing BeautifulSoup to search through the HTML tree
  10. from bs4 import BeautifulSoup
  11. # This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
  12. def onniForums_description_parser(soup: BeautifulSoup) -> tuple:
  13. topicName: str = "-1" # 0 *topic name
  14. users : List[str] = [] # 1 *all users of each post
  15. statuses : List[str] = [] # 2 all user's authority in each post such as (adm, member, dangerous)
  16. reputations : List[str] = [] # 3 all user's karma in each post (usually found as a number)
  17. interests : List[str] = [] # 4 all user's interest in each post
  18. signs : List[str] = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
  19. posts : List[str] = [] # 6 all messages of each post
  20. feedbacks : List[str] = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
  21. addDates : List[datetime] = [] # 8 all dates of each post
  22. image_user : List[str] = [] # 9 all user avatars of each post
  23. image_post : List[str] = [] # 10 all first images of each post
  24. # Getting the topicName
  25. topicName = soup.find("table", {"class": "tborder tfixed clear"}) \
  26. .find("td", {"class": "thead"}) \
  27. .find_all("div")[-1].text
  28. topicName = cleanString(topicName.strip())
  29. topics_array = soup.find_all("div", {"class": "post"})
  30. for topic in topics_array:
  31. # Extracting and cleaning author information
  32. author_information: BeautifulSoup = topic.find("div", {"class": "author_information"})
  33. username: str = author_information.find("span", {"class": "largetext"}).text
  34. username_cleaned = cleanString(username.strip())
  35. users.append(username_cleaned)
  36. user_status: str = author_information.find("span", {"class": "smalltext"}).text
  37. # Banned users often have weird text issues in HTML
  38. # So we detect banned users and give them a unique string
  39. if user_status.find("Banned") > 0: user_status_cleaned = "Banned"
  40. elif user_status.find("Unregistered") > 0: user_status_cleaned = "Unregistered"
  41. else: user_status_cleaned = cleanString(user_status.strip()) # Remove excessive spaces in string
  42. # Add cleaned data into array
  43. statuses.append(user_status_cleaned)
  44. if user_status_cleaned in ['Unregistered', 'Banned']: reputations.append(-1)
  45. else:
  46. author_statistics: BeautifulSoup = topic.find("div", {"class": "author_statistics"})
  47. reputation: str = author_statistics.find_all("div", {"class": "float_right"})[-1].text
  48. reputation_cleaned = cleanString(reputation.strip())
  49. reputations.append(reputation_cleaned)
  50. # Append a "-1" to `interests` and `signs` array since they don't exist on this forum
  51. interests.append("-1")
  52. signs.append("-1")
  53. post_content: str = topic.find("div", {"class": "post_body scaleimages"}).text
  54. # Clean post content of excessive spaces and characters
  55. post_content_cleaned = post_content.replace("[You must reply to view this hidden content]", "")
  56. post_content_cleaned = cleanString(post_content_cleaned.strip())
  57. posts.append(post_content_cleaned)
  58. # Append a "-1" to `feedbacks` array since they don't exists on this forum
  59. feedbacks.append("-1")
  60. date_posted: str = topic.find("span", {"class": "post_date"}).text
  61. date_posted_cleaned = cleanString(date_posted.split(",")[0])
  62. today = datetime.now()
  63. if date_posted_cleaned == 'Yesterday':
  64. date_object = today - timedelta(days=1)
  65. elif date_posted_cleaned.find('hour') > 0:
  66. hours_ago = int(date_posted_cleaned.split(' ')[0])
  67. date_object = today - timedelta(hours=hours_ago)
  68. elif date_posted_cleaned.find('minute') > 0:
  69. minutes_ago = int(date_posted_cleaned.split(' ')[0])
  70. date_object = today - timedelta(minutes=minutes_ago)
  71. else:
  72. date_object = datetime.strptime(date_posted_cleaned, "%m-%d-%Y")
  73. addDates.append(date_object)
  74. image_post.append("-1")
  75. img = topic.find('div', {"class": "author_avatar"}).find('img')
  76. if img is not None:
  77. img = img.get('src').split('base64,')[-1]
  78. else:
  79. img = "-1"
  80. image_user.append(img)
  81. # TESTING PURPOSES - DO NOT REMOVE
  82. # Populate the final variable (this should be a list with all fields scraped)
  83. row = (topicName, users, statuses, reputations, interests, signs, posts, feedbacks, addDates, image_user, image_post)
  84. # Sending the results
  85. return row
  86. def onniForums_listing_parser(soup: BeautifulSoup):
  87. nm = 0 # this variable should receive the number of topics
  88. forum = "OnniForums" # 0 *forum name
  89. boardName = "-1" # 1 board name (the previous level of the topic in the Forum categorization tree.
  90. # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
  91. user: List[str] = [] # 2 all users of each topic
  92. topic : List[str] = [] # 3 all topics
  93. view: List[int] = [] # 4 number of views of each topic
  94. post : List[int] = [] # 5 number of posts of each topic
  95. href: List[str] = [] # 6 this variable should receive all cleaned urls (we will use this to do the merge between Listing and Description pages)
  96. addDate : List[str] = [] # 7 when the topic was created (difficult to find)
  97. image_author : List[str] = [] # 8 all author avatars used in each topic
  98. # Finding the board (should be just one)
  99. board_metadata: BeautifulSoup = soup.find("table",{"class" : "tborder clear"})
  100. boardName = board_metadata.find_all("div")[1].text
  101. boardName = cleanString(boardName.strip())
  102. thread_arrays = board_metadata.find_all("tr", {"class":"inline_row"}) # gets the information of posts
  103. nm = len(thread_arrays)
  104. for thread in thread_arrays: #getting the information from the posts and sorting them into the arrays defined above
  105. body = thread.find("span",{"class": "subject_new"})
  106. try:
  107. post_subject: str = body.text #getting the topic
  108. except:
  109. body = thread.find("span",{"class": "subject_old"})
  110. post_subject: str = body.text
  111. post_subject_cleaned = cleanString(post_subject.strip())
  112. topic.append(post_subject_cleaned)
  113. author_icon = thread.find('div', {"class": "lavatar-old lavatar-old-f"})
  114. if author_icon != None:
  115. author_icon = author_icon.find('img')
  116. author_icon = author_icon.get('src')
  117. author_icon = author_icon.split('base64,')[-1]
  118. else:
  119. author_icon = "-1"
  120. image_author.append(author_icon)
  121. reply_count = thread.find_all("td", {"align": "center"})[2].text
  122. post.append(cleanNumbers(reply_count))
  123. views = thread.find_all("td", {"align": "center"})[3].text
  124. view.append(cleanNumbers(views))
  125. # dates_added: str = thread.find("span",{"class" : "thread_start_datetime smalltext"}).text
  126. # dates_added_cleaned = dates_added.split(',')[0]
  127. # addDate.append(dates_added_cleaned)
  128. author = thread.find("span",{"class" : "author smalltext"}).text
  129. author_cleaned = cleanString(author.strip())
  130. user.append(author_cleaned)
  131. thread_link = body.find('a').get('href')
  132. href.append(thread_link)
  133. return organizeTopics(
  134. forum=forum,
  135. nm=nm,
  136. board=boardName,
  137. author=user,
  138. topic=topic,
  139. views=view,
  140. posts=post,
  141. href=href,
  142. addDate=addDate,
  143. image_author=image_author
  144. )
  145. # This is the method to parse the Listing Pages (one page with many posts)
  146. def onniForums_links_parser(soup: BeautifulSoup):
  147. href = []
  148. listing = soup.find_all('tr', {'class': 'inline_row'})
  149. for thread in listing:
  150. try:
  151. link = thread.find('span', {"class": "subject_old"}).find('a').get('href')
  152. except:
  153. link = thread.find('span', {"class": "subject_new"}).find('a').get('href')
  154. href.append(link)
  155. return href