this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

278 lines
10 KiB

  1. __author__ = 'Helium'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from Forums.Utilities.utilities import *
  4. from datetime import date
  5. from datetime import timedelta
  6. import re
  7. # Here, we are importing BeautifulSoup to search through the HTML tree
  8. from bs4 import BeautifulSoup
  9. # This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
  10. def nemesisforums_description_parser(soup):
  11. # Fields to be parsed
  12. topic = "-1" # 0 *topic name
  13. user = [] # 1 *all users of each post
  14. status = [] # 2 all user's authority in each post such as (adm, member, dangerous)
  15. reputation = [] # 3 all user's karma in each post (usually found as a number)
  16. interest = [] # 4 all user's interest in each post
  17. sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
  18. post = [] # 6 all messages of each post
  19. feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
  20. addDate = [] # 8 all dates of each post
  21. image_user = [] # 9 all user avatars of each post
  22. image_post = [] # 10 all first images of each post
  23. # Finding the topic (should be just one coming from the Listing Page)
  24. li = soup.find("td", {"class": "thead"}).find('strong')
  25. topic = li.text
  26. topic = re.sub("\[\w*\]", '', topic)
  27. topic = topic.replace(",","")
  28. topic = topic.replace("\n","")
  29. topic = cleanString(topic.strip())
  30. # Finding the repeated tag that corresponds to the listing of posts
  31. posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all(
  32. 'div', {"class": "post"})
  33. # For each message (post), get all the fields we are interested to:
  34. for ipost in posts:
  35. if ipost.find('div', {"class": "deleted_post_author"}):
  36. continue
  37. # Finding a first level of the HTML page
  38. post_wrapper = ipost.find('span', {"class": "largetext"})
  39. # Finding the author (user) of the post
  40. author = post_wrapper.text.strip()
  41. user.append(cleanString(author)) # Remember to clean the problematic characters
  42. # Finding the status of the author
  43. smalltext = ipost.find('div', {"class": "post_author"})
  44. if smalltext is not None:
  45. # CryptBB does have membergroup and postgroup
  46. membergroup = smalltext.find('div', {"class": "profile-rank"})
  47. postgroup = smalltext.find('div', {"class": "postgroup"})
  48. if membergroup != None:
  49. membergroup = membergroup.text.strip()
  50. if postgroup != None:
  51. postgroup = postgroup.text.strip()
  52. membergroup = membergroup + " - " + postgroup
  53. else:
  54. if postgroup != None:
  55. membergroup = postgroup.text.strip()
  56. else:
  57. membergroup = "-1"
  58. status.append(cleanString(membergroup))
  59. # Finding the interest of the author
  60. # CryptBB does not have blurb
  61. blurb = smalltext.find('li', {"class": "blurb"})
  62. if blurb != None:
  63. blurb = blurb.text.strip()
  64. else:
  65. blurb = "-1"
  66. interest.append(cleanString(blurb))
  67. # Finding the reputation of the user
  68. # CryptBB does have reputation
  69. author_stats = smalltext.find('div', {"class": "author_statistics"})
  70. karma = author_stats.find('strong')
  71. if karma != None:
  72. karma = karma.text
  73. karma = karma.replace("Community Rating: ", "")
  74. karma = karma.replace("Karma: ", "")
  75. karma = karma.strip()
  76. else:
  77. karma = "-1"
  78. reputation.append(cleanString(karma))
  79. else:
  80. status.append('-1')
  81. interest.append('-1')
  82. reputation.append('-1')
  83. # Getting here another good tag to find the post date, post content and users' signature
  84. postarea = ipost.find('div', {"class": "post_content"})
  85. dt = postarea.find('span', {"class": "post_date"}).text
  86. # dt = dt.strip().split()
  87. dt = dt.strip()
  88. day=date.today()
  89. if "Today" in dt:
  90. today = day.strftime('%m-%d-%Y')
  91. stime = dt.replace('Today,','').strip()
  92. date_time_obj = today + ', '+stime
  93. date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
  94. elif "Yesterday" in dt:
  95. yesterday = day - timedelta(days=1)
  96. yesterday = yesterday.strftime('%m-%d-%Y')
  97. stime = dt.replace('Yesterday,','').strip()
  98. date_time_obj = yesterday + ', '+stime
  99. date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
  100. elif "ago" in dt:
  101. date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
  102. date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p')
  103. else:
  104. date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p')
  105. addDate.append(date_time_obj)
  106. # Finding the post
  107. inner = postarea.find('div', {"class": "post_body scaleimages"})
  108. quote = inner.find('blockquote')
  109. if quote is not None:
  110. quote.decompose()
  111. inner = inner.text.strip()
  112. post.append(cleanString(inner))
  113. # Finding the user's signature
  114. # signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"})
  115. signature = ipost.find('div', {"class": "signature scaleimages"})
  116. if signature != None:
  117. signature = signature.text.strip()
  118. # print(signature)
  119. else:
  120. signature = "-1"
  121. sign.append(cleanString(signature))
  122. # As no information about user's feedback was found, just assign "-1" to the variable
  123. feedback.append("-1")
  124. img = ipost.find('div', {"class": "post_body scaleimages"}).find('img')
  125. if img is not None:
  126. img = img.get('src').split('base64,')[-1]
  127. else:
  128. img = "-1"
  129. image_post.append(img)
  130. avatar = ipost.find('div', {"class": "author_avatar"})
  131. if avatar is not None:
  132. img = avatar.find('img')
  133. if img is not None:
  134. img = img.get('src').split('base64,')[-1]
  135. else:
  136. img = "-1"
  137. else:
  138. img = "-1"
  139. image_user.append(img)
  140. # Populate the final variable (this should be a list with all fields scraped)
  141. row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
  142. # Sending the results
  143. return row
  144. # This is the method to parse the Listing Pages (one page with many posts)
  145. def nemesisforums_listing_parser(soup):
  146. nm = 0 # *this variable should receive the number of topics
  147. forum = "NemesisForums" # 0 *forum name
  148. board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
  149. # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
  150. author = [] # 2 *all authors of each topic
  151. topic = [] # 3 *all topics
  152. views = [] # 4 number of views of each topic
  153. posts = [] # 5 number of posts of each topic
  154. href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
  155. # Listing and Description pages)
  156. addDate = [] # 7 when the topic was created (difficult to find)
  157. image_author = [] # 8 all author avatars used in each topic
  158. # Finding the board (should be just one)
  159. board = soup.find('span', {"class": "active"}).text
  160. board = cleanString(board.strip())
  161. # Finding the repeated tag that corresponds to the listing of topics
  162. itopics = soup.find_all('tr',{"class": "inline_row"})
  163. # Counting how many topics
  164. nm = len(itopics)
  165. for itopic in itopics:
  166. # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
  167. # to don't miss any topic
  168. # Adding the topic to the topic list
  169. try:
  170. topics = itopic.find('span', {"class": "subject_old"}).find('a').text
  171. except:
  172. topics = itopic.find('span', {"class": "subject_new"}).find('a').text
  173. topics = re.sub("\[\w*\]", '', topics)
  174. topic.append(cleanString(topics))
  175. image_author.append(-1)
  176. # Adding the url to the list of urls
  177. try:
  178. link = itopic.find('span', {"class": "subject_old"}).find('a').get('href')
  179. except:
  180. link = itopic.find('span',{"class": "subject_new"}).find('a').get('href')
  181. href.append(link)
  182. # Finding the author of the topic
  183. ps = itopic.find('div', {"class":"author smalltext"}).text
  184. user = ps.strip()
  185. author.append(cleanString(user))
  186. # Finding the number of replies
  187. columns = itopic.findChildren('td',recursive=False)
  188. replies = columns[3].text
  189. if replies == '-':
  190. posts.append('-1')
  191. else:
  192. posts.append(cleanString(replies))
  193. # Finding the number of Views
  194. tview = columns[4].text
  195. if tview == '-':
  196. views.append('-1')
  197. else:
  198. views.append(cleanString(tview))
  199. # If no information about when the topic was added, just assign "-1" to the variable
  200. addDate.append("-1")
  201. return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author)
  202. def nemesisforums_links_parser(soup):
  203. # Returning all links that should be visited by the Crawler
  204. href = []
  205. listing = soup.find('div', {"class": "card-body"}).find_all('div', {"class": "d-flex border-2 border-bottom overflow-hidden position-relative px-6 pt-4 pb-3"})
  206. for a in listing:
  207. link = a.find('div', {"class": "d-flex align-items-center"}).find('a').get('href')
  208. href.append(link)
  209. return href