this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

262 lines
10 KiB

  1. __author__ = 'DarkWeb'
  2. import re
  3. # Here, we are importing the auxiliary functions to clean or convert data
  4. from Forums.Utilities.utilities import *
  5. # Here, we are importing BeautifulSoup to search through the HTML tree
  6. from bs4 import BeautifulSoup
  7. # This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
  8. #parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
  9. #stores info it needs in different lists, these lists are returned after being organized
  10. #@param: soup object looking at html page of description page
  11. #return: 'row' that contains a variety of lists that each hold info on the description page
  12. def darknetarmy_description_parser(soup):
  13. # Fields to be parsed
  14. topic = "-1" # 0 topic name
  15. user = [] # 1 all users of each post
  16. status = [] # 2 all user's authority in each post such as (adm, member, dangerous)
  17. reputation = [] # 3 all users's karma in each post (usually found as a number)
  18. interest = [] # 4 all user's interest in each post
  19. sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
  20. post = [] # 6 all messages of each post
  21. feedback = [] # 7 all feedbacks of each user (this was found in just one Forum and with a number format)
  22. addDate = [] # 8 all dated of each post
  23. image_user = [] # 9 all user avatars of each post
  24. image_post = [] # 10 all first images of each post
  25. # Finding the topic (should be just one coming from the Listing Page)
  26. topic = soup.find("h1", {"class": "p-title-value"})
  27. topic = topic.text
  28. topic = topic.replace(",", "")
  29. topic = topic.replace("\n", "")
  30. topic = cleanString(topic.strip())
  31. # Finding the repeated tag that corresponds to the listing of posts
  32. # posts = soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg"}) + \
  33. # soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg2"})
  34. posts = soup.findAll('article', class_=re.compile("message message--post js-post js-inlineModContainer.*"))
  35. # For each message (post), get all the fields we are interested to:
  36. for ipost in posts:
  37. # Finding a first level of the HTML page
  38. #post_wrapper = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "poster"})
  39. post_wrapper = ipost.find('div', {"class": "message-inner"})
  40. # Finding the author (user) of the post
  41. # users
  42. author = post_wrapper.find('div', {'class': 'message-userName'}).find('h4').text
  43. user.append(cleanString(author)) # Remember to clean the problematic characters
  44. # Finding the status of the author
  45. try:
  46. membergroup = post_wrapper.find('h5', {'class': 'userTitle message-userTitle'}).text
  47. except:
  48. membergroup = '-1'
  49. status.append(cleanString(membergroup))
  50. # reputation
  51. temp = post_wrapper.find('div', {'class': 'message-userExtras'}).find_all('dl')
  52. rep = temp[2].find('dd').text
  53. if 'K' or 'k' in rep:
  54. rep = rep.replace('K', '000').replace('k', '000')
  55. reputation.append(rep)
  56. # na
  57. interest.append('-1')
  58. sign.append('-1')
  59. feedback.append('-1')
  60. image_post.append('-1')
  61. try:
  62. message = post_wrapper.find('article', {'class': 'message-body js-selectToQuote'}).text
  63. message = cleanString(message.strip())
  64. except:
  65. message = post_wrapper.find('div', {'content': 'message-content js-messageContent'}).text
  66. message = cleanString(message.strip())
  67. post.append(message)
  68. time = post_wrapper.find('ul', class_ = re.compile(r'message-attribution-main listInline.*')).find('time').text
  69. if ',' in time:
  70. time = time.replace(',', '')
  71. if 'today' in time:
  72. today = datetime.today()
  73. time = today
  74. elif 'at' or 'AM' or 'PM' in time:
  75. today = datetime.today()
  76. start_of_week = today - timedelta(days=(today.weekday() + 1) % 7)
  77. days_mapping = {
  78. 'Sunday': start_of_week,
  79. 'Monday': start_of_week + timedelta(days=1),
  80. 'Tuesday': start_of_week + timedelta(days=2),
  81. 'Wednesday': start_of_week + timedelta(days=3),
  82. 'Thursday': start_of_week + timedelta(days=4),
  83. 'Friday': start_of_week + timedelta(days=5),
  84. 'Saturday': start_of_week + timedelta(days=6),
  85. }
  86. for day, date in days_mapping.items():
  87. if day in time:
  88. time = date.strftime('%Y-%m-%d')
  89. break
  90. addDate.append(time)
  91. try:
  92. image = post_wrapper.find('div', {'class': 'message-avatar '}).find('img').get('src').split('base64,')[-1]
  93. except:
  94. image = '-1'
  95. image_user.append(image)
  96. # Populate the final variable (this should be a list with all fields scraped)
  97. row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
  98. # Sending the results
  99. return row
  100. # This is the method to parse the Listing Pages (one page with many posts)
  101. #parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
  102. #stores info it needs in different lists, these lists are returned after being organized
  103. #@param: soup object looking at html page of listing page
  104. #return: 'row' that contains a variety of lists that each hold info on the listing page
  105. def darknetarmy_listing_parser(soup):
  106. nm = 0 # *this variable should receive the number of topics
  107. forum = "DarkNetArmy" # 0 *forum name
  108. board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
  109. # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
  110. author = [] # 2 *all authors of each topic
  111. topic = [] # 3 *all topics
  112. views = [] # 4 number of views of each topic
  113. posts = [] # 5 number of posts of each topic
  114. href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
  115. # Listing and Description pages)
  116. addDate = [] # 7 when the topic was created (difficult to find)
  117. image_author = [] # 8 all author avatars used in each topic
  118. # Finding the board (should be just one)
  119. board = soup.find('h1', {"class": "p-title-value"}).text
  120. board = board.replace(u"\xbb", "")
  121. board = cleanString(board.strip())
  122. # Finding the repeated tag that corresponds to the listing of topics
  123. itopics = soup.find('div', {"class": "structItemContainer-group js-threadList"}).find_all('div', class_=re.compile(
  124. r'^structItem structItem--thread js-inlineModContainer js-threadListItem.*'))
  125. nm = len(itopics)
  126. index = 0
  127. for itopic in itopics:
  128. # authors
  129. a = itopic.find('ul', {"class": "structItem-parts"}).find('li').text
  130. a = cleanString(a.strip())
  131. author.append(a)
  132. # topic
  133. top = itopic.find('div', {"class": 'structItem-title'}).text
  134. top = cleanString(top.strip())
  135. topic.append(top)
  136. # href
  137. ref = itopic.find('div', {"class": 'structItem-title'}).find('a').get('href')
  138. href.append(ref)
  139. # image
  140. try:
  141. image = itopic.find('div', {"class": 'structItem-iconContainer'}).find('img').get('src').split('base64,')[
  142. -1]
  143. except:
  144. image = '-1'
  145. image_author.append(image)
  146. # add date
  147. try:
  148. time = soup.find('li', {"class": 'structItem-startDate'}).find('time').text
  149. if ',' in time:
  150. time = time.replace(',', '')
  151. time = time.strip()
  152. if 'today' in time:
  153. today = datetime.today()
  154. time = today
  155. elif 'at' or 'AM' or 'PM' in time:
  156. today = datetime.today()
  157. start_of_week = today - timedelta(days=(today.weekday() + 1) % 7)
  158. days_mapping = {
  159. 'Sunday': start_of_week,
  160. 'Monday': start_of_week + timedelta(days=1),
  161. 'Tuesday': start_of_week + timedelta(days=2),
  162. 'Wednesday': start_of_week + timedelta(days=3),
  163. 'Thursday': start_of_week + timedelta(days=4),
  164. 'Friday': start_of_week + timedelta(days=5),
  165. 'Saturday': start_of_week + timedelta(days=6),
  166. }
  167. for day, date in days_mapping.items():
  168. if day in time:
  169. time = date.strftime('%Y-%m-%d')
  170. break
  171. addDate.append(time)
  172. except:
  173. addDate.append('-1')
  174. try:
  175. temp = itopic.find('div', class_=re.compile(r'^structItem-cell structItem-cell--meta.*')).find_all('dl')
  176. try:
  177. reply = temp[0].find('dd').text
  178. reply = cleanString(reply.strip())
  179. if 'K' or 'k' in reply:
  180. reply = reply.replace('K', '000').replace('k', '000')
  181. except:
  182. reply = '-1'
  183. posts.append(reply)
  184. # views
  185. try:
  186. view = temp[1].find('dd').text
  187. view = cleanString(view.strip())
  188. if 'K' or 'k' in view:
  189. view = view.replace('K', '000').replace('k', '000')
  190. except:
  191. view = '-1'
  192. views.append(view)
  193. except:
  194. reply = '-1'
  195. posts.append(reply)
  196. view = '-1'
  197. views.append(view)
  198. return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author)
  199. #called by the crawler to get description links on a listing page
  200. #@param: beautifulsoup object that is using the correct html page (listing page)
  201. #return: list of description links from a listing page
  202. def darknetarmy_links_parser(soup):
  203. # Returning all links that should be visited by the Crawler
  204. href = []
  205. listing = soup.find('div', {"class": "block-container block-container--nodes"}).findAll('div', {
  206. "class": "structItem-title"})
  207. for a in listing:
  208. bae = a.find('a', href=True)
  209. link = bae['href']
  210. href.append(link)
  211. return href