this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

334 lines
11 KiB

  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. import datetime
  4. from Forums.Utilities.utilities import *
  5. from datetime import date
  6. from datetime import timedelta
  7. import re
  8. import traceback
  9. # Here, we are importing BeautifulSoup to search through the HTML tree
  10. from bs4 import BeautifulSoup
  11. # This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
  12. def dread_description_parser(soup):
  13. # Fields to be parsed
  14. topic = "-1" # 0 *topic name
  15. user = [] # 1 *all users of each post
  16. status = [] # 2 all user's authority in each post such as (adm, member, dangerous)
  17. reputation = [] # 3 all user's karma in each post (usually found as a number)
  18. interest = [] # 4 all user's interest in each post
  19. sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post)
  20. post = [] # 6 all messages of each post
  21. feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
  22. addDate = [] # 8 all dates of each post
  23. # Finding the topic (should be just one coming from the Listing Page)
  24. container = soup.find('div', {"class": "content"})
  25. li = container.find("a", {"class": "title"})
  26. if li == None:
  27. return None
  28. topic = li.text
  29. topic = topic.replace(u'\xa0', ' ')
  30. topic = topic.replace(",","")
  31. topic = topic.replace("\n","")
  32. topic = cleanString(topic.strip())
  33. # print(topic)
  34. # Finding the repeated tag that corresponds to the listing of posts
  35. # posts = soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg"}) + \
  36. # soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg2"})
  37. # putting the initial post data since it is separated from comments
  38. # author name
  39. init_post = container.find('div', {"class": "item"})
  40. author = init_post.find('div', {"class": "author"}).select_one('a[href^="/u/"]').text
  41. flair = init_post.find('div', {"class": "author"}).find("span", {"class": "flair"})
  42. try:
  43. flair = flair.text.strip()
  44. author = author.replace(flair, '')
  45. except:
  46. pass
  47. author = author.strip()
  48. user.append(cleanString(author))
  49. # status
  50. flair = init_post.find("span", {"class": "flair"})
  51. if flair != None:
  52. flair = flair.text.strip()
  53. else:
  54. flair = "-1"
  55. status.append(cleanString(flair))
  56. # no blurb
  57. interest.append(-1)
  58. # points for post
  59. karma = init_post.find("div", {"class": "voteCount"})
  60. if karma != None:
  61. karma = karma.text
  62. karma = karma.replace("points", "")
  63. karma = karma.replace(":", "")
  64. karma = karma.strip()
  65. else:
  66. karma = "-1"
  67. reputation.append(cleanString(karma))
  68. # date
  69. spans = init_post.find('div', {"class": "author"}).find('span', recursive=False)
  70. dt = spans['title']
  71. month = find_month(dt)
  72. split_text = dt.split()
  73. day = int(re.search(r'\d+', split_text[0]).group())
  74. year = int(split_text[2])
  75. hm = re.findall(r'\d+', split_text[-1])
  76. hm[0] = int(hm[0])
  77. hm[1] = int(hm[1])
  78. date_time_obj = datetime(year, month, day, hour=hm[0], minute=hm[1])
  79. addDate.append(date_time_obj)
  80. # content
  81. inner = init_post.find("div", {"class": "postContent"})
  82. inner = inner.text.strip()
  83. post.append(cleanString(inner))
  84. # no signature
  85. sign.append(-1)
  86. # no feedback
  87. feedback.append(-1)
  88. comments = soup.find('div', {"class": "postComments"})
  89. if comments == None:
  90. row = (topic, post, user, addDate, feedback, status, reputation, sign, interest)
  91. return row
  92. else:
  93. comments = soup.find('div', {"class": "postComments"}).find_all('div', "comment")
  94. # print(len(posts))
  95. # For each message (post), get all the fields we are interested to:
  96. for ipost in comments:
  97. # Finding a first level of the HTML page
  98. # post_wrapper = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "poster"})
  99. cc = ipost.find('div', {"class": "commentContent"})
  100. post_wrapper = cc.find('a', {"class": "username"}).text
  101. flair = cc.find("span", {"class": "flair"})
  102. try:
  103. flair = flair.text.strip()
  104. post_wrapper = post_wrapper.replace(flair, '')
  105. except:
  106. pass
  107. author = post_wrapper.strip()
  108. user.append(cleanString(author))
  109. # Finding the status of the author
  110. # Dread does not have membergroup and postgroup, but it has flair, similar enough
  111. postgroup = None
  112. if flair != None:
  113. if postgroup != None:
  114. postgroup = postgroup.text.strip()
  115. flair = flair + " - " + postgroup
  116. else:
  117. if postgroup != None:
  118. flair = postgroup.text.strip()
  119. else:
  120. flair = "-1"
  121. status.append(cleanString(flair))
  122. # print("status " + cleanString(membergroup))
  123. # Finding the interest of the author
  124. # Dread does not have blurb
  125. interest.append(-1)
  126. # Finding the reputation of the user
  127. # Dread doesn't have reputation per user, but instead each post has its own point system
  128. karma = cc.find('div', {"class": "votes"})
  129. if karma != None:
  130. karma = karma.text
  131. karma = karma.replace("points","")
  132. karma = karma.replace(":", "")
  133. karma = karma.strip()
  134. else:
  135. karma = "-1"
  136. reputation.append(cleanString(karma))
  137. # print("karma " + cleanString(karma))
  138. # Getting here another good tag to find the post date, post content and users' signature
  139. postarea = ipost.find('div', {"class": "timestamp"}).find('span', recursive=False)
  140. dt = postarea['title']
  141. month = find_month(dt)
  142. split_text = dt.split()
  143. day = int(re.search(r'\d+', split_text[0]).group())
  144. year = int(split_text[2])
  145. hm = re.findall(r'\d+', split_text[-1])
  146. hm[0] = int(hm[0])
  147. hm[1] = int(hm[1])
  148. date_time_obj = datetime(year, month, day, hour=hm[0], minute=hm[1])
  149. addDate.append(date_time_obj)
  150. # Finding the post
  151. inner = ipost.find('div', {"class": "commentBody"})
  152. inner = inner.text.strip()
  153. # print(inner)
  154. post.append(cleanString(inner))
  155. # No signature for Dread
  156. sign.append(-1)
  157. # As no information about users's feedback was found, just assign "-1" to the variable
  158. feedback.append("-1")
  159. # Populate the final variable (this should be a list with all fields scraped)
  160. row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
  161. # Sending the results
  162. return row
  163. # This is the method to parse the Listing Pages (one page with many posts)
  164. def dread_listing_parser(soup):
  165. nm = 0 # *this variable should receive the number of topics
  166. forum = "Dread" # 0 *forum name
  167. board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
  168. # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
  169. author = [] # 2 *all authors of each topic
  170. topic = [] # 3 *all topics
  171. views = [] # 4 number of views of each topic
  172. posts = [] # 5 number of posts of each topic
  173. href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
  174. # Listing and Description pages)
  175. addDate = [] # 7 when the topic was created (difficult to find)
  176. # Finding the board (should be just one)
  177. board = soup.find('a', {"class": "banner-top"}).text
  178. board = cleanString(board.strip())
  179. # Finding the repeated tag that corresponds to the listing of topics
  180. itopics = soup.find("div", {"class": "postBoard"}).find_all("div", {"class": "item"}, recursive=False)
  181. for itopic in itopics:
  182. # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
  183. # to don't miss any topic
  184. # Adding the topic to the topic list
  185. topic_title = itopic.find("a", {"class": "title"})
  186. title_flair = topic_title.find('span', {"class": "flair"})
  187. topics = topic_title.text
  188. try:
  189. title_flair = title_flair.text.strip()
  190. topics = topics.replace(title_flair, '')
  191. except:
  192. pass
  193. topics = topics.replace(u'\xa0', ' ')
  194. topics = topics.replace(",", "")
  195. topics = topics.replace("\n", "")
  196. topic.append(cleanString(topics.strip()))
  197. # Counting how many topics we have found so far
  198. nm = len(topic)
  199. # Adding the url to the list of urls
  200. link = topic_title['href']
  201. link = cleanLink(link)
  202. href.append(link)
  203. # Finding the author of the topic
  204. ps = itopic.find('div', {"class": "author"})
  205. post_wrapper = ps.select_one('a[href^="/u/"]').text
  206. flair = ps.find("span", {"class": "flair"})
  207. try:
  208. flair = flair.text.strip()
  209. post_wrapper = post_wrapper.replace(flair, '')
  210. except:
  211. pass
  212. user = post_wrapper.strip()
  213. author.append(cleanString(user))
  214. # Finding the number of replies
  215. meta = itopic.find("div", {"class": "postMain"})
  216. post = meta.find("a").text
  217. post = post.replace("comments", '').strip()
  218. posts.append(cleanString(post))
  219. # Finding the number of Views - not shown in Dread
  220. views.append("-1")
  221. # If no information about when the topic was added, just assign "-1" to the variable
  222. spans = itopic.find('div', {"class": "author"}).find('span', recursive=False)
  223. dt = spans['title']
  224. month = find_month(dt)
  225. split_text = dt.split()
  226. day = int(re.search(r'\d+', split_text[0]).group())
  227. year = int(split_text[2])
  228. hm = re.findall(r'\d+', split_text[-1])
  229. hm[0] = int(hm[0])
  230. hm[1] = int(hm[1])
  231. date_time_obj = datetime(year, month, day, hour=hm[0], minute=hm[1])
  232. addDate.append(date_time_obj)
  233. return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate)
  234. def dread_links_parser(soup):
  235. # Returning all links that should be visited by the Crawler
  236. href = []
  237. #print(soup.find('table', {"class": "tborder clear"}).find(
  238. # 'tbody').find_all('tr', {"class": "inline_row"}))
  239. listing = soup.find("div", {"class": "postBoard"}).find_all("div",{"class": "item"}, recursive=False)
  240. for a in listing:
  241. link = a.find("a", {"class": "title"})
  242. link = link['href']
  243. href.append(link)
  244. return href
  245. def find_month(s):
  246. if 'January' in s:
  247. return 1
  248. elif 'February' in s:
  249. return 2
  250. elif 'March' in s:
  251. return 3
  252. elif 'April' in s:
  253. return 4
  254. elif 'May' in s:
  255. return 5
  256. elif 'June' in s:
  257. return 6
  258. elif 'July' in s:
  259. return 7
  260. elif 'August' in s:
  261. return 8
  262. elif 'September' in s:
  263. return 9
  264. elif 'October' in s:
  265. return 10
  266. elif 'November' in s:
  267. return 11
  268. elif 'December' in s:
  269. return 12