this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

357 lines
12 KiB

1 year ago
  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. import datetime
  4. from Forums.Utilities.utilities import *
  5. from datetime import date
  6. from datetime import timedelta
  7. import re
  8. import traceback
  9. # Here, we are importing BeautifulSoup to search through the HTML tree
  10. from bs4 import BeautifulSoup
  11. # This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
  12. def dread_description_parser(soup):
  13. # Fields to be parsed
  14. topic = "-1" # topic name
  15. user = [] # all users of each post
  16. addDate = [] # all dated of each post
  17. feedback = [] # all feedbacks of each vendor (this was found in just one Forum and with a number format)
  18. status = [] # all user's authority in each post such as (adm, member, dangerous)
  19. reputation = [] # all users's karma in each post (usually found as a number)
  20. sign = [] # all user's signature in each post (usually a standard message after the content of the post)
  21. post = [] # all messages of each post
  22. interest = [] # all user's interest in each post
  23. # Finding the topic (should be just one coming from the Listing Page)
  24. container = soup.find('div', {"class": "content"})
  25. li = container.find("a", {"class": "title"})
  26. if li == None:
  27. return None
  28. topic = li.text
  29. topic = topic.replace(u'\xa0', ' ')
  30. topic = topic.replace(",","")
  31. topic = topic.replace("\n","")
  32. topic = cleanString(topic.strip())
  33. # print(topic)
  34. # Finding the repeated tag that corresponds to the listing of posts
  35. # posts = soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg"}) + \
  36. # soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg2"})
  37. try:
  38. # putting the initial post data since it is separated from comments
  39. # author name
  40. init_post = container.find('div', {"class": "item"})
  41. author = init_post.find('div', {"class": "author"}).select_one('a[href^="/u/"]').text
  42. flair = init_post.find('div', {"class": "author"}).find("span", {"class": "flair"})
  43. try:
  44. flair = flair.text.strip()
  45. author = author.replace(flair, '')
  46. except:
  47. pass
  48. author = author.strip()
  49. user.append(cleanString(author))
  50. # status
  51. flair = init_post.find("span", {"class": "flair"})
  52. if flair != None:
  53. flair = flair.text.strip()
  54. else:
  55. flair = "-1"
  56. status.append(cleanString(flair))
  57. # no blurb
  58. interest.append(-1)
  59. # points for post
  60. karma = init_post.find("div", {"class": "voteCount"})
  61. if karma != None:
  62. karma = karma.text
  63. karma = karma.replace("points", "")
  64. karma = karma.replace(":", "")
  65. karma = karma.strip()
  66. else:
  67. karma = "-1"
  68. reputation.append(cleanString(karma))
  69. # date
  70. spans = init_post.find('div', {"class": "author"}).find('span', recursive=False)
  71. dt = spans['title']
  72. month = find_month(dt)
  73. split_text = dt.split()
  74. day = int(re.search(r'\d+', split_text[0]).group())
  75. year = int(split_text[2])
  76. hm = re.findall(r'\d+', split_text[-1])
  77. hm[0] = int(hm[0])
  78. hm[1] = int(hm[1])
  79. date_time_obj = datetime(year, month, day, hour=hm[0], minute=hm[1])
  80. addDate.append(date_time_obj)
  81. # content
  82. inner = init_post.find("div", {"class": "postContent"})
  83. inner = inner.text.strip()
  84. post.append(cleanString(inner))
  85. # no signature
  86. sign.append(-1)
  87. # no feedback
  88. feedback.append(-1)
  89. comments = soup.find('div', {"class": "postComments"})
  90. if comments == None:
  91. row = (topic, post, user, addDate, feedback, status, reputation, sign, interest)
  92. return row
  93. else:
  94. comments = soup.find('div', {"class": "postComments"}).find_all('div', "comment")
  95. # print(len(posts))
  96. # For each message (post), get all the fields we are interested to:
  97. for ipost in comments:
  98. # Finding a first level of the HTML page
  99. # post_wrapper = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "poster"})
  100. cc = ipost.find('div', {"class": "commentContent"})
  101. post_wrapper = cc.find('a', {"class": "username"}).text
  102. flair = cc.find("span", {"class": "flair"})
  103. try:
  104. flair = flair.text.strip()
  105. post_wrapper = post_wrapper.replace(flair, '')
  106. except:
  107. pass
  108. author = post_wrapper.strip()
  109. user.append(cleanString(author))
  110. # Finding the status of the author
  111. # Dread does not have membergroup and postgroup, but it has flair, similar enough
  112. postgroup = None
  113. if flair != None:
  114. if postgroup != None:
  115. postgroup = postgroup.text.strip()
  116. flair = flair + " - " + postgroup
  117. else:
  118. if postgroup != None:
  119. flair = postgroup.text.strip()
  120. else:
  121. flair = "-1"
  122. status.append(cleanString(flair))
  123. # print("status " + cleanString(membergroup))
  124. # Finding the interest of the author
  125. # Dread does not have blurb
  126. interest.append(-1)
  127. # Finding the reputation of the user
  128. # Dread doesn't have reputation per user, but instead each post has its own point system
  129. karma = cc.find('div', {"class": "votes"})
  130. if karma != None:
  131. karma = karma.text
  132. karma = karma.replace("points","")
  133. karma = karma.replace(":", "")
  134. karma = karma.strip()
  135. else:
  136. karma = "-1"
  137. reputation.append(cleanString(karma))
  138. # print("karma " + cleanString(karma))
  139. # Getting here another good tag to find the post date, post content and users' signature
  140. postarea = ipost.find('div', {"class": "timestamp"}).find('span', recursive=False)
  141. dt = postarea['title']
  142. month = find_month(dt)
  143. split_text = dt.split()
  144. day = int(re.search(r'\d+', split_text[0]).group())
  145. year = int(split_text[2])
  146. hm = re.findall(r'\d+', split_text[-1])
  147. hm[0] = int(hm[0])
  148. hm[1] = int(hm[1])
  149. date_time_obj = datetime(year, month, day, hour=hm[0], minute=hm[1])
  150. addDate.append(date_time_obj)
  151. # Finding the post
  152. inner = ipost.find('div', {"class": "commentBody"})
  153. inner = inner.text.strip()
  154. # print(inner)
  155. post.append(cleanString(inner))
  156. # No signature for Dread
  157. sign.append(-1)
  158. # As no information about users's feedback was found, just assign "-1" to the variable
  159. feedback.append("-1")
  160. except :
  161. traceback.print_exc()
  162. # if soup.find('td', {"class": "trow1"}).text == " You do not have permission to access this page. ":
  163. # user.append("-1")
  164. # status.append(-1)
  165. # interest.append(-1)
  166. # reputation.append(-1)
  167. # addDate.append(-1)
  168. # post.append("NO ACCESS TO THIS PAGE!")
  169. # sign.append(-1)
  170. # feedback.append(-1)
  171. # Populate the final variable (this should be a list with all fields scraped)
  172. row = (topic, post, user, addDate, feedback, status, reputation, sign, interest)
  173. # Sending the results
  174. return row
  175. # This is the method to parse the Listing Pages (one page with many posts)
  176. def dread_listing_parser(soup):
  177. board = "-1" # board name (the previous level of the topic in the Forum categorization tree.
  178. # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
  179. nm = 0 # this variable should receive the number of topics
  180. topic = [] # all topics
  181. user = [] # all users of each topic
  182. post = [] # number of posts of each topic
  183. view = [] # number of views of each topic
  184. addDate = [] # when the topic was created (difficult to find)
  185. href = [] # this variable should receive all cleaned urls (we will use this to do the marge between
  186. # Listing and Description pages)
  187. # Finding the board (should be just one)
  188. board = soup.find('a', {"class": "banner-top"}).text
  189. board = cleanString(board.strip())
  190. # Finding the repeated tag that corresponds to the listing of topics
  191. itopics = soup.find("div", {"class": "postBoard"}).find_all("div", {"class": "item"}, recursive=False)
  192. index = 0
  193. for itopic in itopics:
  194. # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
  195. # to don't miss any topic
  196. # Adding the topic to the topic list
  197. topic_title = itopic.find("a", {"class": "title"})
  198. title_flair = topic_title.find('span', {"class": "flair"})
  199. topics = topic_title.text
  200. try:
  201. title_flair = title_flair.text.strip()
  202. topics = topics.replace(title_flair, '')
  203. except:
  204. pass
  205. topics = topics.replace(u'\xa0', ' ')
  206. topics = topics.replace(",", "")
  207. topics = topics.replace("\n", "")
  208. topic.append(cleanString(topics.strip()))
  209. # Counting how many topics we have found so far
  210. nm = len(topic)
  211. # Adding the url to the list of urls
  212. link = topic_title['href']
  213. link = cleanLink(link)
  214. href.append(link)
  215. # Finding the author of the topic
  216. ps = itopic.find('div', {"class": "author"})
  217. post_wrapper = ps.select_one('a[href^="/u/"]').text
  218. flair = ps.find("span", {"class": "flair"})
  219. try:
  220. flair = flair.text.strip()
  221. post_wrapper = post_wrapper.replace(flair, '')
  222. except:
  223. pass
  224. author = post_wrapper.strip()
  225. user.append(cleanString(author))
  226. # Finding the number of replies
  227. meta = itopic.find("div", {"class": "postMain"})
  228. posts = meta.find("a").text
  229. posts = posts.replace("comments", '').strip()
  230. post.append(cleanString(posts))
  231. # Finding the number of Views - not shown in Dread
  232. view.append("-1")
  233. # If no information about when the topic was added, just assign "-1" to the variable
  234. spans = itopic.find('div', {"class": "author"}).find('span', recursive=False)
  235. dt = spans['title']
  236. month = find_month(dt)
  237. split_text = dt.split()
  238. day = int(re.search(r'\d+', split_text[0]).group())
  239. year = int(split_text[2])
  240. hm = re.findall(r'\d+', split_text[-1])
  241. hm[0] = int(hm[0])
  242. hm[1] = int(hm[1])
  243. date_time_obj = datetime(year, month, day, hour=hm[0], minute=hm[1])
  244. addDate.append(date_time_obj)
  245. index += 1
  246. return organizeTopics("Dread", nm, topic, board, view, post, user, addDate, href)
  247. def dread_links_parser(soup):
  248. # Returning all links that should be visited by the Crawler
  249. href = []
  250. #print(soup.find('table', {"class": "tborder clear"}).find(
  251. # 'tbody').find_all('tr', {"class": "inline_row"}))
  252. listing = soup.find("div", {"class": "postBoard"}).find_all("div",{"class": "item"}, recursive=False)
  253. for a in listing:
  254. link = a.find("a", {"class": "title"})
  255. link = link['href']
  256. href.append(link)
  257. return href
  258. def find_month(s):
  259. if 'January' in s:
  260. return 1
  261. elif 'February' in s:
  262. return 2
  263. elif 'March' in s:
  264. return 3
  265. elif 'April' in s:
  266. return 4
  267. elif 'May' in s:
  268. return 5
  269. elif 'June' in s:
  270. return 6
  271. elif 'July' in s:
  272. return 7
  273. elif 'August' in s:
  274. return 8
  275. elif 'September' in s:
  276. return 9
  277. elif 'October' in s:
  278. return 10
  279. elif 'November' in s:
  280. return 11
  281. elif 'December' in s:
  282. return 12