this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

285 lines
11 KiB

  1. __author__ = 'cern'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from MarketPlaces.Utilities.utilities import *
  4. # Here, we are importing BeautifulSoup to search through the HTML tree
  5. from bs4 import BeautifulSoup
  6. #parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
  7. #stores info it needs in different lists, these lists are returned after being organized
  8. #@param: soup object looking at html page of description page
  9. #return: 'row' that contains a variety of lists that each hold info on the description page
  10. def BlackPyramid_description_parser(soup):
  11. # Fields to be parsed
  12. name = "-1" # 0 Product_Name
  13. describe = "-1" # 1 Product_Description
  14. lastSeen = "-1" # 2 Product_LastViewDate
  15. rules = "-1" # 3 NOT USED ...
  16. CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  17. MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
  18. review = "-1" # 6 Product_Number_Of_Reviews
  19. category = "-1" # 7 Product_Category
  20. shipFrom = "-1" # 8 Product_ShippedFrom
  21. shipTo = "-1" # 9 Product_ShippedTo
  22. left = "-1" # 10 Product_QuantityLeft
  23. escrow = "-1" # 11 Vendor_Warranty
  24. terms = "-1" # 12 Vendor_TermsAndConditions
  25. vendor = "-1" # 13 Vendor_Name
  26. sold = "-1" # 14 Product_QuantitySold
  27. addDate = "-1" # 15 Product_AddedDate
  28. available = "-1" # 16 NOT USED ...
  29. endDate = "-1" # 17 NOT USED ...
  30. BTC = "-1" # 18 Product_BTC_SellingPrice
  31. USD = "-1" # 19 Product_USD_SellingPrice
  32. rating = "-1" # 20 Vendor_Rating
  33. success = "-1" # 21 Vendor_Successful_Transactions
  34. EURO = "-1" # 22 Product_EURO_SellingPrice
  35. # Finding Product Name
  36. name = soup.find('div', {'class': 'panel39002'}).find('span').next_sibling
  37. name = name.replace('\n', ' ')
  38. name = name.replace(",", "")
  39. name = name.strip()
  40. # product description
  41. describe = soup.findAll('div', {'class': 'fer048953'})[1].text
  42. describe = describe.replace('\n', ' ')
  43. describe = describe.replace(",", "")
  44. describe = describe.strip()
  45. # Finding Vendor
  46. vendor = soup.find('div', {'class': 'bold03905 vstat364'}).text
  47. vendor = vendor.split(" ")
  48. vendor = vendor[2][:-1]
  49. vendor = vendor.replace('\n', ' ')
  50. vendor = vendor.replace(",", "")
  51. vendor = vendor.strip()
  52. # Finding Vendor Rating
  53. rating_span = soup.find('span', {'class': 'to3098503t'}).find_next_sibling('span')
  54. rating_num = rating_span.find('b').text
  55. if rating_num != 'N/A':
  56. rating = rating_num[0:3]
  57. # Finding Successful Transactions
  58. success_container = soup.find('ul', {'class': 'ul3o00953'}).findAll('li')[1]
  59. success = success_container.find('div').text
  60. success = success.replace('"', '')
  61. success = success.replace("\n", " ")
  62. success = success.replace(",", "")
  63. success = success.strip()
  64. # Finding Prices
  65. USD_text = soup.find('li', {'class': 'vul2994 vghul995'}).find('div').text
  66. USD = USD_text.split(',')[1]
  67. USD = USD.replace('\n', ' ')
  68. USD = USD.replace(",", "")
  69. USD = USD.strip()
  70. container = soup.find('ul', {'class': 'bic03095'})
  71. # Finding Number Sold
  72. sold_container = container.find('li')
  73. sold_div = sold_container.findAll('div')[2]
  74. sold = sold_div.find('b').next_sibling
  75. sold = sold.replace('"', '')
  76. sold = sold.replace("\n", " ")
  77. sold = sold.replace(",", "")
  78. sold = sold.strip()
  79. # Finding the Product Quantity Available
  80. left_container = container.find('li')
  81. left_div = left_container.findAll('div')[3]
  82. left = left_div.find('b').next_sibling
  83. left = left.replace('"', '')
  84. left = left.replace("\n", " ")
  85. left = left.replace(",", "")
  86. left = left.strip()
  87. # Finding number of reviews
  88. positive = soup.find('span', {'class': 'ar04999324'}).text
  89. neutral = soup.find('span', {'class': 'ti9400005 can39953'}).text
  90. negative = soup.find('span', {'class': 'ti9400005 ti90088 can39953'}).text
  91. review = int(positive) + int(neutral) + int(negative)
  92. # Searching for CVE and MS categories
  93. cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
  94. if cve:
  95. CVE = " "
  96. for idx in cve:
  97. CVE += (idx)
  98. CVE += " "
  99. CVE = CVE.replace(',', ' ')
  100. CVE = CVE.replace('\n', '')
  101. ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
  102. if ms:
  103. MS = " "
  104. for im in ms:
  105. MS += (im)
  106. MS += " "
  107. MS = MS.replace(',', ' ')
  108. MS = MS.replace('\n', '')
  109. # Populating the final variable (this should be a list with all fields scraped)
  110. row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,
  111. sold, addDate, available, endDate, BTC, USD, rating, success, EURO)
  112. # Sending the results
  113. return row
  114. #parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
  115. #stores info it needs in different lists, these lists are returned after being organized
  116. #@param: soup object looking at html page of listing page
  117. #return: 'row' that contains a variety of lists that each hold info on the listing page
  118. def BlackPyramid_listing_parser(soup):
  119. # Fields to be parsed
  120. nm = 0 # Total_Products (Should be Integer)
  121. mktName = "BlackPyramid" # 0 Marketplace_Name
  122. name = [] # 1 Product_Name
  123. CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  124. MS = [] # 3 Product_MS_Classification (Microsoft Security)
  125. category = [] # 4 Product_Category
  126. describe = [] # 5 Product_Description
  127. escrow = [] # 6 Vendor_Warranty
  128. views = [] # 7 Product_Number_Of_Views
  129. reviews = [] # 8 Product_Number_Of_Reviews
  130. addDate = [] # 9 Product_AddDate
  131. lastSeen = [] # 10 Product_LastViewDate
  132. BTC = [] # 11 Product_BTC_SellingPrice
  133. USD = [] # 12 Product_USD_SellingPrice
  134. EURO = [] # 13 Product_EURO_SellingPrice
  135. sold = [] # 14 Product_QuantitySold
  136. qLeft =[] # 15 Product_QuantityLeft
  137. shipFrom = [] # 16 Product_ShippedFrom
  138. shipTo = [] # 17 Product_ShippedTo
  139. rating_item = [] # 18 Product_Rating
  140. vendor = [] # 19 Vendor
  141. rating = [] # 20 Vendor_Rating
  142. success = [] # 21 Vendor_Successful_Transactions
  143. href = [] # 23 Product_Links (Urls)
  144. listing = soup.findAll('article', {"class": "product"})
  145. # Some listing pages have an additional article section which is blank
  146. if not listing[-1].findAll('a', href=True):
  147. listing = listing[:-1]
  148. # Populating the Number of Products
  149. nm = len(listing)
  150. for card in listing:
  151. bae = card.findAll('a', href=True)
  152. # Adding the url to the list of urls
  153. link = bae[2].get('href')
  154. link = cleanLink(link)
  155. href.append(link)
  156. # Finding the Product
  157. product = bae[3].text
  158. product = product.replace('\n', ' ')
  159. product = product.replace(",", "")
  160. product = product.replace("...", "")
  161. product = product.strip()
  162. name.append(product)
  163. # Finding description
  164. # 'recurisve = False' only searches direct children
  165. desc = card.findChildren('div', recursive=False)[0]
  166. desc = desc.findAll('div', recursive=False)[3].text
  167. desc = desc.replace('\n', ' ')
  168. desc = desc.replace(",", "")
  169. desc = desc.strip()
  170. describe.append(desc)
  171. # Finding Vendor Name
  172. vendor_name = bae[4].find('span').text
  173. vendor_name = vendor_name.split(' ')[1]
  174. vendor_name = vendor_name.replace('\n', ' ')
  175. vendor_name = vendor_name.replace(",", "")
  176. vendor_name = vendor_name.strip()
  177. vendor.append(vendor_name)
  178. # Finding the Category
  179. cat = card.findAll('div', recursive=False)[0].findAll('div', recursive=False)[1].find('span').text
  180. cat = cat.replace("\n", "")
  181. cat = cat.replace(",", "")
  182. cat = cat.strip()
  183. category.append(cat)
  184. bae = card.findAll('div', recursive=False)[1].findAll('div', recursive=False)[1]
  185. # Finding amount left
  186. left = bae.findAll('div', recursive=False)[1].text
  187. left = left.replace("x", "")
  188. left = left.replace('\n', ' ')
  189. left = left.replace(",", "")
  190. left = left.strip()
  191. qLeft.append(left)
  192. # Finding amount sold
  193. qsold = bae.findAll('div', recursive=False)[2].text
  194. qsold = qsold.replace('\n', ' ')
  195. qsold = qsold.replace("x", "")
  196. qsold = qsold.replace(",", "")
  197. qsold = qsold.strip()
  198. sold.append(qsold)
  199. # Searching for CVE and MS categories
  200. cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
  201. if not cve:
  202. cveValue="-1"
  203. else:
  204. cee = " "
  205. for idx in cve:
  206. cee += (idx)
  207. cee += " "
  208. cee = cee.replace(',', ' ')
  209. cee = cee.replace('\n', '')
  210. cveValue=cee
  211. CVE.append(cveValue)
  212. ms = card.findAll(text=re.compile('MS\d{2}-\d{3}'))
  213. if not ms:
  214. MSValue="-1"
  215. else:
  216. me = " "
  217. for im in ms:
  218. me += (im)
  219. me += " "
  220. me = me.replace(',', ' ')
  221. me = me.replace('\n', '')
  222. MSValue=me
  223. MS.append(MSValue)
  224. # Populate the final variable (this should be a list with all fields scraped)
  225. return organizeProducts(mktName, nm, vendor, rating, success, name, CVE, MS, category, describe, views, reviews, rating,
  226. addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
  227. #called by the crawler to get description links on a listing page
  228. #@param: beautifulsoup object that is using the correct html page (listing page)
  229. #return: list of description links from a listing page
  230. def BlackPyramid_links_parser(soup):
  231. # Returning all links that should be visited by the Crawler
  232. href = []
  233. listing = soup.findAll('article', {"class": "product"})
  234. for item in listing:
  235. container = item.find('a', {"class": "ah39063"})
  236. if container:
  237. link = item.find('a', {"class": "ah39063"})['href']
  238. href.append(link)
  239. return href