this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

290 lines
11 KiB

  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from MarketPlaces.Utilities.utilities import *
  4. # Here, we are importing BeautifulSoup to search through the HTML tree
  5. from bs4 import BeautifulSoup
  6. #parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
  7. #stores info it needs in different lists, these lists are returned after being organized
  8. #@param: soup object looking at html page of description page
  9. #return: 'row' that contains a variety of lists that each hold info on the description page
  10. def cyphermarketplace_description_parser(soup):
  11. # Fields to be parsed
  12. name = "-1" # 0 Product_Name
  13. describe = "-1" # 1 Product_Description
  14. lastSeen = "-1" # 2 Product_LastViewDate
  15. rules = "-1" # 3 NOT USED ...
  16. CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  17. MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
  18. review = "-1" # 6 Product_Number_Of_Reviews
  19. category = "-1" # 7 Product_Category
  20. shipFrom = "-1" # 8 Product_ShippedFrom
  21. shipTo = "-1" # 9 Product_ShippedTo
  22. left = "-1" # 10 Product_QuantityLeft
  23. escrow = "-1" # 11 Vendor_Warranty
  24. terms = "-1" # 12 Vendor_TermsAndConditions
  25. vendor = "-1" # 13 Vendor_Name
  26. sold = "-1" # 14 Product_QuantitySold
  27. addDate = "-1" # 15 Product_AddedDate
  28. available = "-1" # 16 NOT USED ...
  29. endDate = "-1" # 17 NOT USED ...
  30. BTC = "-1" # 18 Product_BTC_SellingPrice
  31. USD = "-1" # 19 Product_USD_SellingPrice
  32. rating = "-1" # 20 Vendor_Rating
  33. success = "-1" # 21 Vendor_Successful_Transactions
  34. EURO = "-1" # 22 Product_EURO_SellingPrice
  35. # Finding Product Name
  36. name = soup.find('h1').text
  37. name = name.replace('\n', ' ')
  38. name = name.replace(",", "")
  39. name = name.strip()
  40. # Finding Vendor
  41. vendor = soup.find('h3').find('a').text.strip()
  42. # Finding Vendor Rating
  43. rating = soup.find('span', {'class': "tag is-dark"}).text.strip()
  44. # Finding Successful Transactions
  45. success = soup.find('h3').text
  46. success = success.replace("Vendor: ", "")
  47. success = success.replace(vendor, "")
  48. success = success.replace("(", "")
  49. success = success.replace(")", "")
  50. success = success.strip()
  51. bae = soup.find('div', {'class': "box"}).find_all('ul')
  52. # Finding Prices
  53. USD = bae[1].find('strong').text.strip()
  54. li = bae[2].find_all('li')
  55. # Finding Escrow
  56. escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip()
  57. # Finding the Product Category
  58. category = li[1].find('span', {'class': "tag is-dark"}).text.strip()
  59. # Finding the Product Quantity Available
  60. left = li[3].find('span', {'class': "tag is-dark"}).text.strip()
  61. # Finding Number Sold
  62. sold = li[4].find('span', {'class': "tag is-dark"}).text.strip()
  63. li = bae[3].find_all('li')
  64. # Finding Shipment Information (Origin)
  65. if "Ships from:" in li[-2].text:
  66. shipFrom = li[-2].text
  67. shipFrom = shipFrom.replace("Ships from: ", "")
  68. # shipFrom = shipFrom.replace(",", "")
  69. shipFrom = shipFrom.strip()
  70. # Finding Shipment Information (Destination)
  71. shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text
  72. shipTo = shipTo.replace("Ships to: ", "")
  73. shipTo = shipTo.strip()
  74. if "certain countries" in shipTo:
  75. countries = ""
  76. tags = li[-1].find_all('span', {'class': "tag"})
  77. for tag in tags:
  78. country = tag.text.strip()
  79. countries += country + ", "
  80. shipTo = countries.strip(", ")
  81. # Finding the Product description
  82. describe = soup.find('div', {'class': "pre-line"}).text
  83. describe = describe.replace("\n", " ")
  84. describe = describe.strip()
  85. '''# Finding the Number of Product Reviews
  86. tag = soup.findAll(text=re.compile('Reviews'))
  87. for index in tag:
  88. reviews = index
  89. par = reviews.find('(')
  90. if par >=0:
  91. reviews = reviews.replace("Reviews (","")
  92. reviews = reviews.replace(")","")
  93. reviews = reviews.split(",")
  94. review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
  95. else :
  96. review = "-1"'''
  97. # Searching for CVE and MS categories
  98. cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
  99. if cve:
  100. CVE = " "
  101. for idx in cve:
  102. CVE += (idx)
  103. CVE += " "
  104. CVE = CVE.replace(',', ' ')
  105. CVE = CVE.replace('\n', '')
  106. ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
  107. if ms:
  108. MS = " "
  109. for im in ms:
  110. MS += (im)
  111. MS += " "
  112. MS = MS.replace(',', ' ')
  113. MS = MS.replace('\n', '')
  114. # Populating the final variable (this should be a list with all fields scraped)
  115. row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,
  116. sold, addDate, available, endDate, BTC, USD, rating, success, EURO)
  117. # Sending the results
  118. return row
  119. #parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
  120. #stores info it needs in different lists, these lists are returned after being organized
  121. #@param: soup object looking at html page of listing page
  122. #return: 'row' that contains a variety of lists that each hold info on the listing page
  123. def cyphermarketplace_listing_parser(soup):
  124. # Fields to be parsed
  125. nm = 0 # Total_Products (Should be Integer)
  126. mktName = "CypherMarketplace" # 0 Marketplace_Name
  127. name = [] # 1 Product_Name
  128. CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  129. MS = [] # 3 Product_MS_Classification (Microsoft Security)
  130. category = [] # 4 Product_Category
  131. describe = [] # 5 Product_Description
  132. escrow = [] # 6 Vendor_Warranty
  133. views = [] # 7 Product_Number_Of_Views
  134. reviews = [] # 8 Product_Number_Of_Reviews
  135. addDate = [] # 9 Product_AddDate
  136. lastSeen = [] # 10 Product_LastViewDate
  137. BTC = [] # 11 Product_BTC_SellingPrice
  138. USD = [] # 12 Product_USD_SellingPrice
  139. EURO = [] # 13 Product_EURO_SellingPrice
  140. sold = [] # 14 Product_QuantitySold
  141. qLeft =[] # 15 Product_QuantityLeft
  142. shipFrom = [] # 16 Product_ShippedFrom
  143. shipTo = [] # 17 Product_ShippedTo
  144. vendor = [] # 18 Vendor
  145. rating = [] # 19 Vendor_Rating
  146. success = [] # 20 Vendor_Successful_Transactions
  147. href = [] # 23 Product_Links (Urls)
  148. listing = soup.findAll('div', {"class": "col-12"})
  149. # Populating the Number of Products
  150. nm = len(listing)
  151. for a in listing:
  152. bae = a.findAll('a', href=True)
  153. # Adding the url to the list of urls
  154. link = bae[0].get('href')
  155. link = cleanLink(link)
  156. href.append(link)
  157. # Finding the Product
  158. product = bae[1].find('p').text
  159. product = product.replace('\n', ' ')
  160. product = product.replace(",", "")
  161. product = product.replace("...", "")
  162. product = product.strip()
  163. name.append(product)
  164. bae = a.find('div', {'class': "media-content"}).find('div').find_all('div')
  165. if len(bae) >= 5:
  166. # Finding Prices
  167. price = bae[0].text
  168. ud = price.replace(" USD", " ")
  169. # u = ud.replace("$","")
  170. u = ud.replace(",", "")
  171. u = u.strip()
  172. USD.append(u)
  173. # bc = (prc[1]).strip(' BTC')
  174. # BTC.append(bc)
  175. # Finding the Vendor
  176. vendor_name = bae[1].find('a').text
  177. vendor_name = vendor_name.replace(",", "")
  178. vendor_name = vendor_name.strip()
  179. vendor.append(vendor_name)
  180. # Finding the Category
  181. cat = bae[2].find('small').text
  182. cat = cat.replace("Category: ", "")
  183. cat = cat.replace(",", "")
  184. cat = cat.strip()
  185. category.append(cat)
  186. # Finding Number Sold and Quantity Left
  187. num = bae[3].text
  188. num = num.replace("Sold: ", "")
  189. num = num.strip()
  190. sold.append(num)
  191. quant = bae[4].find('small').text
  192. quant = quant.replace("In stock: ", "")
  193. quant = quant.strip()
  194. qLeft.append(quant)
  195. # Finding Successful Transactions
  196. freq = bae[1].text
  197. freq = freq.replace(vendor_name, "")
  198. freq = re.sub(r'Vendor Level \d+', "", freq)
  199. freq = freq.replace("(", "")
  200. freq = freq.replace(")", "")
  201. freq = freq.strip()
  202. success.append(freq)
  203. # Searching for CVE and MS categories
  204. cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
  205. if not cve:
  206. cveValue="-1"
  207. else:
  208. cee = " "
  209. for idx in cve:
  210. cee += (idx)
  211. cee += " "
  212. cee = cee.replace(',', ' ')
  213. cee = cee.replace('\n', '')
  214. cveValue=cee
  215. CVE.append(cveValue)
  216. ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
  217. if not ms:
  218. MSValue="-1"
  219. else:
  220. me = " "
  221. for im in ms:
  222. me += (im)
  223. me += " "
  224. me = me.replace(',', ' ')
  225. me = me.replace('\n', '')
  226. MSValue=me
  227. MS.append(MSValue)
  228. # Populate the final variable (this should be a list with all fields scraped)
  229. return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
  230. BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
  231. #called by the crawler to get description links on a listing page
  232. #@param: beautifulsoup object that is using the correct html page (listing page)
  233. #return: list of description links from a listing page
  234. def cyphermarketplace_links_parser(soup):
  235. # Returning all links that should be visited by the Crawler
  236. href = []
  237. listing = soup.findAll('div', {"class": "col-12"})
  238. for a in listing:
  239. bae = a.find('a', href=True)
  240. link = bae['href']
  241. href.append(link)
  242. return href