this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

263 lines
9.2 KiB

1 year ago
  1. # Here, we are importing the auxiliary functions to clean or convert data
  2. from MarketPlaces.Utilities.utilities import *
  3. # Here, we are importing BeautifulSoup to search through the HTML tree
  4. from bs4 import BeautifulSoup
  5. # parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
  6. # stores info it needs in different lists, these lists are returned after being organized
  7. # @param: soup object looking at html page of description page
  8. # return: 'row' that contains a variety of lists that each hold info on the description page
  9. def darkbazar_description_parser(soup):
  10. # Fields to be parsed
  11. vendor = "-1" # 0 *Vendor_Name
  12. success = "-1" # 1 Vendor_Successful_Transactions
  13. rating_vendor = "-1" # 2 Vendor_Rating
  14. name = "-1" # 3 *Product_Name
  15. describe = "-1" # 4 Product_Description
  16. CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  17. MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
  18. category = "-1" # 7 Product_Category
  19. views = "-1" # 8 Product_Number_Of_Views
  20. reviews = "-1" # 9 Product_Number_Of_Reviews
  21. rating_item = "-1" # 10 Product_Rating
  22. addDate = "-1" # 11 Product_AddedDate
  23. BTC = "-1" # 12 Product_BTC_SellingPrice
  24. USD = "-1" # 13 Product_USD_SellingPrice
  25. EURO = "-1" # 14 Product_EURO_SellingPrice
  26. sold = "-1" # 15 Product_QuantitySold
  27. left = "-1" # 16 Product_QuantityLeft
  28. shipFrom = "-1" # 17 Product_ShippedFrom
  29. shipTo = "-1" # 18 Product_ShippedTo
  30. # Finding Product Name
  31. divmb = soup.findAll('div', {'class': "mb-1"})
  32. name = divmb[0].text
  33. name = name.replace('\n', ' ')
  34. name = name.replace(",", "")
  35. name = name.strip()
  36. # Finding Vendor
  37. vendor = divmb[1].find('a').text.strip()
  38. # Finding Vendor Rating
  39. rating = soup.find('div', {'class': ""}).text
  40. rating = rating.replace("Vendor's Review : ", "")
  41. rating = rating.strip()
  42. # Finding Successful Transactions
  43. success = divmb[3].text
  44. success = success.replace("Level:", "")
  45. success = success.strip()
  46. # Finding Prices
  47. USD = soup.find('div', {'class': "h3 text - primary"}).text.strip()
  48. # Finding Escrow
  49. escrow = divmb[5].find('span', {'class': "badge badge-danger"}).text.strip()
  50. # Finding the Product Category
  51. pmb = soup.findAll('p', {'class': "mb-1"})
  52. category = pmb[-1].text
  53. category = category.replace("Category: ", "").strip()
  54. # Finding the Product Quantity Available
  55. left = divmb[-1].text
  56. left = left.split(",", 1)[1]
  57. left = left.replace("in stock", "")
  58. left = left.strip()
  59. # Finding Number Sold
  60. sold = divmb[-1].text
  61. sold = sold.split(",", 1)[0]
  62. sold = sold.replace("sold", "")
  63. sold = sold.strip()
  64. # Finding Shipment Information (Origin)
  65. pmb[0].text
  66. shipFrom = shipFrom.replace("Ships from: ", "").strip()
  67. # Finding Shipment Information (Destination)
  68. pmb[1].text
  69. shipTo = shipTo.replace("Ships to: ", "").strip()
  70. # Finding the Product description
  71. cardbody = soup.findAll('div', {'class': "card-body"})
  72. describe = cardbody[1].text.strip()
  73. # Finding the Number of Product Reviews
  74. reviews = reviews.find('div', {'class': "product-rating"}).text
  75. reviews = reviews.replace("(", "")
  76. reviews = reviews.replace(" review)", "")
  77. reviews = reviews.strip()
  78. # Searching for CVE and MS categories
  79. cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
  80. if cve:
  81. CVE = " "
  82. for idx in cve:
  83. CVE += (idx)
  84. CVE += " "
  85. CVE = CVE.replace(',', ' ')
  86. CVE = CVE.replace('\n', '')
  87. ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
  88. if ms:
  89. MS = " "
  90. for im in ms:
  91. MS += (im)
  92. MS += " "
  93. MS = MS.replace(',', ' ')
  94. MS = MS.replace('\n', '')
  95. # Populating the final variable (this should be a list with all fields scraped)
  96. row = (name, describe, CVE, MS, review, category, shipFrom, shipTo, left, escrow, vendor,
  97. sold, addDate, BTC, USD, rating, success, EURO)
  98. # Sending the results
  99. return row
  100. # parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
  101. # stores info it needs in different lists, these lists are returned after being organized
  102. # @param: soup object looking at html page of listing page
  103. # return: 'row' that contains a variety of lists that each hold info on the listing page
  104. def darkbazar_listing_parser(soup):
  105. # Fields to be parsed
  106. nm = 0 # Total_Products (Should be Integer)
  107. mktName = "DarkBazar" # 0 Marketplace_Name
  108. name = [] # 1 Product_Name
  109. CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  110. MS = [] # 3 Product_MS_Classification (Microsoft Security)
  111. category = [] # 4 Product_Category
  112. describe = [] # 5 Product_Description
  113. escrow = [] # 6 Vendor_Warranty
  114. views = [] # 7 Product_Number_Of_Views
  115. reviews = [] # 8 Product_Number_Of_Reviews
  116. addDate = [] # 9 Product_AddDate
  117. lastSeen = [] # 10 Product_LastViewDate
  118. BTC = [] # 11 Product_BTC_SellingPrice
  119. USD = [] # 12 Product_USD_SellingPrice
  120. EURO = [] # 13 Product_EURO_SellingPrice
  121. sold = [] # 14 Product_QuantitySold
  122. qLeft = [] # 15 Product_QuantityLeft
  123. shipFrom = [] # 16 Product_ShippedFrom
  124. shipTo = [] # 17 Product_ShippedTo
  125. vendor = [] # 18 Vendor
  126. rating = [] # 19 Vendor_Rating
  127. success = [] # 20 Vendor_Successful_Transactions
  128. href = [] # 23 Product_Links (Urls)
  129. listing = soup.findAll('div', {"id": "itembox"})
  130. # Populating the Number of Products
  131. nm = len(listing)
  132. for a in listing:
  133. bae = a.findAll('a', href=True)
  134. lb = a.findAll('div', {"id": "littlebox"})
  135. # Adding the url to the list of urls
  136. link = bae[0].get('href')
  137. link = cleanLink(link)
  138. href.append(link)
  139. # Finding the Product
  140. product = lb[1].find('a').text
  141. product = product.replace('\n', ' ')
  142. product = product.replace(",", "")
  143. product = product.replace("...", "")
  144. product = product.strip()
  145. name.append(product)
  146. # Finding Prices
  147. price = lb[-1].find('div', {"class": "mb-1"}).text
  148. price = price.replace("$","")
  149. price = price.strip()
  150. USD.append(price)
  151. # Finding the Vendor
  152. vendor_name = lb[-1].find("a").text
  153. vendor_name = vendor_name.replace(",", "")
  154. vendor_name = vendor_name.strip()
  155. vendor.append(vendor_name)
  156. # Finding the Category
  157. cat = lb[-1].find("span").text
  158. cat = cat.replace("class:", "")
  159. cat = cat.strip()
  160. category.append(cat)
  161. # Finding Number Sold and Quantity Left
  162. span = lb[1].findAll("span")
  163. num = span[-1].text
  164. num = num.replace("Sold:", "")
  165. num = num.strip()
  166. category.append(num)
  167. quant = span[1].text
  168. quant = quant.replace("stock:", "")
  169. quant = quant.strip()
  170. qLeft.append(quant)
  171. # Searching for CVE and MS categories
  172. cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
  173. if not cve:
  174. cveValue = "-1"
  175. else:
  176. cee = " "
  177. for idx in cve:
  178. cee += (idx)
  179. cee += " "
  180. cee = cee.replace(',', ' ')
  181. cee = cee.replace('\n', '')
  182. cveValue = cee
  183. CVE.append(cveValue)
  184. ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
  185. if not ms:
  186. MSValue = "-1"
  187. else:
  188. me = " "
  189. for im in ms:
  190. me += (im)
  191. me += " "
  192. me = me.replace(',', ' ')
  193. me = me.replace('\n', '')
  194. MSValue = me
  195. MS.append(MSValue)
  196. # Populate the final variable (this should be a list with all fields scraped)
  197. return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
  198. BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
  199. # called by the crawler to get description links on a listing page
  200. # @param: beautifulsoup object that is using the correct html page (listing page)
  201. # return: list of description links from a listing page
  202. def darkbazar_links_parser(soup):
  203. # Returning all links that should be visited by the Crawler
  204. href = []
  205. listing = soup.findAll('div', {"id": "itembox"})
  206. # for a in listing:
  207. # bae = a.find('a', {"class": "text-info"}, href=True)
  208. # link = bae['href']
  209. # href.append(link)
  210. for a in listing:
  211. bae = a.findAll('a', href=True)
  212. # Adding the url to the list of urls
  213. link = bae[0].get('href')
  214. link = cleanLink(link)
  215. href.append(link)
  216. return href