this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

294 lines
10 KiB

1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from MarketPlaces.Utilities.utilities import *
  4. # Here, we are importing BeautifulSoup to search through the HTML tree
  5. from bs4 import BeautifulSoup
  6. # parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
  7. # stores info it needs in different lists, these lists are returned after being organized
  8. # @param: soup object looking at html page of description page
  9. # return: 'row' that contains a variety of lists that each hold info on the description page
  10. def metaversemarket_description_parser(soup):
  11. # Fields to be parsed
  12. name = "-1" # 0 Product_Name
  13. describe = "-1" # 1 Product_Description
  14. lastSeen = "-1" # 2 Product_LastViewDate
  15. CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  16. MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
  17. review = "-1" # 6 Product_Number_Of_Reviews
  18. category = "-1" # 7 Product_Category
  19. shipFrom = "-1" # 8 Product_ShippedFrom
  20. shipTo = "-1" # 9 Product_ShippedTo
  21. left = "-1" # 10 Product_QuantityLeft
  22. escrow = "-1" # 11 Vendor_Warranty
  23. terms = "-1" # 12 Vendor_TermsAndConditions
  24. vendor = "-1" # 13 Vendor_Name
  25. sold = "-1" # 14 Product_QuantitySold
  26. addDate = "-1" # 15 Product_AddedDate
  27. BTC = "-1" # 18 Product_BTC_SellingPrice
  28. USD = "-1" # 19 Product_USD_SellingPrice
  29. rating = "-1" # 20 Vendor_Rating
  30. success = "-1" # 21 Vendor_Successful_Transactions
  31. EURO = "-1" # 22 Product_EURO_SellingPrice
  32. # Finding Product Name
  33. name = soup.find('div', {'class': "panel-heading"}).text.strip
  34. # Finding Vendor
  35. temp = soup.findAll('div', {'class': "col-xs-12 col-sm-6 mt-5"})
  36. temp = temp[1].findAll('span')
  37. temp = temp[1].find('b').text
  38. name = temp.replace("@", "")
  39. # Finding Product Reviews
  40. review = soup.find('span', {'class': "badge bg-success fs-12px"}).text.strip()
  41. # Finding Successful Transactions
  42. # NA
  43. # Finding Prices
  44. USD = soup.find('h3', {'class': "mb-2"}).text()
  45. USD = USD.replace("Price: $", "").strip()
  46. # Finding Escrow
  47. escrow = soup.find('div', {'class': "alert alert-info text-center fw-bold"}).text
  48. escrow = escrow.replace('You are protected by ', "").strip()
  49. # Finding the Product Category
  50. temp = soup.select('div[class="mt-2"]')
  51. temp = temp[0].findAll('span')
  52. category = temp[1].text.strip()
  53. # Finding the Product Quantity Available
  54. # temp = soup.find('em', {'class': "icon ni ni-layers-fill"}).parent.parent.parent
  55. # left = temp.text
  56. # left = left.replace("Supply:", "")
  57. # left = left.strip()
  58. temp = soup.findAll('span', {'class': "badge bg-success"})
  59. temp = temp[1].text.split("/")
  60. left = temp[1].strip()
  61. # Finding Number Sold
  62. sold = temp[0].strip()
  63. # Finding Shipment Information (Origin)
  64. temp = soup.findAll('div', {'class': "alert alert-info"})
  65. temp = temp[1].text.split("to")
  66. shipFrom = temp[0].replace("Shipping from ", "").strip()
  67. # Finding Shipment Information (Destination)
  68. shipTo = temp[1].split("for")
  69. shipTo = shipTo[0].strip()
  70. # Finding the Product description
  71. describe = soup.find('p', {'class': "card-text"}).text
  72. describe = describe.replace("\n", " ")
  73. describe = describe.strip()
  74. '''# Finding the Number of Product Reviews
  75. tag = soup.findAll(text=re.compile('Reviews'))
  76. for index in tag:
  77. reviews = index
  78. par = reviews.find('(')
  79. if par >=0:
  80. reviews = reviews.replace("Reviews (","")
  81. reviews = reviews.replace(")","")
  82. reviews = reviews.split(",")
  83. review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
  84. else :
  85. review = "-1"'''
  86. # Searching for CVE and MS categories
  87. cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
  88. if cve:
  89. CVE = " "
  90. for idx in cve:
  91. CVE += (idx)
  92. CVE += " "
  93. CVE = CVE.replace(',', ' ')
  94. CVE = CVE.replace('\n', '')
  95. ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
  96. if ms:
  97. MS = " "
  98. for im in ms:
  99. MS += (im)
  100. MS += " "
  101. MS = MS.replace(',', ' ')
  102. MS = MS.replace('\n', '')
  103. # Populating the final variable (this should be a list with all fields scraped)
  104. row = (name, describe, lastSeen, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,
  105. sold, addDate, BTC, USD, rating, success, EURO)
  106. # Sending the results
  107. return row
  108. # parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
  109. # stores info it needs in different lists, these lists are returned after being organized
  110. # @param: soup object looking at html page of listing page
  111. # return: 'row' that contains a variety of lists that each hold info on the listing page
  112. def metaversemarket_listing_parser(soup):
  113. # Fields to be parsed
  114. nm = 0 # Total_Products (Should be Integer)
  115. mktName = "DarkFox" # 0 Marketplace_Name
  116. name = [] # 1 Product_Name
  117. CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  118. MS = [] # 3 Product_MS_Classification (Microsoft Security)
  119. category = [] # 4 Product_Category
  120. describe = [] # 5 Product_Description
  121. escrow = [] # 6 Vendor_Warranty
  122. views = [] # 7 Product_Number_Of_Views
  123. reviews = [] # 8 Product_Number_Of_Reviews
  124. addDate = [] # 9 Product_AddDate
  125. lastSeen = [] # 10 Product_LastViewDate
  126. BTC = [] # 11 Product_BTC_SellingPrice
  127. USD = [] # 12 Product_USD_SellingPrice
  128. EURO = [] # 13 Product_EURO_SellingPrice
  129. sold = [] # 14 Product_QuantitySold
  130. qLeft = [] # 15 Product_QuantityLeft
  131. shipFrom = [] # 16 Product_ShippedFrom
  132. shipTo = [] # 17 Product_ShippedTo
  133. vendor = [] # 18 Vendor
  134. rating = [] # 19 Vendor_Rating
  135. success = [] # 20 Vendor_Successful_Transactions
  136. href = [] # 23 Product_Links (Urls)
  137. listing = soup.findAll('div', {"class": "col-12 col-sm-4 col-xl-3 product_item_col p-1"})
  138. # Populating the Number of Products
  139. nm = len(listing)
  140. for a in listing:
  141. bae = a.findAll('a', href=True)
  142. # Adding the url to the list of urls
  143. link = bae[0].get('href')
  144. link = cleanLink(link)
  145. href.append(link)
  146. # Finding the Product
  147. product = bae[1].find('span', {"class": "text-primary"}).text
  148. product = product.replace('\n', ' ')
  149. product = product.replace(",", "")
  150. product = product.replace("...", "")
  151. product = product.strip()
  152. name.append(product)
  153. # Finding Prices
  154. price = a.find('strong').text
  155. price = price.replace("Buy for $", "")
  156. price = price.strip()
  157. USD.append(price)
  158. # Finding the Vendor
  159. temp = a.find('div', {'class': "mt-1 fs-12px"})
  160. temp = temp.findAll('span')
  161. temp = temp[1].find('b').text
  162. vendor_name = temp.replace("@", "").strip()
  163. vendor.append(vendor_name)
  164. # Finding the Category
  165. cat = a.select_one('div[class="fs-12px"]')
  166. cat = cat.findAll('span')[1].text
  167. cat = cat.text
  168. cat = cat.strip()
  169. category.append(cat)
  170. badge = a.findAll('span', {'class': "badge bg-success"})
  171. # Finding Number Sold and Quantity Left
  172. temp = badge[1].text
  173. temp = temp.split("/")
  174. num = temp[0]
  175. num = num.strip()
  176. sold.append(num)
  177. quant = temp[1]
  178. quant = quant.strip()
  179. qLeft.append(quant)
  180. # Finding Successful Transactions
  181. # NA
  182. # Finding Product review
  183. review = a.find('span', {'class': "badge bg-success fs-12px"}).text
  184. review = review.replace("+ ", "")
  185. reviews.append(review)
  186. # Finding Descrption
  187. description = a.find('p', {'class': "alert alert-light text-ssbold p-1"}).text
  188. description = description.replace("\n", " ")
  189. description = description.strip()
  190. describe.append(description)
  191. # Finding Escrow
  192. es = a.find('span', {'class': "fw-bold"}).text.strip()
  193. escrow.append(es)
  194. # Finding Number of Views
  195. view = a.find('span', {'class': "badge bg-primary"}).text.strip()
  196. views.append(view)
  197. # Find where ships from
  198. ships = a.find('div', {'class': "alert alert-info item_alert fs-12px p-1"})
  199. ships = ships.findAll('b')
  200. sFrom = ships[0].text.strips()
  201. shipFrom.append(sFrom)
  202. # Find where it ships to
  203. sTo = ships[1].text.strips()
  204. shipTo.append(sTo)
  205. # Searching for CVE and MS categories
  206. cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
  207. if not cve:
  208. cveValue = "-1"
  209. else:
  210. cee = " "
  211. for idx in cve:
  212. cee += (idx)
  213. cee += " "
  214. cee = cee.replace(',', ' ')
  215. cee = cee.replace('\n', '')
  216. cveValue = cee
  217. CVE.append(cveValue)
  218. ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
  219. if not ms:
  220. MSValue = "-1"
  221. else:
  222. me = " "
  223. for im in ms:
  224. me += (im)
  225. me += " "
  226. me = me.replace(',', ' ')
  227. me = me.replace('\n', '')
  228. MSValue = me
  229. MS.append(MSValue)
  230. # Populate the final variable (this should be a list with all fields scraped)
  231. return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
  232. BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
  233. # called by the crawler to get description links on a listing page
  234. # @param: beautifulsoup object that is using the correct html page (listing page)
  235. # return: list of description links from a listing page
  236. def metaversemarket_links_parser(soup):
  237. # Returning all links that should be visited by the Crawler
  238. href = []
  239. listing = soup.findAll('div', {"class": "col-12 col-sm-4 col-xl-3 product_item_col p-1"})
  240. for a in listing:
  241. bae = a.find('a', href=True)
  242. link = bae['href']
  243. href.append(link)
  244. return href