this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

284 lines
9.9 KiB

  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from MarketPlaces.Utilities.utilities import *
  4. # Here, we are importing BeautifulSoup to search through the HTML tree
  5. from bs4 import BeautifulSoup
  6. # parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
  7. # stores info it needs in different lists, these lists are returned after being organized
  8. # @param: soup object looking at html page of description page
  9. # return: 'row' that contains a variety of lists that each hold info on the description page
  10. def darkfox_description_parser(soup):
  11. # Fields to be parsed
  12. name = "-1" # 0 Product_Name
  13. describe = "-1" # 1 Product_Description
  14. lastSeen = "-1" # 2 Product_LastViewDate
  15. CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  16. MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
  17. review = "-1" # 6 Product_Number_Of_Reviews
  18. category = "-1" # 7 Product_Category
  19. shipFrom = "-1" # 8 Product_ShippedFrom
  20. shipTo = "-1" # 9 Product_ShippedTo
  21. left = "-1" # 10 Product_QuantityLeft
  22. escrow = "-1" # 11 Vendor_Warranty
  23. terms = "-1" # 12 Vendor_TermsAndConditions
  24. vendor = "-1" # 13 Vendor_Name
  25. sold = "-1" # 14 Product_QuantitySold
  26. addDate = "-1" # 15 Product_AddedDate
  27. BTC = "-1" # 18 Product_BTC_SellingPrice
  28. USD = "-1" # 19 Product_USD_SellingPrice
  29. rating = "-1" # 20 Vendor_Rating
  30. success = "-1" # 21 Vendor_Successful_Transactions
  31. EURO = "-1" # 22 Product_EURO_SellingPrice
  32. # Finding Product Name
  33. name = soup.find('h1').text
  34. name = name.replace('\n', ' ')
  35. name = name.replace(",", "")
  36. name = name.strip()
  37. # Finding Vendor
  38. vendor = soup.find('h3').find('a').text.strip()
  39. # Finding Vendor Rating
  40. rating = soup.find('span', {'class': "tag is-dark"}).text.strip()
  41. # Finding Successful Transactions
  42. success = soup.find('h3').text
  43. success = success.replace("Vendor: ", "")
  44. success = success.replace(vendor, "")
  45. success = success.replace("(", "")
  46. success = success.replace(")", "")
  47. success = success.strip()
  48. bae = soup.find('div', {'class': "box"}).find_all('ul')
  49. # Finding Prices
  50. USD = bae[1].find('strong').text.strip()
  51. li = bae[2].find_all('li')
  52. # Finding Escrow
  53. escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip()
  54. # Finding the Product Category
  55. category = li[1].find('span', {'class': "tag is-dark"}).text.strip()
  56. # Finding the Product Quantity Available
  57. left = li[3].find('span', {'class': "tag is-dark"}).text.strip()
  58. # Finding Number Sold
  59. sold = li[4].find('span', {'class': "tag is-dark"}).text.strip()
  60. li = bae[3].find_all('li')
  61. # Finding Shipment Information (Origin)
  62. if "Ships from:" in li[-2].text:
  63. shipFrom = li[-2].text
  64. shipFrom = shipFrom.replace("Ships from: ", "")
  65. # shipFrom = shipFrom.replace(",", "")
  66. shipFrom = shipFrom.strip()
  67. # Finding Shipment Information (Destination)
  68. shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text
  69. shipTo = shipTo.replace("Ships to: ", "")
  70. shipTo = shipTo.strip()
  71. if "certain countries" in shipTo:
  72. countries = ""
  73. tags = li[-1].find_all('span', {'class': "tag"})
  74. for tag in tags:
  75. country = tag.text.strip()
  76. countries += country + ", "
  77. shipTo = countries.strip(", ")
  78. # Finding the Product description
  79. describe = soup.find('div', {'class': "pre-line"}).text
  80. describe = describe.replace("\n", " ")
  81. describe = describe.strip()
  82. '''# Finding the Number of Product Reviews
  83. tag = soup.findAll(text=re.compile('Reviews'))
  84. for index in tag:
  85. reviews = index
  86. par = reviews.find('(')
  87. if par >=0:
  88. reviews = reviews.replace("Reviews (","")
  89. reviews = reviews.replace(")","")
  90. reviews = reviews.split(",")
  91. review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
  92. else :
  93. review = "-1"'''
  94. # Searching for CVE and MS categories
  95. cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
  96. if cve:
  97. CVE = " "
  98. for idx in cve:
  99. CVE += (idx)
  100. CVE += " "
  101. CVE = CVE.replace(',', ' ')
  102. CVE = CVE.replace('\n', '')
  103. ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
  104. if ms:
  105. MS = " "
  106. for im in ms:
  107. MS += (im)
  108. MS += " "
  109. MS = MS.replace(',', ' ')
  110. MS = MS.replace('\n', '')
  111. # Populating the final variable (this should be a list with all fields scraped)
  112. row = (name, describe, lastSeen, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,
  113. sold, addDate, BTC, USD, rating, success, EURO)
  114. # Sending the results
  115. return row
  116. # parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
  117. # stores info it needs in different lists, these lists are returned after being organized
  118. # @param: soup object looking at html page of listing page
  119. # return: 'row' that contains a variety of lists that each hold info on the listing page
  120. def darkfox_listing_parser(soup):
  121. # Fields to be parsed
  122. nm = 0 # Total_Products (Should be Integer)
  123. mktName = "DarkFox" # 0 Marketplace_Name
  124. name = [] # 1 Product_Name
  125. CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  126. MS = [] # 3 Product_MS_Classification (Microsoft Security)
  127. category = [] # 4 Product_Category
  128. describe = [] # 5 Product_Description
  129. escrow = [] # 6 Vendor_Warranty
  130. views = [] # 7 Product_Number_Of_Views
  131. reviews = [] # 8 Product_Number_Of_Reviews
  132. addDate = [] # 9 Product_AddDate
  133. lastSeen = [] # 10 Product_LastViewDate
  134. BTC = [] # 11 Product_BTC_SellingPrice
  135. USD = [] # 12 Product_USD_SellingPrice
  136. EURO = [] # 13 Product_EURO_SellingPrice
  137. sold = [] # 14 Product_QuantitySold
  138. qLeft = [] # 15 Product_QuantityLeft
  139. shipFrom = [] # 16 Product_ShippedFrom
  140. shipTo = [] # 17 Product_ShippedTo
  141. vendor = [] # 18 Vendor
  142. rating = [] # 19 Vendor_Rating
  143. success = [] # 20 Vendor_Successful_Transactions
  144. href = [] # 23 Product_Links (Urls)
  145. listing = soup.findAll('div', {"class": "card"})
  146. # Populating the Number of Products
  147. nm = len(listing)
  148. for a in listing:
  149. bae = a.findAll('a', href=True)
  150. # Adding the url to the list of urls
  151. link = bae[0].get('href')
  152. link = cleanLink(link)
  153. href.append(link)
  154. # Finding the Product
  155. product = bae[1].find('p').text
  156. product = product.replace('\n', ' ')
  157. product = product.replace(",", "")
  158. product = product.replace("...", "")
  159. product = product.strip()
  160. name.append(product)
  161. bae = a.find('div', {'class': "media-content"}).find('div').find_all('div')
  162. if len(bae) >= 5:
  163. # Finding Prices
  164. price = bae[0].text
  165. ud = price.replace(" USD", " ")
  166. # u = ud.replace("$","")
  167. u = ud.replace(",", "")
  168. u = u.strip()
  169. USD.append(u)
  170. # bc = (prc[1]).strip(' BTC')
  171. # BTC.append(bc)
  172. # Finding the Vendor
  173. vendor_name = bae[1].find('a').text
  174. vendor_name = vendor_name.replace(",", "")
  175. vendor_name = vendor_name.strip()
  176. vendor.append(vendor_name)
  177. # Finding the Category
  178. cat = bae[2].find('small').text
  179. cat = cat.replace("Category: ", "")
  180. cat = cat.replace(",", "")
  181. cat = cat.strip()
  182. category.append(cat)
  183. # Finding Number Sold and Quantity Left
  184. num = bae[3].text
  185. num = num.replace("Sold: ", "")
  186. num = num.strip()
  187. sold.append(num)
  188. quant = bae[4].find('small').text
  189. quant = quant.replace("In stock: ", "")
  190. quant = quant.strip()
  191. qLeft.append(quant)
  192. # Finding Successful Transactions
  193. freq = bae[1].text
  194. freq = freq.replace(vendor_name, "")
  195. freq = re.sub(r'Vendor Level \d+', "", freq)
  196. freq = freq.replace("(", "")
  197. freq = freq.replace(")", "")
  198. freq = freq.strip()
  199. success.append(freq)
  200. # Searching for CVE and MS categories
  201. cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
  202. if not cve:
  203. cveValue = "-1"
  204. else:
  205. cee = " "
  206. for idx in cve:
  207. cee += (idx)
  208. cee += " "
  209. cee = cee.replace(',', ' ')
  210. cee = cee.replace('\n', '')
  211. cveValue = cee
  212. CVE.append(cveValue)
  213. ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
  214. if not ms:
  215. MSValue = "-1"
  216. else:
  217. me = " "
  218. for im in ms:
  219. me += (im)
  220. me += " "
  221. me = me.replace(',', ' ')
  222. me = me.replace('\n', '')
  223. MSValue = me
  224. MS.append(MSValue)
  225. # Populate the final variable (this should be a list with all fields scraped)
  226. return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
  227. BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
  228. # called by the crawler to get description links on a listing page
  229. # @param: beautifulsoup object that is using the correct html page (listing page)
  230. # return: list of description links from a listing page
  231. def metaversemarket_links_parser(soup):
  232. # Returning all links that should be visited by the Crawler
  233. href = []
  234. listing = soup.findAll('div', {"class": "col-12 p-0"})
  235. for a in listing:
  236. bae = a.find('a', href=True)
  237. link = bae['href']
  238. href.append(link)
  239. return href