this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

248 lines
9.0 KiB

  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from MarketPlaces.Utilities.utilities import *
  4. # Here, we are importing BeautifulSoup to search through the HTML tree
  5. from bs4 import BeautifulSoup
  6. # This is the method to parse the Description Pages (one page to each Product in the Listing Pages)
  7. def kerberos_description_parser(soup):
  8. # Fields to be parsed
  9. name = "-1" # 0 Product_Name y
  10. describe = "-1" # 1 Product_Description y
  11. lastSeen = "-1" # 2 Product_LastViewDate
  12. rules = "-1" # 3 NOT USED ...
  13. CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  14. MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
  15. review = "-1" # 6 Product_Number_Of_Reviews
  16. category = "-1" # 7 Product_Category
  17. shipFrom = "-1" # 8 Product_ShippedFrom
  18. shipTo = "-1" # 9 Product_ShippedTo
  19. left = "-1" # 10 Product_QuantityLeft y
  20. escrow = "-1" # 11 Vendor_Warranty y
  21. terms = "-1" # 12 Vendor_TermsAndConditions
  22. vendor = "-1" # 13 Vendor_Name y
  23. sold = "-1" # 14 Product_QuantitySold y
  24. addDate = "-1" # 15 Product_AddedDate
  25. available = "-1" # 16 NOT USED ...
  26. endDate = "-1" # 17 NOT USED ...
  27. BTC = "-1" # 18 Product_BTC_SellingPrice y
  28. USD = "-1" # 19 Product_USD_SellingPrice y
  29. rating = "-1" # 20 Vendor_Rating
  30. success = "-1" # 21 Vendor_Successful_Transactions
  31. EURO = "-1" # 22 Product_EURO_SellingPrice
  32. bae = soup.find('div', {'class': "col-9"})
  33. # Finding Product Name
  34. name = bae.find('h2').text
  35. name = name.replace('\n', ' ')
  36. name = name.replace(",", "")
  37. name = name.strip()
  38. mb = bae.findAll('div', {"class": "mb-1"})
  39. # Finding Vendor
  40. vendor = mb[0].text
  41. vendor = vendor.replace(",", "")
  42. vendor = vendor.replace("Sold by:", "")
  43. vendor = vendor.strip()
  44. # # Finding Vendor Rating
  45. # full_stars = bae[2].find_all('i', {'class': "fas fa-star"})
  46. # half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"})
  47. # rating = len(full_stars) + (0.5 if half_star is not None else 0)
  48. # Finding Warranty
  49. escrow = mb[2].text
  50. escrow = escrow.replace("Payment:", "")
  51. escrow = escrow.strip()
  52. # Finding Quantity Sold and Left
  53. temp = mb[4].text.split(',')
  54. sold = temp[0].replace("sold", "")
  55. sold = sold.strip()
  56. left = temp[1].replace("in stock", "")
  57. left = left.strip()
  58. # Finding USD
  59. USD = bae.find('div', {"class": "h3 text-secondary"}).text
  60. USD = USD.replace("$", "")
  61. USD = USD.strip()
  62. # Finding BTC
  63. temp = bae.find('div', {"class": "small"}).text.split("BTC")
  64. BTC = temp[0].strip()
  65. # shipping_info = bae[4].text
  66. # if "Digital" not in shipping_info:
  67. # shipping_info = shipping_info.split(" ")
  68. #
  69. # # Finding Shipment Information (Origin)
  70. # shipFrom = shipping_info[0].strip()
  71. #
  72. # # Finding Shipment Information (Destination)
  73. # shipTo = shipping_info[1].strip()
  74. # Finding the Product description
  75. describe = bae.find('div', {"class": "card border-top-0"}).text
  76. describe = describe.replace("\n", " ")
  77. describe = describe.replace("\r", " ")
  78. describe = describe.strip()
  79. # Searching for CVE and MS categories
  80. cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
  81. if cve:
  82. CVE = " "
  83. for idx in cve:
  84. CVE += (idx)
  85. CVE += " "
  86. CVE = CVE.replace(',', ' ')
  87. CVE = CVE.replace('\n', '')
  88. ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
  89. if ms:
  90. MS = " "
  91. for im in ms:
  92. MS += (im)
  93. MS += " "
  94. MS = MS.replace(',', ' ')
  95. MS = MS.replace('\n', '')
  96. # Populating the final variable (this should be a list with all fields scraped)
  97. row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,
  98. sold, addDate, available, endDate, BTC, USD, rating, success, EURO)
  99. # Sending the results
  100. return row
  101. # This is the method to parse the Listing Pages
  102. def kerberos_listing_parser(soup):
  103. # Fields to be parsed
  104. nm = 0 # Total_Products (Should be Integer)
  105. mktName = "Kerberos" # 0 Marketplace_Name
  106. name = [] # 1 Product_Name y
  107. CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  108. MS = [] # 3 Product_MS_Classification (Microsoft Security)
  109. category = [] # 4 Product_Category y
  110. describe = [] # 5 Product_Description
  111. escrow = [] # 6 Vendor_Warranty
  112. views = [] # 7 Product_Number_Of_Views
  113. reviews = [] # 8 Product_Number_Of_Reviews y
  114. addDate = [] # 9 Product_AddDate
  115. lastSeen = [] # 10 Product_LastViewDate
  116. BTC = [] # 11 Product_BTC_SellingPrice
  117. USD = [] # 12 Product_USD_SellingPrice y
  118. EURO = [] # 13 Product_EURO_SellingPrice
  119. sold = [] # 14 Product_QuantitySold
  120. qLeft =[] # 15 Product_QuantityLeft
  121. shipFrom = [] # 16 Product_ShippedFrom
  122. shipTo = [] # 17 Product_ShippedTo
  123. vendor = [] # 18 Vendor y
  124. rating = [] # 19 Vendor_Rating
  125. success = [] # 20 Vendor_Successful_Transactions
  126. href = [] # 24 Product_Links (Urls)
  127. listing = soup.findAll('div', {"class": "card product-card mb-3"})
  128. # Populating the Number of Products
  129. nm = len(listing)
  130. # Finding Category
  131. cat = soup.find("div", {"class": "col-9"})
  132. cat = cat.find("h2").text
  133. cat = cat.replace("Category: ", "")
  134. cat = cat.replace(",", "")
  135. cat = cat.strip()
  136. for card in listing:
  137. category.append(cat)
  138. bae = card.findAll('a')
  139. # Adding the url to the list of urls
  140. link = bae[0].get('href')
  141. link = cleanLink(link)
  142. href.append(link)
  143. # Finding Product Name
  144. product = bae[1].text
  145. product = product.replace('\n', ' ')
  146. product = product.replace(",", "")
  147. product = product.strip()
  148. name.append(product)
  149. # Finding Vendor
  150. vendor_name = bae[2].text
  151. vendor_name = vendor_name.replace(",", "")
  152. vendor_name = vendor_name.strip()
  153. vendor.append(vendor_name)
  154. # Finding USD
  155. usd = card.find('div', {"class": "mb-1"}).text
  156. usd = usd.replace("$", "")
  157. usd = usd.strip()
  158. USD.append(usd)
  159. # Finding Reviews
  160. num = card.find("span", {"class": "rate-count"}).text
  161. num = num.replace("(", "")
  162. num = num.replace("review)", "")
  163. num = num.replace("reviews)", "")
  164. num = num.strip()
  165. reviews.append(num)
  166. # Searching for CVE and MS categories
  167. cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
  168. if not cve:
  169. cveValue="-1"
  170. else:
  171. cee = " "
  172. for idx in cve:
  173. cee += (idx)
  174. cee += " "
  175. cee = cee.replace(',', ' ')
  176. cee = cee.replace('\n', '')
  177. cveValue=cee
  178. CVE.append(cveValue)
  179. ms = card.findAll(text=re.compile('MS\d{2}-\d{3}'))
  180. if not ms:
  181. MSValue="-1"
  182. else:
  183. me = " "
  184. for im in ms:
  185. me += (im)
  186. me += " "
  187. me = me.replace(',', ' ')
  188. me = me.replace('\n', '')
  189. MSValue=me
  190. MS.append(MSValue)
  191. # Populate the final variable (this should be a list with all fields scraped)
  192. return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
  193. BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
  194. def kerberos_links_parser(soup):
  195. # Returning all links that should be visited by the Crawler
  196. href = []
  197. content = soup.find('div', {"id": "content-pos"})
  198. listing = content.findAll('div', {"class": "item-block"})
  199. for div in listing:
  200. ae = div.find('div', {"ae zx300"})
  201. links = ae.findAll('a')
  202. href.append(links[1]['href'])
  203. return href