this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

263 lines
11 KiB

  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from typing import List, Tuple
  4. from MarketPlaces.Utilities.utilities import *
  5. # Here, we are importing BeautifulSoup to search through the HTML tree
  6. from bs4 import BeautifulSoup
  7. def mikesGrandStore_description_parser(soup: BeautifulSoup) -> Tuple:
  8. name = "-1" # 0 Product_Name
  9. describe = "-1" # 1 Product_Description
  10. lastSeen = "-1" # 2 Product_LastViewDate
  11. rules = "-1" # 3 NOT USED ...
  12. CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  13. MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
  14. review = "-1" # 6 Product_Number_Of_Reviews
  15. category = "-1" # 7 Product_Category
  16. shipFrom = "-1" # 8 Product_ShippedFrom
  17. shipTo = "-1" # 9 Product_ShippedTo
  18. left = "-1" # 10 Product_QuantityLeft
  19. escrow = "-1" # 11 Vendor_Warranty
  20. terms = "-1" # 12 Vendor_TermsAndConditions
  21. vendor = "-1" # 13 Vendor_Name
  22. sold = "-1" # 14 Product_QuantitySold
  23. addDate = "-1" # 15 Product_AddedDate
  24. available = "-1" # 16 NOT USED ...
  25. endDate = "-1" # 17 NOT USED ...
  26. BTC = "-1" # 18 Product_BTC_SellingPrice
  27. USD = "-1" # 19 Product_USD_SellingPrice
  28. rating = "-1" # 20 Vendor_Rating
  29. success = "-1" # 21 Vendor_Successful_Transactions
  30. EURO = "-1" # 22 Product_EURO_SellingPrice
  31. name: str = soup.find("h1", {"class": "product-title product_title entry-title"}).text
  32. describe = soup.find("div", {"id": "tab-description"}).text
  33. commentsList: List[BeautifulSoup] = soup.find("ol", {"class": "commentlist"}).find_all("li")
  34. if len(commentsList) > 0:
  35. lastReview: BeautifulSoup = commentsList[0]
  36. lastSeen = lastReview.find("time").get("datetime").text
  37. reviewTab: str = soup.find('a', {'href': '#tab-reivews'}).text
  38. review = reviewTab.split('(')[1].split(')')[0]
  39. navbarBreadcrumbs: List[BeautifulSoup] = soup.find('nav', {'class': 'woocommerce-breadcrumb breadcrumbs '}).find_all('a')
  40. category = navbarBreadcrumbs[1].text
  41. USD = soup.find("div", {"class": "price-wrapper"}).text
  42. reviewStats: str = soup.find("div", {"class": "star-rating"}).text
  43. rating = reviewStats.split(' ')[1]
  44. row = (
  45. name,
  46. describe,
  47. lastSeen,
  48. rules,
  49. CVE,
  50. MS,
  51. review,
  52. category,
  53. shipFrom,
  54. shipTo,
  55. left,
  56. escrow,
  57. terms,
  58. vendor,
  59. sold,
  60. addDate,
  61. available,
  62. endDate,
  63. BTC,
  64. USD,
  65. rating,
  66. success,
  67. EURO
  68. )
  69. return row
  70. def mikesGrandStore_listing_parser(soup: BeautifulSoup) -> List:
  71. # Fields to be parsed
  72. nm = 0 # Total_Products (Should be Integer)
  73. mktName = "MikesGrandStore" # 0 Marketplace_Name
  74. name = [] # 1 Product_Name
  75. CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  76. MS = [] # 3 Product_MS_Classification (Microsoft Security)
  77. category = [] # 4 Product_Category
  78. describe = [] # 5 Product_Description
  79. escrow = [] # 6 Vendor_Warranty
  80. views = [] # 7 Product_Number_Of_Views
  81. reviews = [] # 8 Product_Number_Of_Reviews
  82. addDate = [] # 9 Product_AddDate
  83. lastSeen = [] # 10 Product_LastViewDate
  84. BTC = [] # 11 Product_BTC_SellingPrice
  85. USD = [] # 12 Product_USD_SellingPrice
  86. EURO = [] # 13 Product_EURO_SellingPrice
  87. sold = [] # 14 Product_QuantitySold
  88. qLeft =[] # 15 Product_QuantityLeft
  89. shipFrom = [] # 16 Product_ShippedFrom
  90. shipTo = [] # 17 Product_ShippedTo
  91. vendor = [] # 18 Vendor
  92. rating = [] # 19 Vendor_Rating
  93. success = [] # 20 Vendor_Successful_Transactions
  94. href = [] # 23 Product_Links (Urls)
  95. pass
  96. #parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
  97. #stores info it needs in different lists, these lists are returned after being organized
  98. #@param: soup object looking at html page of listing page
  99. #return: 'row' that contains a variety of lists that each hold info on the listing page
  100. def darkfox_listing_parser(soup):
  101. # Fields to be parsed
  102. nm = 0 # Total_Products (Should be Integer)
  103. mktName = "DarkFox" # 0 Marketplace_Name
  104. name = [] # 1 Product_Name
  105. CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  106. MS = [] # 3 Product_MS_Classification (Microsoft Security)
  107. category = [] # 4 Product_Category
  108. describe = [] # 5 Product_Description
  109. escrow = [] # 6 Vendor_Warranty
  110. views = [] # 7 Product_Number_Of_Views
  111. reviews = [] # 8 Product_Number_Of_Reviews
  112. addDate = [] # 9 Product_AddDate
  113. lastSeen = [] # 10 Product_LastViewDate
  114. BTC = [] # 11 Product_BTC_SellingPrice
  115. USD = [] # 12 Product_USD_SellingPrice
  116. EURO = [] # 13 Product_EURO_SellingPrice
  117. sold = [] # 14 Product_QuantitySold
  118. qLeft =[] # 15 Product_QuantityLeft
  119. shipFrom = [] # 16 Product_ShippedFrom
  120. shipTo = [] # 17 Product_ShippedTo
  121. vendor = [] # 18 Vendor
  122. rating = [] # 19 Vendor_Rating
  123. success = [] # 20 Vendor_Successful_Transactions
  124. href = [] # 23 Product_Links (Urls)
  125. listing = soup.findAll('div', {"class": "card"})
  126. # Populating the Number of Products
  127. nm = len(listing)
  128. for a in listing:
  129. bae = a.findAll('a', href=True)
  130. # Adding the url to the list of urls
  131. link = bae[0].get('href')
  132. link = cleanLink(link)
  133. href.append(link)
  134. # Finding the Product
  135. product = bae[1].find('p').text
  136. product = product.replace('\n', ' ')
  137. product = product.replace(",", "")
  138. product = product.replace("...", "")
  139. product = product.strip()
  140. name.append(product)
  141. bae = a.find('div', {'class': "media-content"}).find('div').find_all('div')
  142. if len(bae) >= 5:
  143. # Finding Prices
  144. price = bae[0].text
  145. ud = price.replace(" USD", " ")
  146. # u = ud.replace("$","")
  147. u = ud.replace(",", "")
  148. u = u.strip()
  149. USD.append(u)
  150. # bc = (prc[1]).strip(' BTC')
  151. # BTC.append(bc)
  152. # Finding the Vendor
  153. vendor_name = bae[1].find('a').text
  154. vendor_name = vendor_name.replace(",", "")
  155. vendor_name = vendor_name.strip()
  156. vendor.append(vendor_name)
  157. # Finding the Category
  158. cat = bae[2].find('small').text
  159. cat = cat.replace("Category: ", "")
  160. cat = cat.replace(",", "")
  161. cat = cat.strip()
  162. category.append(cat)
  163. # Finding Number Sold and Quantity Left
  164. num = bae[3].text
  165. num = num.replace("Sold: ", "")
  166. num = num.strip()
  167. sold.append(num)
  168. quant = bae[4].find('small').text
  169. quant = quant.replace("In stock: ", "")
  170. quant = quant.strip()
  171. qLeft.append(quant)
  172. # Finding Successful Transactions
  173. freq = bae[1].text
  174. freq = freq.replace(vendor_name, "")
  175. freq = re.sub(r'Vendor Level \d+', "", freq)
  176. freq = freq.replace("(", "")
  177. freq = freq.replace(")", "")
  178. freq = freq.strip()
  179. success.append(freq)
  180. # Searching for CVE and MS categories
  181. cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
  182. if not cve:
  183. cveValue="-1"
  184. else:
  185. cee = " "
  186. for idx in cve:
  187. cee += (idx)
  188. cee += " "
  189. cee = cee.replace(',', ' ')
  190. cee = cee.replace('\n', '')
  191. cveValue=cee
  192. CVE.append(cveValue)
  193. ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
  194. if not ms:
  195. MSValue="-1"
  196. else:
  197. me = " "
  198. for im in ms:
  199. me += (im)
  200. me += " "
  201. me = me.replace(',', ' ')
  202. me = me.replace('\n', '')
  203. MSValue=me
  204. MS.append(MSValue)
  205. # Populate the final variable (this should be a list with all fields scraped)
  206. return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
  207. BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
  208. #called by the crawler to get description links on a listing page
  209. #@param: beautifulsoup object that is using the correct html page (listing page)
  210. #return: list of description links from a listing page
  211. def mikesgrandstore_links_parser(soup):
  212. # Returning all links that should be visited by the Crawler
  213. href = []
  214. listing = soup.findAll('div', {"class": "box-image"})
  215. for a in listing:
  216. bae = a.find('div', {"class": "image-fade_in_back"}).find('a', href=True)
  217. link = bae['href']
  218. href.append(link)
  219. return href