this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

210 lines
8.8 KiB

  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from MarketPlaces.Utilities.utilities import *
  4. # Here, we are importing BeautifulSoup to search through the HTML tree
  5. from bs4 import BeautifulSoup, ResultSet, Tag
  6. def apocalypse_description_parser(soup: Tag):
  7. # Fields to be parsed
  8. vendor = "-1" # 0 *Vendor_Name
  9. success = "-1" # 1 Vendor_Successful_Transactions
  10. rating_vendor = "-1" # 2 Vendor_Rating
  11. name = "-1" # 3 *Product_Name
  12. describe = "-1" # 4 Product_Description
  13. CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  14. MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
  15. category = "-1" # 7 Product_Category
  16. views = "-1" # 8 Product_Number_Of_Views
  17. reviews = "-1" # 9 Product_Number_Of_Reviews
  18. rating_item = "-1" # 10 Product_Rating
  19. addDate = "-1" # 11 Product_AddedDate
  20. BTC = "-1" # 12 Product_BTC_SellingPrice
  21. USD = "-1" # 13 Product_USD_SellingPrice
  22. EURO = "-1" # 14 Product_EURO_SellingPrice
  23. sold = "-1" # 15 Product_QuantitySold
  24. left = "-1" # 16 Product_QuantityLeft
  25. shipFrom = "-1" # 17 Product_ShippedFrom
  26. shipTo = "-1" # 18 Product_ShippedTo
  27. content: Tag = soup.find("div", {'id': "article_page"})
  28. product_name = content.find("p", {"class": "list-group-item text-center mb-0 box"}).text
  29. name = cleanString(product_name.strip())
  30. product_description = content.find("pre").text
  31. describe = cleanString(product_description.strip())
  32. product_reviews_list: Tag = content.find("table", {"class": "table product_reviews"}) \
  33. .find_all("li")
  34. review = str(len(product_reviews_list))
  35. product_category = content.find("a", {"class": "badge badge-danger"}).text
  36. category = cleanString(product_category.strip())
  37. product_ships_from = content.find("span", {"class": "badge badge-info"}).text
  38. shipFrom = cleanString(product_ships_from.strip())
  39. product_success_badge: ResultSet[Tag] = content.find_all("span", {"class": "badge badge-success"})
  40. product_ships_to = product_success_badge[1].text
  41. shipTo = cleanString(product_ships_to.strip())
  42. product_supply = content.find("span", {"class": "badge badge-warning"}).text
  43. left = cleanString(product_supply.strip())
  44. product_primary_badge: ResultSet[Tag] = content.find_all("span", {"class": "badge badge-primary"})
  45. # Product vendor comes in the form of "@ vendor_name"
  46. product_vendor = product_primary_badge[0].text.replace("@", "")
  47. vendor = cleanString(product_vendor.strip())
  48. sold = cleanString(product_primary_badge[1].text.strip())
  49. product_prices: Tag = content.find("p", {"style": "border-bottom:1px solid GREY;"})
  50. USD = product_prices.find("span", {"class": "pr"}).text
  51. prices_array: ResultSet[Tag] = product_prices.find_all("span", {"class": "pr1"})
  52. BTC = prices_array[1].text
  53. # Populating the final variable (this should be a list with all fields scraped)
  54. row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
  55. BTC, USD, EURO, sold, left, shipFrom, shipTo)
  56. # Sending the results
  57. return row
  58. def apocalypse_listing_parser(soup: Tag):
  59. # Fields to be parsed
  60. nm = 0 # Total_Products (Should be Integer)
  61. mktName = "Apocalypse" # 0 Marketplace_Name
  62. name = [] # 1 Product_Name
  63. CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  64. MS = [] # 3 Product_MS_Classification (Microsoft Security)
  65. category = [] # 4 Product_Category
  66. describe = [] # 5 Product_Description
  67. escrow = [] # 6 Vendor_Warranty
  68. views = [] # 7 Product_Number_Of_Views
  69. reviews = [] # 8 Product_Number_Of_Reviews
  70. addDate = [] # 9 Product_AddDate
  71. lastSeen = [] # 10 Product_LastViewDate
  72. BTC = [] # 11 Product_BTC_SellingPrice
  73. USD = [] # 12 Product_USD_SellingPrice
  74. EURO = [] # 13 Product_EURO_SellingPrice
  75. sold = [] # 14 Product_QuantitySold
  76. qLeft =[] # 15 Product_QuantityLeft
  77. shipFrom = [] # 16 Product_ShippedFrom
  78. shipTo = [] # 17 Product_ShippedTo
  79. vendor = [] # 18 Vendor
  80. rating = [] # 19 Vendor_Rating
  81. success = [] # 20 Vendor_Successful_Transactions
  82. href = [] # 23 Product_Links (Urls)
  83. listings: ResultSet[Tag] = soup.find("div", {"class": "col-lg-9 my-4"}).find_all("div", {"class": "col-lg-4 col-md-6 mb-1"})
  84. for prod in listings:
  85. product_name = prod.find('h5', {"class": "art_title"}).text
  86. name.append(cleanString(product_name.strip()))
  87. CVE.append("-1")
  88. MS.append("-1")
  89. describe.append("-1")
  90. escrow.append("-1")
  91. reviews.append("-1")
  92. addDate.append("-1")
  93. lastSeen.append("-1")
  94. BTC.append("-1")
  95. EURO.append("-1")
  96. shipTo.append("-1")
  97. success.append("-1")
  98. product_price = prod.find("span", {"class": "priceP"}).text
  99. USD.append(cleanString(product_price.strip()))
  100. product_sold = prod.find("span", {"class": "badge badge-success"}).text
  101. sold.append(cleanString(product_sold.strip()))
  102. product_statistics: ResultSet[Tag] = prod.find_all("p", {"class": "mb-0 card-text"})
  103. product_category = product_statistics[0].find("a").text
  104. category.append(cleanString(product_category.strip()))
  105. product_sold = product_statistics[1].find("span").text
  106. sold.append(cleanString(product_sold.strip()))
  107. product_quantity_left = product_statistics[2].find("span", {"class": "badge bluebadge"}).text
  108. qLeft.append(cleanString(product_quantity_left.strip()))
  109. product_views = product_statistics[3].find("span").text
  110. views.append(cleanString(product_views.strip()))
  111. product_ships_from = product_statistics[4].find("span").text
  112. shipFrom.append(cleanString(product_ships_from.strip()))
  113. product_vendor_tag: Tag = product_statistics[5].find("a").find("span", {"class": "badge badge-primary"})
  114. # Product vendors & ratings are displayed as "vender_name ★ 5.0"
  115. # When split by the star (★), it should return a 2-value array
  116. product_vendor, product_vendor_rating = product_vendor_tag.text.split("")
  117. try:
  118. vendor.append(cleanString(product_vendor.strip()))
  119. rating.append(cleanString(product_vendor_rating.strip()))
  120. except Exception as e:
  121. raise e
  122. product_href = prod.find('a').get('href')
  123. href.append(product_href)
  124. nm += 1
  125. return organizeProducts(
  126. marketplace=mktName,
  127. nm=nm,
  128. vendor=vendor,
  129. rating_vendor=rating,
  130. success_vendor=success,
  131. nombre=name,
  132. CVE=CVE,
  133. MS=MS,
  134. category=category,
  135. describe=describe,
  136. views=views,
  137. reviews=reviews,
  138. rating_item=["-1" for _ in range(nm)],
  139. addDate=addDate,
  140. BTC=BTC,
  141. USD=USD,
  142. EURO=EURO,
  143. sold=sold,
  144. qLeft=qLeft,
  145. shipFrom=shipFrom,
  146. shipTo=shipTo,
  147. href=href
  148. )
  149. #called by the crawler to get description links on a listing page
  150. #@param: beautifulsoup object that is using the correct html page (listing page)
  151. #return: list of description links from a listing page
  152. def apocalypse_links_parser(soup):
  153. # Returning all links that should be visited by the Crawler
  154. href = []
  155. listing = soup.findAll('div', {"class": "col-lg-4 col-md-6 mb-1"})
  156. for a in listing:
  157. bae = a.find('a', href=True)
  158. link = bae['href']
  159. href.append(link)
  160. return href