this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

225 lines
9.5 KiB

1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from MarketPlaces.Utilities.utilities import *
  4. # Here, we are importing BeautifulSoup to search through the HTML tree
  5. from bs4 import BeautifulSoup, ResultSet, Tag
  6. def apocalypse_description_parser(soup: Tag):
  7. # Fields to be parsed
  8. vendor = "-1" # 0 *Vendor_Name
  9. success = "-1" # 1 Vendor_Successful_Transactions
  10. rating_vendor = "-1" # 2 Vendor_Rating
  11. name = "-1" # 3 *Product_Name
  12. describe = "-1" # 4 Product_Description
  13. CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  14. MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
  15. category = "-1" # 7 Product_Category
  16. views = "-1" # 8 Product_Number_Of_Views
  17. reviews = "-1" # 9 Product_Number_Of_Reviews
  18. rating_item = "-1" # 10 Product_Rating
  19. addDate = "-1" # 11 Product_AddedDate
  20. BTC = "-1" # 12 Product_BTC_SellingPrice
  21. USD = "-1" # 13 Product_USD_SellingPrice
  22. EURO = "-1" # 14 Product_EURO_SellingPrice
  23. sold = "-1" # 15 Product_QuantitySold
  24. left = "-1" # 16 Product_QuantityLeft
  25. shipFrom = "-1" # 17 Product_ShippedFrom
  26. shipTo = "-1" # 18 Product_ShippedTo
  27. image = "-1" # 19 Product_Image
  28. vendor_image = "-1" # 20 Vendor_Image
  29. content: Tag = soup.find("div", {'id': "article_page"})
  30. product_name = content.find("p", {"class": "list-group-item text-center mb-0 box"}).text
  31. name = cleanString(product_name.strip())
  32. product_description = content.find("pre").text
  33. describe = cleanString(product_description.strip())
  34. # Finding Product Image
  35. image = soup.find('div', {'class': 'col-md-7 text-center'}).find('img')
  36. image = image.get('src').split('base64,')[-1]
  37. product_reviews_list: Tag = content.find("table", {"class": "table product_reviews"}) \
  38. .find_all("li")
  39. review = str(len(product_reviews_list))
  40. product_category = content.find("a", {"class": "badge badge-danger"}).text
  41. category = cleanString(product_category.strip())
  42. product_ships_from = content.find("span", {"class": "badge badge-info"}).text
  43. shipFrom = cleanString(product_ships_from.strip())
  44. product_success_badge: ResultSet[Tag] = content.find_all("span", {"class": "badge badge-success"})
  45. product_ships_to = product_success_badge[1].text
  46. shipTo = cleanString(product_ships_to.strip())
  47. product_supply = content.find("span", {"class": "badge badge-warning"}).text
  48. left = cleanString(product_supply.strip())
  49. product_primary_badge: ResultSet[Tag] = content.find_all("span", {"class": "badge badge-primary"})
  50. # Product vendor comes in the form of "@ vendor_name"
  51. product_vendor = product_primary_badge[0].text.replace("@", "")
  52. vendor = cleanString(product_vendor.strip())
  53. sold = cleanString(product_primary_badge[1].text.strip())
  54. product_prices: Tag = content.find("p", {"style": "border-bottom:1px solid GREY;"})
  55. USD = product_prices.find("span", {"class": "pr"}).text
  56. prices_array: ResultSet[Tag] = product_prices.find_all("span", {"class": "pr1"})
  57. BTC = prices_array[1].text
  58. # Populating the final variable (this should be a list with all fields scraped)
  59. row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
  60. BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
  61. # Sending the results
  62. return row
  63. def apocalypse_listing_parser(soup: Tag):
  64. # Fields to be parsed
  65. nm = 0 # Total_Products (Should be Integer)
  66. mktName = "Apocalypse" # 0 Marketplace_Name
  67. name = [] # 1 Product_Name
  68. CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  69. MS = [] # 3 Product_MS_Classification (Microsoft Security)
  70. category = [] # 4 Product_Category
  71. describe = [] # 5 Product_Description
  72. escrow = [] # 6 Vendor_Warranty
  73. views = [] # 7 Product_Number_Of_Views
  74. reviews = [] # 8 Product_Number_Of_Reviews
  75. addDate = [] # 9 Product_AddDate
  76. lastSeen = [] # 10 Product_LastViewDate
  77. BTC = [] # 11 Product_BTC_SellingPrice
  78. USD = [] # 12 Product_USD_SellingPrice
  79. EURO = [] # 13 Product_EURO_SellingPrice
  80. sold = [] # 14 Product_QuantitySold
  81. qLeft =[] # 15 Product_QuantityLeft
  82. shipFrom = [] # 16 Product_ShippedFrom
  83. shipTo = [] # 17 Product_ShippedTo
  84. vendor = [] # 18 Vendor
  85. rating = [] # 19 Vendor_Rating
  86. success = [] # 20 Vendor_Successful_Transactions
  87. image = [] # 20 Product_Image
  88. image_vendor = [] # 21 Vendor_Image
  89. href = [] # 22 Product_Links
  90. table = soup.find("div", {"class": "col-lg-9 my-4"})
  91. if table is None:
  92. table = soup.find("div", {"class": "col-lg-9"})
  93. listings: ResultSet[Tag] = table.find_all("div", {"class": "col-lg-4 col-md-6 mb-1"})
  94. for prod in listings:
  95. product_name = prod.find('h5', {"class": "art_title"}).text
  96. name.append(cleanString(product_name.strip()))
  97. # Finding Product Image
  98. product_image = prod.find('img', {'class': 'customHeight'})
  99. product_image = product_image.get('src').split('base64,')[-1]
  100. image.append(product_image)
  101. CVE.append("-1")
  102. MS.append("-1")
  103. describe.append("-1")
  104. escrow.append("-1")
  105. reviews.append("-1")
  106. addDate.append("-1")
  107. lastSeen.append("-1")
  108. BTC.append("-1")
  109. EURO.append("-1")
  110. shipTo.append("-1")
  111. success.append("-1")
  112. image_vendor.append("-1")
  113. product_price = prod.find("span", {"class": "priceP"}).text
  114. USD.append(cleanString(product_price.strip()))
  115. product_sold = prod.find("span", {"class": "badge badge-success"}).text
  116. sold.append(cleanString(product_sold.strip()))
  117. product_statistics: ResultSet[Tag] = prod.find_all("p", {"class": "mb-0 card-text"})
  118. product_category = product_statistics[0].find("a").text
  119. category.append(cleanString(product_category.strip()))
  120. product_sold = product_statistics[1].find("span").text
  121. sold.append(cleanString(product_sold.strip()))
  122. product_quantity_left = product_statistics[2].find("span", {"class": "badge bluebadge"}).text
  123. qLeft.append(cleanString(product_quantity_left.strip()))
  124. product_views = product_statistics[3].find("span").text
  125. views.append(cleanString(product_views.strip()))
  126. product_ships_from = product_statistics[4].find("span").text
  127. shipFrom.append(cleanString(product_ships_from.strip()))
  128. product_vendor_tag: Tag = product_statistics[5].find("a").find("span", {"class": "badge badge-primary"})
  129. # Product vendors & ratings are displayed as "vender_name ★ 5.0"
  130. # When split by the star (★), it should return a 2-value array
  131. product_vendor, product_vendor_rating = product_vendor_tag.text.split("")
  132. try:
  133. vendor.append(cleanString(product_vendor.strip()))
  134. rating.append(cleanString(product_vendor_rating.strip()))
  135. except Exception as e:
  136. raise e
  137. product_href = prod.find('a').get('href')
  138. href.append(product_href)
  139. nm += 1
  140. return organizeProducts(
  141. marketplace=mktName,
  142. nm=nm,
  143. vendor=vendor,
  144. rating_vendor=rating,
  145. success_vendor=success,
  146. nombre=name,
  147. CVE=CVE,
  148. MS=MS,
  149. category=category,
  150. describe=describe,
  151. views=views,
  152. reviews=reviews,
  153. rating_item=["-1" for _ in range(nm)],
  154. addDate=addDate,
  155. BTC=BTC,
  156. USD=USD,
  157. EURO=EURO,
  158. sold=sold,
  159. qLeft=qLeft,
  160. shipFrom=shipFrom,
  161. shipTo=shipTo,
  162. href=href,
  163. image=image,
  164. image_vendor=image_vendor
  165. )
  166. #called by the crawler to get description links on a listing page
  167. #@param: beautifulsoup object that is using the correct html page (listing page)
  168. #return: list of description links from a listing page
  169. def apocalypse_links_parser(soup):
  170. # Returning all links that should be visited by the Crawler
  171. href = []
  172. listing = soup.findAll('div', {"class": "col-lg-4 col-md-6 mb-1"})
  173. for a in listing:
  174. bae = a.find('a', href=True)
  175. link = bae['href']
  176. href.append(link)
  177. return href