this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

225 lines
9.4 KiB

1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from MarketPlaces.Utilities.utilities import *
  4. # Here, we are importing BeautifulSoup to search through the HTML tree
  5. from bs4 import BeautifulSoup, ResultSet, Tag
  6. def apocalypse_description_parser(soup: Tag):
  7. # Fields to be parsed
  8. vendor = "-1" # 0 *Vendor_Name
  9. success = "-1" # 1 Vendor_Successful_Transactions
  10. rating_vendor = "-1" # 2 Vendor_Rating
  11. name = "-1" # 3 *Product_Name
  12. describe = "-1" # 4 Product_Description
  13. CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  14. MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
  15. category = "-1" # 7 Product_Category
  16. views = "-1" # 8 Product_Number_Of_Views
  17. reviews = "-1" # 9 Product_Number_Of_Reviews
  18. rating_item = "-1" # 10 Product_Rating
  19. addDate = "-1" # 11 Product_AddedDate
  20. BTC = "-1" # 12 Product_BTC_SellingPrice
  21. USD = "-1" # 13 Product_USD_SellingPrice
  22. EURO = "-1" # 14 Product_EURO_SellingPrice
  23. sold = "-1" # 15 Product_QuantitySold
  24. left = "-1" # 16 Product_QuantityLeft
  25. shipFrom = "-1" # 17 Product_ShippedFrom
  26. shipTo = "-1" # 18 Product_ShippedTo
  27. image = "-1" # 19 Product_Image
  28. vendor_image = "-1" # 20 Vendor_Image
  29. content: Tag = soup.find("div", {'id': "article_page"})
  30. product_name = content.find("p", {"class": "list-group-item text-center mb-0 box"}).text
  31. name = cleanString(product_name.strip())
  32. product_description = content.find("pre").text
  33. describe = cleanString(product_description.strip())
  34. # Finding Product Image
  35. image = soup.find('div', {'class': 'col-md-7 text-center'}).find('img')
  36. image = image.get('src').split('base64,')[-1]
  37. product_reviews_list: Tag = content.find("table", {"class": "table product_reviews"}) \
  38. .find_all("li")
  39. review = str(len(product_reviews_list))
  40. product_category = content.find("a", {"class": "badge badge-danger"}).text
  41. category = cleanString(product_category.strip())
  42. product_ships_from = content.find("span", {"class": "badge badge-info"}).text
  43. shipFrom = cleanString(product_ships_from.strip())
  44. product_success_badge: ResultSet[Tag] = content.find_all("span", {"class": "badge badge-success"})
  45. product_ships_to = product_success_badge[1].text
  46. shipTo = cleanString(product_ships_to.strip())
  47. product_supply = content.find("span", {"class": "badge badge-warning"}).text
  48. left = cleanString(product_supply.strip())
  49. product_primary_badge: ResultSet[Tag] = content.find_all("span", {"class": "badge badge-primary"})
  50. # Product vendor comes in the form of "@ vendor_name"
  51. product_vendor = product_primary_badge[0].text.replace("@", "")
  52. vendor = cleanString(product_vendor.strip())
  53. sold = cleanString(product_primary_badge[1].text.strip())
  54. product_prices: Tag = content.find("p", {"style": "border-bottom:1px solid GREY;"})
  55. USD = product_prices.find("span", {"class": "pr"}).text
  56. prices_array: ResultSet[Tag] = product_prices.find_all("span", {"class": "pr1"})
  57. BTC = prices_array[1].text
  58. # Populating the final variable (this should be a list with all fields scraped)
  59. row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
  60. BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
  61. # Sending the results
  62. return row
  63. def apocalypse_listing_parser(soup: Tag):
  64. # Fields to be parsed
  65. nm = 0 # Total_Products (Should be Integer)
  66. mktName = "Apocalypse" # 0 Marketplace_Name
  67. name = [] # 1 Product_Name
  68. CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  69. MS = [] # 3 Product_MS_Classification (Microsoft Security)
  70. category = [] # 4 Product_Category
  71. describe = [] # 5 Product_Description
  72. escrow = [] # 6 Vendor_Warranty
  73. views = [] # 7 Product_Number_Of_Views
  74. reviews = [] # 8 Product_Number_Of_Reviews
  75. addDate = [] # 9 Product_AddDate
  76. lastSeen = [] # 10 Product_LastViewDate
  77. BTC = [] # 11 Product_BTC_SellingPrice
  78. USD = [] # 12 Product_USD_SellingPrice
  79. EURO = [] # 13 Product_EURO_SellingPrice
  80. sold = [] # 14 Product_QuantitySold
  81. qLeft =[] # 15 Product_QuantityLeft
  82. shipFrom = [] # 16 Product_ShippedFrom
  83. shipTo = [] # 17 Product_ShippedTo
  84. vendor = [] # 18 Vendor
  85. rating = [] # 19 Vendor_Rating
  86. success = [] # 20 Vendor_Successful_Transactions
  87. image = [] # 20 Product_Image
  88. image_vendor = [] # 21 Vendor_Image
  89. href = [] # 22 Product_Links
  90. listings: ResultSet[Tag] = soup.find("div", {"class": "col-lg-9 my-4"}).find_all("div", {"class": "col-lg-4 col-md-6 mb-1"})
  91. for prod in listings:
  92. product_name = prod.find('h5', {"class": "art_title"}).text
  93. name.append(cleanString(product_name.strip()))
  94. # Finding Product Image
  95. product_image = prod.find('img', {'class': 'customHeight'})
  96. product_image = product_image.get('src').split('base64,')[-1]
  97. image.append(product_image)
  98. CVE.append("-1")
  99. MS.append("-1")
  100. describe.append("-1")
  101. escrow.append("-1")
  102. reviews.append("-1")
  103. addDate.append("-1")
  104. lastSeen.append("-1")
  105. BTC.append("-1")
  106. EURO.append("-1")
  107. shipTo.append("-1")
  108. success.append("-1")
  109. image_vendor.append("-1")
  110. product_price = prod.find("span", {"class": "priceP"}).text
  111. USD.append(cleanString(product_price.strip()))
  112. product_sold = prod.find("span", {"class": "badge badge-success"}).text
  113. sold.append(cleanString(product_sold.strip()))
  114. product_statistics: ResultSet[Tag] = prod.find_all("p", {"class": "mb-0 card-text"})
  115. product_category = product_statistics[0].find("a").text
  116. category.append(cleanString(product_category.strip()))
  117. product_sold = product_statistics[1].find("span").text
  118. sold.append(cleanString(product_sold.strip()))
  119. product_quantity_left = product_statistics[2].find("span", {"class": "badge bluebadge"}).text
  120. qLeft.append(cleanString(product_quantity_left.strip()))
  121. product_views = product_statistics[3].find("span").text
  122. views.append(cleanString(product_views.strip()))
  123. product_ships_from = product_statistics[4].find("span").text
  124. shipFrom.append(cleanString(product_ships_from.strip()))
  125. product_vendor_tag: Tag = product_statistics[5].find("a").find("span", {"class": "badge badge-primary"})
  126. # Product vendors & ratings are displayed as "vender_name ★ 5.0"
  127. # When split by the star (★), it should return a 2-value array
  128. product_vendor, product_vendor_rating = product_vendor_tag.text.split("")
  129. try:
  130. vendor.append(cleanString(product_vendor.strip()))
  131. rating.append(cleanString(product_vendor_rating.strip()))
  132. except Exception as e:
  133. raise e
  134. product_href = prod.find('a').get('href')
  135. href.append(product_href)
  136. nm += 1
  137. return organizeProducts(
  138. marketplace=mktName,
  139. nm=nm,
  140. vendor=vendor,
  141. rating_vendor=rating,
  142. success_vendor=success,
  143. nombre=name,
  144. CVE=CVE,
  145. MS=MS,
  146. category=category,
  147. describe=describe,
  148. views=views,
  149. reviews=reviews,
  150. rating_item=["-1" for _ in range(nm)],
  151. addDate=addDate,
  152. BTC=BTC,
  153. USD=USD,
  154. EURO=EURO,
  155. sold=sold,
  156. qLeft=qLeft,
  157. shipFrom=shipFrom,
  158. shipTo=shipTo,
  159. href=href,
  160. image=image,
  161. image_vendor=image_vendor
  162. )
  163. #called by the crawler to get description links on a listing page
  164. #@param: beautifulsoup object that is using the correct html page (listing page)
  165. #return: list of description links from a listing page
  166. def apocalypse_links_parser(soup):
  167. # Returning all links that should be visited by the Crawler
  168. href = []
  169. listing = soup.findAll('div', {"class": "col-lg-4 col-md-6 mb-1"})
  170. for a in listing:
  171. bae = a.find('a', href=True)
  172. link = bae['href']
  173. href.append(link)
  174. return href