this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

273 lines
10 KiB

  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from MarketPlaces.Utilities.utilities import *
  4. # Here, we are importing BeautifulSoup to search through the HTML tree
  5. from bs4 import BeautifulSoup
  6. import re
  7. # parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
  8. # stores info it needs in different lists, these lists are returned after being organized
  9. # @param: soup object looking at html page of description page
  10. # return: 'row' that contains a variety of lists that each hold info on the description page
  11. def atlas_description_parser(soup):
  12. # Fields to be parsed
  13. vendor = "-1" # 0 *Vendor_Name y
  14. success = "-1" # 1 Vendor_Successful_Transactions n
  15. rating_vendor = "-1" # 2 Vendor_Rating y
  16. name = "-1" # 3 *Product_Name y
  17. describe = "-1" # 4 Product_Description y
  18. CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  19. MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
  20. category = "-1" # 7 Product_Category y
  21. views = "-1" # 8 Product_Number_Of_Views y
  22. reviews = "-1" # 9 Product_Number_Of_Reviews n
  23. rating_item = "-1" # 10 Product_Rating n
  24. addDate = "-1" # 11 Product_AddedDate y
  25. BTC = "-1" # 12 Product_BTC_SellingPrice y
  26. USD = "-1" # 13 Product_USD_SellingPrice y
  27. EURO = "-1" # 14 Product_EURO_SellingPrice n
  28. sold = "-1" # 15 Product_QuantitySold n
  29. left = "-1" # 16 Product_QuantityLeft n
  30. shipFrom = "-1" # 17 Product_ShippedFrom n
  31. shipTo = "-1" # 18 Product_ShippedTo n
  32. image = "-1" # 19 Product_Image n
  33. vendor_image = "-1" # 20 Vendor_Image n
  34. # Finding Vendor
  35. vendor = soup.find('div', {'class', 'flex items-center space-x-3'})
  36. vendor = vendor.find('div', {'class', 'flex items-center space-x-3'}).find('a').text
  37. vendor = cleanString(vendor).strip()
  38. # sold
  39. try:
  40. sell = soup.find('p', {'class': 'pl-3 text-sm font-semibold text-slate-700 dark:text-slate-400'}).text
  41. reg = r'product sold (.+)'
  42. sell = re.search(reg, sell)
  43. if sell:
  44. sell = sell.group(1)
  45. sell = sell.replace('product sold ', '').replace('time', '').replace('s', '')
  46. sell = cleanString(sell.strip())
  47. except:
  48. sell = '-1'
  49. sold = sell
  50. # successful transaction
  51. suc = soup.find('div', {'class': 'mt-4 grid w-full grid-cols-6 gap-4 rounded border p-3 shadow-sm dark:border-slate-700'}).find_all('div')
  52. suc = suc[2].find_all('p')
  53. suc = suc[1].text
  54. success = cleanString(suc.strip())
  55. # Finding Vendor Rating (bug in their system shows standard rating)
  56. rating_vendor = '-1'
  57. rating_vendor = cleanNumbers(rating_vendor).strip()
  58. # Finding Product Name
  59. name = soup.find('h2').text
  60. name = cleanString(name).strip()
  61. # Finding Product description
  62. try:
  63. describe = soup.find('div', {'class': 'prose mb-12 mt-5 break-words dark:prose-invert'}).text
  64. describe = cleanString(describe).strip()
  65. except:
  66. describe = '-1'
  67. # Finding category
  68. try:
  69. div_category = soup.find('ol', {'class': "leading-node flex items-center gap-1 text-sm font-medium text-gray-600 dark:text-gray-300"}).find_all('li', class_=lambda x: x is None)
  70. category = div_category[1].find('a').text
  71. except:
  72. category = '-1'
  73. category = cleanString(category).strip()
  74. # Product rating - check
  75. rating = '-1'
  76. rating_item = cleanString(rating).strip()
  77. # Finding BTC and USD/GOLD
  78. BTC = '-1'
  79. usd = soup.find('h1', {'class': 'text-2xl font-bold leading-none tracking-tight text-slate-800 dark:text-slate-300'}).text
  80. if '$' in usd:
  81. usd = usd.replace('$', '')
  82. usd = cleanString(usd).strip()
  83. USD = usd
  84. try:
  85. image = soup.find('div', {'class', 'w-full flex-1 flex-shrink-0'}).find('img').get('src').split('base64,')[-1]
  86. except:
  87. image = '-1'
  88. # Populating the final variable (this should be a list with all fields scraped)
  89. row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
  90. BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
  91. # Sending the results
  92. return row
  93. # parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
  94. # stores info it needs in different lists, these lists are returned after being organized
  95. # @param: soup object looking at html page of listing page
  96. # return: 'row' that contains a variety of lists that each hold info on the listing page
  97. def atlas_listing_parser(soup):
  98. # Fields to be parsed
  99. nm = 0 # *Total_Products (Should be Integer)
  100. mktName = "Atlas" # 0 *Marketplace_Name y
  101. vendor = [] # 1 *Vendor y
  102. rating_vendor = [] # 2 Vendor_Rating y
  103. success = [] # 3 Vendor_Successful_Transactions n
  104. name = [] # 4 *Product_Name y
  105. CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
  106. MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
  107. category = [] # 7 Product_Category y
  108. describe = [] # 8 Product_Description n
  109. views = [] # 9 Product_Number_Of_Views y
  110. reviews = [] # 10 Product_Number_Of_Reviews n
  111. rating_item = [] # 11 Product_Rating n
  112. addDate = [] # 12 Product_AddDate y
  113. BTC = [] # 13 Product_BTC_SellingPrice y
  114. USD = [] # 14 Product_USD_SellingPrice y
  115. EURO = [] # 15 Product_EURO_SellingPrice n
  116. sold = [] # 16 Product_QuantitySold n
  117. qLeft = [] # 17 Product_QuantityLeft n
  118. shipFrom = [] # 18 Product_ShippedFrom n
  119. shipTo = [] # 19 Product_ShippedTo n
  120. image = [] # 20 Product_Image n
  121. image_vendor = [] # 21 Vendor_Image n
  122. href = [] # 22 Product_Links y
  123. listings = soup.find('div', {'class': 'grid h-fit grid-cols-1 gap-x-8 gap-y-10 lg:grid-cols-4 md:grid-cols-3'}).findAll('div', class_=lambda x: x is None)
  124. temp = soup.find('h1', {'class': 'text-4xl font-bold tracking-tight text-gray-900 dark:text-slate-200'}).text
  125. reg = r'results in (.+)'
  126. cat = re.search(reg, temp)
  127. if cat:
  128. cat = cat.group(1)
  129. cat = cat.replace('results in ', '')
  130. # Populating the Number of Products
  131. nm = len(listings)
  132. for listing in listings:
  133. # Finding vendor name
  134. vendor_name = listing.find('p', {'class': 'text-sm font-medium leading-none dark:text-slate-100'}).text
  135. vendor_name = vendor_name.strip()
  136. vendor.append(vendor_name)
  137. # Finding the vendor rating
  138. vendor_level = listing.find('p', {'class': "flex space-x-1 text-xs text-gray-700 dark:text-slate-500"}).text
  139. vendor_level = vendor_level.strip().split(' ')
  140. vendor_level = cleanNumbers(vendor_level[-1])
  141. rating_vendor.append(vendor_level)
  142. # Finding the product name
  143. product_name = listing.find('p', {'class':'my-1 line-clamp-2 text-sm text-slate-700 group-hover:underline dark:text-slate-300'}).text
  144. product_name = cleanString(product_name).strip()
  145. name.append(product_name)
  146. # Searching for CVE and MS categories
  147. cve = listing.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
  148. if not cve:
  149. cveValue = "-1"
  150. else:
  151. cee = " "
  152. for idx in cve:
  153. cee += (idx)
  154. cee += " "
  155. cee = cee.replace(',', ' ')
  156. cee = cee.replace('\n', '')
  157. cee = cee.replace('\t', '')
  158. cveValue = cee
  159. CVE.append(cveValue)
  160. ms = listing.findAll(text=re.compile('MS\d{2}-\d{3}'))
  161. if not ms:
  162. MSValue = "-1"
  163. else:
  164. me = " "
  165. for im in ms:
  166. me += (im)
  167. me += " "
  168. me = me.replace(',', ' ')
  169. me = me.replace('\n', '')
  170. MSValue = me
  171. MS.append(MSValue)
  172. # Finding the category
  173. category_text = cleanString(cat).strip()
  174. category.append(category_text)
  175. # Finding the hrefs
  176. description_link = listing.find('a', {'class': 'group relative block'})['href']
  177. href.append(description_link)
  178. # Finding the views - check
  179. views_text = '-1'
  180. views.append(views_text)
  181. # Finding the date added
  182. date = '-1'
  183. # date = datetime.strptime(date, "%d-%m-%Y")
  184. addDate.append(date)
  185. # EURO
  186. EURO.append('-1')
  187. # Finding the BTC and USD/GOLD
  188. btc_price = '-1'
  189. BTC.append(btc_price)
  190. try:
  191. money = listing.find('p', {'class': 'mt-2 text-lg font-bold tracking-wider text-slate-900 dark:text-slate-200'}).text
  192. if '$' in money:
  193. usd_price = money.strip().replace('$', '')
  194. usd_price = cleanString(usd_price).strip()
  195. except:
  196. usd_price = '-1'
  197. USD.append(usd_price)
  198. sold.append('-1')
  199. qLeft.append('-1')
  200. shipTo.append('-1')
  201. shipf = '-1'
  202. shipFrom.append(shipf)
  203. try:
  204. temp = listing.find('div', {'class', 'overflow-hidden rounded-lg'}).find('img').get('src').split('base64,')[-1]
  205. except:
  206. temp = '-1'
  207. image.append(temp)
  208. image_vendor.append('-1')
  209. # Populate the final variable (this should be a list with all fields scraped)
  210. return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
  211. reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image,
  212. image_vendor)
  213. # called by the crawler to get description links on a listing page
  214. # @param: beautifulsoup object that is using the correct html page (listing page)
  215. # return: list of description links from a listing page
  216. def atlas_links_parser(soup):
  217. # Returning all links that should be visited by the Crawler
  218. href = []
  219. listings = soup.find('div', {'class': 'grid h-fit grid-cols-1 gap-x-8 gap-y-10 lg:grid-cols-4 md:grid-cols-3'}).findAll('div')
  220. for listing in listings:
  221. # Adding the url to the list of urls
  222. try:
  223. description_link = listing.find('a', {'class': 'group relative block'}).get('href')
  224. href.append(description_link)
  225. except:
  226. pass
  227. return href