this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

288 lines
11 KiB

  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from MarketPlaces.Utilities.utilities import *
  4. # Here, we are importing BeautifulSoup to search through the HTML tree
  5. from bs4 import BeautifulSoup
  6. import re
  7. # parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
  8. # stores info it needs in different lists, these lists are returned after being organized
  9. # @param: soup object looking at html page of description page
  10. # return: 'row' that contains a variety of lists that each hold info on the description page
  11. def vortex_description_parser(soup):
  12. # Fields to be parsed
  13. vendor = "-1" # 0 *Vendor_Name y
  14. success = "-1" # 1 Vendor_Successful_Transactions n
  15. rating_vendor = "-1" # 2 Vendor_Rating y
  16. name = "-1" # 3 *Product_Name y
  17. describe = "-1" # 4 Product_Description y
  18. CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  19. MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
  20. category = "-1" # 7 Product_Category y
  21. views = "-1" # 8 Product_Number_Of_Views y
  22. reviews = "-1" # 9 Product_Number_Of_Reviews n
  23. rating_item = "-1" # 10 Product_Rating n
  24. addDate = "-1" # 11 Product_AddedDate y
  25. BTC = "-1" # 12 Product_BTC_SellingPrice y
  26. USD = "-1" # 13 Product_USD_SellingPrice y
  27. EURO = "-1" # 14 Product_EURO_SellingPrice n
  28. sold = "-1" # 15 Product_QuantitySold n
  29. left = "-1" # 16 Product_QuantityLeft n
  30. shipFrom = "-1" # 17 Product_ShippedFrom n
  31. shipTo = "-1" # 18 Product_ShippedTo n
  32. image = "-1" # 19 Product_Image n
  33. vendor_image = "-1" # 20 Vendor_Image n
  34. temp2 = soup.find('div', {'class', 'col-auto font-weight-bold'})
  35. # Finding Vendor - check
  36. vendor = temp2.find('a').text
  37. vendor = cleanString(vendor).strip()
  38. # Finding Vendor Rating (bug in their system shows standard rating)
  39. try:
  40. rating_vendor = temp2.find('span', {'class': "badge badge-pill mr-2"}).text
  41. rating_vendor = rating_vendor.replace('VendorLvl:', '')
  42. except:
  43. rating_vendor = '-1'
  44. rating_vendor = cleanNumbers(rating_vendor).strip()
  45. # Finding Product Name - check
  46. name = soup.find('h3', {'class': "d-inline-block font-weight-bold"}).text
  47. name = cleanString(name).strip()
  48. # Finding Product description - check
  49. try:
  50. describe = soup.find('p', {'class': 'description pl-3 py-2 border rounded-3'}).text
  51. describe = cleanString(describe).strip()
  52. except:
  53. describe = '-1'
  54. # print(describe)
  55. # Finding category - checl
  56. div_category = soup.find('ol', {'class': "breadcrumb breadcrumb-navbar py-0 px-2 px-md-4 m-0 ml-2 mx-auto justify-content-center h-auto d-flex bg-transparent"}).find_all('li')
  57. category = div_category[1].find('a').text
  58. category = cleanString(category).strip()
  59. # Product rating - check
  60. try:
  61. rating = temp2.find('span', {'class': 'text-success mr-2 py-1'}).text
  62. rating = re.sub(r'[()+%]', '', rating)
  63. except:
  64. rating = '-1'
  65. rating_item = cleanString(rating).strip()
  66. # Finding BTC and USD/GOLD
  67. div_price = soup.find('h5', {'class': "product-price"}).find('span', {'class', 'small product-price ml-2'}).text
  68. div_price = div_price.split('|')
  69. bit = div_price[0]
  70. if 'btc' in bit:
  71. bit = bit.replace('btc', '')
  72. bit = cleanString(bit).strip()
  73. BTC = bit
  74. usd = div_price[2]
  75. if 'usdt_tron' in usd:
  76. usd = usd.replace('usdt_tron', '')
  77. usd = cleanString(usd).strip()
  78. USD = usd
  79. left = soup.find('div', {'class', 'row mt-3'}).find('div', {'class', 'col-6 font-weight-bold'}).text
  80. left = cleanString(left).strip()
  81. try:
  82. image = soup.find('li', {'class', 'carousel__slide'}).find('img').get('src').split('base64,')[-1]
  83. except:
  84. image = '-1'
  85. # Populating the final variable (this should be a list with all fields scraped)
  86. row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
  87. BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
  88. # Sending the results
  89. return row
  90. # parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
  91. # stores info it needs in different lists, these lists are returned after being organized
  92. # @param: soup object looking at html page of listing page
  93. # return: 'row' that contains a variety of lists that each hold info on the listing page
  94. def vortex_listing_parser(soup):
  95. # Fields to be parsed
  96. nm = 0 # *Total_Products (Should be Integer)
  97. mktName = "Vortex" # 0 *Marketplace_Name y
  98. vendor = [] # 1 *Vendor y
  99. rating_vendor = [] # 2 Vendor_Rating y
  100. success = [] # 3 Vendor_Successful_Transactions n
  101. name = [] # 4 *Product_Name y
  102. CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
  103. MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
  104. category = [] # 7 Product_Category y
  105. describe = [] # 8 Product_Description n
  106. views = [] # 9 Product_Number_Of_Views y
  107. reviews = [] # 10 Product_Number_Of_Reviews n
  108. rating_item = [] # 11 Product_Rating n
  109. addDate = [] # 12 Product_AddDate y
  110. BTC = [] # 13 Product_BTC_SellingPrice y
  111. USD = [] # 14 Product_USD_SellingPrice y
  112. EURO = [] # 15 Product_EURO_SellingPrice n
  113. sold = [] # 16 Product_QuantitySold n
  114. qLeft = [] # 17 Product_QuantityLeft n
  115. shipFrom = [] # 18 Product_ShippedFrom n
  116. shipTo = [] # 19 Product_ShippedTo n
  117. image = [] # 20 Product_Image n
  118. image_vendor = [] # 21 Vendor_Image n
  119. href = [] # 22 Product_Links y
  120. temp = soup.find('main', {'id': 'main'}).find('section', {'id':'page_container'})
  121. listings = temp.findAll('div', {"class": "product-card col-sm-6 col-md-3 col-xl-4 mb-0"})
  122. # cat = soup.find('section', {'class': 'row px-md-4 mx-0 my-3'}).find('ol').find_all('li')
  123. # cat = cat[1].find('a').text
  124. # Populating the Number of Products
  125. nm = len(listings)
  126. for listing in listings:
  127. listing = listing.find('div', {'class': 'product-details'})
  128. # Finding vendor name - checked
  129. vendor_name = listing.find('span', {'class': 'd-inline-block w-100 mb-1'}).find('a').text
  130. if 'ships from' in vendor_name:
  131. reg = re.compile(r'(.+?)\'s shop')
  132. vendor_name = reg.match(vendor_name)
  133. if vendor_name:
  134. vendor_name = vendor_name.group(1)
  135. vendor_name = vendor_name.replace('\'s shop', '')
  136. vendor_name = vendor_name.strip()
  137. vendor.append(vendor_name)
  138. # Finding the vendor rating - checked
  139. vendor_level = listing.find('span', {'class': "badge badge-pill mr-2 font-weight-normal"}).text
  140. vendor_level = vendor_level.split(' ')
  141. vendor_level = cleanNumbers(vendor_level[-1])
  142. rating_vendor.append(vendor_level)
  143. # Finding the product name - checked
  144. product_name = listing.find('h4').find('a').text
  145. product_name = cleanString(product_name).strip()
  146. name.append(product_name)
  147. # Searching for CVE and MS categories
  148. cve = listing.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
  149. if not cve:
  150. cveValue = "-1"
  151. else:
  152. cee = " "
  153. for idx in cve:
  154. cee += (idx)
  155. cee += " "
  156. cee = cee.replace(',', ' ')
  157. cee = cee.replace('\n', '')
  158. cee = cee.replace('\t', '')
  159. cveValue = cee
  160. CVE.append(cveValue)
  161. ms = listing.findAll(text=re.compile('MS\d{2}-\d{3}'))
  162. if not ms:
  163. MSValue = "-1"
  164. else:
  165. me = " "
  166. for im in ms:
  167. me += (im)
  168. me += " "
  169. me = me.replace(',', ' ')
  170. me = me.replace('\n', '')
  171. MSValue = me
  172. MS.append(MSValue)
  173. # # Finding the category - check
  174. # category_text = cleanString(cat).strip()
  175. # category.append(category_text)
  176. # Finding the hrefs - check
  177. description_link = listing.find('h4').find('a')['href']
  178. href.append(description_link)
  179. # Finding the views - check
  180. views_text = '-1'
  181. views.append(views_text)
  182. # Finding the date added
  183. date = '-1'
  184. # date = datetime.strptime(date, "%d-%m-%Y")
  185. addDate.append(date)
  186. # EURO
  187. EURO.append('-1')
  188. # Finding the BTC and USD/GOLD
  189. try:
  190. money = listing.find('div', {"class": 'w-100 small product-price mb-1 text-right'}).text
  191. money = money.split('|')
  192. if 'btc' in money[0]:
  193. btc_price = money[0].replace('btc', '')
  194. btc_price = cleanString(btc_price).strip()
  195. except:
  196. btc_price = '-1'
  197. BTC.append(btc_price)
  198. try:
  199. if 'usd' in money[2]:
  200. usd_price = money[2].strip().replace('usdt_tron', '')
  201. usd_price = cleanString(usd_price).strip()
  202. except:
  203. usd_price = '-1'
  204. USD.append(usd_price)
  205. sold.append('-1')
  206. qLeft.append('-1')
  207. shipTo.append('-1')
  208. shipf = listing.find('span', {'class': 'd-inline-block w-100 mb-1'}).find('a').text
  209. if 'ships from' in shipf:
  210. pattern = re.compile(r"ships from (.*)")
  211. shipf = pattern.search(shipf)
  212. if shipf:
  213. shipf = shipf.group(1)
  214. shipf = shipf.replace('ships from', '')
  215. else:
  216. shipf = '-1'
  217. shipf = cleanString(shipf).strip()
  218. shipFrom.append(shipf)
  219. try:
  220. temp = listing.find('p', {'class', 'w-100 mb-2'}).find('img').get('src').split('base64,')[-1]
  221. except:
  222. temp = '-1'
  223. image.append(temp)
  224. image_vendor.append('-1')
  225. # Populate the final variable (this should be a list with all fields scraped)
  226. return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
  227. reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image,
  228. image_vendor)
  229. # called by the crawler to get description links on a listing page
  230. # @param: beautifulsoup object that is using the correct html page (listing page)
  231. # return: list of description links from a listing page
  232. def vortex_links_parser(soup):
  233. # Returning all links that should be visited by the Crawler
  234. href = []
  235. listings = soup.find('main').findAll('div', {"class": "product-card col-sm-6 col-md-3 col-xl-4 mb-0"})
  236. for listing in listings:
  237. # Adding the url to the list of urls
  238. description_link = listing.find('h4').find('a')['href']
  239. href.append(description_link)
  240. return href