this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

456 lines
18 KiB

  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from typing import List, Tuple
  4. from MarketPlaces.Utilities.utilities import *
  5. # Here, we are importing BeautifulSoup to search through the HTML tree
  6. from bs4 import BeautifulSoup, ResultSet, Tag
  7. def thiefWorld_description_parser(soup: BeautifulSoup) -> Tuple:
  8. # Fields to be parsed
  9. name = "-1" # 0 Product_Name
  10. describe = "-1" # 1 Product_Description
  11. lastSeen = "-1" # 2 Product_LastViewDate
  12. rules = "-1" # 3 NOT USED ...
  13. CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  14. MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
  15. review = "-1" # 6 Product_Number_Of_Reviews
  16. category = "-1" # 7 Product_Category
  17. shipFrom = "-1" # 8 Product_ShippedFrom
  18. shipTo = "-1" # 9 Product_ShippedTo
  19. left = "-1" # 10 Product_QuantityLeft
  20. escrow = "-1" # 11 Vendor_Warranty
  21. terms = "-1" # 12 Vendor_TermsAndConditions
  22. vendor = "-1" # 13 Vendor_Name
  23. sold = "-1" # 14 Product_QuantitySold
  24. addDate = "-1" # 15 Product_AddedDate
  25. available = "-1" # 16 NOT USED ...
  26. endDate = "-1" # 17 NOT USED ...
  27. BTC = "-1" # 18 Product_BTC_SellingPrice
  28. USD = "-1" # 19 Product_USD_SellingPrice
  29. rating = "-1" # 20 Vendor_Rating
  30. success = "-1" # 21 Vendor_Successful_Transactions
  31. EURO = "-1" # 22 Product_EURO_SellingPrice
  32. name = soup.find("h1", {'class': 'title'}).text
  33. name = cleanString(name.strip())
  34. describe = soup.find('div', {'id': 'descriptionContent'}).text
  35. describe = cleanString(describe.strip())
  36. commentListTag: Tag = soup.find('ul', {'class': 'comment_list scrollbar'})
  37. commentList = commentListTag.find_all('li')
  38. review = str(len(commentList))
  39. citySelection: str = soup.find('ul', {'class': 'meta text-muted i_location'}).text
  40. shipFrom = cleanString(citySelection.strip())
  41. vendor = soup.find('h1', {'class': 'title over'}).text
  42. vendor = cleanString(vendor.strip)
  43. usdTag: Tag = soup.find('div', {'class': 'product_price__big'}).find('span')
  44. usdText = usdTag.text.strip('/')[0]
  45. # usdText format: "<value> USD " (i.e., "70 000 USD ")
  46. USD = cleanString(usdText.replace("USD", "").strip())
  47. ratingDiv = soup.find('div', {'class': 'rating_star'})
  48. rating = ratingDiv.get('title').strip(' ')[1]
  49. row = (
  50. name,
  51. describe,
  52. lastSeen,
  53. rules,
  54. CVE,
  55. MS,
  56. review,
  57. category,
  58. shipFrom,
  59. shipTo,
  60. left,
  61. escrow,
  62. terms,
  63. vendor,
  64. sold,
  65. addDate,
  66. available,
  67. endDate,
  68. BTC,
  69. USD,
  70. rating,
  71. success,
  72. EURO
  73. )
  74. return row
  75. def thiefWorld_listing_parser(soup: BeautifulSoup):
  76. pass
  77. #parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
  78. #stores info it needs in different lists, these lists are returned after being organized
  79. #@param: soup object looking at html page of description page
  80. #return: 'row' that contains a variety of lists that each hold info on the description page
  81. def darkfox_description_parser(soup):
  82. # Fields to be parsed
  83. name = "-1" # 0 Product_Name
  84. describe = "-1" # 1 Product_Description
  85. lastSeen = "-1" # 2 Product_LastViewDate
  86. rules = "-1" # 3 NOT USED ...
  87. CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  88. MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
  89. review = "-1" # 6 Product_Number_Of_Reviews
  90. category = "-1" # 7 Product_Category
  91. shipFrom = "-1" # 8 Product_ShippedFrom
  92. shipTo = "-1" # 9 Product_ShippedTo
  93. left = "-1" # 10 Product_QuantityLeft
  94. escrow = "-1" # 11 Vendor_Warranty
  95. terms = "-1" # 12 Vendor_TermsAndConditions
  96. vendor = "-1" # 13 Vendor_Name
  97. sold = "-1" # 14 Product_QuantitySold
  98. addDate = "-1" # 15 Product_AddedDate
  99. available = "-1" # 16 NOT USED ...
  100. endDate = "-1" # 17 NOT USED ...
  101. BTC = "-1" # 18 Product_BTC_SellingPrice
  102. USD = "-1" # 19 Product_USD_SellingPrice
  103. rating = "-1" # 20 Vendor_Rating
  104. success = "-1" # 21 Vendor_Successful_Transactions
  105. EURO = "-1" # 22 Product_EURO_SellingPrice
  106. # Finding Product Name
  107. name = soup.find('h1').text
  108. name = name.replace('\n', ' ')
  109. name = name.replace(",", "")
  110. name = name.strip()
  111. # Finding Vendor
  112. vendor = soup.find('h3').find('a').text.strip()
  113. # Finding Vendor Rating
  114. rating = soup.find('span', {'class': "tag is-dark"}).text.strip()
  115. # Finding Successful Transactions
  116. success = soup.find('h3').text
  117. success = success.replace("Vendor: ", "")
  118. success = success.replace(vendor, "")
  119. success = success.replace("(", "")
  120. success = success.replace(")", "")
  121. success = success.strip()
  122. bae = soup.find('div', {'class': "box"}).find_all('ul')
  123. # Finding Prices
  124. USD = bae[1].find('strong').text.strip()
  125. li = bae[2].find_all('li')
  126. # Finding Escrow
  127. escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip()
  128. # Finding the Product Category
  129. category = li[1].find('span', {'class': "tag is-dark"}).text.strip()
  130. # Finding the Product Quantity Available
  131. left = li[3].find('span', {'class': "tag is-dark"}).text.strip()
  132. # Finding Number Sold
  133. sold = li[4].find('span', {'class': "tag is-dark"}).text.strip()
  134. li = bae[3].find_all('li')
  135. # Finding Shipment Information (Origin)
  136. if "Ships from:" in li[-2].text:
  137. shipFrom = li[-2].text
  138. shipFrom = shipFrom.replace("Ships from: ", "")
  139. # shipFrom = shipFrom.replace(",", "")
  140. shipFrom = shipFrom.strip()
  141. # Finding Shipment Information (Destination)
  142. shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text
  143. shipTo = shipTo.replace("Ships to: ", "")
  144. shipTo = shipTo.strip()
  145. if "certain countries" in shipTo:
  146. countries = ""
  147. tags = li[-1].find_all('span', {'class': "tag"})
  148. for tag in tags:
  149. country = tag.text.strip()
  150. countries += country + ", "
  151. shipTo = countries.strip(", ")
  152. # Finding the Product description
  153. describe = soup.find('div', {'class': "pre-line"}).text
  154. describe = describe.replace("\n", " ")
  155. describe = describe.strip()
  156. '''# Finding the Number of Product Reviews
  157. tag = soup.findAll(text=re.compile('Reviews'))
  158. for index in tag:
  159. reviews = index
  160. par = reviews.find('(')
  161. if par >=0:
  162. reviews = reviews.replace("Reviews (","")
  163. reviews = reviews.replace(")","")
  164. reviews = reviews.split(",")
  165. review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
  166. else :
  167. review = "-1"'''
  168. # Searching for CVE and MS categories
  169. cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
  170. if cve:
  171. CVE = " "
  172. for idx in cve:
  173. CVE += (idx)
  174. CVE += " "
  175. CVE = CVE.replace(',', ' ')
  176. CVE = CVE.replace('\n', '')
  177. ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
  178. if ms:
  179. MS = " "
  180. for im in ms:
  181. MS += (im)
  182. MS += " "
  183. MS = MS.replace(',', ' ')
  184. MS = MS.replace('\n', '')
  185. # Populating the final variable (this should be a list with all fields scraped)
  186. row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,
  187. sold, addDate, available, endDate, BTC, USD, rating, success, EURO)
  188. # Sending the results
  189. return row
  190. def thiefWorld_listing_parser(soup: BeautifulSoup):
  191. # Fields to be parsed
  192. nm = 0 # Total_Products (Should be Integer)
  193. mktName = "ThiefWorld" # 0 Marketplace_Name
  194. name = [] # 1 Product_Name
  195. CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  196. MS = [] # 3 Product_MS_Classification (Microsoft Security)
  197. category = [] # 4 Product_Category
  198. describe = [] # 5 Product_Description
  199. escrow = [] # 6 Vendor_Warranty
  200. views = [] # 7 Product_Number_Of_Views
  201. reviews = [] # 8 Product_Number_Of_Reviews
  202. addDate = [] # 9 Product_AddDate
  203. lastSeen = [] # 10 Product_LastViewDate
  204. BTC = [] # 11 Product_BTC_SellingPrice
  205. USD = [] # 12 Product_USD_SellingPrice
  206. EURO = [] # 13 Product_EURO_SellingPrice
  207. sold = [] # 14 Product_QuantitySold
  208. qLeft =[] # 15 Product_QuantityLeft
  209. shipFrom = [] # 16 Product_ShippedFrom
  210. shipTo = [] # 17 Product_ShippedTo
  211. vendor = [] # 18 Vendor
  212. rating = [] # 19 Vendor_Rating
  213. success = [] # 20 Vendor_Successful_Transactions
  214. href = [] # 23 Product_Links (Urls)
  215. productList: ResultSet[Tag] = soup.find_all('div', {'class': 'catalog_item'})
  216. nm = len(productList)
  217. for product in productList:
  218. productTitle: Tag = product.find('div', {'class': 'title'}).find('a')
  219. productName = cleanString(productTitle.text.strip())
  220. name.append(productName)
  221. productHref = productTitle.get('href')
  222. href.append(productHref)
  223. CVE.append('-1')
  224. MS.append('-1')
  225. category.append('-1')
  226. productDescription = product.find('div', {'class': 'text'}).text
  227. productDescription = cleanString(productDescription.strip())
  228. describe.append(productDescription)
  229. escrow.append('-1')
  230. views.append('-1')
  231. reviews.append('-1')
  232. addDate.append('-1')
  233. lastSeen.append('-1')
  234. BTC.append('-1')
  235. priceText = product.find('span', {'class': 'price'}).find('span').text
  236. priceText = priceText.split('USD')[0]
  237. priceText = cleanString(priceText.strip())
  238. USD.append(priceText)
  239. EURO.append('-1')
  240. sold.append('-1')
  241. qLeft.append('-1')
  242. shipFrom.append('-1')
  243. shipTo.append('-1')
  244. productVendor = product.find('div', {'class': 'market over'}).find('a').text
  245. productVendor = cleanString(productVendor.strip())
  246. vendor.append(productVendor)
  247. rating.append('-1')
  248. success.append('-1')
  249. return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
  250. BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
  251. #parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
  252. #stores info it needs in different lists, these lists are returned after being organized
  253. #@param: soup object looking at html page of listing page
  254. #return: 'row' that contains a variety of lists that each hold info on the listing page
  255. def darkfox_listing_parser(soup):
  256. # Fields to be parsed
  257. nm = 0 # Total_Products (Should be Integer)
  258. mktName = "DarkFox" # 0 Marketplace_Name
  259. name = [] # 1 Product_Name
  260. CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  261. MS = [] # 3 Product_MS_Classification (Microsoft Security)
  262. category = [] # 4 Product_Category
  263. describe = [] # 5 Product_Description
  264. escrow = [] # 6 Vendor_Warranty
  265. views = [] # 7 Product_Number_Of_Views
  266. reviews = [] # 8 Product_Number_Of_Reviews
  267. addDate = [] # 9 Product_AddDate
  268. lastSeen = [] # 10 Product_LastViewDate
  269. BTC = [] # 11 Product_BTC_SellingPrice
  270. USD = [] # 12 Product_USD_SellingPrice
  271. EURO = [] # 13 Product_EURO_SellingPrice
  272. sold = [] # 14 Product_QuantitySold
  273. qLeft =[] # 15 Product_QuantityLeft
  274. shipFrom = [] # 16 Product_ShippedFrom
  275. shipTo = [] # 17 Product_ShippedTo
  276. vendor = [] # 18 Vendor
  277. rating = [] # 19 Vendor_Rating
  278. success = [] # 20 Vendor_Successful_Transactions
  279. href = [] # 23 Product_Links (Urls)
  280. listing = soup.findAll('div', {"class": "card"})
  281. # Populating the Number of Products
  282. nm = len(listing)
  283. for a in listing:
  284. bae = a.findAll('a', href=True)
  285. # Adding the url to the list of urls
  286. link = bae[0].get('href')
  287. link = cleanLink(link)
  288. href.append(link)
  289. # Finding the Product
  290. product = bae[1].find('p').text
  291. product = product.replace('\n', ' ')
  292. product = product.replace(",", "")
  293. product = product.replace("...", "")
  294. product = product.strip()
  295. name.append(product)
  296. bae = a.find('div', {'class': "media-content"}).find('div').find_all('div')
  297. if len(bae) >= 5:
  298. # Finding Prices
  299. price = bae[0].text
  300. ud = price.replace(" USD", " ")
  301. # u = ud.replace("$","")
  302. u = ud.replace(",", "")
  303. u = u.strip()
  304. USD.append(u)
  305. # bc = (prc[1]).strip(' BTC')
  306. # BTC.append(bc)
  307. # Finding the Vendor
  308. vendor_name = bae[1].find('a').text
  309. vendor_name = vendor_name.replace(",", "")
  310. vendor_name = vendor_name.strip()
  311. vendor.append(vendor_name)
  312. # Finding the Category
  313. cat = bae[2].find('small').text
  314. cat = cat.replace("Category: ", "")
  315. cat = cat.replace(",", "")
  316. cat = cat.strip()
  317. category.append(cat)
  318. # Finding Number Sold and Quantity Left
  319. num = bae[3].text
  320. num = num.replace("Sold: ", "")
  321. num = num.strip()
  322. sold.append(num)
  323. quant = bae[4].find('small').text
  324. quant = quant.replace("In stock: ", "")
  325. quant = quant.strip()
  326. qLeft.append(quant)
  327. # Finding Successful Transactions
  328. freq = bae[1].text
  329. freq = freq.replace(vendor_name, "")
  330. freq = re.sub(r'Vendor Level \d+', "", freq)
  331. freq = freq.replace("(", "")
  332. freq = freq.replace(")", "")
  333. freq = freq.strip()
  334. success.append(freq)
  335. # Searching for CVE and MS categories
  336. cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
  337. if not cve:
  338. cveValue="-1"
  339. else:
  340. cee = " "
  341. for idx in cve:
  342. cee += (idx)
  343. cee += " "
  344. cee = cee.replace(',', ' ')
  345. cee = cee.replace('\n', '')
  346. cveValue=cee
  347. CVE.append(cveValue)
  348. ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
  349. if not ms:
  350. MSValue="-1"
  351. else:
  352. me = " "
  353. for im in ms:
  354. me += (im)
  355. me += " "
  356. me = me.replace(',', ' ')
  357. me = me.replace('\n', '')
  358. MSValue=me
  359. MS.append(MSValue)
  360. # Populate the final variable (this should be a list with all fields scraped)
  361. return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
  362. BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
  363. #called by the crawler to get description links on a listing page
  364. #@param: beautifulsoup object that is using the correct html page (listing page)
  365. #return: list of description links from a listing page
  366. def thiefworld_links_parser(soup):
  367. # Returning all links that should be visited by the Crawler
  368. href = []
  369. listing = soup.find('div', {"class": "row tile__list tileitems_filter pad15 tileproduct__list"}).findAll('div', {"class": "desc"})
  370. for a in listing:
  371. bae = a.find('div', {"class": "title"}).find('a', href=True)
  372. link = bae['href']
  373. href.append(link)
  374. return href