this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

451 lines
18 KiB

  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from typing import List, Tuple
  4. from MarketPlaces.Utilities.utilities import *
  5. # Here, we are importing BeautifulSoup to search through the HTML tree
  6. from bs4 import BeautifulSoup, ResultSet, Tag
  7. def thiefWorld_description_parser(soup: BeautifulSoup) -> Tuple:
  8. # Fields to be parsed
  9. name = "-1" # 0 Product_Name
  10. describe = "-1" # 1 Product_Description
  11. lastSeen = "-1" # 2 Product_LastViewDate
  12. rules = "-1" # 3 NOT USED ...
  13. CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  14. MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
  15. review = "-1" # 6 Product_Number_Of_Reviews
  16. category = "-1" # 7 Product_Category
  17. shipFrom = "-1" # 8 Product_ShippedFrom
  18. shipTo = "-1" # 9 Product_ShippedTo
  19. left = "-1" # 10 Product_QuantityLeft
  20. escrow = "-1" # 11 Vendor_Warranty
  21. terms = "-1" # 12 Vendor_TermsAndConditions
  22. vendor = "-1" # 13 Vendor_Name
  23. sold = "-1" # 14 Product_QuantitySold
  24. addDate = "-1" # 15 Product_AddedDate
  25. available = "-1" # 16 NOT USED ...
  26. endDate = "-1" # 17 NOT USED ...
  27. BTC = "-1" # 18 Product_BTC_SellingPrice
  28. USD = "-1" # 19 Product_USD_SellingPrice
  29. rating = "-1" # 20 Vendor_Rating
  30. success = "-1" # 21 Vendor_Successful_Transactions
  31. EURO = "-1" # 22 Product_EURO_SellingPrice
  32. name = soup.find("h1", {'class': 'title'}).text
  33. name = cleanString(name.strip())
  34. describe = soup.find('div', {'id': 'descriptionContent'}).text
  35. describe = cleanString(describe.strip())
  36. commentListTag: Tag = soup.find('ul', {'class': 'comment_list scrollbar'})
  37. commentList = commentListTag.find_all('li')
  38. review = str(len(commentList))
  39. citySelection: str = soup.find('ul', {'class': 'meta text-muted i_location'}).text
  40. shipFrom = cleanString(citySelection.strip())
  41. vendor = soup.find('h1', {'class': 'title over'}).text
  42. vendor = cleanString(vendor.strip)
  43. usdTag: Tag = soup.find('div', {'class': 'product_price__big'}).find('span')
  44. usdText = usdTag.text.strip('/')[0]
  45. # usdText format: "<value> USD " (i.e., "70 000 USD ")
  46. USD = cleanString(usdText.replace("USD", "").strip())
  47. ratingDiv = soup.find('div', {'class': 'rating_star'})
  48. rating = ratingDiv.get('title').strip(' ')[1]
  49. row = (
  50. name,
  51. describe,
  52. lastSeen,
  53. rules,
  54. CVE,
  55. MS,
  56. review,
  57. category,
  58. shipFrom,
  59. shipTo,
  60. left,
  61. escrow,
  62. terms,
  63. vendor,
  64. sold,
  65. addDate,
  66. available,
  67. endDate,
  68. BTC,
  69. USD,
  70. rating,
  71. success,
  72. EURO
  73. )
  74. return row
  75. #parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
  76. #stores info it needs in different lists, these lists are returned after being organized
  77. #@param: soup object looking at html page of description page
  78. #return: 'row' that contains a variety of lists that each hold info on the description page
  79. def darkfox_description_parser(soup):
  80. # Fields to be parsed
  81. name = "-1" # 0 Product_Name
  82. describe = "-1" # 1 Product_Description
  83. lastSeen = "-1" # 2 Product_LastViewDate
  84. rules = "-1" # 3 NOT USED ...
  85. CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  86. MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
  87. review = "-1" # 6 Product_Number_Of_Reviews
  88. category = "-1" # 7 Product_Category
  89. shipFrom = "-1" # 8 Product_ShippedFrom
  90. shipTo = "-1" # 9 Product_ShippedTo
  91. left = "-1" # 10 Product_QuantityLeft
  92. escrow = "-1" # 11 Vendor_Warranty
  93. terms = "-1" # 12 Vendor_TermsAndConditions
  94. vendor = "-1" # 13 Vendor_Name
  95. sold = "-1" # 14 Product_QuantitySold
  96. addDate = "-1" # 15 Product_AddedDate
  97. available = "-1" # 16 NOT USED ...
  98. endDate = "-1" # 17 NOT USED ...
  99. BTC = "-1" # 18 Product_BTC_SellingPrice
  100. USD = "-1" # 19 Product_USD_SellingPrice
  101. rating = "-1" # 20 Vendor_Rating
  102. success = "-1" # 21 Vendor_Successful_Transactions
  103. EURO = "-1" # 22 Product_EURO_SellingPrice
  104. # Finding Product Name
  105. name = soup.find('h1').text
  106. name = name.replace('\n', ' ')
  107. name = name.replace(",", "")
  108. name = name.strip()
  109. # Finding Vendor
  110. vendor = soup.find('h3').find('a').text.strip()
  111. # Finding Vendor Rating
  112. rating = soup.find('span', {'class': "tag is-dark"}).text.strip()
  113. # Finding Successful Transactions
  114. success = soup.find('h3').text
  115. success = success.replace("Vendor: ", "")
  116. success = success.replace(vendor, "")
  117. success = success.replace("(", "")
  118. success = success.replace(")", "")
  119. success = success.strip()
  120. bae = soup.find('div', {'class': "box"}).find_all('ul')
  121. # Finding Prices
  122. USD = bae[1].find('strong').text.strip()
  123. li = bae[2].find_all('li')
  124. # Finding Escrow
  125. escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip()
  126. # Finding the Product Category
  127. category = li[1].find('span', {'class': "tag is-dark"}).text.strip()
  128. # Finding the Product Quantity Available
  129. left = li[3].find('span', {'class': "tag is-dark"}).text.strip()
  130. # Finding Number Sold
  131. sold = li[4].find('span', {'class': "tag is-dark"}).text.strip()
  132. li = bae[3].find_all('li')
  133. # Finding Shipment Information (Origin)
  134. if "Ships from:" in li[-2].text:
  135. shipFrom = li[-2].text
  136. shipFrom = shipFrom.replace("Ships from: ", "")
  137. # shipFrom = shipFrom.replace(",", "")
  138. shipFrom = shipFrom.strip()
  139. # Finding Shipment Information (Destination)
  140. shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text
  141. shipTo = shipTo.replace("Ships to: ", "")
  142. shipTo = shipTo.strip()
  143. if "certain countries" in shipTo:
  144. countries = ""
  145. tags = li[-1].find_all('span', {'class': "tag"})
  146. for tag in tags:
  147. country = tag.text.strip()
  148. countries += country + ", "
  149. shipTo = countries.strip(", ")
  150. # Finding the Product description
  151. describe = soup.find('div', {'class': "pre-line"}).text
  152. describe = describe.replace("\n", " ")
  153. describe = describe.strip()
  154. '''# Finding the Number of Product Reviews
  155. tag = soup.findAll(text=re.compile('Reviews'))
  156. for index in tag:
  157. reviews = index
  158. par = reviews.find('(')
  159. if par >=0:
  160. reviews = reviews.replace("Reviews (","")
  161. reviews = reviews.replace(")","")
  162. reviews = reviews.split(",")
  163. review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
  164. else :
  165. review = "-1"'''
  166. # Searching for CVE and MS categories
  167. cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
  168. if cve:
  169. CVE = " "
  170. for idx in cve:
  171. CVE += (idx)
  172. CVE += " "
  173. CVE = CVE.replace(',', ' ')
  174. CVE = CVE.replace('\n', '')
  175. ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
  176. if ms:
  177. MS = " "
  178. for im in ms:
  179. MS += (im)
  180. MS += " "
  181. MS = MS.replace(',', ' ')
  182. MS = MS.replace('\n', '')
  183. # Populating the final variable (this should be a list with all fields scraped)
  184. row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,
  185. sold, addDate, available, endDate, BTC, USD, rating, success, EURO)
  186. # Sending the results
  187. return row
  188. def thiefWorld_listing_parser(soup: BeautifulSoup):
  189. # Fields to be parsed
  190. nm = 0 # Total_Products (Should be Integer)
  191. mktName = "ThiefWorld" # 0 Marketplace_Name
  192. name = [] # 1 Product_Name
  193. CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  194. MS = [] # 3 Product_MS_Classification (Microsoft Security)
  195. category = [] # 4 Product_Category
  196. describe = [] # 5 Product_Description
  197. escrow = [] # 6 Vendor_Warranty
  198. views = [] # 7 Product_Number_Of_Views
  199. reviews = [] # 8 Product_Number_Of_Reviews
  200. addDate = [] # 9 Product_AddDate
  201. lastSeen = [] # 10 Product_LastViewDate
  202. BTC = [] # 11 Product_BTC_SellingPrice
  203. USD = [] # 12 Product_USD_SellingPrice
  204. EURO = [] # 13 Product_EURO_SellingPrice
  205. sold = [] # 14 Product_QuantitySold
  206. qLeft =[] # 15 Product_QuantityLeft
  207. shipFrom = [] # 16 Product_ShippedFrom
  208. shipTo = [] # 17 Product_ShippedTo
  209. vendor = [] # 18 Vendor
  210. rating = [] # 19 Vendor_Rating
  211. success = [] # 20 Vendor_Successful_Transactions
  212. href = [] # 23 Product_Links (Urls)
  213. productList: ResultSet[Tag] = soup.find_all('div', {'class': 'catalog_item'})
  214. nm = len(productList)
  215. for product in productList:
  216. productTitle: Tag = product.find('div', {'class': 'title'}).find('a')
  217. productName = cleanString(productTitle.text.strip())
  218. name.append(productName)
  219. productHref = productTitle.get('href')
  220. href.append(productHref)
  221. CVE.append('-1')
  222. MS.append('-1')
  223. category.append('-1')
  224. productDescription = product.find('div', {'class': 'text'}).text
  225. productDescription = cleanString(productDescription.strip())
  226. describe.append(productDescription)
  227. escrow.append('-1')
  228. views.append('-1')
  229. reviews.append('-1')
  230. addDate.append('-1')
  231. lastSeen.append('-1')
  232. BTC.append('-1')
  233. priceText = product.find('span', {'class': 'price'}).find('span').text
  234. priceText = priceText.split('USD')[0]
  235. priceText = cleanString(priceText.strip())
  236. USD.append(priceText)
  237. EURO.append('-1')
  238. sold.append('-1')
  239. qLeft.append('-1')
  240. shipFrom.append('-1')
  241. shipTo.append('-1')
  242. productVendor = product.find('div', {'class': 'market over'}).find('a').text
  243. productVendor = cleanString(productVendor.strip())
  244. vendor.append(productVendor)
  245. rating.append('-1')
  246. success.append('-1')
  247. return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
  248. BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
  249. #parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
  250. #stores info it needs in different lists, these lists are returned after being organized
  251. #@param: soup object looking at html page of listing page
  252. #return: 'row' that contains a variety of lists that each hold info on the listing page
  253. def darkfox_listing_parser(soup):
  254. # Fields to be parsed
  255. nm = 0 # Total_Products (Should be Integer)
  256. mktName = "DarkFox" # 0 Marketplace_Name
  257. name = [] # 1 Product_Name
  258. CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  259. MS = [] # 3 Product_MS_Classification (Microsoft Security)
  260. category = [] # 4 Product_Category
  261. describe = [] # 5 Product_Description
  262. escrow = [] # 6 Vendor_Warranty
  263. views = [] # 7 Product_Number_Of_Views
  264. reviews = [] # 8 Product_Number_Of_Reviews
  265. addDate = [] # 9 Product_AddDate
  266. lastSeen = [] # 10 Product_LastViewDate
  267. BTC = [] # 11 Product_BTC_SellingPrice
  268. USD = [] # 12 Product_USD_SellingPrice
  269. EURO = [] # 13 Product_EURO_SellingPrice
  270. sold = [] # 14 Product_QuantitySold
  271. qLeft =[] # 15 Product_QuantityLeft
  272. shipFrom = [] # 16 Product_ShippedFrom
  273. shipTo = [] # 17 Product_ShippedTo
  274. vendor = [] # 18 Vendor
  275. rating = [] # 19 Vendor_Rating
  276. success = [] # 20 Vendor_Successful_Transactions
  277. href = [] # 23 Product_Links (Urls)
  278. listing = soup.findAll('div', {"class": "card"})
  279. # Populating the Number of Products
  280. nm = len(listing)
  281. for a in listing:
  282. bae = a.findAll('a', href=True)
  283. # Adding the url to the list of urls
  284. link = bae[0].get('href')
  285. link = cleanLink(link)
  286. href.append(link)
  287. # Finding the Product
  288. product = bae[1].find('p').text
  289. product = product.replace('\n', ' ')
  290. product = product.replace(",", "")
  291. product = product.replace("...", "")
  292. product = product.strip()
  293. name.append(product)
  294. bae = a.find('div', {'class': "media-content"}).find('div').find_all('div')
  295. if len(bae) >= 5:
  296. # Finding Prices
  297. price = bae[0].text
  298. ud = price.replace(" USD", " ")
  299. # u = ud.replace("$","")
  300. u = ud.replace(",", "")
  301. u = u.strip()
  302. USD.append(u)
  303. # bc = (prc[1]).strip(' BTC')
  304. # BTC.append(bc)
  305. # Finding the Vendor
  306. vendor_name = bae[1].find('a').text
  307. vendor_name = vendor_name.replace(",", "")
  308. vendor_name = vendor_name.strip()
  309. vendor.append(vendor_name)
  310. # Finding the Category
  311. cat = bae[2].find('small').text
  312. cat = cat.replace("Category: ", "")
  313. cat = cat.replace(",", "")
  314. cat = cat.strip()
  315. category.append(cat)
  316. # Finding Number Sold and Quantity Left
  317. num = bae[3].text
  318. num = num.replace("Sold: ", "")
  319. num = num.strip()
  320. sold.append(num)
  321. quant = bae[4].find('small').text
  322. quant = quant.replace("In stock: ", "")
  323. quant = quant.strip()
  324. qLeft.append(quant)
  325. # Finding Successful Transactions
  326. freq = bae[1].text
  327. freq = freq.replace(vendor_name, "")
  328. freq = re.sub(r'Vendor Level \d+', "", freq)
  329. freq = freq.replace("(", "")
  330. freq = freq.replace(")", "")
  331. freq = freq.strip()
  332. success.append(freq)
  333. # Searching for CVE and MS categories
  334. cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
  335. if not cve:
  336. cveValue="-1"
  337. else:
  338. cee = " "
  339. for idx in cve:
  340. cee += (idx)
  341. cee += " "
  342. cee = cee.replace(',', ' ')
  343. cee = cee.replace('\n', '')
  344. cveValue=cee
  345. CVE.append(cveValue)
  346. ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
  347. if not ms:
  348. MSValue="-1"
  349. else:
  350. me = " "
  351. for im in ms:
  352. me += (im)
  353. me += " "
  354. me = me.replace(',', ' ')
  355. me = me.replace('\n', '')
  356. MSValue=me
  357. MS.append(MSValue)
  358. # Populate the final variable (this should be a list with all fields scraped)
  359. return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
  360. BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
  361. #called by the crawler to get description links on a listing page
  362. #@param: beautifulsoup object that is using the correct html page (listing page)
  363. #return: list of description links from a listing page
  364. def thiefworld_links_parser(soup):
  365. # Returning all links that should be visited by the Crawler
  366. href = []
  367. listing = soup.find('div', {"class": "row tile__list tileitems_filter pad15 tileproduct__list"}).findAll('div', {"class": "desc"})
  368. for a in listing:
  369. bae = a.find('div', {"class": "title"}).find('a', href=True)
  370. link = bae['href']
  371. href.append(link)
  372. return href