this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

189 lines
8.1 KiB

  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from typing import List, Tuple
  4. from MarketPlaces.Utilities.utilities import *
  5. # Here, we are importing BeautifulSoup to search through the HTML tree
  6. from bs4 import BeautifulSoup, ResultSet, Tag
  7. def thiefWorld_description_parser(soup: BeautifulSoup) -> Tuple:
  8. # Fields to be parsed
  9. vendor = "-1" # 0 *Vendor_Name
  10. success = "-1" # 1 Vendor_Successful_Transactions
  11. rating_vendor = "-1" # 2 Vendor_Rating
  12. name = "-1" # 3 *Product_Name
  13. describe = "-1" # 4 Product_Description
  14. CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  15. MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
  16. category = "-1" # 7 Product_Category
  17. views = "-1" # 8 Product_Number_Of_Views
  18. reviews = "-1" # 9 Product_Number_Of_Reviews
  19. rating_item = "-1" # 10 Product_Rating
  20. addDate = "-1" # 11 Product_AddedDate
  21. BTC = "-1" # 12 Product_BTC_SellingPrice
  22. USD = "-1" # 13 Product_USD_SellingPrice
  23. EURO = "-1" # 14 Product_EURO_SellingPrice
  24. sold = "-1" # 15 Product_QuantitySold
  25. left = "-1" # 16 Product_QuantityLeft
  26. shipFrom = "-1" # 17 Product_ShippedFrom
  27. shipTo = "-1" # 18 Product_ShippedTo
  28. image = "-1" # 19 Product_Image
  29. vendor_image = "-1" # 20 Vendor_Image
  30. name = soup.find("h1", {'class': 'title'}).text
  31. name = cleanString(name.strip())
  32. describe = soup.find('div', {'id': 'descriptionContent'}).text
  33. describe = cleanString(describe.strip())
  34. # Finding Product Image
  35. image = soup.find('div', {'class': 'product_img_big'}).find('img')
  36. image = image.get('src')
  37. image = image.split('base64,')[-1]
  38. commentListTag: Tag = soup.find('ul', {'class': 'comment_list scrollbar'})
  39. commentList = commentListTag.find_all('li')
  40. review = str(len(commentList))
  41. citySelection: str = soup.find('ul', {'class': 'meta text-muted i_location'}).text
  42. shipFrom = cleanString(citySelection.strip())
  43. vendor = soup.find('h1', {'class': 'title over'}).text
  44. vendor = cleanString(vendor.strip())
  45. usdTag: Tag = soup.find('div', {'class': 'product_price__big'}).find('span')
  46. usdText = usdTag.text.strip('/')[0]
  47. # usdText format: "<value> USD " (i.e., "70 000 USD ")
  48. USD = cleanString(usdText.replace("USD", "").strip())
  49. ratingDiv = soup.find('div', {'class': 'rating_star'})
  50. rating_vendor = ratingDiv.get('title').split(' ')[1]
  51. rating_item = soup.find('div', {'class': 'product_rate'}).text
  52. rating_item = rating_item.replace("rating", "")
  53. rating_item = cleanString(rating_item.strip())
  54. # Populating the final variable (this should be a list with all fields scraped)
  55. row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
  56. BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
  57. # Sending the results
  58. return row
  59. def thiefWorld_listing_parser(soup: BeautifulSoup):
  60. # Fields to be parsed
  61. nm = 0 # Total_Products (Should be Integer)
  62. mktName = "ThiefWorld" # 0 Marketplace_Name
  63. vendor = [] # 1 *Vendor y
  64. rating_vendor = [] # 2 Vendor_Rating
  65. success = [] # 3 Vendor_Successful_Transactions
  66. name = [] # 4 *Product_Name y
  67. CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  68. MS = [] # 6 Product_MS_Classification (Microsoft Security)
  69. category = [] # 7 Product_Category y
  70. describe = [] # 8 Product_Description
  71. views = [] # 9 Product_Number_Of_Views
  72. reviews = [] # 10 Product_Number_Of_Reviews
  73. rating_item = [] # 11 Product_Rating
  74. addDate = [] # 12 Product_AddDate
  75. BTC = [] # 13 Product_BTC_SellingPrice
  76. USD = [] # 14 Product_USD_SellingPrice y
  77. EURO = [] # 15 Product_EURO_SellingPrice
  78. sold = [] # 16 Product_QuantitySold
  79. qLeft =[] # 17 Product_QuantityLeft
  80. shipFrom = [] # 18 Product_ShippedFrom
  81. shipTo = [] # 19 Product_ShippedTo
  82. image = [] # 20 Product_Image
  83. image_vendor = [] # 21 Vendor_Image
  84. href = [] # 22 Product_Links
  85. productList: ResultSet[Tag] = soup.find_all('div', {'class': 'catalog_item'})
  86. nm = len(productList)
  87. for product in productList:
  88. productTitle: Tag = product.find('div', {'class': 'title'}).find('a')
  89. productName = cleanString(productTitle.text.strip())
  90. name.append(productName)
  91. # Finding Product Image
  92. product_image = product.find('noscript').find('img')
  93. product_image = product_image.get('src')
  94. product_image = product_image.split('base64,')[-1]
  95. image.append(product_image)
  96. productHref = productTitle.get('href')
  97. href.append(productHref)
  98. CVE.append('-1')
  99. MS.append('-1')
  100. cat = soup.find('calsys-cat').text
  101. category.append(cat.strip())
  102. productDescription = product.find('div', {'class': 'text'}).text
  103. productDescription = cleanString(productDescription.strip())
  104. describe.append(productDescription)
  105. views.append('-1')
  106. reviews.append('-1')
  107. addDate.append('-1')
  108. BTC.append('-1')
  109. priceText = product.find('span', {'class': 'price'}).find('span').text
  110. priceText = priceText.split('USD')[0]
  111. priceText = cleanString(priceText.strip())
  112. USD.append(priceText)
  113. EURO.append('-1')
  114. sold.append('-1')
  115. qLeft.append('-1')
  116. shipFrom.append('-1')
  117. shipTo.append('-1')
  118. productVendor = product.find('div', {'class': 'market over'}).find('a').text
  119. productVendor = cleanString(productVendor.strip())
  120. vendor.append(productVendor)
  121. image_vendor.append('-1')
  122. rating_vendor.append('-1')
  123. #rating_item.append('-1')
  124. rating = product.find('div', {'class': 'rating_star_yellow'}).attrs.get('style')
  125. rating = rating.replace("width: ", "")
  126. rating_item.append(cleanString(rating))
  127. success.append('-1')
  128. # Populate the final variable (this should be a list with all fields scraped)
  129. return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
  130. reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
  131. #called by the crawler to get description links on a listing page
  132. #@param: beautifulsoup object that is using the correct html page (listing page)
  133. #return: list of description links from a listing page
  134. def thiefworld_links_parser(soup):
  135. # Returning all links that should be visited by the Crawler
  136. href = []
  137. listing = soup.find('div', {"class": "row tile__list tileitems_filter pad15 tileproduct__list"}).findAll('div', {"class": "desc"})
  138. for a in listing:
  139. bae = a.find('div', {"class": "title"}).find('a', href=True)
  140. link = bae['href']
  141. href.append(link)
  142. return href