this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

231 lines
9.7 KiB

  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from MarketPlaces.Utilities.utilities import *
  4. def darkdock_description_parser(soup):
  5. """Parses the description pages of a DarkDock marketplace.
  6. It takes a BeautifulSoup object that represents the HTML page of a description page, and
  7. extracts various information such as vendor name, product name, etc.
  8. Args:
  9. soup: A BeautifulSoup object that represents the HTML page of a description page.
  10. Returns:
  11. The row of a description item as a tuple containing the information fields extracted from the description page.
  12. """
  13. vendor = "-1" # 0 Vendor_Name
  14. success = "-1" # 1 Vendor_Successful_Transactions
  15. rating_vendor = "-1" # 2 Vendor_Rating
  16. name = "-1" # 3 Product_Name
  17. describe = "-1" # 4 Product_Description
  18. CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  19. MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
  20. category = "-1" # 7 Product_Category
  21. views = "-1" # 8 Product_Number_Of_Views
  22. reviews = "-1" # 9 Product_Number_Of_Reviews
  23. rating_item = "-1" # 10 Product_Rating
  24. addDate = "-1" # 11 Product_AddedDate
  25. BTC = "-1" # 12 Product_BTC_SellingPrice
  26. USD = "-1" # 13 Product_USD_SellingPrice
  27. EURO = "-1" # 14 Product_EURO_SellingPrice
  28. sold = "-1" # 15 Product_QuantitySold
  29. left = "-1" # 16 Product_QuantityLeft
  30. shipFrom = "-1" # 17 Product_ShippedFrom
  31. shipTo = "-1" # 18 Product_ShippedTo
  32. image = "-1" # 19 Product_Image
  33. vendor_image = "-1" # 20 Vendor_Image
  34. # Finding Vendor
  35. vendor = soup.select_one('table tr:nth-of-type(2) td:nth-of-type(3) a u').text
  36. vendor = cleanString(vendor)
  37. vendor = vendor.strip()
  38. # Finding Product Name
  39. headings = soup.find('div', {'class': 'main'}).find_all('div', {'class': 'heading'})
  40. name = headings[0].text
  41. name = cleanString(name)
  42. name = name.strip()
  43. # Finding the Product description
  44. describe = soup.find('div', {'class': 'tab1'}).text
  45. describe = cleanString(describe)
  46. describe = describe.strip()
  47. # Finding the Product category
  48. category = soup.select_one('table tr:nth-of-type(6) td:nth-of-type(3)').text
  49. category = cleanString(category)
  50. category = category.strip()
  51. # Finding Number of Product Reviews
  52. reviews = headings[1].text
  53. match = re.search(r'\((\d+)\)', reviews).group(1)
  54. reviews = cleanNumbers(reviews)
  55. reviews = reviews.strip()
  56. # Finding Prices
  57. USD = soup.select_one('table tr:nth-of-type(1) td:nth-of-type(3)').text
  58. USD = cleanNumbers(USD)
  59. USD = USD.strip()
  60. # Finding the Product Quantity Available
  61. left = soup.select_one('table tr:nth-of-type(7) td:nth-of-type(3)').text
  62. left = cleanNumbers(left)
  63. left = left.strip()
  64. # Finding Product Shipped From
  65. shipFrom = soup.select_one('table tr:nth-of-type(3) td:nth-of-type(3)').text
  66. shipFrom = cleanString(shipFrom)
  67. shipFrom = shipFrom.strip()
  68. # Finding Product Shipped To
  69. shipTo = soup.select_one('table tr:nth-of-type(5) td:nth-of-type(3)').text
  70. shipTo = cleanString(shipTo)
  71. shipTo = shipTo.strip()
  72. # Finding Product Image
  73. image = soup.find('img', {'class': 'bigthumbnail'}).get('src')
  74. image = image.split('base64,')[-1]
  75. # Populating the final variable (this should be a list with all fields scraped)
  76. row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
  77. BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
  78. # Sending the results
  79. return row
  80. def darkdock_listing_parser(soup):
  81. """Parses the listing pages of a DarkDock marketplace.
  82. It takes a BeautifulSoup object that represents the HTML page of a listing page,
  83. and extracts various information such as vendor name, product name, etc. It then
  84. removes and cleans the extracted information by passing it to the organizeProducts
  85. function.
  86. Args:
  87. soup: A BeautifulSoup object that represents the HTML page of a listing page.
  88. Returns:
  89. The row of a description item as a tuple containing the information fields extracted from the listing page.
  90. """
  91. # Fields to be parsed
  92. nm = 0 # Total_Products (Should be Integer)
  93. mktName = "DarkDock" # 0 Marketplace_Name
  94. vendor = [] # 1 Vendor
  95. rating_vendor = [] # 2 Vendor_Rating
  96. success = [] # 3 Vendor_Successful_Transactions
  97. name = [] # 4 Product_Name
  98. CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
  99. MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
  100. category = [] # 7 Product_Category
  101. describe = [] # 8 Product_Description
  102. views = [] # 9 Product_Number_Of_Views
  103. reviews = [] # 10 Product_Number_Of_Reviews
  104. rating_item = [] # 11 Product_Rating
  105. addDate = [] # 12 Product_AddDate
  106. BTC = [] # 13 Product_BTC_SellingPrice
  107. USD = [] # 14 Product_USD_SellingPrice
  108. EURO = [] # 15 Product_EURO_SellingPrice
  109. sold = [] # 16 Product_QuantitySold
  110. qLeft = [] # 17 Product_QuantityLeft
  111. shipFrom = [] # 18 Product_ShippedFrom
  112. shipTo = [] # 19 Product_ShippedTo
  113. image = [] # 20 Product_Image
  114. image_vendor = [] # 21 Vendor_Image
  115. href = [] # 22 Product_Links
  116. listings = soup.findAll('div', {'class': 'item'})
  117. # Populating the Number of Products
  118. nm = len(listings)
  119. cat = soup.find('div', {'class': 'heading'}).text
  120. cat = cleanString(cat)
  121. cat = cat.strip()
  122. for listing in listings:
  123. # Finding the Vendor
  124. vendor_name = listing.find('div', {'class': 'seller'}).text
  125. vendor.append(vendor_name)
  126. # Finding the Product
  127. product = listing.find('div', {'class': 'title'}).text
  128. product = cleanString(product)
  129. product = product.strip()
  130. name.append(product)
  131. # Finding the Category
  132. category.append(cat)
  133. # Finding description
  134. description = listing.find('div', {'class': 'description'}).text
  135. description = cleanString(description)
  136. description = description.strip()
  137. describe.append(description)
  138. # Finding product views
  139. num_view = listing.select_one('.stats table tr:nth-of-type(3) td:nth-of-type(1)').text
  140. num_view = cleanNumbers(num_view)
  141. num_view = num_view.strip()
  142. views.append(num_view)
  143. # Finding product reviews
  144. num_reviews = listing.select_one('.stats table tr:nth-of-type(3) td:nth-of-type(3)').text
  145. num_reviews = cleanNumbers(num_reviews)
  146. num_reviews = num_reviews.strip()
  147. reviews.append(num_reviews)
  148. # Finding product rating based on width style
  149. rating = listing.find('div', {'class': 'stars2'}).get('style')
  150. rating = re.findall(r"\d+\.\d+|\d+", rating)[0]
  151. rating = cleanNumbers(rating)
  152. rating = rating.strip()
  153. rating_item.append(rating)
  154. # Finding Prices
  155. price = listing.find('div', {'class': 'price'}).text
  156. price = price.strip()
  157. USD.append(price)
  158. # Finding number of times product is sold
  159. num_sold = listing.select_one('.stats table tr:nth-of-type(3) td:nth-of-type(2)').text
  160. num_sold = cleanNumbers(num_sold)
  161. num_sold = num_sold.strip()
  162. sold.append(num_sold)
  163. # Finding shipping locations
  164. shipping = listing.find('div',{'class': 'shipping'}).text
  165. shippedFrom, shippedTo = cleanString(shipping).split(' > ')
  166. shipTo.append(shippedTo)
  167. shipFrom.append(shippedFrom)
  168. # Adding the url to the list of urls
  169. link = listing.find('a', recursive=False).get('href')
  170. href.append(link)
  171. image_vendor.append("-1")
  172. return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
  173. reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
  174. def darkdock_links_parser(soup):
  175. """Returns a list of description links from a listing page.
  176. It takes a BeautifulSoup object that represents the HTML page of a listing page, and
  177. extracts all the description links from the page.
  178. Args:
  179. soup: A BeautifulSoup object that represents the HTML page of a listing page.
  180. Returns:
  181. A list of description links from a listing page.
  182. """
  183. # Returning all links that should be visited by the Crawler
  184. href = []
  185. listing = soup.find_all('a', href=lambda href: href and '/product/' in href)
  186. for a in listing:
  187. href.append(a['href'])
  188. return href