this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

188 lines
8.9 KiB

  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from MarketPlaces.Utilities.utilities import *
  4. # Here, we are importing BeautifulSoup to search through the HTML tree
  5. from bs4 import BeautifulSoup
  6. import re
  7. #parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
  8. #stores info it needs in different lists, these lists are returned after being organized
  9. #@param: soup object looking at html page of description page
  10. #return: 'row' that contains a variety of lists that each hold info on the description page
  11. def tormarket_description_parser(soup):
  12. # Fields to be parsed
  13. vendor = "-1" # 0 *Vendor_Name
  14. success = "-1" # 1 Vendor_Successful_Transactions
  15. rating_vendor = "-1" # 2 Vendor_Rating
  16. name = "-1" # 3 *Product_Name
  17. describe = "-1" # 4 Product_Description
  18. CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  19. MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
  20. category = "-1" # 7 Product_Category
  21. views = "-1" # 8 Product_Number_Of_Views
  22. reviews = "-1" # 9 Product_Number_Of_Reviews
  23. rating_item = "-1" # 10 Product_Rating
  24. addDate = "-1" # 11 Product_AddedDate
  25. BTC = "-1" # 12 Product_BTC_SellingPrice
  26. USD = "-1" # 13 Product_USD_SellingPrice
  27. EURO = "-1" # 14 Product_EURO_SellingPrice
  28. sold = "-1" # 15 Product_QuantitySold
  29. left = "-1" # 16 Product_QuantityLeft
  30. shipFrom = "-1" # 17 Product_ShippedFrom
  31. shipTo = "-1" # 18 Product_ShippedTo
  32. image = "-1" # 19 Product_Image
  33. vendor_image = "-1" # 20 Vendor_Image
  34. #finding the name of the product
  35. name_of_product = soup.find("h1", {"class": "product_title entry-title"}).find("a").text
  36. name = cleanString(name_of_product.strip())
  37. #finding the description of the product
  38. description_of_product = soup.find("div", {"class": "woocommerce-product-details__short-description"}).text
  39. describe = cleanString(description_of_product.strip())
  40. #finding the name of the vendor
  41. name_of_vendor = soup.find("div", {"class": "wcfmmp_sold_by_store"})
  42. if name_of_vendor is not None:
  43. name_of_vendor = name_of_vendor.find("a").text
  44. vendor = cleanString(name_of_vendor.strip())
  45. else:
  46. vendor = "TorMarket"
  47. #finding the price of the item
  48. price = soup.find("p", {"class": "price"}).find("bdi").text
  49. price_cleaned = price[1:]
  50. USD = price_cleaned.strip()
  51. category = soup.find('span', {"class": "posted_in"}).text
  52. category = category.split(':')[-1]
  53. category = category.replace(',', '/')
  54. category = cleanString(category.strip())
  55. #everything else gets a -1 because they are not found
  56. # Populating the final variable (this should be a list with all fields scraped)
  57. row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
  58. BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
  59. # Sending the results
  60. return row
  61. #parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
  62. #stores info it needs in different lists, these lists are returned after being organized
  63. #@param: soup object looking at html page of listing page
  64. #return: 'row' that contains a variety of lists that each hold info on the listing page
  65. def tormarket_listing_parser(soup):
  66. # Fields to be parsed
  67. nm = 0 # *Total_Products (Should be Integer)
  68. mktName = "TorMarket" # 0 *Marketplace_Name
  69. vendor = [] # 1 *Vendor y
  70. rating_vendor = [] # 2 Vendor_Rating
  71. success = [] # 3 Vendor_Successful_Transactions
  72. name = [] # 4 *Product_Name y
  73. CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
  74. MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
  75. category = [] # 7 Product_Category y
  76. describe = [] # 8 Product_Description
  77. views = [] # 9 Product_Number_Of_Views
  78. reviews = [] # 10 Product_Number_Of_Reviews
  79. rating_item = [] # 11 Product_Rating
  80. addDate = [] # 12 Product_AddDate
  81. BTC = [] # 13 Product_BTC_SellingPrice
  82. USD = [] # 14 Product_USD_SellingPrice y
  83. EURO = [] # 15 Product_EURO_SellingPrice
  84. sold = [] # 16 Product_QuantitySold
  85. qLeft = [] # 17 Product_QuantityLeft
  86. shipFrom = [] # 18 Product_ShippedFrom
  87. shipTo = [] # 19 Product_ShippedTo
  88. image = [] # 20 Product_Image
  89. image_vendor = [] # 21 Vendor_Image
  90. href = [] # 22 Product_Links
  91. products_list = soup.find('ul', {"class": "products columns-3 tablet-columns-2 mobile-columns-1"}).find_all('li')
  92. nm = len(products_list)
  93. for product in products_list:
  94. # Finding the name of the product
  95. name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text
  96. name_of_product_cleaned = cleanString(name_of_product.strip())
  97. # print(name_of_product_cleaned)
  98. name.append(name_of_product_cleaned)
  99. #finding the URL
  100. try:
  101. url = product.find("div", {"class": "product-loop-content text-center"}).find("a").get("href")
  102. # print(url)
  103. href.append(url)
  104. except AttributeError as e:
  105. print("I can't find the link")
  106. raise e
  107. #finding the rating of the product
  108. rating_score_of_product = product.find("div", {"class": "product-loop-content text-center"}).find("div").find("span").text
  109. rating_item.append(cleanString(rating_score_of_product.strip()))
  110. # print("done")
  111. #finding the rating of the vendors
  112. rating_score_of_vendor = product.find("div", {"class": "wcfmmp-store-rating"})
  113. if rating_score_of_vendor is not None:
  114. rating_score_of_vendor = rating_score_of_vendor.find("strong").text
  115. rating_vendor.append(cleanString(rating_score_of_vendor.strip()))
  116. else:
  117. rating_vendor.append('-1')
  118. # print("done")
  119. #finding the cost in USD
  120. cost = product.find("span", {"class": "woocommerce-Price-amount amount"}).text
  121. USD.append(cost)
  122. # print("done")
  123. #finding the name of the vendor
  124. vendor_name = product.find("div", {"class": "wcfmmp_sold_by_wrapper"})
  125. if vendor_name is not None:
  126. vendor_name = vendor_name.find("a").text
  127. vendor.append(cleanString(vendor_name.strip()))
  128. else:
  129. vendor.append(mktName)
  130. # print("done")
  131. #everything else appends a -1
  132. success.append("-1")
  133. CVE.append("-1")
  134. MS.append("-1")
  135. category.append("-1")
  136. describe.append("-1")
  137. views.append("-1")
  138. reviews.append("-1")
  139. addDate.append("-1")
  140. BTC.append("-1")
  141. EURO.append("-1")
  142. sold.append("-1")
  143. qLeft.append("-1")
  144. shipFrom.append("-1")
  145. shipTo.append("-1")
  146. # print("Done! moving onto the next product!")
  147. # print(len(shipTo))
  148. # Populate the final variable (this should be a list with all fields scraped)
  149. return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
  150. reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
  151. #called by the crawler to get description links on a listing page
  152. #@param: beautifulsoup object that is using the correct html page (listing page)
  153. #return: list of description links from a listing page
  154. def tormarket_links_parser(soup):
  155. # Returning all links that should be visited by the Crawler
  156. href = []
  157. listing = soup.findAll('div', {"class": "product-loop-content text-center"})
  158. for a in listing:
  159. bae = a.find('h2', {"class": "woocommerce-loop-product__title"}).find('a', href=True)
  160. link = bae['href']
  161. href.append(link)
  162. return href