this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

202 lines
8.6 KiB

  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from MarketPlaces.Utilities.utilities import *
  4. # Here, we are importing BeautifulSoup to search through the HTML tree
  5. from bs4 import BeautifulSoup
  6. import re
  7. #parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
  8. #stores info it needs in different lists, these lists are returned after being organized
  9. #@param: soup object looking at html page of description page
  10. #return: 'row' that contains a variety of lists that each hold info on the description page
  11. def tormarket_description_parser(soup):
  12. # Fields to be parsed
  13. name = "-1" # 0 Product_Name
  14. describe = "-1" # 1 Product_Description
  15. lastSeen = "-1" # 2 Product_LastViewDate
  16. rules = "-1" # 3 NOT USED ...
  17. CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  18. MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
  19. review = "-1" # 6 Product_Number_Of_Reviews
  20. category = "-1" # 7 Product_Category
  21. shipFrom = "-1" # 8 Product_ShippedFrom
  22. shipTo = "-1" # 9 Product_ShippedTo
  23. left = "-1" # 10 Product_QuantityLeft
  24. escrow = "-1" # 11 Vendor_Warranty
  25. terms = "-1" # 12 Vendor_TermsAndConditions
  26. vendor = "-1" # 13 Vendor_Name
  27. sold = "-1" # 14 Product_QuantitySold
  28. addDate = "-1" # 15 Product_AddedDate
  29. available = "-1" # 16 NOT USED ...
  30. endDate = "-1" # 17 NOT USED ...
  31. BTC = "-1" # 18 Product_BTC_SellingPrice
  32. USD = "-1" # 19 Product_USD_SellingPrice
  33. rating = "-1" # 20 Vendor_Rating
  34. success = "-1" # 21 Vendor_Successful_Transactions
  35. EURO = "-1" # 22 Product_EURO_SellingPrice
  36. #finding the name of the product
  37. name_of_product = soup.find("h1", {"class": "product_title entry-title"}).find("a").text
  38. name = cleanString(name_of_product.strip())
  39. #finding the description of the product
  40. description_of_product = soup.find("div", {"class": "woocommerce-product-details__short-description"}).text
  41. describe = cleanString(description_of_product.strip())
  42. #finding the replies
  43. inquires_about_product = soup.find("div", {"class": "woocommerce-Tabs-panel woocommerce-Tabs-panel--wcfm_enquiry_tab panel entry-content wc-tab"}).find("p").text
  44. if inquires_about_product == "There are no inquiries yet.":
  45. review = 0
  46. else:
  47. review = -1 #fix later pls
  48. #finding the terms and conditions
  49. terms_and_conditions = soup.find("div", {"class": "woocommerce-Tabs-panel woocommerce-Tabs-panel--wcfm_enquiry_tab panel entry-content wc-tab"}).find("p").text
  50. term = cleanString(terms_and_conditions)
  51. #finding the name of the vendor
  52. name_of_vendor = soup.find("div", {"class": "wcfmmp_sold_by_store"}).find("a").text
  53. vendor = cleanString(name_of_vendor)
  54. #finding the price of the item
  55. price = soup.find("p", {"class": "price"}).find("bdi").text
  56. price_cleaned = price[1:]
  57. USD = price_cleaned.strip()
  58. #everything else gets a -1 because they are not found
  59. # Populating the final variable (this should be a list with all fields scraped)
  60. row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,
  61. sold, addDate, available, endDate, BTC, USD, rating, success, EURO)
  62. # Sending the results
  63. return row
  64. #parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
  65. #stores info it needs in different lists, these lists are returned after being organized
  66. #@param: soup object looking at html page of listing page
  67. #return: 'row' that contains a variety of lists that each hold info on the listing page
  68. def tormarket_listing_parser(soup):
  69. # Fields to be parsed
  70. nm = 0 # *Total_Products (Should be Integer)
  71. mktName = "TorMarket" # 0 *Marketplace_Name
  72. vendor = [] # 1 *Vendor y
  73. rating_vendor = [] # 2 Vendor_Rating
  74. success = [] # 3 Vendor_Successful_Transactions
  75. name = [] # 4 *Product_Name y
  76. CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  77. MS = [] # 6 Product_MS_Classification (Microsoft Security)
  78. category = [] # 7 Product_Category y
  79. describe = [] # 8 Product_Description
  80. views = [] # 9 Product_Number_Of_Views
  81. reviews = [] # 10 Product_Number_Of_Reviews
  82. rating_item = [] # 11 Product_Rating
  83. addDate = [] # 12 Product_AddDate
  84. BTC = [] # 13 Product_BTC_SellingPrice
  85. USD = [] # 14 Product_USD_SellingPrice y
  86. EURO = [] # 15 Product_EURO_SellingPrice
  87. sold = [] # 16 Product_QuantitySold
  88. qLeft = [] # 17 Product_QuantityLeft
  89. shipFrom = [] # 18 Product_ShippedFrom
  90. shipTo = [] # 19 Product_ShippedTo
  91. href = [] # 20 Product_Links
  92. products_list = soup.find('ul', {"class": "products columns-3 tablet-columns-2 mobile-columns-1"}).find_all('li')
  93. nm = len(products_list)
  94. for product in products_list:
  95. # Finding the name of the product
  96. name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text
  97. name_of_product_cleaned = cleanString(name_of_product.strip())
  98. # print(name_of_product_cleaned)
  99. name.append(name_of_product_cleaned)
  100. #finding the URL
  101. try:
  102. url = product.find("div", {"class": "product-loop-content text-center"}).find("a").get("href")
  103. # print(url)
  104. href.append(url)
  105. except AttributeError as e:
  106. print("I can't find the link")
  107. raise e
  108. #finding the rating of the product
  109. rating_score_of_product = product.find("div", {"class": "product-loop-content text-center"}).find("div").find("span").text
  110. rating_item.append(cleanString(rating_score_of_product.strip()))
  111. # print("done")
  112. #finding the rating of the vendors
  113. rating_score_of_vendor = product.find("div", {"class": "wcfmmp-store-rating"}).find("strong").text
  114. rating_vendor.append(cleanString(rating_score_of_vendor.strip()))
  115. # print("done")
  116. #finding the cost in USD
  117. cost = product.find("span", {"class": "woocommerce-Price-amount amount"}).text
  118. USD.append(cost)
  119. # print("done")
  120. #finding the name of the vendor
  121. vendor_name = product.find("div", {"class": "wcfmmp_sold_by_wrapper"}).find("a").text
  122. vendor.append(cleanString(vendor_name.strip()))
  123. # print("done")
  124. #everything else appends a -1
  125. success.append("-1")
  126. CVE.append("-1")
  127. MS.append("-1")
  128. category.append("-1")
  129. describe.append("-1")
  130. views.append("-1")
  131. reviews.append("-1")
  132. addDate.append("-1")
  133. BTC.append("-1")
  134. EURO.append("-1")
  135. sold.append("-1")
  136. qLeft.append("-1")
  137. shipFrom.append("-1")
  138. shipTo.append("-1")
  139. # print("Done! moving onto the next product!")
  140. # print(len(shipTo))
  141. # Populate the final variable (this should be a list with all fields scraped)
  142. return organizeProducts(
  143. marketplace = "TorMarket",
  144. nm = nm,
  145. vendor = vendor,
  146. rating_vendor = rating_vendor,
  147. success_vendor = success,
  148. nombre = name,
  149. CVE = CVE,
  150. MS = MS,
  151. category = category,
  152. describe = describe,
  153. views = views,
  154. reviews = reviews,
  155. rating_item = rating_item,
  156. addDate = addDate,
  157. BTC = BTC,
  158. USD = USD,
  159. EURO = EURO,
  160. sold = sold,
  161. qLeft = qLeft,
  162. shipFrom = shipFrom,
  163. shipTo = shipTo,
  164. href = href
  165. )
  166. #called by the crawler to get description links on a listing page
  167. #@param: beautifulsoup object that is using the correct html page (listing page)
  168. #return: list of description links from a listing page
  169. def tormarket_links_parser(soup):
  170. # Returning all links that should be visited by the Crawler
  171. href = []
  172. listing = soup.findAll('div', {"class": "product-loop-content text-center"})
  173. for a in listing:
  174. bae = a.find('h2', {"class": "woocommerce-loop-product__title"}).find('a', href=True)
  175. link = bae['href']
  176. href.append(link)
  177. return href