this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

205 lines
8.8 KiB

  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from MarketPlaces.Utilities.utilities import *
  4. # Here, we are importing BeautifulSoup to search through the HTML tree
  5. from bs4 import BeautifulSoup
  6. import re
  7. #parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
  8. #stores info it needs in different lists, these lists are returned after being organized
  9. #@param: soup object looking at html page of description page
  10. #return: 'row' that contains a variety of lists that each hold info on the description page
  11. def tormarket_description_parser(soup):
  12. # Fields to be parsed
  13. name = "-1" # 0 Product_Name
  14. describe = "-1" # 1 Product_Description
  15. lastSeen = "-1" # 2 Product_LastViewDate
  16. rules = "-1" # 3 NOT USED ...
  17. CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  18. MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
  19. review = "-1" # 6 Product_Number_Of_Reviews
  20. category = "-1" # 7 Product_Category
  21. shipFrom = "-1" # 8 Product_ShippedFrom
  22. shipTo = "-1" # 9 Product_ShippedTo
  23. left = "-1" # 10 Product_QuantityLeft
  24. escrow = "-1" # 11 Vendor_Warranty
  25. terms = "-1" # 12 Vendor_TermsAndConditions
  26. vendor = "-1" # 13 Vendor_Name
  27. sold = "-1" # 14 Product_QuantitySold
  28. addDate = "-1" # 15 Product_AddedDate
  29. available = "-1" # 16 NOT USED ...
  30. endDate = "-1" # 17 NOT USED ...
  31. BTC = "-1" # 18 Product_BTC_SellingPrice
  32. USD = "-1" # 19 Product_USD_SellingPrice
  33. rating = "-1" # 20 Vendor_Rating
  34. success = "-1" # 21 Vendor_Successful_Transactions
  35. EURO = "-1" # 22 Product_EURO_SellingPrice
  36. #finding the name of the product
  37. name_of_product = soup.find("h1", {"class": "product_title entry-title"}).find("a").text
  38. name = cleanString(name_of_product.strip())
  39. #finding the description of the product
  40. description_of_product = soup.find("div", {"class": "woocommerce-product-details__short-description"}).text
  41. describe = cleanString(description_of_product.strip())
  42. #finding the replies
  43. inquires_about_product = soup.find("div", {"class": "woocommerce-Tabs-panel woocommerce-Tabs-panel--wcfm_enquiry_tab panel entry-content wc-tab"}).find("p").text
  44. if inquires_about_product == "There are no inquiries yet.":
  45. review = 0
  46. else:
  47. review = -1 #fix later pls
  48. #finding the terms and conditions
  49. terms_and_conditions = soup.find("div", {"class": "woocommerce-Tabs-panel woocommerce-Tabs-panel--wcfm_enquiry_tab panel entry-content wc-tab"}).find("p").text
  50. term = cleanString(terms_and_conditions)
  51. #finding the name of the vendor
  52. name_of_vendor = soup.find("div", {"class": "wcfmmp_sold_by_store"}).find("a").text
  53. vendor = cleanString(name_of_vendor)
  54. #finding the price of the item
  55. price = soup.find("p", {"class": "price"}).find("bdi").text
  56. price_cleaned = price[1:]
  57. USD = price_cleaned.strip()
  58. #everything else gets a -1 because they are not found
  59. # Populating the final variable (this should be a list with all fields scraped)
  60. row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,
  61. sold, addDate, available, endDate, BTC, USD, rating, success, EURO)
  62. # Sending the results
  63. return row
  64. #parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
  65. #stores info it needs in different lists, these lists are returned after being organized
  66. #@param: soup object looking at html page of listing page
  67. #return: 'row' that contains a variety of lists that each hold info on the listing page
  68. def tormarket_listing_parser(soup):
  69. # Fields to be parsed
  70. nm = 0 # *Total_Products (Should be Integer)
  71. mktName = "TorMarket" # 0 *Marketplace_Name
  72. vendor = [] # 1 *Vendor y
  73. rating_vendor = [] # 2 Vendor_Rating
  74. success = [] # 3 Vendor_Successful_Transactions
  75. name = [] # 4 *Product_Name y
  76. CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  77. MS = [] # 6 Product_MS_Classification (Microsoft Security)
  78. category = [] # 7 Product_Category y
  79. describe = [] # 8 Product_Description
  80. views = [] # 9 Product_Number_Of_Views
  81. reviews = [] # 10 Product_Number_Of_Reviews
  82. rating_item = [] # 11 Product_Rating
  83. addDate = [] # 12 Product_AddDate
  84. BTC = [] # 13 Product_BTC_SellingPrice
  85. USD = [] # 14 Product_USD_SellingPrice y
  86. EURO = [] # 15 Product_EURO_SellingPrice
  87. sold = [] # 16 Product_QuantitySold
  88. qLeft = [] # 17 Product_QuantityLeft
  89. shipFrom = [] # 18 Product_ShippedFrom
  90. shipTo = [] # 19 Product_ShippedTo
  91. href = [] # 20 Product_Links
  92. products_list = soup.find_all('li')
  93. nm = 0
  94. for product in products_list:
  95. try:
  96. # Finding the name of the product
  97. name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text
  98. name_of_product_cleaned = cleanString(name_of_product.strip())
  99. print(name_of_product_cleaned)
  100. name.append(name_of_product_cleaned)
  101. #finding the URL
  102. try:
  103. url = product.find("div", {"class": "product-loop-content text-center"}).find("a").get("href")
  104. print(url)
  105. href.append(url)
  106. except AttributeError as e:
  107. print("I can't find the link")
  108. raise e
  109. #finding the rating of the product
  110. rating_score_of_product = product.find("div", {"class": "product-loop-content text-center"}).find("div").find("span").text
  111. rating_item.append(cleanString(rating_score_of_product.strip()))
  112. print("done")
  113. #finding the rating of the vendors
  114. rating_score_of_vendor = product.find("div", {"class": "wcfmmp-store-rating"}).find("strong").text
  115. rating_vendor.append(cleanString(rating_score_of_vendor.strip()))
  116. print("done")
  117. #finding the cost in USD
  118. cost = product.find("span", {"class": "woocommerce-Price-amount amount"}).text
  119. USD.append(cost)
  120. print("done")
  121. #finding the name of the vendor
  122. vendor_name = product.find("div", {"class": "wcfmmp_sold_by_wrapper"}).find("a").text
  123. vendor.append(cleanString(vendor_name.strip()))
  124. print("done")
  125. #everything else appends a -1
  126. success.append("-1")
  127. CVE.append("-1")
  128. MS.append("-1")
  129. category.append("-1")
  130. describe.append("-1")
  131. views.append("-1")
  132. reviews.append("-1")
  133. addDate.append("-1")
  134. BTC.append("-1")
  135. EURO.append("-1")
  136. sold.append("-1")
  137. qLeft.append("-1")
  138. shipFrom.append("-1")
  139. shipTo.append("-1")
  140. print("Done! moving onto the next product!")
  141. print(len(shipTo))
  142. nm += 1
  143. except AttributeError as e:
  144. print("I'm somewhere I don't belong. I'm going to leave")
  145. continue
  146. # Populate the final variable (this should be a list with all fields scraped)
  147. return organizeProducts(
  148. marketplace = "TorMarket",
  149. nm = nm,
  150. vendor = vendor,
  151. rating_vendor = rating_vendor,
  152. success_vendor = success,
  153. nombre = name,
  154. CVE = CVE,
  155. MS = MS,
  156. category = category,
  157. describe = describe,
  158. views = views,
  159. reviews = reviews,
  160. rating_item = rating_item,
  161. addDate = addDate,
  162. BTC = BTC,
  163. USD = USD,
  164. EURO = EURO,
  165. sold = sold,
  166. qLeft = qLeft,
  167. shipFrom = shipFrom,
  168. shipTo = shipTo,
  169. href = href
  170. )
  171. #called by the crawler to get description links on a listing page
  172. #@param: beautifulsoup object that is using the correct html page (listing page)
  173. #return: list of description links from a listing page
  174. def tormarket_links_parser(soup):
  175. # Returning all links that should be visited by the Crawler
  176. href = []
  177. listing = soup.findAll('div', {"class": "product-loop-content text-center"})
  178. for a in listing:
  179. bae = a.find('h2', {"class": "woocommerce-loop-product__title"}).find('a', href=True)
  180. link = bae['href']
  181. href.append(link)
  182. return href