this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

180 lines
8.9 KiB

  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from MarketPlaces.Utilities.utilities import *
  4. # Here, we are importing BeautifulSoup to search through the HTML tree
  5. from bs4 import BeautifulSoup
  6. import re
  7. #parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
  8. #stores info it needs in different lists, these lists are returned after being organized
  9. #@param: soup object looking at html page of description page
  10. #return: 'row' that contains a variety of lists that each hold info on the description page
  11. def tormarket_description_parser(soup):
  12. # Fields to be parsed
  13. vendor = "-1" # 0 *Vendor_Name
  14. success = "-1" # 1 Vendor_Successful_Transactions
  15. rating_vendor = "-1" # 2 Vendor_Rating
  16. name = "-1" # 3 *Product_Name
  17. describe = "-1" # 4 Product_Description
  18. CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  19. MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
  20. category = "-1" # 7 Product_Category
  21. views = "-1" # 8 Product_Number_Of_Views
  22. reviews = "-1" # 9 Product_Number_Of_Reviews
  23. rating_item = "-1" # 10 Product_Rating
  24. addDate = "-1" # 11 Product_AddedDate
  25. BTC = "-1" # 12 Product_BTC_SellingPrice
  26. USD = "-1" # 13 Product_USD_SellingPrice
  27. EURO = "-1" # 14 Product_EURO_SellingPrice
  28. sold = "-1" # 15 Product_QuantitySold
  29. left = "-1" # 16 Product_QuantityLeft
  30. shipFrom = "-1" # 17 Product_ShippedFrom
  31. shipTo = "-1" # 18 Product_ShippedTo
  32. image = "-1" # 19 Product_Image
  33. vendor_image = "-1" # 20 Vendor_Image
  34. #finding the name of the product
  35. name_of_product = soup.find("h1", {"class": "product_title entry-title"}).find("a").text
  36. name = cleanString(name_of_product.strip())
  37. #finding the description of the product
  38. description_of_product = soup.find("div", {"class": "woocommerce-product-details__short-description"}).text
  39. describe = cleanString(description_of_product.strip())
  40. #finding the replies
  41. inquires_about_product = soup.find("div", {"class": "woocommerce-Tabs-panel woocommerce-Tabs-panel--wcfm_enquiry_tab panel entry-content wc-tab"}).find("p").text
  42. if inquires_about_product == "There are no inquiries yet.":
  43. review = 0
  44. else:
  45. review = "-1" #fix later pls
  46. #finding the terms and conditions
  47. terms_and_conditions = soup.find("div", {"class": "woocommerce-Tabs-panel woocommerce-Tabs-panel--wcfm_enquiry_tab panel entry-content wc-tab"}).find("p").text
  48. term = cleanString(terms_and_conditions)
  49. #finding the name of the vendor
  50. name_of_vendor = soup.find("div", {"class": "wcfmmp_sold_by_store"}).find("a").text
  51. vendor = cleanString(name_of_vendor)
  52. #finding the price of the item
  53. price = soup.find("p", {"class": "price"}).find("bdi").text
  54. price_cleaned = price[1:]
  55. USD = price_cleaned.strip()
  56. #everything else gets a -1 because they are not found
  57. # Populating the final variable (this should be a list with all fields scraped)
  58. row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
  59. BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
  60. # Sending the results
  61. return row
  62. #parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
  63. #stores info it needs in different lists, these lists are returned after being organized
  64. #@param: soup object looking at html page of listing page
  65. #return: 'row' that contains a variety of lists that each hold info on the listing page
  66. def tormarket_listing_parser(soup):
  67. # Fields to be parsed
  68. nm = 0 # *Total_Products (Should be Integer)
  69. mktName = "TorMarket" # 0 *Marketplace_Name
  70. vendor = [] # 1 *Vendor y
  71. rating_vendor = [] # 2 Vendor_Rating
  72. success = [] # 3 Vendor_Successful_Transactions
  73. name = [] # 4 *Product_Name y
  74. CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
  75. MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
  76. category = [] # 7 Product_Category y
  77. describe = [] # 8 Product_Description
  78. views = [] # 9 Product_Number_Of_Views
  79. reviews = [] # 10 Product_Number_Of_Reviews
  80. rating_item = [] # 11 Product_Rating
  81. addDate = [] # 12 Product_AddDate
  82. BTC = [] # 13 Product_BTC_SellingPrice
  83. USD = [] # 14 Product_USD_SellingPrice y
  84. EURO = [] # 15 Product_EURO_SellingPrice
  85. sold = [] # 16 Product_QuantitySold
  86. qLeft = [] # 17 Product_QuantityLeft
  87. shipFrom = [] # 18 Product_ShippedFrom
  88. shipTo = [] # 19 Product_ShippedTo
  89. image = [] # 20 Product_Image
  90. image_vendor = [] # 21 Vendor_Image
  91. href = [] # 22 Product_Links
  92. products_list = soup.find('ul', {"class": "products columns-3 tablet-columns-2 mobile-columns-1"}).find_all('li')
  93. nm = len(products_list)
  94. for product in products_list:
  95. # Finding the name of the product
  96. name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text
  97. name_of_product_cleaned = cleanString(name_of_product.strip())
  98. # print(name_of_product_cleaned)
  99. name.append(name_of_product_cleaned)
  100. #finding the URL
  101. try:
  102. url = product.find("div", {"class": "product-loop-content text-center"}).find("a").get("href")
  103. # print(url)
  104. href.append(url)
  105. except AttributeError as e:
  106. print("I can't find the link")
  107. raise e
  108. #finding the rating of the product
  109. rating_score_of_product = product.find("div", {"class": "product-loop-content text-center"}).find("div").find("span").text
  110. rating_item.append(cleanString(rating_score_of_product.strip()))
  111. # print("done")
  112. #finding the rating of the vendors
  113. rating_score_of_vendor = product.find("div", {"class": "wcfmmp-store-rating"}).find("strong").text
  114. rating_vendor.append(cleanString(rating_score_of_vendor.strip()))
  115. # print("done")
  116. #finding the cost in USD
  117. cost = product.find("span", {"class": "woocommerce-Price-amount amount"}).text
  118. USD.append(cost)
  119. # print("done")
  120. #finding the name of the vendor
  121. vendor_name = product.find("div", {"class": "wcfmmp_sold_by_wrapper"}).find("a").text
  122. vendor.append(cleanString(vendor_name.strip()))
  123. # print("done")
  124. #everything else appends a -1
  125. success.append("-1")
  126. CVE.append("-1")
  127. MS.append("-1")
  128. category.append("-1")
  129. describe.append("-1")
  130. views.append("-1")
  131. reviews.append("-1")
  132. addDate.append("-1")
  133. BTC.append("-1")
  134. EURO.append("-1")
  135. sold.append("-1")
  136. qLeft.append("-1")
  137. shipFrom.append("-1")
  138. shipTo.append("-1")
  139. # print("Done! moving onto the next product!")
  140. # print(len(shipTo))
  141. # Populate the final variable (this should be a list with all fields scraped)
  142. return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
  143. reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
  144. #called by the crawler to get description links on a listing page
  145. #@param: beautifulsoup object that is using the correct html page (listing page)
  146. #return: list of description links from a listing page
  147. def tormarket_links_parser(soup):
  148. # Returning all links that should be visited by the Crawler
  149. href = []
  150. listing = soup.findAll('div', {"class": "product-loop-content text-center"})
  151. for a in listing:
  152. bae = a.find('h2', {"class": "woocommerce-loop-product__title"}).find('a', href=True)
  153. link = bae['href']
  154. href.append(link)
  155. return href