this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

174 lines
7.0 KiB

  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from MarketPlaces.Utilities.utilities import *
  4. # Here, we are importing BeautifulSoup to search through the HTML tree
  5. from bs4 import BeautifulSoup, ResultSet, Tag
  6. def anonymousMarketplace_description_parser(soup: Tag):
  7. # Fields to be parsed
  8. vendor = "-1" # 0 *Vendor_Name
  9. success = "-1" # 1 Vendor_Successful_Transactions
  10. rating_vendor = "-1" # 2 Vendor_Rating
  11. name = "-1" # 3 *Product_Name
  12. describe = "-1" # 4 Product_Description
  13. CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  14. MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
  15. category = "-1" # 7 Product_Category
  16. views = "-1" # 8 Product_Number_Of_Views
  17. reviews = "-1" # 9 Product_Number_Of_Reviews
  18. rating_item = "-1" # 10 Product_Rating
  19. addDate = "-1" # 11 Product_AddedDate
  20. BTC = "-1" # 12 Product_BTC_SellingPrice
  21. USD = "-1" # 13 Product_USD_SellingPrice
  22. EURO = "-1" # 14 Product_EURO_SellingPrice
  23. sold = "-1" # 15 Product_QuantitySold
  24. left = "-1" # 16 Product_QuantityLeft
  25. shipFrom = "-1" # 17 Product_ShippedFrom
  26. shipTo = "-1" # 18 Product_ShippedTo
  27. product_name = soup.find("h1", {"class": "product_title entry-title"}).text
  28. name = cleanString(product_name.strip())
  29. product_description_list: ResultSet[Tag] = soup.find("div", {"id": "tab-description"}).find_all("div")
  30. describe_output = ""
  31. for div in product_description_list:
  32. describe_output += div.text
  33. describe = cleanString(describe_output.strip())
  34. product_ratings: Tag = soup.find("div", {"class": "woocommerce-product-rating"})
  35. product_reviews = product_ratings.find("span", {"class": "rating"}).text
  36. reviews = cleanString(product_reviews.strip())
  37. product_star_rating = product_ratings.find("strong", {"class": "rating"}).text
  38. rating_item = cleanString(product_star_rating.strip())
  39. product_price = soup.find("span", {"class": "woocommerce-Price-amount amount"}).text
  40. USD = cleanString(product_price.replace("$", "").strip())
  41. # Populating the final variable (this should be a list with all fields scraped)
  42. row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
  43. BTC, USD, EURO, sold, left, shipFrom, shipTo)
  44. # Sending the results
  45. return row
  46. def anonymousMarketplace_listing_parser(soup: Tag):
  47. # Fields to be parsed
  48. nm = 0 # *Total_Products (Should be Integer)
  49. mktName = "AnonymousMarketplace" # 0 *Marketplace_Name
  50. vendor = [] # 1 *Vendor y
  51. rating_vendor = [] # 2 Vendor_Rating
  52. success = [] # 3 Vendor_Successful_Transactions
  53. name = [] # 4 *Product_Name y
  54. CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  55. MS = [] # 6 Product_MS_Classification (Microsoft Security)
  56. category = [] # 7 Product_Category y
  57. describe = [] # 8 Product_Description
  58. views = [] # 9 Product_Number_Of_Views
  59. reviews = [] # 10 Product_Number_Of_Reviews
  60. rating_item = [] # 11 Product_Rating
  61. addDate = [] # 12 Product_AddDate
  62. BTC = [] # 13 Product_BTC_SellingPrice
  63. USD = [] # 14 Product_USD_SellingPrice y
  64. EURO = [] # 15 Product_EURO_SellingPrice
  65. sold = [] # 16 Product_QuantitySold
  66. qLeft =[] # 17 Product_QuantityLeft
  67. shipFrom = [] # 18 Product_ShippedFrom
  68. shipTo = [] # 19 Product_ShippedTo
  69. href = [] # 20 Product_Links
  70. woo = soup.find('div', {"class": "woocommerce"})
  71. product_list = woo.find('ul', {"class": "products columns-4"}, recursive=False).find_all('li')
  72. for item in product_list:
  73. item_href = item.find("a", recursive=False).get("href")
  74. href.append(item_href)
  75. item_name = item.find("h2").text
  76. name.append(cleanString(item_name.strip()))
  77. item_rating = item.find("div", {"class": "star-rating"}).find("strong", {"class": "rating"}).text
  78. rating_item.append(cleanString(item_rating.strip()))
  79. try:
  80. item_price = item.find("span", {"class": "woocommerce-Price-amount amount"}).text
  81. item_price = item_price.replace("$", "").strip()
  82. USD.append(cleanNumbers(item_price))
  83. except AttributeError:
  84. USD.append("-1")
  85. vendor.append("AnonymousMarketplace")
  86. rating_vendor.append("-1")
  87. success.append("-1")
  88. CVE.append("-1")
  89. MS.append("-1")
  90. category.append("-1")
  91. describe.append("-1")
  92. views.append("-1")
  93. reviews.append("-1")
  94. addDate.append("-1")
  95. BTC.append("-1")
  96. EURO.append("-1")
  97. sold.append("-1")
  98. qLeft.append("-1")
  99. shipFrom.append("-1")
  100. shipTo.append("-1")
  101. nm += 1
  102. return organizeProducts(
  103. marketplace=mktName,
  104. nm=nm,
  105. vendor=vendor,
  106. rating_vendor=rating_vendor,
  107. success_vendor=success,
  108. nombre=name,
  109. CVE=CVE,
  110. MS=MS,
  111. category=category,
  112. describe=describe,
  113. views=views,
  114. reviews=reviews,
  115. rating_item=rating_item,
  116. addDate=addDate,
  117. BTC=BTC,
  118. USD=USD,
  119. EURO=EURO,
  120. sold=sold,
  121. qLeft=qLeft,
  122. shipFrom=shipFrom,
  123. shipTo=shipTo,
  124. href=href
  125. )
  126. #called by the crawler to get description links on a listing page
  127. #@param: beautifulsoup object that is using the correct html page (listing page)
  128. #return: list of description links from a listing page
  129. def anonymous_links_parser(soup):
  130. # Returning all links that should be visited by the Crawler
  131. href = []
  132. woo = soup.find('div', {"class": "woocommerce"})
  133. listing = woo.find('ul', {"class": "products columns-4"}, recursive=False).find_all('li')
  134. for a in listing:
  135. bae = a.find('a', href=True, recursive=False)
  136. link = bae['href']
  137. href.append(link)
  138. return href