this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

227 lines
8.8 KiB

  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from MarketPlaces.Utilities.utilities import *
  4. # Here, we are importing BeautifulSoup to search through the HTML tree
  5. from bs4 import BeautifulSoup
  6. #parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
  7. #stores info it needs in different lists, these lists are returned after being organized
  8. #@param: soup object looking at html page of description page
  9. #return: 'row' that contains a variety of lists that each hold info on the description page
  10. def darkmatter_description_parser(soup):
  11. # Fields to be parsed
  12. vendor = "-1" # 0 *Vendor_Name
  13. success = "-1" # 1 Vendor_Successful_Transactions
  14. rating_vendor = "-1" # 2 Vendor_Rating
  15. name = "-1" # 3 *Product_Name
  16. describe = "-1" # 4 Product_Description
  17. CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  18. MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
  19. category = "-1" # 7 Product_Category
  20. views = "-1" # 8 Product_Number_Of_Views
  21. reviews = "-1" # 9 Product_Number_Of_Reviews
  22. rating_item = "-1" # 10 Product_Rating
  23. addDate = "-1" # 11 Product_AddedDate
  24. BTC = "-1" # 12 Product_BTC_SellingPrice
  25. USD = "-1" # 13 Product_USD_SellingPrice
  26. EURO = "-1" # 14 Product_EURO_SellingPrice
  27. sold = "-1" # 15 Product_QuantitySold
  28. left = "-1" # 16 Product_QuantityLeft
  29. shipFrom = "-1" # 17 Product_ShippedFrom
  30. shipTo = "-1" # 18 Product_ShippedTo
  31. vendor = "-1" # 0 *Vendor_Name
  32. success = "-1" # 1 Vendor_Successful_Transactions
  33. rating_vendor = "-1" # 2 Vendor_Rating
  34. # product name
  35. try:
  36. name = soup.find('head').find('title').text
  37. name = cleanString(name.strip())
  38. except:
  39. print("name")
  40. #product description
  41. try:
  42. temp = soup.find('pre', {'class', 'description'}).text
  43. temp = temp.replace('\n', ' ')
  44. describe = cleanString(temp.strip())
  45. except:
  46. print("description")
  47. CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  48. MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
  49. category = "-1" # 7 Product_Category
  50. views = "-1" # 8 Product_Number_Of_Views
  51. reviews = "-1" # 9 Product_Number_Of_Reviews
  52. rating_item = "-1" # 10 Product_Rating
  53. addDate = "-1" # 11 Product_AddedDate
  54. BTC = "-1" # 12 Product_BTC_SellingPrice
  55. USD = "-1" # 13 Product_USD_SellingPrice
  56. EURO = "-1" # 14 Product_EURO_SellingPrice
  57. sold = "-1" # 15 Product_QuantitySold
  58. left = "-1" # 16 Product_QuantityLeft
  59. shipFrom = "-1" # 17 Product_ShippedFrom
  60. shipTo = "-1" # 18 Product_ShippedTo
  61. # Populating the final variable (this should be a list with all fields scraped)
  62. row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
  63. BTC, USD, EURO, sold, left, shipFrom, shipTo)
  64. # Sending the results
  65. return row
  66. #parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
  67. #stores info it needs in different lists, these lists are returned after being organized
  68. #@param: soup object looking at html page of listing page
  69. #return: 'row' that contains a variety of lists that each hold info on the listing page
  70. def darkmatter_listing_parser(soup):
  71. # Fields to be parsed
  72. nm = 0 # Total_Products (Should be Integer)
  73. mktName = "DarkMatter" # 0 Marketplace_Name
  74. name = [] # 1 Product_Name
  75. CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  76. MS = [] # 3 Product_MS_Classification (Microsoft Security)
  77. category = [] # 4 Product_Category
  78. describe = [] # 5 Product_Description
  79. escrow = [] # 6 Vendor_Warranty
  80. views = [] # 7 Product_Number_Of_Views
  81. reviews = [] # 8 Product_Number_Of_Reviews
  82. addDate = [] # 9 Product_AddDate
  83. lastSeen = [] # 10 Product_LastViewDate
  84. BTC = [] # 11 Product_BTC_SellingPrice
  85. USD = [] # 12 Product_USD_SellingPrice
  86. EURO = [] # 13 Product_EURO_SellingPrice
  87. sold = [] # 14 Product_QuantitySold
  88. qLeft =[] # 15 Product_QuantityLeft
  89. shipFrom = [] # 16 Product_ShippedFrom
  90. shipTo = [] # 17 Product_ShippedTo
  91. vendor = [] # 18 Vendor
  92. rating = [] # 19 Vendor_Rating
  93. success = [] # 20 Vendor_Successful_Transactions
  94. href = [] # 23 Product_Links (Urls)
  95. listing = soup.findAll('div', {"class": "card"})
  96. # Populating the Number of Products
  97. nm = len(listing)
  98. for a in listing:
  99. bae = a.findAll('a', href=True)
  100. # Adding the url to the list of urls
  101. link = bae[0].get('href')
  102. link = cleanLink(link)
  103. href.append(link)
  104. # Finding the Product
  105. product = bae[1].find('p').text
  106. product = product.replace('\n', ' ')
  107. product = product.replace(",", "")
  108. product = product.replace("...", "")
  109. product = product.strip()
  110. name.append(product)
  111. bae = a.find('div', {'class': "media-content"}).find('div').find_all('div')
  112. if len(bae) >= 5:
  113. # Finding Prices
  114. price = bae[0].text
  115. ud = price.replace(" USD", " ")
  116. # u = ud.replace("$","")
  117. u = ud.replace(",", "")
  118. u = u.strip()
  119. USD.append(u)
  120. # bc = (prc[1]).strip(' BTC')
  121. # BTC.append(bc)
  122. # Finding the Vendor
  123. vendor_name = bae[1].find('a').text
  124. vendor_name = vendor_name.replace(",", "")
  125. vendor_name = vendor_name.strip()
  126. vendor.append(vendor_name)
  127. # Finding the Category
  128. cat = bae[2].find('small').text
  129. cat = cat.replace("Category: ", "")
  130. cat = cat.replace(",", "")
  131. cat = cat.strip()
  132. category.append(cat)
  133. # Finding Number Sold and Quantity Left
  134. num = bae[3].text
  135. num = num.replace("Sold: ", "")
  136. num = num.strip()
  137. sold.append(num)
  138. quant = bae[4].find('small').text
  139. quant = quant.replace("In stock: ", "")
  140. quant = quant.strip()
  141. qLeft.append(quant)
  142. # Finding Successful Transactions
  143. freq = bae[1].text
  144. freq = freq.replace(vendor_name, "")
  145. freq = re.sub(r'Vendor Level \d+', "", freq)
  146. freq = freq.replace("(", "")
  147. freq = freq.replace(")", "")
  148. freq = freq.strip()
  149. success.append(freq)
  150. # Searching for CVE and MS categories
  151. cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
  152. if not cve:
  153. cveValue="-1"
  154. else:
  155. cee = " "
  156. for idx in cve:
  157. cee += (idx)
  158. cee += " "
  159. cee = cee.replace(',', ' ')
  160. cee = cee.replace('\n', '')
  161. cveValue=cee
  162. CVE.append(cveValue)
  163. ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
  164. if not ms:
  165. MSValue="-1"
  166. else:
  167. me = " "
  168. for im in ms:
  169. me += (im)
  170. me += " "
  171. me = me.replace(',', ' ')
  172. me = me.replace('\n', '')
  173. MSValue=me
  174. MS.append(MSValue)
  175. # Populate the final variable (this should be a list with all fields scraped)
  176. return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
  177. BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
  178. #called by the crawler to get description links on a listing page
  179. #@param: beautifulsoup object that is using the correct html page (listing page)
  180. #return: list of description links from a listing page
  181. def darkmatter_links_parser(soup):
  182. # Returning all links that should be visited by the Crawler
  183. href = []
  184. listing = soup.find('div', {"class": "content"}).findAll('td', {"class": "lefted", 'colspan': '3'})
  185. for a in listing:
  186. bae = a.find('a', href=True)
  187. link = bae['href']
  188. href.append(link)
  189. return href