this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

281 lines
10 KiB

  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from MarketPlaces.Utilities.utilities import *
  4. # Here, we are importing BeautifulSoup to search through the HTML tree
  5. from bs4 import BeautifulSoup
  6. #parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
  7. #stores info it needs in different lists, these lists are returned after being organized
  8. #@param: soup object looking at html page of description page
  9. #return: 'row' that contains a variety of lists that each hold info on the description page
  10. def darkmatter_description_parser(soup):
  11. # Fields to be parsed
  12. vendor = "-1" # 0 *Vendor_Name
  13. success = "-1" # 1 Vendor_Successful_Transactions
  14. rating_vendor = "-1" # 2 Vendor_Rating
  15. name = "-1" # 3 *Product_Name
  16. describe = "-1" # 4 Product_Description
  17. CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  18. MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
  19. category = "-1" # 7 Product_Category
  20. views = "-1" # 8 Product_Number_Of_Views
  21. reviews = "-1" # 9 Product_Number_Of_Reviews
  22. rating_item = "-1" # 10 Product_Rating
  23. addDate = "-1" # 11 Product_AddedDate
  24. BTC = "-1" # 12 Product_BTC_SellingPrice
  25. USD = "-1" # 13 Product_USD_SellingPrice
  26. EURO = "-1" # 14 Product_EURO_SellingPrice
  27. sold = "-1" # 15 Product_QuantitySold
  28. left = "-1" # 16 Product_QuantityLeft
  29. shipFrom = "-1" # 17 Product_ShippedFrom
  30. shipTo = "-1" # 18 Product_ShippedTo
  31. # 0 *Vendor_Name
  32. try:
  33. temp = soup.find('table', {'class', 'vtable'})
  34. temp = temp.findAll('tr')
  35. temp2 = temp[3].find('a').text
  36. name = cleanString(temp2.strip())
  37. except:
  38. try:
  39. temp = soup.find('table', {'class', 'vtable'})
  40. temp = temp.findAll('tr')
  41. temp2 = temp[4].find('a').text
  42. name = cleanString(temp2.strip())
  43. except:
  44. print("vendor")
  45. # product name
  46. try:
  47. name = soup.find('div', {'class', 'title-h2'}).text
  48. name = cleanString(name.strip())
  49. except:
  50. print("name")
  51. #product description
  52. try:
  53. temp = soup.find('pre', {'class', 'description'}).text
  54. temp = temp.replace('\n', ' ')
  55. describe = cleanString(temp.strip())
  56. except:
  57. print("description")
  58. #product category
  59. try:
  60. temp = soup.find('table', {'class', 'vtable'})
  61. temp = temp.findAll('tr')
  62. temp2 = temp[4].find('th').text
  63. temp2 = cleanString(temp2)
  64. if (temp2 == "Category"):
  65. temp2 = temp[4].find('a').text
  66. category = cleanString(temp2.strip())
  67. except:
  68. try:
  69. temp = soup.find('table', {'class', 'vtable'})
  70. temp = temp.findAll('tr')
  71. temp2 = temp[5].find('th').text
  72. temp2 = cleanString(temp2.strip)
  73. if (temp2 == "Category"):
  74. temp2 = temp[5].find('a').text
  75. category = cleanString(temp2.strip())
  76. except:
  77. print('category')
  78. # usd
  79. try:
  80. temp = soup.find('table', {'class', 'vtable'})
  81. temp = temp.findAll('tr')
  82. temp2 = temp[1].find('td').text
  83. temp2 = temp2.replace(' USD', '')
  84. USD = cleanString(temp2)
  85. except:
  86. print('USD')
  87. # 15 Product_QuantitySold
  88. try:
  89. temp = soup.find('table', {'class', 'vtable'})
  90. temp = temp.findAll('tr')
  91. temp2 = temp[5].find('th').text
  92. temp2 = cleanString(temp2)
  93. temp3 = temp[6].find('th').text
  94. temp3 = cleanString(temp3)
  95. if (temp2 == "Sold"):
  96. temp2 = temp[5].find('td').text
  97. sold = cleanString(temp2.strip())
  98. elif (temp3 == "Sold"):
  99. temp2 = temp[6].find('td').text
  100. sold = cleanString(temp2.strip())
  101. except:
  102. print('sold')
  103. # Populating the final variable (this should be a list with all fields scraped)
  104. row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
  105. BTC, USD, EURO, sold, left, shipFrom, shipTo)
  106. # Sending the results
  107. return row
  108. #parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
  109. #stores info it needs in different lists, these lists are returned after being organized
  110. #@param: soup object looking at html page of listing page
  111. #return: 'row' that contains a variety of lists that each hold info on the listing page
  112. def darkmatter_listing_parser(soup):
  113. # Fields to be parsed
  114. nm = 0 # Total_Products (Should be Integer)
  115. mktName = "DarkMatter" # 0 Marketplace_Name
  116. name = [] # 1 Product_Name
  117. CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  118. MS = [] # 3 Product_MS_Classification (Microsoft Security)
  119. category = [] # 4 Product_Category
  120. describe = [] # 5 Product_Description
  121. escrow = [] # 6 Vendor_Warranty
  122. views = [] # 7 Product_Number_Of_Views
  123. reviews = [] # 8 Product_Number_Of_Reviews
  124. addDate = [] # 9 Product_AddDate
  125. rating_item = [] # 11 Product_Rating
  126. lastSeen = [] # 10 Product_LastViewDate
  127. BTC = [] # 11 Product_BTC_SellingPrice
  128. USD = [] # 12 Product_USD_SellingPrice
  129. EURO = [] # 13 Product_EURO_SellingPrice
  130. sold = [] # 14 Product_QuantitySold
  131. qLeft =[] # 15 Product_QuantityLeft
  132. shipFrom = [] # 16 Product_ShippedFrom
  133. shipTo = [] # 17 Product_ShippedTo
  134. vendor = [] # 18 Vendor
  135. rating = [] # 19 Vendor_Rating
  136. success = [] # 20 Vendor_Successful_Transactions
  137. href = [] # 23 Product_Links (Urls)
  138. names = soup.find('div', {"class": "content"}).findAll('td', {"class": "lefted", "colspan": "3"})
  139. left = soup.find('div', {"class": "content"}).findAll('table', {"class": "vtable"})
  140. right = soup.find('div', {"class": "content"}).findAll('td', {"class": "vtop centered"})
  141. # vtop centered
  142. count = 0
  143. # Populating the Number of Products
  144. nm = len(names)
  145. for a in names:
  146. # product name
  147. try:
  148. temp = a.find('a').text
  149. if ("pcs x " in temp):
  150. index = temp.index("pcs x ")
  151. result = temp[index + len("pcs x "):]
  152. name.append(cleanString(result))
  153. elif("pks x " in temp):
  154. index = temp.index("pks x ")
  155. result = temp[index + len("pks x "):]
  156. name.append(cleanString(temp))
  157. except Exception as e:
  158. print("product name", e)
  159. CVE.append("-1")
  160. MS.append("-1")
  161. temp2 = left[count].findAll('tr')
  162. length_2 = len(temp2) - 1
  163. # category
  164. try:
  165. temp = temp2[1].find('td').text
  166. category.append(cleanString(temp.strip()))
  167. except:
  168. print('category')
  169. describe.append("-1")
  170. escrow.append("-1")
  171. views.append("-1")
  172. reviews.append("-1")
  173. addDate.append("-1")
  174. lastSeen.append("-1")
  175. BTC.append("-1")
  176. # usd
  177. try:
  178. temp3 = right[count*2].find('span').text
  179. temp = temp3.replace(' USD', '')
  180. USD.append(cleanString(temp))
  181. except:
  182. print('USD')
  183. EURO.append("-1")
  184. # 14 Product_QuantitySold
  185. try:
  186. temp3 = temp2[length_2].find('th').text
  187. temp3 = cleanString(temp3)
  188. if (temp3 == "Sold:"):
  189. temp = temp2[length_2].find('td').text
  190. sold.append(cleanString(temp.strip()))
  191. else:
  192. sold.append("-1")
  193. except Exception as e:
  194. sold.append("-1")
  195. print('sold', e)
  196. qLeft.append("-1")
  197. shipFrom.append("-1")
  198. # ship to
  199. try:
  200. temp3 = temp2[length_2].find('th').text
  201. temp3 = cleanString(temp3)
  202. if (temp3 == "Ship To:"):
  203. temp = temp2[length_2].find('td').text
  204. shipTo.append(cleanString(temp.strip()))
  205. else:
  206. shipTo.append("-1")
  207. except Exception as e:
  208. shipTo.append("-1")
  209. print('shopto')
  210. # vendor
  211. try:
  212. temp = temp2[0].find('a').text
  213. vendor.append(cleanString(temp.strip()))
  214. except:
  215. print('vendor')
  216. rating.append("-1")
  217. success.append("-1")
  218. try:
  219. temp = a.find('a').get('href')
  220. href.append(temp)
  221. except:
  222. print('href')
  223. count += 1
  224. rating_item.append("-1")
  225. # Populate the final variable (this should be a list with all fields scraped)
  226. return organizeProducts(mktName, nm, vendor, rating, success, name, CVE, MS, category, describe, views,
  227. reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
  228. #called by the crawler to get description links on a listing page
  229. #@param: beautifulsoup object that is using the correct html page (listing page)
  230. #return: list of description links from a listing page
  231. def darkmatter_links_parser(soup):
  232. # Returning all links that should be visited by the Crawler
  233. href = []
  234. listing = soup.find('div', {"class": "content"}).findAll('td', {"class": "lefted", 'colspan': '3'})
  235. for a in listing:
  236. bae = a.find('a', href=True)
  237. link = bae['href']
  238. href.append(link)
  239. return href