this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

216 lines
8.8 KiB

11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
  1. __author__ = 'DarkWeb'
  2. # Here, we are importing the auxiliary functions to clean or convert data
  3. from MarketPlaces.Utilities.utilities import *
  4. # Here, we are importing BeautifulSoup to search through the HTML tree
  5. from bs4 import BeautifulSoup
  6. # parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs
  7. # stores info it needs in different lists, these lists are returned after being organized
  8. # @param: soup object looking at html page of description page
  9. # return: 'row' that contains a variety of lists that each hold info on the description page
  10. def gofish_description_parser(soup):
  11. # Fields to be parsed
  12. vendor = "-1" # 0 *Vendor_Name
  13. success = "-1" # 1 Vendor_Successful_Transactions
  14. rating_vendor = "-1" # 2 Vendor_Rating
  15. name = "-1" # 3 *Product_Name
  16. describe = "-1" # 4 Product_Description
  17. CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)
  18. MS = "-1" # 6 Product_MS_Classification (Microsoft Security)
  19. category = "-1" # 7 Product_Category
  20. views = "-1" # 8 Product_Number_Of_Views
  21. reviews = "-1" # 9 Product_Number_Of_Reviews
  22. rating_item = "-1" # 10 Product_Rating
  23. addDate = "-1" # 11 Product_AddedDate
  24. BTC = "-1" # 12 Product_BTC_SellingPrice
  25. USD = "-1" # 13 Product_USD_SellingPrice
  26. EURO = "-1" # 14 Product_EURO_SellingPrice
  27. sold = "-1" # 15 Product_QuantitySold
  28. left = "-1" # 16 Product_QuantityLeft
  29. shipFrom = "-1" # 17 Product_ShippedFrom
  30. shipTo = "-1" # 18 Product_ShippedTo
  31. image = "-1" # 19 Product_Image
  32. vendor_image = "-1" # 20 Vendor_Image
  33. temp = soup.find('div', {'class': 'col-lg-5'})
  34. # find vendor name
  35. vendor = temp.find('a', {'class': 'text-decoration-none fw-bold'}).text.strip()
  36. if vendor is None:
  37. print('vendor')
  38. # find product name
  39. temp2 = soup.find('nav', {'aria-label': 'breadcrumb'}).findAll('li', {'class': 'breadcrumb-item'})
  40. name = soup.find('li', {'class': 'breadcrumb-item active text-truncate'}).text.strip()
  41. if name is None:
  42. print('name')
  43. describe = soup.find('div', {'class': 'p-3 mb-3 overflow-auto border border-2 bg-white rounded'}).text
  44. describe = cleanString(describe)
  45. if describe is None:
  46. print('describe')
  47. category = temp2[2].text
  48. if category is None:
  49. print('category')
  50. USD = soup.find('td', {'class': 'text-end text-nowrap'}).text
  51. if USD is None:
  52. print('USD')
  53. shipFrom = soup.find('span', {'class': 'lh-1 me-2 fs-4'}).text
  54. print(shipFrom)
  55. shipTo = soup.find('span', {'class': 'lh-1 me-2 fs-4'}).text
  56. if shipTo.isalnum():
  57. shipTo = shipTo
  58. else:
  59. shipTo = 'Worldwide'
  60. # Finding Product Image
  61. image = soup.find('figure', {"class": 'image-feature'}).find('img')
  62. if image is not None:
  63. image = image.get('src')
  64. image = image.split('base64,')[-1]
  65. else:
  66. print('img')
  67. image = "-1"
  68. # Populating the final variable (this should be a list with all fields scraped)
  69. row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
  70. BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
  71. # Sending the results
  72. return row
  73. # parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
  74. # stores info it needs in different lists, these lists are returned after being organized
  75. # @param: soup object looking at html page of listing page
  76. # return: 'row' that contains a variety of lists that each hold info on the listing page
  77. def gofish_listing_parser(soup):
  78. # Fields to be parsed
  79. nm = 0 # *Total_Products (Should be Integer)
  80. mktName = "GoFish" # 0 *Marketplace_Name
  81. vendor = [] # 1 *Vendor y
  82. rating_vendor = [] # 2 Vendor_Rating
  83. success = [] # 3 Vendor_Successful_Transactions
  84. name = [] # 4 *Product_Name y
  85. CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
  86. MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
  87. category = [] # 7 Product_Category y
  88. describe = [] # 8 Product_Description
  89. views = [] # 9 Product_Number_Of_Views
  90. reviews = [] # 10 Product_Number_Of_Reviews
  91. rating_item = [] # 11 Product_Rating
  92. addDate = [] # 12 Product_AddDate
  93. BTC = [] # 13 Product_BTC_SellingPrice
  94. USD = [] # 14 Product_USD_SellingPrice y
  95. EURO = [] # 15 Product_EURO_SellingPrice
  96. sold = [] # 16 Product_QuantitySold
  97. qLeft = [] # 17 Product_QuantityLeft
  98. shipFrom = [] # 18 Product_ShippedFrom
  99. shipTo = [] # 19 Product_ShippedTo
  100. image = [] # 20 Product_Image
  101. image_vendor = [] # 21 Vendor_Image
  102. href = [] # 22 Product_Links
  103. temp = soup.find('div', {"class": "col-9"})
  104. cat = temp.find('nav', {'aria-label': 'breadcrumb'}).find('li', {'class': 'breadcrumb-item active'}).text.strip()
  105. cat = cleanString(cat)
  106. listing = temp.find('tbody', {"class": 'border border-2 align-middle'}).findAll('tr')
  107. # Populating the Number of Products
  108. nm = len(listing)
  109. for a in listing:
  110. category.append(cat)
  111. # Adding the url to the list of urls
  112. link = a.find('a').get('href')
  113. link = cleanLink(link)
  114. href.append(link)
  115. # Finding the Product name
  116. product = a.find('a', {"class": 'text-decoration-none'}).text
  117. product = product.replace('\n', ' ')
  118. product = product.replace(",", "")
  119. product = product.replace("...", "")
  120. product = product.strip()
  121. name.append(product)
  122. # Finding Product Image
  123. product_image = a.find('img')
  124. product_image = product_image.get('src')
  125. product_image = product_image.split('base64,')[-1]
  126. image.append(product_image)
  127. # Finding the Vendor
  128. vendor_name = a.find('a', {"class": 'text-decoration-none fw-bold'}).text
  129. vendor_name = vendor_name.replace(",", "")
  130. vendor_name = vendor_name.strip()
  131. vendor.append(vendor_name)
  132. # image vendor
  133. image_vendor.append("-1")
  134. # USD
  135. usd = a.find('div', {'class': 'text-nowrap'}). find('span', {'class': 'fw-bold text-nowrap'}).text.strip()
  136. USD.append(usd)
  137. temp = a.findAll('span', {'class': 'fs-4 lh-1'})
  138. shipF = temp[0].text
  139. shipFrom.append(shipF)
  140. shipT = temp[1].text
  141. if shipT.isalnum():
  142. shipTo.append(shipT)
  143. else:
  144. shipTo.append('Worldwide')
  145. rating_vendor.append('-1')
  146. success.append('-1')
  147. CVE.append('-1')
  148. MS.append('-1')
  149. describe.append('-1')
  150. views.append('-1')
  151. reviews.append('-1')
  152. rating_item.append('-1')
  153. addDate.append('-1')
  154. BTC.append('-1')
  155. EURO.append('-1')
  156. sold.append('-1')
  157. qLeft.append('-1')
  158. # Populate the final variable (this should be a list with all fields scraped)
  159. return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
  160. reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor)
  161. # called by the crawler to get description links on a listing page
  162. # @param: beautifulsoup object that is using the correct html page (listing page)
  163. # return: list of description links from a listing page
  164. def gofish_links_parser(soup):
  165. # Returning all links that should be visited by the Crawler
  166. href = []
  167. listing = soup.find('div', {"class": "col-9"}).find('tbody', {'class': 'border border-2 align-middle'}).findAll('tr')
  168. for a in listing:
  169. bae = a.findAll('a', href=True)
  170. # Adding the url to the list of urls
  171. link = bae[0].get('href')
  172. href.append(link)
  173. return href