this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

362 lines
11 KiB

1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
  1. __author__ = 'DarkWeb'
  2. import string
  3. import time
  4. import re
  5. import hashlib
  6. import base64
  7. import io
  8. import configparser
  9. import json
  10. import keras
  11. import cv2
  12. import numpy as np
  13. from keras.preprocessing import image
  14. from keras.applications.imagenet_utils import preprocess_input
  15. from keras.models import Model
  16. from datetime import datetime, timedelta
  17. from lxml import html as lxml
  18. from selenium.webdriver.common.by import By
  19. from Crypto.Cipher import AES
  20. from Crypto.Util.Padding import pad, unpad
  21. from PIL import Image
  22. from urllib.parse import urlsplit, urljoin
  23. def generate_aes_key():
  24. config = configparser.ConfigParser()
  25. config.read('../../setup.ini')
  26. secret = config.get('Encryption', 'secret')
  27. secret_bytes = bytes(secret, encoding="utf-8")
  28. # Derive a key from the seed using PBKDF2
  29. key = hashlib.pbkdf2_hmac(hash_name='sha256', password=secret_bytes, salt=bytes(), iterations=1)
  30. # Use the first 16 bytes of the derived key as the AES key
  31. aes_key = key[:16]
  32. # print("key: ", aes_key)
  33. return aes_key
  34. BLOCK_SIZE = 32
  35. aes_key = generate_aes_key()
  36. encryptCipher = AES.new(aes_key, AES.MODE_ECB)
  37. decryptCipher = AES.new(aes_key, AES.MODE_ECB)
  38. model = keras.applications.ResNet50(weights='imagenet', include_top=True)
  39. feat_extractor = Model(inputs=model.input, outputs=model.get_layer('avg_pool').output)
  40. sift = cv2.SIFT_create(
  41. nfeatures=0, # Number of features, 0 for unlimited
  42. nOctaveLayers=3, # Number of layers per octave
  43. contrastThreshold=0.09, # Contrast threshold
  44. edgeThreshold=10, # Edge threshold
  45. sigma=1.6 # Initial Gaussian blur sigma
  46. )
  47. def generate_image_hash(image_string):
  48. image_bytes = bytes(image_string, encoding='utf-8')
  49. image_bytes = base64.b64decode(image_bytes)
  50. return hashlib.sha256(image_bytes).hexdigest()
  51. def extract_hidden_layer_output(image_string):
  52. image_bytes = bytes(image_string, encoding='utf-8')
  53. image_bytes = base64.b64decode(image_bytes)
  54. im = Image.open(io.BytesIO(image_bytes)).convert('RGB')
  55. x = image.img_to_array(im)
  56. x = image.smart_resize(x, size=model.input_shape[1:3], interpolation='nearest')
  57. x = np.expand_dims(x, axis=0)
  58. x = preprocess_input(x)
  59. return json.dumps(feat_extractor.predict(x)[0].tolist())
  60. def extract_keypoints(image_string):
  61. image_bytes = bytes(image_string, encoding='utf-8')
  62. image_bytes = base64.b64decode(image_bytes)
  63. image_array = np.asarray(bytearray(image_bytes), dtype=np.uint8)
  64. img = cv2.imdecode(image_array, cv2.IMREAD_GRAYSCALE)
  65. keypoints, descriptors = sift.detectAndCompute(img, None)
  66. if len(keypoints) == 0:
  67. return None, None
  68. return json.dumps(wrap_keypoints(keypoints)), json.dumps(descriptors.tolist())
  69. def wrap_keypoints(keypoints):
  70. keypoints_list = []
  71. for i in range(len(keypoints)):
  72. temp = {
  73. 'pt': keypoints[i].pt,
  74. 'size': keypoints[i].size,
  75. 'angle': keypoints[i].angle,
  76. 'octave': keypoints[i].octave,
  77. 'response': keypoints[i].response,
  78. 'class_id': keypoints[i].class_id
  79. }
  80. keypoints_list.append(temp)
  81. return keypoints_list
  82. def unwrap_keypoints(keypoints_list):
  83. keypoints = []
  84. for temp in keypoints_list:
  85. point = cv2.KeyPoint(
  86. x=temp['pt'][0],
  87. y=temp['pt'][1],
  88. size=temp['size'],
  89. angle=temp['angle'],
  90. octave=temp['octave'],
  91. response=temp['response'],
  92. class_id=temp['class_id']
  93. )
  94. keypoints.append(point)
  95. return tuple(keypoints)
  96. def cleanText(originalText):
  97. safe_chars = string.ascii_letters + string.digits + " " + "_" + "/" + "&" + "$" + "#" "@" + "+" + "-" + "*" + "=" \
  98. ":" + ";" + "." "," + "?" + "!" + "{" + "}" + "[" + "]" + "(" + ")" + "%" + "`" + "~" + "^" + "|" + "<" + ">"
  99. for index, text in enumerate(originalText):
  100. originalText[index] = ''.join([char if char in safe_chars else '' for char in text])
  101. return originalText
  102. def cleanLink(originalLink):
  103. safe_chars = string.ascii_letters + string.digits
  104. originalLink = ''.join([char if char in safe_chars else '' for char in originalLink])
  105. return originalLink
  106. def organizeProducts(marketplace, nm, vendor, rating_vendor, success_vendor, nombre, CVE, MS, category, describe,
  107. views, reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor):
  108. rw = []
  109. current_time = datetime.now()
  110. day = current_time.strftime("%m/%d/%Y")
  111. ahora = current_time.strftime("%I:%M:%S")
  112. for n in range(nm):
  113. lne = marketplace # 0
  114. lne += ","
  115. lne += "-1" if len(vendor) == 0 else vendor[n] # 1
  116. lne += ","
  117. lne += "-1" if len(rating_vendor) == 0 else rating_vendor[n] # 2
  118. lne += ","
  119. lne += "-1" if len(success_vendor) == 0 else success_vendor[n] # 3
  120. lne += ","
  121. lne += nombre[n] # 4
  122. lne += ','
  123. lne += "-1" if len(describe) == 0 else describe[n] # 5
  124. lne += ","
  125. lne += "-1" if len(CVE) == 0 else CVE[n] # 6
  126. lne += ","
  127. lne += "-1" if len(MS) == 0 else MS[n] # 7
  128. lne += ","
  129. lne += "-1" if len(category) == 0 else category[n] # 8
  130. lne += ","
  131. lne += "-1" if len(views) == 0 else views[n] # 9
  132. lne += ","
  133. lne += "-1" if len(reviews) == 0 else reviews[n] # 10
  134. lne += ","
  135. lne += "-1" if len(rating_item) == 0 else rating_item[n] # 11
  136. lne += ","
  137. lne += "-1" if len(addDate) == 0 else addDate[n] # 12
  138. lne += ","
  139. lne += "-1" if len(BTC) == 0 else BTC[n] # 13
  140. lne += ","
  141. lne += "-1" if len(USD) == 0 else USD[n] # 14
  142. lne += ","
  143. lne += "-1" if len(EURO) == 0 else EURO[n] # 15
  144. lne += ","
  145. lne += "-1" if len(sold) == 0 else sold[n] # 16
  146. lne += ","
  147. lne += "-1" if len(qLeft) == 0 else qLeft[n] # 17
  148. lne += ","
  149. lne += "-1" if len(shipFrom) == 0 else shipFrom[n] # 18
  150. lne += ","
  151. lne += "-1" if len(shipTo) == 0 else shipTo[n] # 19
  152. lne += ","
  153. lne += "-1" if len(image) == 0 else image[n] # 20
  154. lne += ","
  155. lne += "-1" if len(image_vendor) == 0 else image_vendor[n] # 21
  156. lne += ","
  157. lne += "-1" if len(href) == 0 else href[n] # 22
  158. lne += ","
  159. lne += day + " " + ahora # 23
  160. rw.append(lne)
  161. return rw
  162. def cleanString(originalString):
  163. updated_string = originalString.replace(",", "") #replace all commas
  164. updated_string = updated_string.replace("\n", "") #replace all newlines
  165. updated_string = updated_string.replace("\t", "") #replace all tabs
  166. updated_string = updated_string.replace("\r", "") #replace all carriage returns
  167. updated_string = updated_string.replace("'", "^") #replace all semicolons
  168. updated_string = updated_string.replace(u"»", '') #replace all arrows
  169. updated_string = updated_string.replace("!", "") #replace all exclamation points
  170. updated_string = updated_string.replace(";", "") #replace all exclamations
  171. return updated_string
  172. def checkDateFormat(myString):
  173. isDate = re.match('[0-1][0-9]\/[0-3][0-9]\/[1-2][0-9]{3}', myString)
  174. return isDate
  175. def cleanNumbers(inputString):
  176. reg_ex = re.compile(r'[^\d.]+')
  177. updated_string = reg_ex.sub('', inputString)
  178. return updated_string
  179. def aes_encryption(data_bytes):
  180. encrypted_bytes = encryptCipher.encrypt(pad(data_bytes, BLOCK_SIZE))
  181. return encrypted_bytes
  182. def aes_decryption(data_bytes):
  183. decrypted_bytes = decryptCipher.decrypt(data_bytes)
  184. return unpad(decrypted_bytes, BLOCK_SIZE)
  185. def encrypt_encode_image_to_base64(driver, xpath):
  186. try:
  187. img_element = driver.find_element(by=By.XPATH, value=xpath)
  188. image_data = img_element.screenshot_as_png
  189. encrypted_image = aes_encryption(image_data)
  190. base64_image = base64.b64encode(encrypted_image)
  191. enc_image_string = base64_image.decode('utf-8')
  192. return enc_image_string
  193. except Exception as e:
  194. print(e)
  195. pass
  196. return None
  197. def decode_decrypt_image_in_base64(image_string):
  198. try:
  199. image_bytes = bytes(image_string, encoding='utf-8')
  200. encrypted_bytes = base64.b64decode(image_bytes)
  201. decrypted_image = aes_decryption(encrypted_bytes)
  202. base64_image = base64.b64encode(decrypted_image)
  203. dec_image_string = base64_image.decode('utf-8')
  204. return dec_image_string
  205. except Exception as e:
  206. print(e)
  207. pass
  208. return None
  209. def replace_image_sources(driver, html_content):
  210. tree = lxml.fromstring(html_content)
  211. for picture_tag in tree.findall('.//picture'):
  212. for source_tag in picture_tag.findall('.//source'):
  213. picture_tag.remove(source_tag)
  214. for img_tag in tree.findall('.//img'):
  215. img_xpath = tree.getroottree().getpath(img_tag)
  216. string_image = encrypt_encode_image_to_base64(driver, img_xpath)
  217. if string_image:
  218. img_tag.set('src', f'data:image/png;base64,{string_image}')
  219. else:
  220. img_tag.getparent().remove(img_tag)
  221. modified_html = lxml.tostring(tree, encoding='utf-8').decode('utf-8')
  222. return modified_html
  223. def cleanHTML(driver, html):
  224. clean_html = replace_image_sources(driver, html)
  225. formats = [
  226. "jpg", "jpeg", "jfif", "pjpeg", "pjp",
  227. "png", "apng", "svg", "bmp", "gif",
  228. "avif", "webp", "ico", "cur", "tiff"
  229. ]
  230. # remove images
  231. clean_html = re.sub(r"<svg[\s\S]*?svg>", "", clean_html)
  232. for fmat in formats:
  233. clean_html = re.sub(r"<object.*" + fmat + "[\s\S]*?object>", "", clean_html)
  234. clean_html = re.sub(r"<canvas[\s\S]*?canvas>", "", clean_html)
  235. # remove JavaScript
  236. clean_html = re.sub(r"<script[\s\S]*?script>", "", clean_html)
  237. clean_html = re.sub(r"<iframe[\s\S]*?iframe>", "", clean_html)
  238. clean_html = re.sub(r"<object.*javascript[\s\S]*?object>", "", clean_html)
  239. clean_html = re.sub(r"<aplet.*mayscript[\s\S]*?aplet>", "", clean_html)
  240. clean_html = re.sub(r"<embed.*scriptable[\s\S]*?embed>", "", clean_html)
  241. # image and JavaScript
  242. clean_html = re.sub(r"<div[^>]*style=\"[^\"]*background-image[\s\S]*?div>", "", clean_html)
  243. return clean_html
  244. def get_relative_url(target_url):
  245. # Use a dummy base URL to handle both absolute and relative URLs
  246. base_url = "http://dummybaseurl.com/"
  247. absolute_url = urljoin(base_url, target_url)
  248. # Parse the absolute URL
  249. parsed_absolute_url = urlsplit(absolute_url)
  250. # Extract the path and query from the absolute URL as the relative URL
  251. return parsed_absolute_url.path + '?' + parsed_absolute_url.query \
  252. if parsed_absolute_url.query else parsed_absolute_url.path