this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

349 lines
11 KiB

1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
  1. __author__ = 'DarkWeb'
  2. import string
  3. import time
  4. import re
  5. import hashlib
  6. import base64
  7. import io
  8. import configparser
  9. import json
  10. import keras
  11. import cv2
  12. import numpy as np
  13. from keras.preprocessing import image
  14. from keras.applications.imagenet_utils import preprocess_input
  15. from keras.models import Model
  16. from datetime import datetime, timedelta
  17. from lxml import html as lxml
  18. from selenium.webdriver.common.by import By
  19. from Crypto.Cipher import AES
  20. from Crypto.Util.Padding import pad, unpad
  21. from PIL import Image
  22. from urllib.parse import urlsplit, urljoin
  23. def generate_aes_key():
  24. config = configparser.ConfigParser()
  25. config.read('../../setup.ini')
  26. secret = config.get('Encryption', 'secret')
  27. secret_bytes = bytes(secret, encoding="utf-8")
  28. # Derive a key from the seed using PBKDF2
  29. key = hashlib.pbkdf2_hmac(hash_name='sha256', password=secret_bytes, salt=bytes(), iterations=1)
  30. # Use the first 16 bytes of the derived key as the AES key
  31. aes_key = key[:16]
  32. # print("key: ", aes_key)
  33. return aes_key
  34. BLOCK_SIZE = 32
  35. aes_key = generate_aes_key()
  36. encryptCipher = AES.new(aes_key, AES.MODE_ECB)
  37. decryptCipher = AES.new(aes_key, AES.MODE_ECB)
  38. model = keras.applications.ResNet50(weights='imagenet', include_top=True)
  39. feat_extractor = Model(inputs=model.input, outputs=model.get_layer('avg_pool').output)
  40. sift = cv2.SIFT_create(
  41. nfeatures=0, # Number of features, 0 for unlimited
  42. nOctaveLayers=3, # Number of layers per octave
  43. contrastThreshold=0.09, # Contrast threshold
  44. edgeThreshold=10, # Edge threshold
  45. sigma=1.6 # Initial Gaussian blur sigma
  46. )
  47. def generate_image_hash(image_string):
  48. image_bytes = bytes(image_string, encoding='utf-8')
  49. image_bytes = base64.b64decode(image_bytes)
  50. return hashlib.sha256(image_bytes).hexdigest()
  51. def extract_hidden_layer_output(image_string):
  52. image_bytes = bytes(image_string, encoding='utf-8')
  53. image_bytes = base64.b64decode(image_bytes)
  54. im = Image.open(io.BytesIO(image_bytes)).convert('RGB')
  55. x = image.img_to_array(im)
  56. x = image.smart_resize(x, size=model.input_shape[1:3], interpolation='nearest')
  57. x = np.expand_dims(x, axis=0)
  58. x = preprocess_input(x)
  59. return json.dumps(feat_extractor.predict(x)[0].tolist())
  60. def extract_keypoints(image_string):
  61. image_bytes = bytes(image_string, encoding='utf-8')
  62. image_bytes = base64.b64decode(image_bytes)
  63. image_array = np.asarray(bytearray(image_bytes), dtype=np.uint8)
  64. img = cv2.imdecode(image_array, cv2.IMREAD_GRAYSCALE)
  65. keypoints, descriptors = sift.detectAndCompute(img, None)
  66. if len(keypoints) == 0:
  67. return None, None
  68. return json.dumps(wrap_keypoints(keypoints)), json.dumps(descriptors.tolist())
  69. def wrap_keypoints(keypoints):
  70. keypoints_list = []
  71. for i in range(len(keypoints)):
  72. temp = {
  73. 'pt': keypoints[i].pt,
  74. 'size': keypoints[i].size,
  75. 'angle': keypoints[i].angle,
  76. 'octave': keypoints[i].octave,
  77. 'response': keypoints[i].response,
  78. 'class_id': keypoints[i].class_id
  79. }
  80. keypoints_list.append(temp)
  81. return keypoints_list
  82. def unwrap_keypoints(keypoints_list):
  83. keypoints = []
  84. for temp in keypoints_list:
  85. point = cv2.KeyPoint(
  86. x=temp['pt'][0],
  87. y=temp['pt'][1],
  88. size=temp['size'],
  89. angle=temp['angle'],
  90. octave=temp['octave'],
  91. response=temp['response'],
  92. class_id=temp['class_id']
  93. )
  94. keypoints.append(point)
  95. return tuple(keypoints)
  96. def cleanText(originalText):
  97. safe_chars = string.ascii_letters + string.digits + " " + "_" + "/" + "&" + "$" + "#" "@" + "+" + "-" + "*" + "=" \
  98. ":" + ";" + "." "," + "?" + "!" + "{" + "}" + "[" + "]" + "(" + ")" + "%" + "`" + "~" + "^" + "|" + "<" + ">"
  99. for index, text in enumerate(originalText):
  100. originalText[index] = ''.join([char if char in safe_chars else '' for char in text])
  101. return originalText
  102. def cleanLink(originalLink):
  103. safe_chars = string.ascii_letters + string.digits
  104. originalLink = ''.join([char if char in safe_chars else '' for char in originalLink])
  105. return originalLink
  106. def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author):
  107. rw = []
  108. current_time = datetime.now()
  109. day = current_time.strftime("%m/%d/%Y")
  110. ahora = current_time.strftime("%I:%M:%S")
  111. for n in range(nm):
  112. lne = forum # 0
  113. lne += ","
  114. lne += board # 1 board_topic
  115. lne += ","
  116. lne += author[n] # 2
  117. lne += ","
  118. lne += topic[n] # 3 topic_title
  119. lne += ","
  120. lne += "-1" if len(views) == 0 else views[n] # 4 views_topic
  121. lne += ","
  122. lne += "-1" if len(posts) == 0 else posts[n] # 5 posts_topic
  123. lne += ","
  124. lne += "-1" if len(href) == 0 else href[n] # 6 href_topic
  125. lne += ","
  126. lne += "-1" if len(addDate) == 0 else str(addDate[n]) # 7 dateadded_topic
  127. lne += ","
  128. lne += day + " " + ahora # 8 dateinserted_topic
  129. lne += ","
  130. lne += "-1" if len(image_author) == 0 else str(image_author[n]) # 9 image_user
  131. lne += ","
  132. lne += "-1" # 10 name_user
  133. lne += ","
  134. lne += "-1" # 11 status_user
  135. lne += ","
  136. lne += "-1" # 12 reputation_user
  137. lne += ","
  138. lne += "-1" # 13 interest_user
  139. lne += ","
  140. lne += "-1" # 14 signature_user
  141. lne += ","
  142. lne += "-1" # 15 content_post
  143. lne += ","
  144. lne += "-1" # 16 feedback_post
  145. lne += ","
  146. lne += "-1" # 17 dateadded_post
  147. lne += ","
  148. lne += "-1" # 18 image_post
  149. lne += ","
  150. lne += "-1" # 19 classification_post
  151. rw.append(lne)
  152. return rw
  153. def cleanString(originalString):
  154. updated_string = originalString.replace(",", "") #replace all commas
  155. updated_string = updated_string.replace("\n", "") #replace all newlines
  156. updated_string = updated_string.replace("\t", "") #replace all tabs
  157. updated_string = updated_string.replace("\r", "") #replace all carriage returns
  158. updated_string = updated_string.replace("'", "^") #replace all semicolons
  159. updated_string = updated_string.replace(u"»", '') #replace all arrows
  160. updated_string = updated_string.replace("!", "") #replace all exclamation points
  161. updated_string = updated_string.replace(";", "") #replace all exclamations
  162. return updated_string
  163. def cleanNumbers(inputString):
  164. reg_ex = re.compile(r'[^\d.]+')
  165. updated_string = reg_ex.sub('', inputString)
  166. return updated_string
  167. def aes_encryption(data_bytes):
  168. encrypted_bytes = encryptCipher.encrypt(pad(data_bytes, BLOCK_SIZE))
  169. return encrypted_bytes
  170. def aes_decryption(data_bytes):
  171. decrypted_bytes = decryptCipher.decrypt(data_bytes)
  172. return unpad(decrypted_bytes, BLOCK_SIZE)
  173. def encrypt_encode_image_to_base64(driver, xpath):
  174. try:
  175. img_element = driver.find_element(by=By.XPATH, value=xpath)
  176. image_data = img_element.screenshot_as_png
  177. encrypted_image = aes_encryption(image_data)
  178. base64_image = base64.b64encode(encrypted_image)
  179. enc_image_string = base64_image.decode('utf-8')
  180. return enc_image_string
  181. except Exception as e:
  182. print(e)
  183. pass
  184. return None
  185. def decode_decrypt_image_in_base64(image_string):
  186. try:
  187. image_bytes = bytes(image_string, encoding='utf-8')
  188. encrypted_bytes = base64.b64decode(image_bytes)
  189. decrypted_image = aes_decryption(encrypted_bytes)
  190. base64_image = base64.b64encode(decrypted_image)
  191. dec_image_string = base64_image.decode('utf-8')
  192. return dec_image_string
  193. except Exception as e:
  194. print(e)
  195. pass
  196. return None
  197. def replace_image_sources(driver, html_content):
  198. tree = lxml.fromstring(html_content)
  199. for picture_tag in tree.findall('.//picture'):
  200. for source_tag in picture_tag.findall('.//source'):
  201. picture_tag.remove(source_tag)
  202. for img_tag in tree.findall('.//img'):
  203. img_xpath = tree.getroottree().getpath(img_tag)
  204. string_image = encrypt_encode_image_to_base64(driver, img_xpath)
  205. if string_image:
  206. img_tag.set('src', f'data:image/png;base64,{string_image}')
  207. else:
  208. img_tag.getparent().remove(img_tag)
  209. modified_html = lxml.tostring(tree, encoding='utf-8').decode('utf-8')
  210. return modified_html
  211. def cleanHTML(driver, html):
  212. clean_html = replace_image_sources(driver, html)
  213. # decode_decrypt_image_in_base64(clean_html)
  214. formats = [
  215. "jpg", "jpeg", "jfif", "pjpeg", "pjp",
  216. "png", "apng", "svg", "bmp", "gif",
  217. "avif", "webp", "ico", "cur", "tiff"
  218. ]
  219. # remove images
  220. clean_html = re.sub(r"<svg[\s\S]*?svg>", "", clean_html)
  221. for fmat in formats:
  222. clean_html = re.sub(r"<object.*" + fmat + "[\s\S]*?object>", "", clean_html)
  223. clean_html = re.sub(r"<canvas[\s\S]*?canvas>", "", clean_html)
  224. # remove JavaScript
  225. clean_html = re.sub(r"<script[\s\S]*?script>", "", clean_html)
  226. clean_html = re.sub(r"<iframe[\s\S]*?iframe>", "", clean_html)
  227. clean_html = re.sub(r"<object.*javascript[\s\S]*?object>", "", clean_html)
  228. clean_html = re.sub(r"<aplet.*mayscript[\s\S]*?aplet>", "", clean_html)
  229. clean_html = re.sub(r"<embed.*scriptable[\s\S]*?embed>", "", clean_html)
  230. # image and JavaScript
  231. clean_html = re.sub(r"<div[^>]*style=\"[^\"]*background-image[\s\S]*?div>", "", clean_html)
  232. return clean_html
  233. def get_relative_url(target_url):
  234. # Use a dummy base URL to handle both absolute and relative URLs
  235. base_url = "http://dummybaseurl.com/"
  236. absolute_url = urljoin(base_url, target_url)
  237. # Parse the absolute URL
  238. parsed_absolute_url = urlsplit(absolute_url)
  239. # Extract the path and query from the absolute URL as the relative URL
  240. return parsed_absolute_url.path + '?' + parsed_absolute_url.query \
  241. if parsed_absolute_url.query else parsed_absolute_url.path