this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

446 lines
14 KiB

1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
  1. __author__ = 'DarkWeb'
  2. import string
  3. import time
  4. import re
  5. import hashlib
  6. import base64
  7. import io
  8. import configparser
  9. from datetime import datetime, timedelta
  10. import datetime as fulldatetime
  11. from bs4 import BeautifulSoup
  12. from lxml import html as lxml
  13. from selenium.webdriver.common.by import By
  14. from Crypto.Cipher import AES
  15. from Crypto.Util.Padding import pad, unpad
  16. from PIL import Image
  17. def generate_aes_key():
  18. config = configparser.ConfigParser()
  19. config.read('../../setup.ini')
  20. secret = config.get('Encryption', 'secret')
  21. secret_bytes = bytes(secret, encoding="utf-8")
  22. # Derive a key from the seed using PBKDF2
  23. key = hashlib.pbkdf2_hmac(hash_name='sha256', password=secret_bytes, salt=bytes(), iterations=1)
  24. # Use the first 16 bytes of the derived key as the AES key
  25. aes_key = key[:16]
  26. # print("key: ", aes_key)
  27. return aes_key
  28. BLOCK_SIZE = 32
  29. aes_key = generate_aes_key()
  30. encryptCipher = AES.new(aes_key, AES.MODE_ECB)
  31. decryptCipher = AES.new(aes_key, AES.MODE_ECB)
  32. def cleanText(originalText):
  33. safe_chars = string.ascii_letters + string.digits + " " + "_" + "/" + "&" + "$" + "#" "@" + "+" + "-" + "*" + "=" \
  34. ":" + ";" + "." "," + "?" + "!" + "{" + "}" + "[" + "]" + "(" + ")" + "%" + "`" + "~" + "^" + "|" + "<" + ">"
  35. for index, text in enumerate(originalText):
  36. originalText[index] = ''.join([char if char in safe_chars else '' for char in text])
  37. return originalText
  38. def convertDate(sdate, language, crawlerDate):
  39. if language == "english":
  40. todaysday = crawlerDate.strftime("%m/%d/%Y")
  41. sdate = sdate.replace(u"January","01")
  42. sdate = sdate.replace(u"February","02")
  43. sdate = sdate.replace(u"March","03")
  44. sdate = sdate.replace(u"April","04")
  45. sdate = sdate.replace(u"May","05")
  46. sdate = sdate.replace(u"June","06")
  47. sdate = sdate.replace(u"July","07")
  48. sdate = sdate.replace(u"August","08")
  49. sdate = sdate.replace(u"September","09")
  50. sdate = sdate.replace(u"October","10")
  51. sdate = sdate.replace(u"November","11")
  52. sdate = sdate.replace(u"December","12")
  53. sdate = sdate.replace(u"Jan","01")
  54. sdate = sdate.replace(u"Feb","02")
  55. sdate = sdate.replace(u"Mar","03")
  56. sdate = sdate.replace(u"Apr","04")
  57. sdate = sdate.replace(u"May","05")
  58. sdate = sdate.replace(u"Jun","06")
  59. sdate = sdate.replace(u"Jul","07")
  60. sdate = sdate.replace(u"Aug","08")
  61. sdate = sdate.replace(u"Sep","09")
  62. sdate = sdate.replace(u"Oct","10")
  63. sdate = sdate.replace(u"Nov","11")
  64. sdate = sdate.replace(u"Dec","12")
  65. sdate = sdate.replace(u".","")
  66. if sdate == "Today at":
  67. sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%m %d %Y')
  68. sdate = datetime.strptime(str(sdate), '%m %d %Y').strftime('%m/%d/%Y')
  69. elif language == "french":
  70. todaysday = crawlerDate.strftime("%m/%d/%Y")
  71. sdate = sdate.replace(u"janvier","01")
  72. sdate = sdate.replace(u"jan","01")
  73. sdate = sdate.replace(u"février","02")
  74. sdate = sdate.replace(u"juin","06")
  75. sdate = sdate.replace(u"juillet","07")
  76. sdate = sdate.replace(u"juil","07")
  77. sdate = sdate.replace(u"août","08")
  78. sdate = sdate.replace(u"septembre","09")
  79. sdate = sdate.replace(u"sept","09")
  80. sdate = sdate.replace(u"octobre","10")
  81. sdate = sdate.replace(u"oct","10")
  82. sdate = sdate.replace(u"novembre","11")
  83. sdate = sdate.replace(u"nov","11")
  84. sdate = sdate.replace(u"décembre","12")
  85. sdate = sdate.replace(u"déc","12")
  86. sdate = sdate.replace(u".","")
  87. if sdate == u"Aujourd'hui":
  88. sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%d %m %Y')
  89. if "mar" in sdate:
  90. print ("Add March to the IBM Black Market")
  91. raise SystemExit
  92. elif "avr" in sdate:
  93. print ("Add April to the IBM Black Market")
  94. raise SystemExit
  95. elif "mai" in sdate:
  96. print ("Add May to the IBM Black Market")
  97. raise SystemExit
  98. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  99. elif language == "swedish":
  100. sdate = sdate.replace(u"jan","01")
  101. sdate = sdate.replace(u"feb","02")
  102. sdate = sdate.replace(u"mar","03")
  103. sdate = sdate.replace(u"apr","04")
  104. sdate = sdate.replace(u"maj","05")
  105. sdate = sdate.replace(u"jun","06")
  106. sdate = sdate.replace(u"jul","07")
  107. sdate = sdate.replace(u"aug","08")
  108. sdate = sdate.replace(u"sep","09")
  109. sdate = sdate.replace(u"okt","10")
  110. sdate = sdate.replace(u"nov","11")
  111. sdate = sdate.replace(u"dec","12")
  112. sdate = sdate.replace(u".","")
  113. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  114. elif language == "russian":
  115. if sdate == u'\u0412\u0447\u0435\u0440\u0430':
  116. sdate = crawlerDate.today() - timedelta(1)
  117. sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y')
  118. elif u'\xd1\xee\xe7\xe4\xe0\xed\xee' in sdate:
  119. return ""
  120. sdate = sdate.replace(u"января","01")
  121. sdate = sdate.replace(u"янв","01")
  122. sdate = sdate.replace(u"февраля","02")
  123. sdate = sdate.replace(u"Февраль", "02")
  124. sdate = sdate.replace(u"фев","02")
  125. sdate = sdate.replace(u"марта","03")
  126. sdate = sdate.replace(u"апреля","04")
  127. sdate = sdate.replace(u"апр","04")
  128. sdate = sdate.replace(u"мар","05")
  129. sdate = sdate.replace(u"май","05")
  130. sdate = sdate.replace(u"мая","05")
  131. sdate = sdate.replace(u"июня","06")
  132. sdate = sdate.replace(u"июн","06")
  133. sdate = sdate.replace(u"июля","07")
  134. sdate = sdate.replace(u"июл","07")
  135. sdate = sdate.replace(u"августа","08")
  136. sdate = sdate.replace(u"авг","08")
  137. sdate = sdate.replace(u"сентября","09")
  138. sdate = sdate.replace(u"сен","09")
  139. sdate = sdate.replace(u"октября","10")
  140. sdate = sdate.replace(u"Октябрь","10")
  141. sdate = sdate.replace(u"окт","10")
  142. sdate = sdate.replace(u"ноября","11")
  143. sdate = sdate.replace(u"ноя","11")
  144. sdate = sdate.replace(u"декабря","12")
  145. sdate = sdate.replace(u"дек","12")
  146. sdate = sdate.replace(u".","")
  147. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  148. return sdate
  149. def cleanLink(originalLink):
  150. safe_chars = string.ascii_letters + string.digits
  151. originalLink = ''.join([char if char in safe_chars else '' for char in originalLink])
  152. return originalLink
  153. def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author):
  154. rw = []
  155. current_time = datetime.now()
  156. day = current_time.strftime("%m/%d/%Y")
  157. for n in range(nm):
  158. current_time += timedelta(seconds=2)
  159. ahora = current_time.strftime("%I:%M:%S")
  160. lne = forum # 0
  161. lne += ","
  162. lne += board # 1
  163. lne += ","
  164. lne += author[n] # 2
  165. lne += ","
  166. lne += topic[n] # 3
  167. lne += ","
  168. lne += "-1" if len(views) == 0 else views[n] # 4
  169. lne += ","
  170. lne += "-1" if len(posts) == 0 else posts[n] # 5
  171. lne += ","
  172. lne += "-1" if len(href) == 0 else href[n] # 6
  173. lne += ","
  174. lne += "-1" if len(addDate) == 0 else str(addDate[n]) # 7
  175. lne += ","
  176. lne += day + " " + ahora # 8
  177. lne += ","
  178. lne += "-1" if len(image_author) == 0 else str(image_author[n]) # 9 image_user
  179. lne += ","
  180. lne += "-1" # 10 name_user
  181. lne += ","
  182. lne += "-1" # 11 status_user
  183. lne += ","
  184. lne += "-1" # 12 reputation_user
  185. lne += ","
  186. lne += "-1" # 13 interest_user
  187. lne += ","
  188. lne += "-1" # 14 signature_user
  189. lne += ","
  190. lne += "-1" # 15 content_post
  191. lne += ","
  192. lne += "-1" # 16 feedback_post
  193. lne += ","
  194. lne += "-1" # 17 dateadded_post
  195. lne += ","
  196. lne += "-1" # 18 image_post
  197. rw.append(lne)
  198. return rw
  199. def cleanString(originalString):
  200. updated_string = originalString.replace(",", "") #replace all commas
  201. updated_string = updated_string.replace("\n", "") #replace all newlines
  202. updated_string = updated_string.replace("\t", "") #replace all tabs
  203. updated_string = updated_string.replace("\r", "") #replace all carriage returns
  204. updated_string = updated_string.replace("'", "^") #replace all semicolons
  205. updated_string = updated_string.replace(u"»", '') #replace all arrows
  206. updated_string = updated_string.replace("!", "")
  207. updated_string = updated_string.replace(";", "") #replace all exclamations
  208. return updated_string
  209. #function to convert long informal date string to formal date
  210. def convertFromLongDate(longDate, crawlerdate):
  211. list_of_words = []
  212. list_of_words = longDate.split()
  213. day = 0
  214. week = 0
  215. hour = 0
  216. second = 0
  217. minute = 0
  218. year = 0
  219. total_days = 0
  220. if 'days' in list_of_words:
  221. index = list_of_words.index('days')
  222. day = float(list_of_words[index - 1])
  223. if 'weeks' in list_of_words:
  224. index = list_of_words.index('weeks')
  225. week = float(list_of_words[index - 1])
  226. if 'hours' in list_of_words:
  227. index = list_of_words.index('hours')
  228. hour = float(list_of_words[index - 1])
  229. if 'seconds' in list_of_words:
  230. index = list_of_words.index('seconds')
  231. second = float(list_of_words[index - 1])
  232. if 'minutes' in list_of_words:
  233. index = list_of_words.index('minutes')
  234. minute = float(list_of_words[index - 1])
  235. if 'years' in list_of_words:
  236. index = list_of_words.index('years')
  237. year = float(list_of_words[index - 1])
  238. if year != 0:
  239. total_days = day + 365 * year
  240. #today = datetime.date.today()
  241. timeDelta = fulldatetime.timedelta(days=total_days, weeks=week, hours=hour, seconds=second, minutes=minute)
  242. date = crawlerdate - timeDelta
  243. correct_date = str(date.strftime('%m/%d/%Y'))
  244. return correct_date
  245. def cleanNumbers(inputString):
  246. reg_ex = re.compile(r'[^\d.]+')
  247. updated_string = reg_ex.sub('', inputString)
  248. return updated_string
  249. def aes_encryption(item):
  250. to_bytes = bytes(item)
  251. encrypted_bytes = encryptCipher.encrypt(pad(to_bytes, BLOCK_SIZE))
  252. return encrypted_bytes
  253. def aes_decryption(item):
  254. to_bytes = bytes(item)
  255. decrypted_bytes = decryptCipher.decrypt(to_bytes)
  256. return unpad(decrypted_bytes, BLOCK_SIZE)
  257. def encrypt_encode_image_to_base64(driver, xpath):
  258. try:
  259. img_element = driver.find_element(by=By.XPATH, value=xpath)
  260. image_data = img_element.screenshot_as_png
  261. encrypted_image = aes_encryption(image_data)
  262. base64_image = base64.b64encode(encrypted_image)
  263. string_image = base64_image.decode('utf-8')
  264. return string_image
  265. except:
  266. pass
  267. return None
  268. def decode_decrypt_image_in_base64(html_content):
  269. soup = BeautifulSoup(html_content, 'html.parser')
  270. for img_tag in soup.find_all('img'):
  271. src_attr = img_tag.get('src')
  272. if src_attr and src_attr.startswith('data:image'):
  273. try:
  274. string_image = src_attr.split('base64,')[-1]
  275. base64_image = bytes(string_image, encoding='utf-8')
  276. encrypted_image = base64.b64decode(base64_image)
  277. decrypted_image = aes_decryption(encrypted_image)
  278. im = Image.open(io.BytesIO(decrypted_image))
  279. im.show()
  280. except Exception as e:
  281. print(e)
  282. pass
  283. def replace_image_sources(driver, html_content):
  284. tree = lxml.fromstring(html_content)
  285. for picture_tag in tree.findall('.//picture'):
  286. for source_tag in picture_tag.findall('.//source'):
  287. picture_tag.remove(source_tag)
  288. for img_tag in tree.findall('.//img'):
  289. img_xpath = tree.getroottree().getpath(img_tag)
  290. string_image = encrypt_encode_image_to_base64(driver, img_xpath)
  291. if string_image:
  292. img_tag.set('src', f'data:image/png;base64,{string_image}')
  293. else:
  294. img_tag.getparent().remove(img_tag)
  295. modified_html = lxml.tostring(tree, encoding='utf-8').decode('utf-8')
  296. return modified_html
  297. def cleanHTML(driver, html):
  298. clean_html = replace_image_sources(driver, html)
  299. # decode_decrypt_image_in_base64(clean_html)
  300. formats = [
  301. "jpg", "jpeg", "jfif", "pjpeg", "pjp",
  302. "png", "apng", "svg", "bmp", "gif",
  303. "avif", "webp", "ico", "cur", "tiff"
  304. ]
  305. # remove images
  306. clean_html = re.sub(r"<svg[\s\S]*?svg>", "", clean_html)
  307. for fmat in formats:
  308. clean_html = re.sub(r"<object.*" + fmat + "[\s\S]*?object>", "", clean_html)
  309. clean_html = re.sub(r"<canvas[\s\S]*?canvas>", "", clean_html)
  310. # remove JavaScript
  311. clean_html = re.sub(r"<script[\s\S]*?script>", "", clean_html)
  312. clean_html = re.sub(r"<iframe[\s\S]*?iframe>", "", clean_html)
  313. clean_html = re.sub(r"<object.*javascript[\s\S]*?object>", "", clean_html)
  314. clean_html = re.sub(r"<aplet.*mayscript[\s\S]*?aplet>", "", clean_html)
  315. clean_html = re.sub(r"<embed.*scriptable[\s\S]*?embed>", "", clean_html)
  316. # image and JavaScript
  317. clean_html = re.sub(r"<div[^>]*style=\"[^\"]*background-image[\s\S]*?div>", "", clean_html)
  318. return clean_html