this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

428 lines
13 KiB

1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
  1. __author__ = 'DarkWeb'
  2. import string
  3. import time
  4. import re
  5. import hashlib
  6. import imghdr
  7. import base64
  8. import requests
  9. import io
  10. import urllib.parse as urlparse
  11. from datetime import datetime, timedelta
  12. import datetime as fulldatetime
  13. from bs4 import BeautifulSoup
  14. from lxml import html as lxml
  15. from selenium.webdriver.common.by import By
  16. from Crypto.Cipher import AES
  17. from Crypto.Util.Padding import pad, unpad
  18. from PIL import Image
  19. def generate_aes_key():
  20. from Forums.Initialization.forums_mining import config
  21. password = "password"
  22. password_bytes = bytes(password, encoding="utf-8")
  23. # Derive a key from the seed using PBKDF2
  24. key = hashlib.pbkdf2_hmac(hash_name='sha256', password=password_bytes, salt=bytes(), iterations=1)
  25. # Use the first 16 bytes of the derived key as the AES key
  26. aes_key = key[:16]
  27. # print("key: ", aes_key)
  28. return aes_key
  29. BLOCK_SIZE = 32
  30. aes_key = generate_aes_key()
  31. encryptCipher = AES.new(aes_key, AES.MODE_ECB)
  32. decryptCipher = AES.new(aes_key, AES.MODE_ECB)
  33. def cleanText(originalText):
  34. safe_chars = string.ascii_letters + string.digits + " " + "_" + "/" + "&" + "$" + "#" "@" + "+" + "-" + "*" + "=" \
  35. ":" + ";" + "." "," + "?" + "!" + "{" + "}" + "[" + "]" + "(" + ")" + "%" + "`" + "~" + "^" + "|" + "<" + ">"
  36. for index, text in enumerate(originalText):
  37. originalText[index] = ''.join([char if char in safe_chars else '' for char in text])
  38. return originalText
  39. def convertDate(sdate, language, crawlerDate):
  40. if language == "english":
  41. todaysday = crawlerDate.strftime("%m/%d/%Y")
  42. sdate = sdate.replace(u"January","01")
  43. sdate = sdate.replace(u"February","02")
  44. sdate = sdate.replace(u"March","03")
  45. sdate = sdate.replace(u"April","04")
  46. sdate = sdate.replace(u"May","05")
  47. sdate = sdate.replace(u"June","06")
  48. sdate = sdate.replace(u"July","07")
  49. sdate = sdate.replace(u"August","08")
  50. sdate = sdate.replace(u"September","09")
  51. sdate = sdate.replace(u"October","10")
  52. sdate = sdate.replace(u"November","11")
  53. sdate = sdate.replace(u"December","12")
  54. sdate = sdate.replace(u"Jan","01")
  55. sdate = sdate.replace(u"Feb","02")
  56. sdate = sdate.replace(u"Mar","03")
  57. sdate = sdate.replace(u"Apr","04")
  58. sdate = sdate.replace(u"May","05")
  59. sdate = sdate.replace(u"Jun","06")
  60. sdate = sdate.replace(u"Jul","07")
  61. sdate = sdate.replace(u"Aug","08")
  62. sdate = sdate.replace(u"Sep","09")
  63. sdate = sdate.replace(u"Oct","10")
  64. sdate = sdate.replace(u"Nov","11")
  65. sdate = sdate.replace(u"Dec","12")
  66. sdate = sdate.replace(u".","")
  67. if sdate == "Today at":
  68. sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%m %d %Y')
  69. sdate = datetime.strptime(str(sdate), '%m %d %Y').strftime('%m/%d/%Y')
  70. elif language == "french":
  71. todaysday = crawlerDate.strftime("%m/%d/%Y")
  72. sdate = sdate.replace(u"janvier","01")
  73. sdate = sdate.replace(u"jan","01")
  74. sdate = sdate.replace(u"février","02")
  75. sdate = sdate.replace(u"juin","06")
  76. sdate = sdate.replace(u"juillet","07")
  77. sdate = sdate.replace(u"juil","07")
  78. sdate = sdate.replace(u"août","08")
  79. sdate = sdate.replace(u"septembre","09")
  80. sdate = sdate.replace(u"sept","09")
  81. sdate = sdate.replace(u"octobre","10")
  82. sdate = sdate.replace(u"oct","10")
  83. sdate = sdate.replace(u"novembre","11")
  84. sdate = sdate.replace(u"nov","11")
  85. sdate = sdate.replace(u"décembre","12")
  86. sdate = sdate.replace(u"déc","12")
  87. sdate = sdate.replace(u".","")
  88. if sdate == u"Aujourd'hui":
  89. sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%d %m %Y')
  90. if "mar" in sdate:
  91. print ("Add March to the IBM Black Market")
  92. raise SystemExit
  93. elif "avr" in sdate:
  94. print ("Add April to the IBM Black Market")
  95. raise SystemExit
  96. elif "mai" in sdate:
  97. print ("Add May to the IBM Black Market")
  98. raise SystemExit
  99. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  100. elif language == "swedish":
  101. sdate = sdate.replace(u"jan","01")
  102. sdate = sdate.replace(u"feb","02")
  103. sdate = sdate.replace(u"mar","03")
  104. sdate = sdate.replace(u"apr","04")
  105. sdate = sdate.replace(u"maj","05")
  106. sdate = sdate.replace(u"jun","06")
  107. sdate = sdate.replace(u"jul","07")
  108. sdate = sdate.replace(u"aug","08")
  109. sdate = sdate.replace(u"sep","09")
  110. sdate = sdate.replace(u"okt","10")
  111. sdate = sdate.replace(u"nov","11")
  112. sdate = sdate.replace(u"dec","12")
  113. sdate = sdate.replace(u".","")
  114. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  115. elif language == "russian":
  116. if sdate == u'\u0412\u0447\u0435\u0440\u0430':
  117. sdate = crawlerDate.today() - timedelta(1)
  118. sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y')
  119. elif u'\xd1\xee\xe7\xe4\xe0\xed\xee' in sdate:
  120. return ""
  121. sdate = sdate.replace(u"января","01")
  122. sdate = sdate.replace(u"янв","01")
  123. sdate = sdate.replace(u"февраля","02")
  124. sdate = sdate.replace(u"Февраль", "02")
  125. sdate = sdate.replace(u"фев","02")
  126. sdate = sdate.replace(u"марта","03")
  127. sdate = sdate.replace(u"апреля","04")
  128. sdate = sdate.replace(u"апр","04")
  129. sdate = sdate.replace(u"мар","05")
  130. sdate = sdate.replace(u"май","05")
  131. sdate = sdate.replace(u"мая","05")
  132. sdate = sdate.replace(u"июня","06")
  133. sdate = sdate.replace(u"июн","06")
  134. sdate = sdate.replace(u"июля","07")
  135. sdate = sdate.replace(u"июл","07")
  136. sdate = sdate.replace(u"августа","08")
  137. sdate = sdate.replace(u"авг","08")
  138. sdate = sdate.replace(u"сентября","09")
  139. sdate = sdate.replace(u"сен","09")
  140. sdate = sdate.replace(u"октября","10")
  141. sdate = sdate.replace(u"Октябрь","10")
  142. sdate = sdate.replace(u"окт","10")
  143. sdate = sdate.replace(u"ноября","11")
  144. sdate = sdate.replace(u"ноя","11")
  145. sdate = sdate.replace(u"декабря","12")
  146. sdate = sdate.replace(u"дек","12")
  147. sdate = sdate.replace(u".","")
  148. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  149. return sdate
  150. def cleanLink(originalLink):
  151. safe_chars = string.ascii_letters + string.digits
  152. originalLink = ''.join([char if char in safe_chars else '' for char in originalLink])
  153. return originalLink
  154. def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate):
  155. day = time.strftime("%m/%d/%Y")
  156. ahora = time.strftime("%I:%M:%S")
  157. rw = []
  158. for n in range(nm):
  159. lne = forum # 0
  160. lne += ","
  161. lne += board # 1
  162. lne += ","
  163. lne += author[n] # 2
  164. lne += ","
  165. lne += topic[n] # 3
  166. lne += ","
  167. lne += "-1" if len(views) == 0 else views[n] # 4
  168. lne += ","
  169. lne += "-1" if len(posts) == 0 else posts[n] # 5
  170. lne += ","
  171. lne += "-1" if len(href) == 0 else href[n] # 6
  172. lne += ","
  173. lne += "-1" if len(addDate) == 0 else str(addDate[n]) # 7
  174. lne += ","
  175. lne += day + " " + ahora # 8
  176. lne += ","
  177. lne += "-1" # 9 name_user
  178. lne += ","
  179. lne += "-1" # 10 status_user
  180. lne += ","
  181. lne += "-1" # 11 reputation_user
  182. lne += ","
  183. lne += "-1" # 12 interest_user
  184. lne += ","
  185. lne += "-1" # 13 signature_user
  186. lne += ","
  187. lne += "-1" # 14 content_post
  188. lne += ","
  189. lne += "-1" # 15 feedback_post
  190. lne += ","
  191. lne += "-1" # 16 dateadded_post
  192. rw.append(lne)
  193. return rw
  194. def cleanString(originalString):
  195. updated_string = originalString.replace(",", "") #replace all commas
  196. updated_string = updated_string.replace("\n", "") #replace all newlines
  197. updated_string = updated_string.replace("\t", "") #replace all tabs
  198. updated_string = updated_string.replace("\r", "") #replace all carriage returns
  199. updated_string = updated_string.replace("'", "^") #replace all semicolons
  200. updated_string = updated_string.replace(u"»", '') #replace all arrows
  201. updated_string = updated_string.replace("!", "")
  202. updated_string = updated_string.replace(";", "") #replace all exclamations
  203. return updated_string
  204. #function to convert long informal date string to formal date
  205. def convertFromLongDate(longDate, crawlerdate):
  206. list_of_words = []
  207. list_of_words = longDate.split()
  208. day = 0
  209. week = 0
  210. hour = 0
  211. second = 0
  212. minute = 0
  213. year = 0
  214. total_days = 0
  215. if 'days' in list_of_words:
  216. index = list_of_words.index('days')
  217. day = float(list_of_words[index - 1])
  218. if 'weeks' in list_of_words:
  219. index = list_of_words.index('weeks')
  220. week = float(list_of_words[index - 1])
  221. if 'hours' in list_of_words:
  222. index = list_of_words.index('hours')
  223. hour = float(list_of_words[index - 1])
  224. if 'seconds' in list_of_words:
  225. index = list_of_words.index('seconds')
  226. second = float(list_of_words[index - 1])
  227. if 'minutes' in list_of_words:
  228. index = list_of_words.index('minutes')
  229. minute = float(list_of_words[index - 1])
  230. if 'years' in list_of_words:
  231. index = list_of_words.index('years')
  232. year = float(list_of_words[index - 1])
  233. if year != 0:
  234. total_days = day + 365 * year
  235. #today = datetime.date.today()
  236. timeDelta = fulldatetime.timedelta(days=total_days, weeks=week, hours=hour, seconds=second, minutes=minute)
  237. date = crawlerdate - timeDelta
  238. correct_date = str(date.strftime('%m/%d/%Y'))
  239. return correct_date
  240. def aes_encryption(item):
  241. to_bytes = bytes(item)
  242. encrypted_bytes = encryptCipher.encrypt(pad(to_bytes, BLOCK_SIZE))
  243. return encrypted_bytes
  244. def aes_decryption(item):
  245. to_bytes = bytes(item)
  246. decrypted_bytes = decryptCipher.decrypt(to_bytes)
  247. return unpad(decrypted_bytes, BLOCK_SIZE)
  248. def encrypt_encode_image_to_base64(driver, xpath):
  249. try:
  250. img_element = driver.find_element(by=By.XPATH, value=xpath)
  251. image_data = img_element.screenshot_as_png
  252. encrypted_image = aes_encryption(image_data)
  253. base64_image = base64.b64encode(encrypted_image)
  254. string_image = base64_image.decode('utf-8')
  255. return string_image
  256. except:
  257. pass
  258. return None
  259. def decode_decrypt_image_in_base64(html_content):
  260. soup = BeautifulSoup(html_content, 'html.parser')
  261. for img_tag in soup.find_all('img'):
  262. src_attr = img_tag.get('src')
  263. if src_attr and src_attr.startswith('data:image'):
  264. try:
  265. string_image = src_attr.split('base64,')[-1]
  266. base64_image = bytes(string_image, encoding='utf-8')
  267. encrypted_image = base64.b64decode(base64_image)
  268. decrypted_image = aes_decryption(encrypted_image)
  269. im = Image.open(io.BytesIO(decrypted_image))
  270. im.show()
  271. except Exception as e:
  272. print(e)
  273. pass
  274. def replace_image_sources(driver, html_content):
  275. tree = lxml.fromstring(html_content)
  276. for picture_tag in tree.findall('.//picture'):
  277. for source_tag in picture_tag.findall('.//source'):
  278. picture_tag.remove(source_tag)
  279. for img_tag in tree.findall('.//img'):
  280. img_xpath = tree.getroottree().getpath(img_tag)
  281. string_image = encrypt_encode_image_to_base64(driver, img_xpath)
  282. if string_image:
  283. img_tag.set('src', f'data:image/png;base64,{string_image}')
  284. else:
  285. img_tag.getparent().remove(img_tag)
  286. modified_html = lxml.tostring(tree, encoding='utf-8').decode('utf-8')
  287. return modified_html
  288. def cleanHTML(driver, html):
  289. clean_html = replace_image_sources(driver, html)
  290. # decode_decrypt_image_in_base64(clean_html)
  291. formats = [
  292. "jpg", "jpeg", "jfif", "pjpeg", "pjp",
  293. "png", "apng", "svg", "bmp", "gif",
  294. "avif", "webp", "ico", "cur", "tiff"
  295. ]
  296. # remove images
  297. clean_html = re.sub(r"<svg.*?>", "", clean_html)
  298. for fmat in formats:
  299. clean_html = re.sub(r"<object.*" + fmat + ".*?>", "", clean_html)
  300. clean_html = re.sub(r"<canvas.*?>", "", clean_html)
  301. # remove JavaScript
  302. clean_html = re.sub(r"<script.*?>", "", clean_html)
  303. clean_html = re.sub(r"<object.*javascript.*?>", "", clean_html)
  304. clean_html = re.sub(r"<aplet.*mayscript?>", "", clean_html)
  305. clean_html = re.sub(r"<embed.*scriptable?>", "", clean_html)
  306. # image and JavaScript
  307. clean_html = re.sub(r"<div[^>]*style=\"[^\"]*background-image.*?>|background-image:url\(\'(.*?)\'\);", "", clean_html)
  308. return clean_html