this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

453 lines
16 KiB

1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
  1. __author__ = 'DarkWeb'
  2. import string
  3. import time
  4. import re
  5. import hashlib
  6. import base64
  7. import io
  8. import configparser
  9. from datetime import datetime, timedelta
  10. from lxml import html as lxml
  11. from selenium.webdriver.common.by import By
  12. from Crypto.Cipher import AES
  13. from Crypto.Util.Padding import pad, unpad
  14. from PIL import Image
  15. from urllib.parse import urlsplit, urljoin
  16. def generate_aes_key():
  17. config = configparser.ConfigParser()
  18. config.read('../../setup.ini')
  19. secret = config.get('Encryption', 'secret')
  20. secret_bytes = bytes(secret, encoding="utf-8")
  21. # Derive a key from the seed using PBKDF2
  22. key = hashlib.pbkdf2_hmac(hash_name='sha256', password=secret_bytes, salt=bytes(), iterations=1)
  23. # Use the first 16 bytes of the derived key as the AES key
  24. aes_key = key[:16]
  25. # print("key: ", aes_key)
  26. return aes_key
  27. BLOCK_SIZE = 32
  28. aes_key = generate_aes_key()
  29. encryptCipher = AES.new(aes_key, AES.MODE_ECB)
  30. decryptCipher = AES.new(aes_key, AES.MODE_ECB)
  31. def convertDate(sdate, language, crawlerDate):
  32. if language == "english":
  33. today = crawlerDate.strftime("%m/%d/%Y")
  34. yesterday = (crawlerDate - timedelta(1)).strftime("%m/%d/%Y")
  35. sdate = sdate.replace(u"January","01")
  36. sdate = sdate.replace(u"February","02")
  37. sdate = sdate.replace(u"March","03")
  38. sdate = sdate.replace(u"April","04")
  39. sdate = sdate.replace(u"May","05")
  40. sdate = sdate.replace(u"June","06")
  41. sdate = sdate.replace(u"July","07")
  42. sdate = sdate.replace(u"August","08")
  43. sdate = sdate.replace(u"September","09")
  44. sdate = sdate.replace(u"October","10")
  45. sdate = sdate.replace(u"November","11")
  46. sdate = sdate.replace(u"December","12")
  47. sdate = sdate.replace(u"Jan","01")
  48. sdate = sdate.replace(u"Feb","02")
  49. sdate = sdate.replace(u"Mar","03")
  50. sdate = sdate.replace(u"Apr","04")
  51. sdate = sdate.replace(u"May","05")
  52. sdate = sdate.replace(u"Jun","06")
  53. sdate = sdate.replace(u"Jul","07")
  54. sdate = sdate.replace(u"Aug","08")
  55. sdate = sdate.replace(u"Sep","09")
  56. sdate = sdate.replace(u"Oct","10")
  57. sdate = sdate.replace(u"Nov","11")
  58. sdate = sdate.replace(u"Dec","12")
  59. sdate = sdate.replace(u".","")
  60. if "Today" in sdate:
  61. sdate = datetime.strptime(str(today), '%m/%d/%Y').strftime('%m %d %Y')
  62. elif "Yesterday" in sdate:
  63. sdate = datetime.strptime(str(yesterday), '%m/%d/%Y').strftime('%m %d %Y')
  64. sdate = datetime.strptime(str(sdate), '%m %d %Y').strftime('%m/%d/%Y')
  65. elif language == "british":
  66. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  67. elif language == "french":
  68. todaysday = crawlerDate.strftime("%m/%d/%Y")
  69. sdate = sdate.replace(u"janvier","01")
  70. sdate = sdate.replace(u"jan","01")
  71. sdate = sdate.replace(u"février","02")
  72. sdate = sdate.replace(u"juin","06")
  73. sdate = sdate.replace(u"juillet","07")
  74. sdate = sdate.replace(u"juil","07")
  75. sdate = sdate.replace(u"août","08")
  76. sdate = sdate.replace(u"septembre","09")
  77. sdate = sdate.replace(u"sept","09")
  78. sdate = sdate.replace(u"octobre","10")
  79. sdate = sdate.replace(u"oct","10")
  80. sdate = sdate.replace(u"novembre","11")
  81. sdate = sdate.replace(u"nov","11")
  82. sdate = sdate.replace(u"décembre","12")
  83. sdate = sdate.replace(u"déc","12")
  84. sdate = sdate.replace(u".","")
  85. if sdate == u"Aujourd'hui" or "Today" in sdate:
  86. sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%d %m %Y')
  87. if "mar" in sdate:
  88. print ("Add March to the IBM Black Market")
  89. raise SystemExit
  90. elif "avr" in sdate:
  91. print ("Add April to the IBM Black Market")
  92. raise SystemExit
  93. elif "mai" in sdate:
  94. print ("Add May to the IBM Black Market")
  95. raise SystemExit
  96. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  97. elif language == "swedish":
  98. sdate = sdate.replace(u"jan","01")
  99. sdate = sdate.replace(u"feb","02")
  100. sdate = sdate.replace(u"mar","03")
  101. sdate = sdate.replace(u"apr","04")
  102. sdate = sdate.replace(u"maj","05")
  103. sdate = sdate.replace(u"jun","06")
  104. sdate = sdate.replace(u"jul","07")
  105. sdate = sdate.replace(u"aug","08")
  106. sdate = sdate.replace(u"sep","09")
  107. sdate = sdate.replace(u"okt","10")
  108. sdate = sdate.replace(u"nov","11")
  109. sdate = sdate.replace(u"dec","12")
  110. sdate = sdate.replace(u".","")
  111. if sdate == u"Ig\xe5r" or sdate == u"Idag" or "minuter sedan" in sdate:
  112. sdate = crawlerDate
  113. sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y')
  114. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  115. elif language == "russian":
  116. if sdate == u'\u0412\u0447\u0435\u0440\u0430' or u"Вчера" in sdate:
  117. sdate = crawlerDate - timedelta(1)
  118. sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y')
  119. elif sdate == u'\u0421\u0435\u0433\u043e\u0434\u043d\u044f':
  120. sdate = crawlerDate
  121. sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y')
  122. elif u'\xd1\xee\xe7\xe4\xe0\xed\xee' in sdate:
  123. return ""
  124. sdate = sdate.replace(u"Январь","01")
  125. sdate = sdate.replace(u"января","01")
  126. sdate = sdate.replace(u"янв","01")
  127. sdate = sdate.replace(u"January","01")
  128. sdate = sdate.replace(u"Jan","01")
  129. sdate = sdate.replace(u"фев","02")
  130. sdate = sdate.replace(u"февраля","02")
  131. sdate = sdate.replace(u"Февраль", "02")
  132. sdate = sdate.replace(u"February", "02")
  133. sdate = sdate.replace(u"Feb", "02")
  134. sdate = sdate.replace(u"Март","03")
  135. sdate = sdate.replace(u"марта","03")
  136. sdate = sdate.replace(u"March","03")
  137. sdate = sdate.replace(u"Mar","03")
  138. sdate = sdate.replace(u"Апрель","04")
  139. sdate = sdate.replace(u"апреля","04")
  140. sdate = sdate.replace(u"апр","04")
  141. sdate = sdate.replace(u"April","04")
  142. sdate = sdate.replace(u"Apr","04")
  143. sdate = sdate.replace(u"май","05")
  144. sdate = sdate.replace(u"Май","05")
  145. sdate = sdate.replace(u"мар","05")
  146. sdate = sdate.replace(u"май","05")
  147. sdate = sdate.replace(u"мая","05")
  148. sdate = sdate.replace(u"May","05")
  149. sdate = sdate.replace(u"Июнь","06")
  150. sdate = sdate.replace(u"июня","06")
  151. sdate = sdate.replace(u"июн","06")
  152. sdate = sdate.replace(u"June","06")
  153. sdate = sdate.replace(u"Jun","06")
  154. sdate = sdate.replace(u"Июль","07")
  155. sdate = sdate.replace(u"июля","07")
  156. sdate = sdate.replace(u"июл","07")
  157. sdate = sdate.replace(u"July","07")
  158. sdate = sdate.replace(u"Jul","07")
  159. sdate = sdate.replace(u"августа","08")
  160. sdate = sdate.replace(u"Август","08")
  161. sdate = sdate.replace(u"авг","08")
  162. sdate = sdate.replace(u"August","08")
  163. sdate = sdate.replace(u"Aug","08")
  164. sdate = sdate.replace(u"Сентябрь","09")
  165. sdate = sdate.replace(u"сентября","09")
  166. sdate = sdate.replace(u"сен","09")
  167. sdate = sdate.replace(u"September","09")
  168. sdate = sdate.replace(u"Sep","09")
  169. sdate = sdate.replace(u"октября","10")
  170. sdate = sdate.replace(u"Октябрь","10")
  171. sdate = sdate.replace(u"October","10")
  172. sdate = sdate.replace(u"Oct","10")
  173. sdate = sdate.replace(u"окт","10")
  174. sdate = sdate.replace(u"Ноябрь","11")
  175. sdate = sdate.replace(u"ноября","11")
  176. sdate = sdate.replace(u"ноя","11")
  177. sdate = sdate.replace(u"November","11")
  178. sdate = sdate.replace(u"Nov","11")
  179. sdate = sdate.replace(u"Декабрь","12")
  180. sdate = sdate.replace(u"декабря","12")
  181. sdate = sdate.replace(u"дек","12")
  182. sdate = sdate.replace(u"December","12")
  183. sdate = sdate.replace(u"Dec","12")
  184. sdate = sdate.replace(u".","")
  185. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  186. return sdate
  187. def cleanText(originalText):
  188. safe_chars = string.ascii_letters + string.digits + " " + "_" + "/" + "&" + "$" + "#" "@" + "+" + "-" + "*" + "=" \
  189. ":" + ";" + "." "," + "?" + "!" + "{" + "}" + "[" + "]" + "(" + ")" + "%" + "`" + "~" + "^" + "|" + "<" + ">"
  190. for index, text in enumerate(originalText):
  191. originalText[index] = ''.join([char if char in safe_chars else '' for char in text])
  192. return originalText
  193. def cleanLink(originalLink):
  194. safe_chars = string.ascii_letters + string.digits
  195. originalLink = ''.join([char if char in safe_chars else '' for char in originalLink])
  196. return originalLink
  197. def organizeProducts(marketplace, nm, vendor, rating_vendor, success_vendor, nombre, CVE, MS, category, describe,
  198. views, reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor):
  199. rw = []
  200. current_time = datetime.now()
  201. day = current_time.strftime("%m/%d/%Y")
  202. ahora = current_time.strftime("%I:%M:%S")
  203. for n in range(nm):
  204. lne = marketplace # 0
  205. lne += ","
  206. lne += "-1" if len(vendor) == 0 else vendor[n] # 1
  207. lne += ","
  208. lne += "-1" if len(rating_vendor) == 0 else rating_vendor[n] # 2
  209. lne += ","
  210. lne += "-1" if len(success_vendor) == 0 else success_vendor[n] # 3
  211. lne += ","
  212. lne += nombre[n] # 4
  213. lne += ','
  214. lne += "-1" if len(describe) == 0 else describe[n] # 5
  215. lne += ","
  216. lne += "-1" if len(CVE) == 0 else CVE[n] # 6
  217. lne += ","
  218. lne += "-1" if len(MS) == 0 else MS[n] # 7
  219. lne += ","
  220. lne += "-1" if len(category) == 0 else category[n] # 8
  221. lne += ","
  222. lne += "-1" if len(views) == 0 else views[n] # 9
  223. lne += ","
  224. lne += "-1" if len(reviews) == 0 else reviews[n] # 10
  225. lne += ","
  226. lne += "-1" if len(rating_item) == 0 else rating_item[n] # 11
  227. lne += ","
  228. lne += "-1" if len(addDate) == 0 else addDate[n] # 12
  229. lne += ","
  230. lne += "-1" if len(BTC) == 0 else BTC[n] # 13
  231. lne += ","
  232. lne += "-1" if len(USD) == 0 else USD[n] # 14
  233. lne += ","
  234. lne += "-1" if len(EURO) == 0 else EURO[n] # 15
  235. lne += ","
  236. lne += "-1" if len(sold) == 0 else sold[n] # 16
  237. lne += ","
  238. lne += "-1" if len(qLeft) == 0 else qLeft[n] # 17
  239. lne += ","
  240. lne += "-1" if len(shipFrom) == 0 else shipFrom[n] # 18
  241. lne += ","
  242. lne += "-1" if len(shipTo) == 0 else shipTo[n] # 19
  243. lne += ","
  244. lne += "-1" if len(image) == 0 else image[n] # 20
  245. lne += ","
  246. lne += "-1" if len(image_vendor) == 0 else image_vendor[n] # 21
  247. lne += ","
  248. lne += "-1" if len(href) == 0 else href[n] # 22
  249. lne += ","
  250. lne += day + " " + ahora # 23
  251. rw.append(lne)
  252. return rw
  253. def cleanString(originalString):
  254. updated_string = originalString.replace(",", "") #replace all commas
  255. updated_string = updated_string.replace("\n", "") #replace all newlines
  256. updated_string = updated_string.replace("\t", "") #replace all tabs
  257. updated_string = updated_string.replace("\r", "") #replace all carriage returns
  258. updated_string = updated_string.replace("'", "^") #replace all semicolons
  259. updated_string = updated_string.replace(u"»", '') #replace all arrows
  260. updated_string = updated_string.replace("!", "") #replace all exclamation points
  261. updated_string = updated_string.replace(";", "") #replace all exclamations
  262. return updated_string
  263. def checkDateFormat(myString):
  264. isDate = re.match('[0-1][0-9]\/[0-3][0-9]\/[1-2][0-9]{3}', myString)
  265. return isDate
  266. def cleanNumbers(inputString):
  267. reg_ex = re.compile(r'[^\d.]+')
  268. updated_string = reg_ex.sub('', inputString)
  269. return updated_string
  270. def aes_encryption(item):
  271. to_bytes = bytes(item)
  272. encrypted_bytes = encryptCipher.encrypt(pad(to_bytes, BLOCK_SIZE))
  273. return encrypted_bytes
  274. def aes_decryption(item):
  275. to_bytes = bytes(item)
  276. decrypted_bytes = decryptCipher.decrypt(to_bytes)
  277. return unpad(decrypted_bytes, BLOCK_SIZE)
  278. def encrypt_encode_image_to_base64(driver, xpath):
  279. try:
  280. img_element = driver.find_element(by=By.XPATH, value=xpath)
  281. image_data = img_element.screenshot_as_png
  282. encrypted_image = aes_encryption(image_data)
  283. base64_image = base64.b64encode(encrypted_image)
  284. string_image = base64_image.decode('utf-8')
  285. return string_image
  286. except:
  287. pass
  288. return None
  289. def decode_decrypt_image_in_base64(string_image):
  290. try:
  291. base64_image = bytes(string_image, encoding='utf-8')
  292. encrypted_image = base64.b64decode(base64_image)
  293. decrypted_image = aes_decryption(encrypted_image)
  294. im = Image.open(io.BytesIO(decrypted_image))
  295. im.show()
  296. return decrypted_image
  297. except Exception as e:
  298. print(e)
  299. pass
  300. return None
  301. def replace_image_sources(driver, html_content):
  302. tree = lxml.fromstring(html_content)
  303. for picture_tag in tree.findall('.//picture'):
  304. for source_tag in picture_tag.findall('.//source'):
  305. picture_tag.remove(source_tag)
  306. for img_tag in tree.findall('.//img'):
  307. img_xpath = tree.getroottree().getpath(img_tag)
  308. string_image = encrypt_encode_image_to_base64(driver, img_xpath)
  309. if string_image:
  310. img_tag.set('src', f'data:image/png;base64,{string_image}')
  311. else:
  312. img_tag.getparent().remove(img_tag)
  313. modified_html = lxml.tostring(tree, encoding='utf-8').decode('utf-8')
  314. return modified_html
  315. def cleanHTML(driver, html):
  316. clean_html = replace_image_sources(driver, html)
  317. formats = [
  318. "jpg", "jpeg", "jfif", "pjpeg", "pjp",
  319. "png", "apng", "svg", "bmp", "gif",
  320. "avif", "webp", "ico", "cur", "tiff"
  321. ]
  322. # remove images
  323. clean_html = re.sub(r"<svg[\s\S]*?svg>", "", clean_html)
  324. for fmat in formats:
  325. clean_html = re.sub(r"<object.*" + fmat + "[\s\S]*?object>", "", clean_html)
  326. clean_html = re.sub(r"<canvas[\s\S]*?canvas>", "", clean_html)
  327. # remove JavaScript
  328. clean_html = re.sub(r"<script[\s\S]*?script>", "", clean_html)
  329. clean_html = re.sub(r"<iframe[\s\S]*?iframe>", "", clean_html)
  330. clean_html = re.sub(r"<object.*javascript[\s\S]*?object>", "", clean_html)
  331. clean_html = re.sub(r"<aplet.*mayscript[\s\S]*?aplet>", "", clean_html)
  332. clean_html = re.sub(r"<embed.*scriptable[\s\S]*?embed>", "", clean_html)
  333. # image and JavaScript
  334. clean_html = re.sub(r"<div[^>]*style=\"[^\"]*background-image[\s\S]*?div>", "", clean_html)
  335. return clean_html
  336. def get_relative_url(target_url):
  337. # Use a dummy base URL to handle both absolute and relative URLs
  338. base_url = "http://dummybaseurl.com/"
  339. absolute_url = urljoin(base_url, target_url)
  340. # Parse the absolute URL
  341. parsed_absolute_url = urlsplit(absolute_url)
  342. # Extract the path and query from the absolute URL as the relative URL
  343. return parsed_absolute_url.path + '?' + parsed_absolute_url.query \
  344. if parsed_absolute_url.query else parsed_absolute_url.path