this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

442 lines
15 KiB

1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
  1. __author__ = 'DarkWeb'
  2. import string
  3. import time
  4. import re
  5. import hashlib
  6. import base64
  7. import io
  8. from datetime import datetime, timedelta
  9. import datetime as fulldatetime
  10. from bs4 import BeautifulSoup
  11. from lxml import html as lxml
  12. from selenium.webdriver.common.by import By
  13. from Crypto.Cipher import AES
  14. from Crypto.Util.Padding import pad, unpad
  15. from PIL import Image
  16. def generate_aes_key():
  17. from MarketPlaces.Initialization.markets_mining import config
  18. secret = config.get('Encryption', 'secret')
  19. secret_bytes = bytes(secret, encoding="utf-8")
  20. # Derive a key from the seed using PBKDF2
  21. key = hashlib.pbkdf2_hmac(hash_name='sha256', password=secret_bytes, salt=bytes(), iterations=1)
  22. # Use the first 16 bytes of the derived key as the AES key
  23. aes_key = key[:16]
  24. # print("key: ", aes_key)
  25. return aes_key
  26. BLOCK_SIZE = 32
  27. aes_key = generate_aes_key()
  28. encryptCipher = AES.new(aes_key, AES.MODE_ECB)
  29. decryptCipher = AES.new(aes_key, AES.MODE_ECB)
  30. def convertDate(sdate, language, crawlerDate):
  31. if language == "english":
  32. today = crawlerDate.strftime("%m/%d/%Y")
  33. yesterday = (crawlerDate - timedelta(1)).strftime("%m/%d/%Y")
  34. sdate = sdate.replace(u"January","01")
  35. sdate = sdate.replace(u"February","02")
  36. sdate = sdate.replace(u"March","03")
  37. sdate = sdate.replace(u"April","04")
  38. sdate = sdate.replace(u"May","05")
  39. sdate = sdate.replace(u"June","06")
  40. sdate = sdate.replace(u"July","07")
  41. sdate = sdate.replace(u"August","08")
  42. sdate = sdate.replace(u"September","09")
  43. sdate = sdate.replace(u"October","10")
  44. sdate = sdate.replace(u"November","11")
  45. sdate = sdate.replace(u"December","12")
  46. sdate = sdate.replace(u"Jan","01")
  47. sdate = sdate.replace(u"Feb","02")
  48. sdate = sdate.replace(u"Mar","03")
  49. sdate = sdate.replace(u"Apr","04")
  50. sdate = sdate.replace(u"May","05")
  51. sdate = sdate.replace(u"Jun","06")
  52. sdate = sdate.replace(u"Jul","07")
  53. sdate = sdate.replace(u"Aug","08")
  54. sdate = sdate.replace(u"Sep","09")
  55. sdate = sdate.replace(u"Oct","10")
  56. sdate = sdate.replace(u"Nov","11")
  57. sdate = sdate.replace(u"Dec","12")
  58. sdate = sdate.replace(u".","")
  59. if "Today" in sdate:
  60. sdate = datetime.strptime(str(today), '%m/%d/%Y').strftime('%m %d %Y')
  61. elif "Yesterday" in sdate:
  62. sdate = datetime.strptime(str(yesterday), '%m/%d/%Y').strftime('%m %d %Y')
  63. sdate = datetime.strptime(str(sdate), '%m %d %Y').strftime('%m/%d/%Y')
  64. elif language == "british":
  65. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  66. elif language == "french":
  67. todaysday = crawlerDate.strftime("%m/%d/%Y")
  68. sdate = sdate.replace(u"janvier","01")
  69. sdate = sdate.replace(u"jan","01")
  70. sdate = sdate.replace(u"février","02")
  71. sdate = sdate.replace(u"juin","06")
  72. sdate = sdate.replace(u"juillet","07")
  73. sdate = sdate.replace(u"juil","07")
  74. sdate = sdate.replace(u"août","08")
  75. sdate = sdate.replace(u"septembre","09")
  76. sdate = sdate.replace(u"sept","09")
  77. sdate = sdate.replace(u"octobre","10")
  78. sdate = sdate.replace(u"oct","10")
  79. sdate = sdate.replace(u"novembre","11")
  80. sdate = sdate.replace(u"nov","11")
  81. sdate = sdate.replace(u"décembre","12")
  82. sdate = sdate.replace(u"déc","12")
  83. sdate = sdate.replace(u".","")
  84. if sdate == u"Aujourd'hui" or "Today" in sdate:
  85. sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%d %m %Y')
  86. if "mar" in sdate:
  87. print ("Add March to the IBM Black Market")
  88. raise SystemExit
  89. elif "avr" in sdate:
  90. print ("Add April to the IBM Black Market")
  91. raise SystemExit
  92. elif "mai" in sdate:
  93. print ("Add May to the IBM Black Market")
  94. raise SystemExit
  95. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  96. elif language == "swedish":
  97. sdate = sdate.replace(u"jan","01")
  98. sdate = sdate.replace(u"feb","02")
  99. sdate = sdate.replace(u"mar","03")
  100. sdate = sdate.replace(u"apr","04")
  101. sdate = sdate.replace(u"maj","05")
  102. sdate = sdate.replace(u"jun","06")
  103. sdate = sdate.replace(u"jul","07")
  104. sdate = sdate.replace(u"aug","08")
  105. sdate = sdate.replace(u"sep","09")
  106. sdate = sdate.replace(u"okt","10")
  107. sdate = sdate.replace(u"nov","11")
  108. sdate = sdate.replace(u"dec","12")
  109. sdate = sdate.replace(u".","")
  110. if sdate == u"Ig\xe5r" or sdate == u"Idag" or "minuter sedan" in sdate:
  111. sdate = crawlerDate
  112. sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y')
  113. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  114. elif language == "russian":
  115. if sdate == u'\u0412\u0447\u0435\u0440\u0430' or u"Вчера" in sdate:
  116. sdate = crawlerDate - timedelta(1)
  117. sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y')
  118. elif sdate == u'\u0421\u0435\u0433\u043e\u0434\u043d\u044f':
  119. sdate = crawlerDate
  120. sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y')
  121. elif u'\xd1\xee\xe7\xe4\xe0\xed\xee' in sdate:
  122. return ""
  123. sdate = sdate.replace(u"Январь","01")
  124. sdate = sdate.replace(u"января","01")
  125. sdate = sdate.replace(u"янв","01")
  126. sdate = sdate.replace(u"January","01")
  127. sdate = sdate.replace(u"Jan","01")
  128. sdate = sdate.replace(u"фев","02")
  129. sdate = sdate.replace(u"февраля","02")
  130. sdate = sdate.replace(u"Февраль", "02")
  131. sdate = sdate.replace(u"February", "02")
  132. sdate = sdate.replace(u"Feb", "02")
  133. sdate = sdate.replace(u"Март","03")
  134. sdate = sdate.replace(u"марта","03")
  135. sdate = sdate.replace(u"March","03")
  136. sdate = sdate.replace(u"Mar","03")
  137. sdate = sdate.replace(u"Апрель","04")
  138. sdate = sdate.replace(u"апреля","04")
  139. sdate = sdate.replace(u"апр","04")
  140. sdate = sdate.replace(u"April","04")
  141. sdate = sdate.replace(u"Apr","04")
  142. sdate = sdate.replace(u"май","05")
  143. sdate = sdate.replace(u"Май","05")
  144. sdate = sdate.replace(u"мар","05")
  145. sdate = sdate.replace(u"май","05")
  146. sdate = sdate.replace(u"мая","05")
  147. sdate = sdate.replace(u"May","05")
  148. sdate = sdate.replace(u"Июнь","06")
  149. sdate = sdate.replace(u"июня","06")
  150. sdate = sdate.replace(u"июн","06")
  151. sdate = sdate.replace(u"June","06")
  152. sdate = sdate.replace(u"Jun","06")
  153. sdate = sdate.replace(u"Июль","07")
  154. sdate = sdate.replace(u"июля","07")
  155. sdate = sdate.replace(u"июл","07")
  156. sdate = sdate.replace(u"July","07")
  157. sdate = sdate.replace(u"Jul","07")
  158. sdate = sdate.replace(u"августа","08")
  159. sdate = sdate.replace(u"Август","08")
  160. sdate = sdate.replace(u"авг","08")
  161. sdate = sdate.replace(u"August","08")
  162. sdate = sdate.replace(u"Aug","08")
  163. sdate = sdate.replace(u"Сентябрь","09")
  164. sdate = sdate.replace(u"сентября","09")
  165. sdate = sdate.replace(u"сен","09")
  166. sdate = sdate.replace(u"September","09")
  167. sdate = sdate.replace(u"Sep","09")
  168. sdate = sdate.replace(u"октября","10")
  169. sdate = sdate.replace(u"Октябрь","10")
  170. sdate = sdate.replace(u"October","10")
  171. sdate = sdate.replace(u"Oct","10")
  172. sdate = sdate.replace(u"окт","10")
  173. sdate = sdate.replace(u"Ноябрь","11")
  174. sdate = sdate.replace(u"ноября","11")
  175. sdate = sdate.replace(u"ноя","11")
  176. sdate = sdate.replace(u"November","11")
  177. sdate = sdate.replace(u"Nov","11")
  178. sdate = sdate.replace(u"Декабрь","12")
  179. sdate = sdate.replace(u"декабря","12")
  180. sdate = sdate.replace(u"дек","12")
  181. sdate = sdate.replace(u"December","12")
  182. sdate = sdate.replace(u"Dec","12")
  183. sdate = sdate.replace(u".","")
  184. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  185. return sdate
  186. def cleanText(originalText):
  187. safe_chars = string.ascii_letters + string.digits + " " + "_" + "/" + "&" + "$" + "#" "@" + "+" + "-" + "*" + "=" \
  188. ":" + ";" + "." "," + "?" + "!" + "{" + "}" + "[" + "]" + "(" + ")" + "%" + "`" + "~" + "^" + "|" + "<" + ">"
  189. for index, text in enumerate(originalText):
  190. originalText[index] = ''.join([char if char in safe_chars else '' for char in text])
  191. return originalText
  192. def cleanLink(originalLink):
  193. safe_chars = string.ascii_letters + string.digits
  194. originalLink = ''.join([char if char in safe_chars else '' for char in originalLink])
  195. return originalLink
  196. def organizeProducts(marketplace, nm, vendor, rating_vendor, success_vendor, nombre, CVE, MS, category, describe,
  197. views, reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href):
  198. rw = []
  199. day = time.strftime("%m/%d/%Y")
  200. ahora = time.strftime("%I:%M:%S")
  201. for n in range(nm):
  202. lne = marketplace # 0
  203. lne += ","
  204. lne += vendor[n] # 1
  205. lne += ","
  206. lne += "-1" if len(rating_vendor) == 0 else rating_vendor[n] # 2
  207. lne += ","
  208. lne += "-1" if len(success_vendor) == 0 else success_vendor[n] # 3
  209. lne += ","
  210. lne += nombre[n] # 4
  211. lne += ','
  212. lne += "-1" if len(describe) == 0 else describe[n] # 5
  213. lne += ","
  214. lne += "-1" if len(CVE) == 0 else CVE[n] # 6
  215. lne += ","
  216. lne += "-1" if len(MS) == 0 else MS[n] # 7
  217. lne += ","
  218. lne += "-1" if len(category) == 0 else category[n] # 8
  219. lne += ","
  220. lne += "-1" if len(views) == 0 else views[n] # 9
  221. lne += ","
  222. lne += "-1" if len(reviews) == 0 else reviews[n] # 10
  223. lne += ","
  224. lne += "-1" if len(rating_item) == 0 else rating_item[n] # 11
  225. lne += ","
  226. lne += "-1" if len(addDate) == 0 else addDate[n] # 12
  227. lne += ","
  228. lne += "-1" if len(BTC) == 0 else BTC[n] # 13
  229. lne += ","
  230. lne += "-1" if len(USD) == 0 else USD[n] # 14
  231. lne += ","
  232. lne += "-1" if len(EURO) == 0 else EURO[n] # 15
  233. lne += ","
  234. lne += "-1" if len(sold) == 0 else sold[n] # 16
  235. lne += ","
  236. lne += "-1" if len(qLeft) == 0 else qLeft[n] # 17
  237. lne += ","
  238. lne += "-1" if len(shipFrom) == 0 else shipFrom[n] # 18
  239. lne += ","
  240. lne += "-1" if len(shipTo) == 0 else shipTo[n] # 19
  241. lne += ","
  242. lne += "-1" if len(href) == 0 else href[n] # 20
  243. lne += ","
  244. lne += day + " " + ahora # 21
  245. rw.append(lne)
  246. return rw
  247. def cleanString(originalString):
  248. updated_string = originalString.replace(",", "") #replace all commas
  249. updated_string = updated_string.replace("\n", "") #replace all newlines
  250. updated_string = updated_string.replace("\t", "") #replace all tabs
  251. updated_string = updated_string.replace("\r", "") #replace all carriage returns
  252. updated_string = updated_string.replace("'", "^") #replace all semicolons
  253. updated_string = updated_string.replace(u"»", '') #replace all arrows
  254. updated_string = updated_string.replace("!", "") #replace all exclamation points
  255. updated_string = updated_string.replace(";", "") #replace all exclamations
  256. return updated_string
  257. def checkDateFormat(myString):
  258. isDate = re.match('[0-1][0-9]\/[0-3][0-9]\/[1-2][0-9]{3}', myString)
  259. return isDate
  260. def cleanNumbers(inputString):
  261. reg_ex = re.compile(r'[^\d.]+')
  262. updated_string = reg_ex.sub('', inputString)
  263. return updated_string
  264. def aes_encryption(item):
  265. to_bytes = bytes(item)
  266. encrypted_bytes = encryptCipher.encrypt(pad(to_bytes, BLOCK_SIZE))
  267. return encrypted_bytes
  268. def aes_decryption(item):
  269. to_bytes = bytes(item)
  270. decrypted_bytes = decryptCipher.decrypt(to_bytes)
  271. return unpad(decrypted_bytes, BLOCK_SIZE)
  272. def encrypt_encode_image_to_base64(driver, xpath):
  273. try:
  274. img_element = driver.find_element(by=By.XPATH, value=xpath)
  275. image_data = img_element.screenshot_as_png
  276. encrypted_image = aes_encryption(image_data)
  277. base64_image = base64.b64encode(encrypted_image)
  278. string_image = base64_image.decode('utf-8')
  279. return string_image
  280. except:
  281. pass
  282. return None
  283. def decode_decrypt_image_in_base64(html_content):
  284. soup = BeautifulSoup(html_content, 'html.parser')
  285. for img_tag in soup.find_all('img'):
  286. src_attr = img_tag.get('src')
  287. if src_attr and src_attr.startswith('data:image'):
  288. try:
  289. string_image = src_attr.split('base64,')[-1]
  290. base64_image = bytes(string_image, encoding='utf-8')
  291. encrypted_image = base64.b64decode(base64_image)
  292. decrypted_image = aes_decryption(encrypted_image)
  293. im = Image.open(io.BytesIO(decrypted_image))
  294. im.show()
  295. except Exception as e:
  296. print(e)
  297. pass
  298. def replace_image_sources(driver, html_content):
  299. tree = lxml.fromstring(html_content)
  300. for picture_tag in tree.findall('.//picture'):
  301. for source_tag in picture_tag.findall('.//source'):
  302. picture_tag.remove(source_tag)
  303. for img_tag in tree.findall('.//img'):
  304. img_xpath = tree.getroottree().getpath(img_tag)
  305. string_image = encrypt_encode_image_to_base64(driver, img_xpath)
  306. if string_image:
  307. img_tag.set('src', f'data:image/png;base64,{string_image}')
  308. else:
  309. img_tag.getparent().remove(img_tag)
  310. modified_html = lxml.tostring(tree, encoding='utf-8').decode('utf-8')
  311. return modified_html
  312. def cleanHTML(driver, html):
  313. clean_html = replace_image_sources(driver, html)
  314. # decode_decrypt_image_in_base64(clean_html)
  315. formats = [
  316. "jpg", "jpeg", "jfif", "pjpeg", "pjp",
  317. "png", "apng", "svg", "bmp", "gif",
  318. "avif", "webp", "ico", "cur", "tiff"
  319. ]
  320. # remove images
  321. clean_html = re.sub(r"<svg.*?>", "", clean_html)
  322. for fmat in formats:
  323. clean_html = re.sub(r"<object.*" + fmat + ".*?>", "", clean_html)
  324. clean_html = re.sub(r"<canvas.*?>", "", clean_html)
  325. # remove JavaScript
  326. clean_html = re.sub(r"<script.*?>", "", clean_html)
  327. clean_html = re.sub(r"<object.*javascript.*?>", "", clean_html)
  328. clean_html = re.sub(r"<aplet.*mayscript?>", "", clean_html)
  329. clean_html = re.sub(r"<embed.*scriptable?>", "", clean_html)
  330. # image and JavaScript
  331. clean_html = re.sub(r"<div[^>]*style=\"[^\"]*background-image.*?>|background-image:url\(\'(.*?)\'\);", "", clean_html)
  332. return clean_html