|
|
@ -3,7 +3,39 @@ __author__ = 'DarkWeb' |
|
|
|
import string |
|
|
|
import time |
|
|
|
import re |
|
|
|
import hashlib |
|
|
|
import base64 |
|
|
|
import io |
|
|
|
from datetime import datetime, timedelta |
|
|
|
import datetime as fulldatetime |
|
|
|
from bs4 import BeautifulSoup |
|
|
|
from lxml import html as lxml |
|
|
|
from selenium.webdriver.common.by import By |
|
|
|
from Crypto.Cipher import AES |
|
|
|
from Crypto.Util.Padding import pad, unpad |
|
|
|
from PIL import Image |
|
|
|
|
|
|
|
|
|
|
|
def generate_aes_key(): |
|
|
|
from MarketPlaces.Initialization.markets_mining import config |
|
|
|
|
|
|
|
secret = config.get('Encryption', 'secret') |
|
|
|
secret_bytes = bytes(secret, encoding="utf-8") |
|
|
|
|
|
|
|
# Derive a key from the seed using PBKDF2 |
|
|
|
key = hashlib.pbkdf2_hmac(hash_name='sha256', password=secret_bytes, salt=bytes(), iterations=1) |
|
|
|
|
|
|
|
# Use the first 16 bytes of the derived key as the AES key |
|
|
|
aes_key = key[:16] |
|
|
|
|
|
|
|
# print("key: ", aes_key) |
|
|
|
return aes_key |
|
|
|
|
|
|
|
|
|
|
|
BLOCK_SIZE = 32 |
|
|
|
aes_key = generate_aes_key() |
|
|
|
encryptCipher = AES.new(aes_key, AES.MODE_ECB) |
|
|
|
decryptCipher = AES.new(aes_key, AES.MODE_ECB) |
|
|
|
|
|
|
|
|
|
|
|
def convertDate(sdate, language, crawlerDate): |
|
|
@ -292,7 +324,96 @@ def cleanNumbers(inputString): |
|
|
|
return updated_string |
|
|
|
|
|
|
|
|
|
|
|
def cleanHTML(html): |
|
|
|
def aes_encryption(item): |
|
|
|
|
|
|
|
to_bytes = bytes(item) |
|
|
|
|
|
|
|
encrypted_bytes = encryptCipher.encrypt(pad(to_bytes, BLOCK_SIZE)) |
|
|
|
|
|
|
|
return encrypted_bytes |
|
|
|
|
|
|
|
|
|
|
|
def aes_decryption(item): |
|
|
|
|
|
|
|
to_bytes = bytes(item) |
|
|
|
|
|
|
|
decrypted_bytes = decryptCipher.decrypt(to_bytes) |
|
|
|
|
|
|
|
return unpad(decrypted_bytes, BLOCK_SIZE) |
|
|
|
|
|
|
|
|
|
|
|
def encrypt_encode_image_to_base64(driver, xpath): |
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
img_element = driver.find_element(by=By.XPATH, value=xpath) |
|
|
|
image_data = img_element.screenshot_as_png |
|
|
|
|
|
|
|
encrypted_image = aes_encryption(image_data) |
|
|
|
base64_image = base64.b64encode(encrypted_image) |
|
|
|
string_image = base64_image.decode('utf-8') |
|
|
|
|
|
|
|
return string_image |
|
|
|
|
|
|
|
except: |
|
|
|
pass |
|
|
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
def decode_decrypt_image_in_base64(html_content): |
|
|
|
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
|
|
|
|
|
|
for img_tag in soup.find_all('img'): |
|
|
|
|
|
|
|
src_attr = img_tag.get('src') |
|
|
|
|
|
|
|
if src_attr and src_attr.startswith('data:image'): |
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
string_image = src_attr.split('base64,')[-1] |
|
|
|
base64_image = bytes(string_image, encoding='utf-8') |
|
|
|
encrypted_image = base64.b64decode(base64_image) |
|
|
|
decrypted_image = aes_decryption(encrypted_image) |
|
|
|
|
|
|
|
im = Image.open(io.BytesIO(decrypted_image)) |
|
|
|
im.show() |
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
print(e) |
|
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
def replace_image_sources(driver, html_content): |
|
|
|
|
|
|
|
tree = lxml.fromstring(html_content) |
|
|
|
|
|
|
|
for picture_tag in tree.findall('.//picture'): |
|
|
|
for source_tag in picture_tag.findall('.//source'): |
|
|
|
picture_tag.remove(source_tag) |
|
|
|
|
|
|
|
for img_tag in tree.findall('.//img'): |
|
|
|
|
|
|
|
img_xpath = tree.getroottree().getpath(img_tag) |
|
|
|
|
|
|
|
string_image = encrypt_encode_image_to_base64(driver, img_xpath) |
|
|
|
|
|
|
|
if string_image: |
|
|
|
img_tag.set('src', f'data:image/png;base64,{string_image}') |
|
|
|
else: |
|
|
|
img_tag.getparent().remove(img_tag) |
|
|
|
|
|
|
|
modified_html = lxml.tostring(tree, encoding='utf-8').decode('utf-8') |
|
|
|
|
|
|
|
return modified_html |
|
|
|
|
|
|
|
|
|
|
|
def cleanHTML(driver, html): |
|
|
|
|
|
|
|
clean_html = replace_image_sources(driver, html) |
|
|
|
# decode_decrypt_image_in_base64(clean_html) |
|
|
|
|
|
|
|
formats = [ |
|
|
|
"jpg", "jpeg", "jfif", "pjpeg", "pjp", |
|
|
@ -301,8 +422,6 @@ def cleanHTML(html): |
|
|
|
] |
|
|
|
|
|
|
|
# remove images |
|
|
|
clean_html = re.sub(r"<img.*?>", "", html) |
|
|
|
clean_html = re.sub(r"<picture.*?>", "", clean_html) |
|
|
|
clean_html = re.sub(r"<svg.*?>", "", clean_html) |
|
|
|
for fmat in formats: |
|
|
|
clean_html = re.sub(r"<object.*" + fmat + ".*?>", "", clean_html) |
|
|
|