__author__ = 'DarkWeb'
|
|
|
|
import string
|
|
import time
|
|
import re
|
|
import hashlib
|
|
import base64
|
|
import io
|
|
import configparser
|
|
import json
|
|
import keras
|
|
import cv2
|
|
import numpy as np
|
|
from keras.preprocessing import image
|
|
from keras.applications.imagenet_utils import preprocess_input
|
|
from keras.models import Model
|
|
from datetime import datetime, timedelta
|
|
from lxml import html as lxml
|
|
from selenium.webdriver.common.by import By
|
|
from Crypto.Cipher import AES
|
|
from Crypto.Util.Padding import pad, unpad
|
|
from PIL import Image
|
|
from urllib.parse import urlsplit, urljoin
|
|
|
|
|
|
def generate_aes_key():
|
|
config = configparser.ConfigParser()
|
|
config.read('../../setup.ini')
|
|
|
|
secret = config.get('Encryption', 'secret')
|
|
secret_bytes = bytes(secret, encoding="utf-8")
|
|
|
|
# Derive a key from the seed using PBKDF2
|
|
key = hashlib.pbkdf2_hmac(hash_name='sha256', password=secret_bytes, salt=bytes(), iterations=1)
|
|
|
|
# Use the first 16 bytes of the derived key as the AES key
|
|
aes_key = key[:16]
|
|
|
|
# print("key: ", aes_key)
|
|
return aes_key
|
|
|
|
|
|
BLOCK_SIZE = 32
|
|
aes_key = generate_aes_key()
|
|
encryptCipher = AES.new(aes_key, AES.MODE_ECB)
|
|
decryptCipher = AES.new(aes_key, AES.MODE_ECB)
|
|
|
|
model = keras.applications.ResNet50(weights='imagenet', include_top=True)
|
|
feat_extractor = Model(inputs=model.input, outputs=model.get_layer('avg_pool').output)
|
|
|
|
sift = cv2.SIFT_create(
|
|
nfeatures=0, # Number of features, 0 for unlimited
|
|
nOctaveLayers=3, # Number of layers per octave
|
|
contrastThreshold=0.09, # Contrast threshold
|
|
edgeThreshold=10, # Edge threshold
|
|
sigma=1.6 # Initial Gaussian blur sigma
|
|
)
|
|
|
|
|
|
def generate_image_hash(image_string):
|
|
|
|
image_bytes = bytes(image_string, encoding='utf-8')
|
|
image_bytes = base64.b64decode(image_bytes)
|
|
|
|
return hashlib.sha256(image_bytes).hexdigest()
|
|
|
|
|
|
def extract_hidden_layer_output(image_string):
|
|
|
|
image_bytes = bytes(image_string, encoding='utf-8')
|
|
image_bytes = base64.b64decode(image_bytes)
|
|
im = Image.open(io.BytesIO(image_bytes)).convert('RGB')
|
|
|
|
x = image.img_to_array(im)
|
|
x = image.smart_resize(x, size=model.input_shape[1:3], interpolation='nearest')
|
|
x = np.expand_dims(x, axis=0)
|
|
x = preprocess_input(x)
|
|
|
|
return json.dumps(feat_extractor.predict(x)[0].tolist())
|
|
|
|
|
|
def extract_keypoints(image_string):
|
|
|
|
image_bytes = bytes(image_string, encoding='utf-8')
|
|
image_bytes = base64.b64decode(image_bytes)
|
|
image_array = np.asarray(bytearray(image_bytes), dtype=np.uint8)
|
|
|
|
img = cv2.imdecode(image_array, cv2.IMREAD_GRAYSCALE)
|
|
|
|
keypoints, descriptors = sift.detectAndCompute(img, None)
|
|
|
|
if len(keypoints) == 0:
|
|
return None, None
|
|
|
|
return json.dumps(wrap_keypoints(keypoints)), json.dumps(descriptors.tolist())
|
|
|
|
|
|
def wrap_keypoints(keypoints):
|
|
|
|
keypoints_list = []
|
|
|
|
for i in range(len(keypoints)):
|
|
temp = {
|
|
'pt': keypoints[i].pt,
|
|
'size': keypoints[i].size,
|
|
'angle': keypoints[i].angle,
|
|
'octave': keypoints[i].octave,
|
|
'response': keypoints[i].response,
|
|
'class_id': keypoints[i].class_id
|
|
}
|
|
keypoints_list.append(temp)
|
|
|
|
return keypoints_list
|
|
|
|
|
|
def unwrap_keypoints(keypoints_list):
|
|
|
|
keypoints = []
|
|
|
|
for temp in keypoints_list:
|
|
point = cv2.KeyPoint(
|
|
x=temp['pt'][0],
|
|
y=temp['pt'][1],
|
|
size=temp['size'],
|
|
angle=temp['angle'],
|
|
octave=temp['octave'],
|
|
response=temp['response'],
|
|
class_id=temp['class_id']
|
|
)
|
|
keypoints.append(point)
|
|
|
|
return tuple(keypoints)
|
|
|
|
|
|
def cleanText(originalText):
|
|
|
|
safe_chars = string.ascii_letters + string.digits + " " + "_" + "/" + "&" + "$" + "#" "@" + "+" + "-" + "*" + "=" \
|
|
":" + ";" + "." "," + "?" + "!" + "{" + "}" + "[" + "]" + "(" + ")" + "%" + "`" + "~" + "^" + "|" + "<" + ">"
|
|
|
|
for index, text in enumerate(originalText):
|
|
|
|
originalText[index] = ''.join([char if char in safe_chars else '' for char in text])
|
|
|
|
return originalText
|
|
|
|
|
|
def cleanLink(originalLink):
|
|
|
|
safe_chars = string.ascii_letters + string.digits
|
|
|
|
originalLink = ''.join([char if char in safe_chars else '' for char in originalLink])
|
|
|
|
return originalLink
|
|
|
|
|
|
def organizeProducts(marketplace, nm, vendor, rating_vendor, success_vendor, nombre, CVE, MS, category, describe,
|
|
views, reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor):
|
|
|
|
rw = []
|
|
|
|
current_time = datetime.now()
|
|
day = current_time.strftime("%m/%d/%Y")
|
|
ahora = current_time.strftime("%I:%M:%S")
|
|
|
|
for n in range(nm):
|
|
|
|
lne = marketplace # 0
|
|
lne += ","
|
|
lne += "-1" if len(vendor) == 0 else vendor[n] # 1
|
|
lne += ","
|
|
lne += "-1" if len(rating_vendor) == 0 else rating_vendor[n] # 2
|
|
lne += ","
|
|
lne += "-1" if len(success_vendor) == 0 else success_vendor[n] # 3
|
|
lne += ","
|
|
lne += nombre[n] # 4
|
|
lne += ','
|
|
lne += "-1" if len(describe) == 0 else describe[n] # 5
|
|
lne += ","
|
|
lne += "-1" if len(CVE) == 0 else CVE[n] # 6
|
|
lne += ","
|
|
lne += "-1" if len(MS) == 0 else MS[n] # 7
|
|
lne += ","
|
|
lne += "-1" if len(category) == 0 else category[n] # 8
|
|
lne += ","
|
|
lne += "-1" if len(views) == 0 else views[n] # 9
|
|
lne += ","
|
|
lne += "-1" if len(reviews) == 0 else reviews[n] # 10
|
|
lne += ","
|
|
lne += "-1" if len(rating_item) == 0 else rating_item[n] # 11
|
|
lne += ","
|
|
lne += "-1" if len(addDate) == 0 else addDate[n] # 12
|
|
lne += ","
|
|
lne += "-1" if len(BTC) == 0 else BTC[n] # 13
|
|
lne += ","
|
|
lne += "-1" if len(USD) == 0 else USD[n] # 14
|
|
lne += ","
|
|
lne += "-1" if len(EURO) == 0 else EURO[n] # 15
|
|
lne += ","
|
|
lne += "-1" if len(sold) == 0 else sold[n] # 16
|
|
lne += ","
|
|
lne += "-1" if len(qLeft) == 0 else qLeft[n] # 17
|
|
lne += ","
|
|
lne += "-1" if len(shipFrom) == 0 else shipFrom[n] # 18
|
|
lne += ","
|
|
lne += "-1" if len(shipTo) == 0 else shipTo[n] # 19
|
|
lne += ","
|
|
lne += "-1" if len(image) == 0 else image[n] # 20
|
|
lne += ","
|
|
lne += "-1" if len(image_vendor) == 0 else image_vendor[n] # 21
|
|
lne += ","
|
|
lne += "-1" if len(href) == 0 else href[n] # 22
|
|
lne += ","
|
|
lne += day + " " + ahora # 23
|
|
|
|
rw.append(lne)
|
|
|
|
return rw
|
|
|
|
|
|
def cleanString(originalString):
|
|
updated_string = originalString.replace(",", "") #replace all commas
|
|
updated_string = updated_string.replace("\n", "") #replace all newlines
|
|
updated_string = updated_string.replace("\t", "") #replace all tabs
|
|
updated_string = updated_string.replace("\r", "") #replace all carriage returns
|
|
updated_string = updated_string.replace("'", "^") #replace all semicolons
|
|
updated_string = updated_string.replace(u"»", '') #replace all arrows
|
|
updated_string = updated_string.replace("!", "") #replace all exclamation points
|
|
updated_string = updated_string.replace(";", "") #replace all exclamations
|
|
|
|
return updated_string
|
|
|
|
|
|
def checkDateFormat(myString):
|
|
isDate = re.match('[0-1][0-9]\/[0-3][0-9]\/[1-2][0-9]{3}', myString)
|
|
return isDate
|
|
|
|
|
|
def cleanNumbers(inputString):
|
|
|
|
reg_ex = re.compile(r'[^\d.]+')
|
|
updated_string = reg_ex.sub('', inputString)
|
|
|
|
return updated_string
|
|
|
|
|
|
def aes_encryption(data_bytes):
|
|
|
|
encrypted_bytes = encryptCipher.encrypt(pad(data_bytes, BLOCK_SIZE))
|
|
|
|
return encrypted_bytes
|
|
|
|
|
|
def aes_decryption(data_bytes):
|
|
|
|
decrypted_bytes = decryptCipher.decrypt(data_bytes)
|
|
|
|
return unpad(decrypted_bytes, BLOCK_SIZE)
|
|
|
|
|
|
def encrypt_encode_image_to_base64(driver, xpath):
|
|
|
|
try:
|
|
|
|
img_element = driver.find_element(by=By.XPATH, value=xpath)
|
|
image_data = img_element.screenshot_as_png
|
|
|
|
encrypted_image = aes_encryption(image_data)
|
|
base64_image = base64.b64encode(encrypted_image)
|
|
enc_image_string = base64_image.decode('utf-8')
|
|
|
|
return enc_image_string
|
|
|
|
except Exception as e:
|
|
print(e)
|
|
pass
|
|
|
|
return None
|
|
|
|
|
|
def decode_decrypt_image_in_base64(image_string):
|
|
|
|
try:
|
|
|
|
image_bytes = bytes(image_string, encoding='utf-8')
|
|
encrypted_bytes = base64.b64decode(image_bytes)
|
|
decrypted_image = aes_decryption(encrypted_bytes)
|
|
base64_image = base64.b64encode(decrypted_image)
|
|
dec_image_string = base64_image.decode('utf-8')
|
|
|
|
return dec_image_string
|
|
|
|
except Exception as e:
|
|
print(e)
|
|
pass
|
|
|
|
return None
|
|
|
|
|
|
def replace_image_sources(driver, html_content):
|
|
|
|
tree = lxml.fromstring(html_content)
|
|
|
|
for picture_tag in tree.findall('.//picture'):
|
|
for source_tag in picture_tag.findall('.//source'):
|
|
picture_tag.remove(source_tag)
|
|
|
|
for img_tag in tree.findall('.//img'):
|
|
|
|
img_xpath = tree.getroottree().getpath(img_tag)
|
|
|
|
string_image = encrypt_encode_image_to_base64(driver, img_xpath)
|
|
|
|
if string_image:
|
|
img_tag.set('src', f'data:image/png;base64,{string_image}')
|
|
else:
|
|
img_tag.getparent().remove(img_tag)
|
|
|
|
modified_html = lxml.tostring(tree, encoding='utf-8').decode('utf-8')
|
|
|
|
return modified_html
|
|
|
|
|
|
def cleanHTML(driver, html):
|
|
|
|
clean_html = replace_image_sources(driver, html)
|
|
|
|
formats = [
|
|
"jpg", "jpeg", "jfif", "pjpeg", "pjp",
|
|
"png", "apng", "svg", "bmp", "gif",
|
|
"avif", "webp", "ico", "cur", "tiff"
|
|
]
|
|
|
|
# remove images
|
|
clean_html = re.sub(r"<svg[\s\S]*?svg>", "", clean_html)
|
|
for fmat in formats:
|
|
clean_html = re.sub(r"<object.*" + fmat + "[\s\S]*?object>", "", clean_html)
|
|
clean_html = re.sub(r"<canvas[\s\S]*?canvas>", "", clean_html)
|
|
|
|
# remove JavaScript
|
|
clean_html = re.sub(r"<script[\s\S]*?script>", "", clean_html)
|
|
clean_html = re.sub(r"<iframe[\s\S]*?iframe>", "", clean_html)
|
|
clean_html = re.sub(r"<object.*javascript[\s\S]*?object>", "", clean_html)
|
|
clean_html = re.sub(r"<aplet.*mayscript[\s\S]*?aplet>", "", clean_html)
|
|
clean_html = re.sub(r"<embed.*scriptable[\s\S]*?embed>", "", clean_html)
|
|
|
|
# image and JavaScript
|
|
clean_html = re.sub(r"<div[^>]*style=\"[^\"]*background-image[\s\S]*?div>", "", clean_html)
|
|
|
|
return clean_html
|
|
|
|
|
|
def get_relative_url(target_url):
|
|
# Use a dummy base URL to handle both absolute and relative URLs
|
|
base_url = "http://dummybaseurl.com/"
|
|
absolute_url = urljoin(base_url, target_url)
|
|
|
|
# Parse the absolute URL
|
|
parsed_absolute_url = urlsplit(absolute_url)
|
|
|
|
# Extract the path and query from the absolute URL as the relative URL
|
|
return parsed_absolute_url.path + '?' + parsed_absolute_url.query \
|
|
if parsed_absolute_url.query else parsed_absolute_url.path
|