|
|
- __author__ = 'DarkWeb'
-
- import string
- import time
- import re
- import hashlib
- import base64
- import io
- import configparser
- from datetime import datetime, timedelta
- import datetime as fulldatetime
- from bs4 import BeautifulSoup
- from lxml import html as lxml
- from selenium.webdriver.common.by import By
- from Crypto.Cipher import AES
- from Crypto.Util.Padding import pad, unpad
- from PIL import Image
-
-
- def generate_aes_key():
- config = configparser.ConfigParser()
- config.read('../../setup.ini')
-
- secret = config.get('Encryption', 'secret')
- secret_bytes = bytes(secret, encoding="utf-8")
-
- # Derive a key from the seed using PBKDF2
- key = hashlib.pbkdf2_hmac(hash_name='sha256', password=secret_bytes, salt=bytes(), iterations=1)
-
- # Use the first 16 bytes of the derived key as the AES key
- aes_key = key[:16]
-
- # print("key: ", aes_key)
- return aes_key
-
-
- BLOCK_SIZE = 32
- aes_key = generate_aes_key()
- encryptCipher = AES.new(aes_key, AES.MODE_ECB)
- decryptCipher = AES.new(aes_key, AES.MODE_ECB)
-
-
- def convertDate(sdate, language, crawlerDate):
-
- if language == "english":
-
- today = crawlerDate.strftime("%m/%d/%Y")
- yesterday = (crawlerDate - timedelta(1)).strftime("%m/%d/%Y")
-
- sdate = sdate.replace(u"January","01")
- sdate = sdate.replace(u"February","02")
- sdate = sdate.replace(u"March","03")
- sdate = sdate.replace(u"April","04")
- sdate = sdate.replace(u"May","05")
- sdate = sdate.replace(u"June","06")
- sdate = sdate.replace(u"July","07")
- sdate = sdate.replace(u"August","08")
- sdate = sdate.replace(u"September","09")
- sdate = sdate.replace(u"October","10")
- sdate = sdate.replace(u"November","11")
- sdate = sdate.replace(u"December","12")
- sdate = sdate.replace(u"Jan","01")
- sdate = sdate.replace(u"Feb","02")
- sdate = sdate.replace(u"Mar","03")
- sdate = sdate.replace(u"Apr","04")
- sdate = sdate.replace(u"May","05")
- sdate = sdate.replace(u"Jun","06")
- sdate = sdate.replace(u"Jul","07")
- sdate = sdate.replace(u"Aug","08")
- sdate = sdate.replace(u"Sep","09")
- sdate = sdate.replace(u"Oct","10")
- sdate = sdate.replace(u"Nov","11")
- sdate = sdate.replace(u"Dec","12")
- sdate = sdate.replace(u".","")
-
- if "Today" in sdate:
- sdate = datetime.strptime(str(today), '%m/%d/%Y').strftime('%m %d %Y')
- elif "Yesterday" in sdate:
- sdate = datetime.strptime(str(yesterday), '%m/%d/%Y').strftime('%m %d %Y')
-
- sdate = datetime.strptime(str(sdate), '%m %d %Y').strftime('%m/%d/%Y')
-
- elif language == "british":
-
- sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
-
- elif language == "french":
-
- todaysday = crawlerDate.strftime("%m/%d/%Y")
-
- sdate = sdate.replace(u"janvier","01")
- sdate = sdate.replace(u"jan","01")
- sdate = sdate.replace(u"février","02")
- sdate = sdate.replace(u"juin","06")
- sdate = sdate.replace(u"juillet","07")
- sdate = sdate.replace(u"juil","07")
- sdate = sdate.replace(u"août","08")
- sdate = sdate.replace(u"septembre","09")
- sdate = sdate.replace(u"sept","09")
- sdate = sdate.replace(u"octobre","10")
- sdate = sdate.replace(u"oct","10")
- sdate = sdate.replace(u"novembre","11")
- sdate = sdate.replace(u"nov","11")
- sdate = sdate.replace(u"décembre","12")
- sdate = sdate.replace(u"déc","12")
- sdate = sdate.replace(u".","")
-
- if sdate == u"Aujourd'hui" or "Today" in sdate:
- sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%d %m %Y')
-
- if "mar" in sdate:
- print ("Add March to the IBM Black Market")
- raise SystemExit
- elif "avr" in sdate:
- print ("Add April to the IBM Black Market")
- raise SystemExit
- elif "mai" in sdate:
- print ("Add May to the IBM Black Market")
- raise SystemExit
-
- sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
-
- elif language == "swedish":
-
- sdate = sdate.replace(u"jan","01")
- sdate = sdate.replace(u"feb","02")
- sdate = sdate.replace(u"mar","03")
- sdate = sdate.replace(u"apr","04")
- sdate = sdate.replace(u"maj","05")
- sdate = sdate.replace(u"jun","06")
- sdate = sdate.replace(u"jul","07")
- sdate = sdate.replace(u"aug","08")
- sdate = sdate.replace(u"sep","09")
- sdate = sdate.replace(u"okt","10")
- sdate = sdate.replace(u"nov","11")
- sdate = sdate.replace(u"dec","12")
- sdate = sdate.replace(u".","")
-
- if sdate == u"Ig\xe5r" or sdate == u"Idag" or "minuter sedan" in sdate:
- sdate = crawlerDate
- sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y')
-
- sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
-
- elif language == "russian":
-
- if sdate == u'\u0412\u0447\u0435\u0440\u0430' or u"Вчера" in sdate:
- sdate = crawlerDate - timedelta(1)
- sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y')
- elif sdate == u'\u0421\u0435\u0433\u043e\u0434\u043d\u044f':
- sdate = crawlerDate
- sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y')
- elif u'\xd1\xee\xe7\xe4\xe0\xed\xee' in sdate:
- return ""
-
- sdate = sdate.replace(u"Январь","01")
- sdate = sdate.replace(u"января","01")
- sdate = sdate.replace(u"янв","01")
- sdate = sdate.replace(u"January","01")
- sdate = sdate.replace(u"Jan","01")
- sdate = sdate.replace(u"фев","02")
- sdate = sdate.replace(u"февраля","02")
- sdate = sdate.replace(u"Февраль", "02")
- sdate = sdate.replace(u"February", "02")
- sdate = sdate.replace(u"Feb", "02")
- sdate = sdate.replace(u"Март","03")
- sdate = sdate.replace(u"марта","03")
- sdate = sdate.replace(u"March","03")
- sdate = sdate.replace(u"Mar","03")
- sdate = sdate.replace(u"Апрель","04")
- sdate = sdate.replace(u"апреля","04")
- sdate = sdate.replace(u"апр","04")
- sdate = sdate.replace(u"April","04")
- sdate = sdate.replace(u"Apr","04")
- sdate = sdate.replace(u"май","05")
- sdate = sdate.replace(u"Май","05")
- sdate = sdate.replace(u"мар","05")
- sdate = sdate.replace(u"май","05")
- sdate = sdate.replace(u"мая","05")
- sdate = sdate.replace(u"May","05")
- sdate = sdate.replace(u"Июнь","06")
- sdate = sdate.replace(u"июня","06")
- sdate = sdate.replace(u"июн","06")
- sdate = sdate.replace(u"June","06")
- sdate = sdate.replace(u"Jun","06")
- sdate = sdate.replace(u"Июль","07")
- sdate = sdate.replace(u"июля","07")
- sdate = sdate.replace(u"июл","07")
- sdate = sdate.replace(u"July","07")
- sdate = sdate.replace(u"Jul","07")
- sdate = sdate.replace(u"августа","08")
- sdate = sdate.replace(u"Август","08")
- sdate = sdate.replace(u"авг","08")
- sdate = sdate.replace(u"August","08")
- sdate = sdate.replace(u"Aug","08")
- sdate = sdate.replace(u"Сентябрь","09")
- sdate = sdate.replace(u"сентября","09")
- sdate = sdate.replace(u"сен","09")
- sdate = sdate.replace(u"September","09")
- sdate = sdate.replace(u"Sep","09")
- sdate = sdate.replace(u"октября","10")
- sdate = sdate.replace(u"Октябрь","10")
- sdate = sdate.replace(u"October","10")
- sdate = sdate.replace(u"Oct","10")
- sdate = sdate.replace(u"окт","10")
- sdate = sdate.replace(u"Ноябрь","11")
- sdate = sdate.replace(u"ноября","11")
- sdate = sdate.replace(u"ноя","11")
- sdate = sdate.replace(u"November","11")
- sdate = sdate.replace(u"Nov","11")
- sdate = sdate.replace(u"Декабрь","12")
- sdate = sdate.replace(u"декабря","12")
- sdate = sdate.replace(u"дек","12")
- sdate = sdate.replace(u"December","12")
- sdate = sdate.replace(u"Dec","12")
- sdate = sdate.replace(u".","")
-
- sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
-
- return sdate
-
-
- def cleanText(originalText):
-
- safe_chars = string.ascii_letters + string.digits + " " + "_" + "/" + "&" + "$" + "#" "@" + "+" + "-" + "*" + "=" \
- ":" + ";" + "." "," + "?" + "!" + "{" + "}" + "[" + "]" + "(" + ")" + "%" + "`" + "~" + "^" + "|" + "<" + ">"
-
- for index, text in enumerate(originalText):
-
- originalText[index] = ''.join([char if char in safe_chars else '' for char in text])
-
- return originalText
-
-
- def cleanLink(originalLink):
-
- safe_chars = string.ascii_letters + string.digits
-
- originalLink = ''.join([char if char in safe_chars else '' for char in originalLink])
-
- return originalLink
-
-
- def organizeProducts(marketplace, nm, vendor, rating_vendor, success_vendor, nombre, CVE, MS, category, describe,
- views, reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor):
-
- rw = []
-
- day = time.strftime("%m/%d/%Y")
- ahora = time.strftime("%I:%M:%S")
-
- for n in range(nm):
-
- lne = marketplace # 0
- lne += ","
- lne += vendor[n] # 1
- lne += ","
- lne += "-1" if len(rating_vendor) == 0 else rating_vendor[n] # 2
- lne += ","
- lne += "-1" if len(success_vendor) == 0 else success_vendor[n] # 3
- lne += ","
- lne += nombre[n] # 4
- lne += ','
- lne += "-1" if len(describe) == 0 else describe[n] # 5
- lne += ","
- lne += "-1" if len(CVE) == 0 else CVE[n] # 6
- lne += ","
- lne += "-1" if len(MS) == 0 else MS[n] # 7
- lne += ","
- lne += "-1" if len(category) == 0 else category[n] # 8
- lne += ","
- lne += "-1" if len(views) == 0 else views[n] # 9
- lne += ","
- lne += "-1" if len(reviews) == 0 else reviews[n] # 10
- lne += ","
- lne += "-1" if len(rating_item) == 0 else rating_item[n] # 11
- lne += ","
- lne += "-1" if len(addDate) == 0 else addDate[n] # 12
- lne += ","
- lne += "-1" if len(BTC) == 0 else BTC[n] # 13
- lne += ","
- lne += "-1" if len(USD) == 0 else USD[n] # 14
- lne += ","
- lne += "-1" if len(EURO) == 0 else EURO[n] # 15
- lne += ","
- lne += "-1" if len(sold) == 0 else sold[n] # 16
- lne += ","
- lne += "-1" if len(qLeft) == 0 else qLeft[n] # 17
- lne += ","
- lne += "-1" if len(shipFrom) == 0 else shipFrom[n] # 18
- lne += ","
- lne += "-1" if len(shipTo) == 0 else shipTo[n] # 19
- lne += ","
- lne += "-1" if len(image) == 0 else image[n] # 20
- lne += ","
- lne += "-1" if len(image_vendor) == 0 else image_vendor[n] # 21
- lne += ","
- lne += "-1" if len(href) == 0 else href[n] # 22
- lne += ","
- lne += day + " " + ahora # 23
-
- rw.append(lne)
-
- return rw
-
-
- def cleanString(originalString):
- updated_string = originalString.replace(",", "") #replace all commas
- updated_string = updated_string.replace("\n", "") #replace all newlines
- updated_string = updated_string.replace("\t", "") #replace all tabs
- updated_string = updated_string.replace("\r", "") #replace all carriage returns
- updated_string = updated_string.replace("'", "^") #replace all semicolons
- updated_string = updated_string.replace(u"»", '') #replace all arrows
- updated_string = updated_string.replace("!", "") #replace all exclamation points
- updated_string = updated_string.replace(";", "") #replace all exclamations
-
- return updated_string
-
-
- def checkDateFormat(myString):
- isDate = re.match('[0-1][0-9]\/[0-3][0-9]\/[1-2][0-9]{3}', myString)
- return isDate
-
-
- def cleanNumbers(inputString):
-
- reg_ex = re.compile(r'[^\d.]+')
- updated_string = reg_ex.sub('', inputString)
-
- return updated_string
-
-
- def aes_encryption(item):
-
- to_bytes = bytes(item)
-
- encrypted_bytes = encryptCipher.encrypt(pad(to_bytes, BLOCK_SIZE))
-
- return encrypted_bytes
-
-
- def aes_decryption(item):
-
- to_bytes = bytes(item)
-
- decrypted_bytes = decryptCipher.decrypt(to_bytes)
-
- return unpad(decrypted_bytes, BLOCK_SIZE)
-
-
- def encrypt_encode_image_to_base64(driver, xpath):
-
- try:
-
- img_element = driver.find_element(by=By.XPATH, value=xpath)
- image_data = img_element.screenshot_as_png
-
- encrypted_image = aes_encryption(image_data)
- base64_image = base64.b64encode(encrypted_image)
- string_image = base64_image.decode('utf-8')
-
- return string_image
-
- except:
- pass
-
- return None
-
-
- def decode_decrypt_image_in_base64(string_image):
-
- try:
-
- base64_image = bytes(string_image, encoding='utf-8')
- encrypted_image = base64.b64decode(base64_image)
- decrypted_image = aes_decryption(encrypted_image)
-
- im = Image.open(io.BytesIO(decrypted_image))
- im.show()
-
- return decrypted_image
-
- except Exception as e:
- print(e)
- pass
-
- return None
-
-
- def replace_image_sources(driver, html_content):
-
- tree = lxml.fromstring(html_content)
-
- for picture_tag in tree.findall('.//picture'):
- for source_tag in picture_tag.findall('.//source'):
- picture_tag.remove(source_tag)
-
- for img_tag in tree.findall('.//img'):
-
- img_xpath = tree.getroottree().getpath(img_tag)
-
- string_image = encrypt_encode_image_to_base64(driver, img_xpath)
-
- if string_image:
- img_tag.set('src', f'data:image/png;base64,{string_image}')
- else:
- img_tag.getparent().remove(img_tag)
-
- modified_html = lxml.tostring(tree, encoding='utf-8').decode('utf-8')
-
- return modified_html
-
-
- def cleanHTML(driver, html):
-
- clean_html = replace_image_sources(driver, html)
-
- formats = [
- "jpg", "jpeg", "jfif", "pjpeg", "pjp",
- "png", "apng", "svg", "bmp", "gif",
- "avif", "webp", "ico", "cur", "tiff"
- ]
-
- # remove images
- clean_html = re.sub(r"<svg.*?>", "", clean_html)
- for fmat in formats:
- clean_html = re.sub(r"<object.*" + fmat + ".*?>", "", clean_html)
- clean_html = re.sub(r"<canvas.*?>", "", clean_html)
-
- # remove JavaScript
- clean_html = re.sub(r"<script.*?>", "", clean_html)
- clean_html = re.sub(r"<object.*javascript.*?>", "", clean_html)
- clean_html = re.sub(r"<aplet.*mayscript?>", "", clean_html)
- clean_html = re.sub(r"<embed.*scriptable?>", "", clean_html)
-
- # image and JavaScript
- clean_html = re.sub(r"<div[^>]*style=\"[^\"]*background-image.*?>|background-image:url\(\'(.*?)\'\);", "", clean_html)
-
- return clean_html
-
-
-
|