__author__ = 'DarkWeb' import string import time import re import hashlib import base64 import io import configparser from datetime import datetime, timedelta import datetime as fulldatetime from bs4 import BeautifulSoup from lxml import html as lxml from selenium.webdriver.common.by import By from Crypto.Cipher import AES from Crypto.Util.Padding import pad, unpad from PIL import Image def generate_aes_key(): config = configparser.ConfigParser() config.read('../../setup.ini') secret = config.get('Encryption', 'secret') secret_bytes = bytes(secret, encoding="utf-8") # Derive a key from the seed using PBKDF2 key = hashlib.pbkdf2_hmac(hash_name='sha256', password=secret_bytes, salt=bytes(), iterations=1) # Use the first 16 bytes of the derived key as the AES key aes_key = key[:16] # print("key: ", aes_key) return aes_key BLOCK_SIZE = 32 aes_key = generate_aes_key() encryptCipher = AES.new(aes_key, AES.MODE_ECB) decryptCipher = AES.new(aes_key, AES.MODE_ECB) def convertDate(sdate, language, crawlerDate): if language == "english": today = crawlerDate.strftime("%m/%d/%Y") yesterday = (crawlerDate - timedelta(1)).strftime("%m/%d/%Y") sdate = sdate.replace(u"January","01") sdate = sdate.replace(u"February","02") sdate = sdate.replace(u"March","03") sdate = sdate.replace(u"April","04") sdate = sdate.replace(u"May","05") sdate = sdate.replace(u"June","06") sdate = sdate.replace(u"July","07") sdate = sdate.replace(u"August","08") sdate = sdate.replace(u"September","09") sdate = sdate.replace(u"October","10") sdate = sdate.replace(u"November","11") sdate = sdate.replace(u"December","12") sdate = sdate.replace(u"Jan","01") sdate = sdate.replace(u"Feb","02") sdate = sdate.replace(u"Mar","03") sdate = sdate.replace(u"Apr","04") sdate = sdate.replace(u"May","05") sdate = sdate.replace(u"Jun","06") sdate = sdate.replace(u"Jul","07") sdate = sdate.replace(u"Aug","08") sdate = sdate.replace(u"Sep","09") sdate = sdate.replace(u"Oct","10") sdate = sdate.replace(u"Nov","11") sdate = sdate.replace(u"Dec","12") sdate = sdate.replace(u".","") if "Today" in sdate: sdate = datetime.strptime(str(today), '%m/%d/%Y').strftime('%m %d %Y') elif "Yesterday" in sdate: sdate = datetime.strptime(str(yesterday), '%m/%d/%Y').strftime('%m %d %Y') sdate = datetime.strptime(str(sdate), '%m %d %Y').strftime('%m/%d/%Y') elif language == "british": sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y') elif language == "french": todaysday = crawlerDate.strftime("%m/%d/%Y") sdate = sdate.replace(u"janvier","01") sdate = sdate.replace(u"jan","01") sdate = sdate.replace(u"février","02") sdate = sdate.replace(u"juin","06") sdate = sdate.replace(u"juillet","07") sdate = sdate.replace(u"juil","07") sdate = sdate.replace(u"août","08") sdate = sdate.replace(u"septembre","09") sdate = sdate.replace(u"sept","09") sdate = sdate.replace(u"octobre","10") sdate = sdate.replace(u"oct","10") sdate = sdate.replace(u"novembre","11") sdate = sdate.replace(u"nov","11") sdate = sdate.replace(u"décembre","12") sdate = sdate.replace(u"déc","12") sdate = sdate.replace(u".","") if sdate == u"Aujourd'hui" or "Today" in sdate: sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%d %m %Y') if "mar" in sdate: print ("Add March to the IBM Black Market") raise SystemExit elif "avr" in sdate: print ("Add April to the IBM Black Market") raise SystemExit elif "mai" in sdate: print ("Add May to the IBM Black Market") raise SystemExit sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y') elif language == "swedish": sdate = sdate.replace(u"jan","01") sdate = sdate.replace(u"feb","02") sdate = sdate.replace(u"mar","03") sdate = sdate.replace(u"apr","04") sdate = sdate.replace(u"maj","05") sdate = sdate.replace(u"jun","06") sdate = sdate.replace(u"jul","07") sdate = sdate.replace(u"aug","08") sdate = sdate.replace(u"sep","09") sdate = sdate.replace(u"okt","10") sdate = sdate.replace(u"nov","11") sdate = sdate.replace(u"dec","12") sdate = sdate.replace(u".","") if sdate == u"Ig\xe5r" or sdate == u"Idag" or "minuter sedan" in sdate: sdate = crawlerDate sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y') sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y') elif language == "russian": if sdate == u'\u0412\u0447\u0435\u0440\u0430' or u"Вчера" in sdate: sdate = crawlerDate - timedelta(1) sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y') elif sdate == u'\u0421\u0435\u0433\u043e\u0434\u043d\u044f': sdate = crawlerDate sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y') elif u'\xd1\xee\xe7\xe4\xe0\xed\xee' in sdate: return "" sdate = sdate.replace(u"Январь","01") sdate = sdate.replace(u"января","01") sdate = sdate.replace(u"янв","01") sdate = sdate.replace(u"January","01") sdate = sdate.replace(u"Jan","01") sdate = sdate.replace(u"фев","02") sdate = sdate.replace(u"февраля","02") sdate = sdate.replace(u"Февраль", "02") sdate = sdate.replace(u"February", "02") sdate = sdate.replace(u"Feb", "02") sdate = sdate.replace(u"Март","03") sdate = sdate.replace(u"марта","03") sdate = sdate.replace(u"March","03") sdate = sdate.replace(u"Mar","03") sdate = sdate.replace(u"Апрель","04") sdate = sdate.replace(u"апреля","04") sdate = sdate.replace(u"апр","04") sdate = sdate.replace(u"April","04") sdate = sdate.replace(u"Apr","04") sdate = sdate.replace(u"май","05") sdate = sdate.replace(u"Май","05") sdate = sdate.replace(u"мар","05") sdate = sdate.replace(u"май","05") sdate = sdate.replace(u"мая","05") sdate = sdate.replace(u"May","05") sdate = sdate.replace(u"Июнь","06") sdate = sdate.replace(u"июня","06") sdate = sdate.replace(u"июн","06") sdate = sdate.replace(u"June","06") sdate = sdate.replace(u"Jun","06") sdate = sdate.replace(u"Июль","07") sdate = sdate.replace(u"июля","07") sdate = sdate.replace(u"июл","07") sdate = sdate.replace(u"July","07") sdate = sdate.replace(u"Jul","07") sdate = sdate.replace(u"августа","08") sdate = sdate.replace(u"Август","08") sdate = sdate.replace(u"авг","08") sdate = sdate.replace(u"August","08") sdate = sdate.replace(u"Aug","08") sdate = sdate.replace(u"Сентябрь","09") sdate = sdate.replace(u"сентября","09") sdate = sdate.replace(u"сен","09") sdate = sdate.replace(u"September","09") sdate = sdate.replace(u"Sep","09") sdate = sdate.replace(u"октября","10") sdate = sdate.replace(u"Октябрь","10") sdate = sdate.replace(u"October","10") sdate = sdate.replace(u"Oct","10") sdate = sdate.replace(u"окт","10") sdate = sdate.replace(u"Ноябрь","11") sdate = sdate.replace(u"ноября","11") sdate = sdate.replace(u"ноя","11") sdate = sdate.replace(u"November","11") sdate = sdate.replace(u"Nov","11") sdate = sdate.replace(u"Декабрь","12") sdate = sdate.replace(u"декабря","12") sdate = sdate.replace(u"дек","12") sdate = sdate.replace(u"December","12") sdate = sdate.replace(u"Dec","12") sdate = sdate.replace(u".","") sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y') return sdate def cleanText(originalText): safe_chars = string.ascii_letters + string.digits + " " + "_" + "/" + "&" + "$" + "#" "@" + "+" + "-" + "*" + "=" \ ":" + ";" + "." "," + "?" + "!" + "{" + "}" + "[" + "]" + "(" + ")" + "%" + "`" + "~" + "^" + "|" + "<" + ">" for index, text in enumerate(originalText): originalText[index] = ''.join([char if char in safe_chars else '' for char in text]) return originalText def cleanLink(originalLink): safe_chars = string.ascii_letters + string.digits originalLink = ''.join([char if char in safe_chars else '' for char in originalLink]) return originalLink def organizeProducts(marketplace, nm, vendor, rating_vendor, success_vendor, nombre, CVE, MS, category, describe, views, reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href): rw = [] day = time.strftime("%m/%d/%Y") ahora = time.strftime("%I:%M:%S") for n in range(nm): lne = marketplace # 0 lne += "," lne += vendor[n] # 1 lne += "," lne += "-1" if len(rating_vendor) == 0 else rating_vendor[n] # 2 lne += "," lne += "-1" if len(success_vendor) == 0 else success_vendor[n] # 3 lne += "," lne += nombre[n] # 4 lne += ',' lne += "-1" if len(describe) == 0 else describe[n] # 5 lne += "," lne += "-1" if len(CVE) == 0 else CVE[n] # 6 lne += "," lne += "-1" if len(MS) == 0 else MS[n] # 7 lne += "," lne += "-1" if len(category) == 0 else category[n] # 8 lne += "," lne += "-1" if len(views) == 0 else views[n] # 9 lne += "," lne += "-1" if len(reviews) == 0 else reviews[n] # 10 lne += "," lne += "-1" if len(rating_item) == 0 else rating_item[n] # 11 lne += "," lne += "-1" if len(addDate) == 0 else addDate[n] # 12 lne += "," lne += "-1" if len(BTC) == 0 else BTC[n] # 13 lne += "," lne += "-1" if len(USD) == 0 else USD[n] # 14 lne += "," lne += "-1" if len(EURO) == 0 else EURO[n] # 15 lne += "," lne += "-1" if len(sold) == 0 else sold[n] # 16 lne += "," lne += "-1" if len(qLeft) == 0 else qLeft[n] # 17 lne += "," lne += "-1" if len(shipFrom) == 0 else shipFrom[n] # 18 lne += "," lne += "-1" if len(shipTo) == 0 else shipTo[n] # 19 lne += "," lne += "-1" if len(href) == 0 else href[n] # 20 lne += "," lne += day + " " + ahora # 21 rw.append(lne) return rw def cleanString(originalString): updated_string = originalString.replace(",", "") #replace all commas updated_string = updated_string.replace("\n", "") #replace all newlines updated_string = updated_string.replace("\t", "") #replace all tabs updated_string = updated_string.replace("\r", "") #replace all carriage returns updated_string = updated_string.replace("'", "^") #replace all semicolons updated_string = updated_string.replace(u"»", '') #replace all arrows updated_string = updated_string.replace("!", "") #replace all exclamation points updated_string = updated_string.replace(";", "") #replace all exclamations return updated_string def checkDateFormat(myString): isDate = re.match('[0-1][0-9]\/[0-3][0-9]\/[1-2][0-9]{3}', myString) return isDate def cleanNumbers(inputString): reg_ex = re.compile(r'[^\d.]+') updated_string = reg_ex.sub('', inputString) return updated_string def aes_encryption(item): to_bytes = bytes(item) encrypted_bytes = encryptCipher.encrypt(pad(to_bytes, BLOCK_SIZE)) return encrypted_bytes def aes_decryption(item): to_bytes = bytes(item) decrypted_bytes = decryptCipher.decrypt(to_bytes) return unpad(decrypted_bytes, BLOCK_SIZE) def encrypt_encode_image_to_base64(driver, xpath): try: img_element = driver.find_element(by=By.XPATH, value=xpath) image_data = img_element.screenshot_as_png encrypted_image = aes_encryption(image_data) base64_image = base64.b64encode(encrypted_image) string_image = base64_image.decode('utf-8') return string_image except: pass return None def decode_decrypt_image_in_base64(html_content): soup = BeautifulSoup(html_content, 'html.parser') for img_tag in soup.find_all('img'): src_attr = img_tag.get('src') if src_attr and src_attr.startswith('data:image'): try: string_image = src_attr.split('base64,')[-1] base64_image = bytes(string_image, encoding='utf-8') encrypted_image = base64.b64decode(base64_image) decrypted_image = aes_decryption(encrypted_image) im = Image.open(io.BytesIO(decrypted_image)) im.show() except Exception as e: print(e) pass def replace_image_sources(driver, html_content): tree = lxml.fromstring(html_content) for picture_tag in tree.findall('.//picture'): for source_tag in picture_tag.findall('.//source'): picture_tag.remove(source_tag) for img_tag in tree.findall('.//img'): img_xpath = tree.getroottree().getpath(img_tag) string_image = encrypt_encode_image_to_base64(driver, img_xpath) if string_image: img_tag.set('src', f'data:image/png;base64,{string_image}') else: img_tag.getparent().remove(img_tag) modified_html = lxml.tostring(tree, encoding='utf-8').decode('utf-8') return modified_html def cleanHTML(driver, html): clean_html = replace_image_sources(driver, html) # decode_decrypt_image_in_base64(clean_html) formats = [ "jpg", "jpeg", "jfif", "pjpeg", "pjp", "png", "apng", "svg", "bmp", "gif", "avif", "webp", "ico", "cur", "tiff" ] # remove images clean_html = re.sub(r"", "", clean_html) for fmat in formats: clean_html = re.sub(r"", "", clean_html) clean_html = re.sub(r"", "", clean_html) # remove JavaScript clean_html = re.sub(r"", "", clean_html) clean_html = re.sub(r"", "", clean_html) clean_html = re.sub(r"", "", clean_html) clean_html = re.sub(r"", "", clean_html) # image and JavaScript clean_html = re.sub(r"]*style=\"[^\"]*background-image.*?>|background-image:url\(\'(.*?)\'\);", "", clean_html) return clean_html