__author__ = 'DarkWeb' import string import time import re import hashlib import base64 import io import configparser from datetime import datetime, timedelta import datetime as fulldatetime from bs4 import BeautifulSoup from lxml import html as lxml from selenium.webdriver.common.by import By from Crypto.Cipher import AES from Crypto.Util.Padding import pad, unpad from PIL import Image def generate_aes_key(): config = configparser.ConfigParser() config.read('../../setup.ini') secret = config.get('Encryption', 'secret') secret_bytes = bytes(secret, encoding="utf-8") # Derive a key from the seed using PBKDF2 key = hashlib.pbkdf2_hmac(hash_name='sha256', password=secret_bytes, salt=bytes(), iterations=1) # Use the first 16 bytes of the derived key as the AES key aes_key = key[:16] # print("key: ", aes_key) return aes_key BLOCK_SIZE = 32 aes_key = generate_aes_key() encryptCipher = AES.new(aes_key, AES.MODE_ECB) decryptCipher = AES.new(aes_key, AES.MODE_ECB) def cleanText(originalText): safe_chars = string.ascii_letters + string.digits + " " + "_" + "/" + "&" + "$" + "#" "@" + "+" + "-" + "*" + "=" \ ":" + ";" + "." "," + "?" + "!" + "{" + "}" + "[" + "]" + "(" + ")" + "%" + "`" + "~" + "^" + "|" + "<" + ">" for index, text in enumerate(originalText): originalText[index] = ''.join([char if char in safe_chars else '' for char in text]) return originalText def convertDate(sdate, language, crawlerDate): if language == "english": todaysday = crawlerDate.strftime("%m/%d/%Y") sdate = sdate.replace(u"January","01") sdate = sdate.replace(u"February","02") sdate = sdate.replace(u"March","03") sdate = sdate.replace(u"April","04") sdate = sdate.replace(u"May","05") sdate = sdate.replace(u"June","06") sdate = sdate.replace(u"July","07") sdate = sdate.replace(u"August","08") sdate = sdate.replace(u"September","09") sdate = sdate.replace(u"October","10") sdate = sdate.replace(u"November","11") sdate = sdate.replace(u"December","12") sdate = sdate.replace(u"Jan","01") sdate = sdate.replace(u"Feb","02") sdate = sdate.replace(u"Mar","03") sdate = sdate.replace(u"Apr","04") sdate = sdate.replace(u"May","05") sdate = sdate.replace(u"Jun","06") sdate = sdate.replace(u"Jul","07") sdate = sdate.replace(u"Aug","08") sdate = sdate.replace(u"Sep","09") sdate = sdate.replace(u"Oct","10") sdate = sdate.replace(u"Nov","11") sdate = sdate.replace(u"Dec","12") sdate = sdate.replace(u".","") if sdate == "Today at": sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%m %d %Y') sdate = datetime.strptime(str(sdate), '%m %d %Y').strftime('%m/%d/%Y') elif language == "french": todaysday = crawlerDate.strftime("%m/%d/%Y") sdate = sdate.replace(u"janvier","01") sdate = sdate.replace(u"jan","01") sdate = sdate.replace(u"février","02") sdate = sdate.replace(u"juin","06") sdate = sdate.replace(u"juillet","07") sdate = sdate.replace(u"juil","07") sdate = sdate.replace(u"août","08") sdate = sdate.replace(u"septembre","09") sdate = sdate.replace(u"sept","09") sdate = sdate.replace(u"octobre","10") sdate = sdate.replace(u"oct","10") sdate = sdate.replace(u"novembre","11") sdate = sdate.replace(u"nov","11") sdate = sdate.replace(u"décembre","12") sdate = sdate.replace(u"déc","12") sdate = sdate.replace(u".","") if sdate == u"Aujourd'hui": sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%d %m %Y') if "mar" in sdate: print ("Add March to the IBM Black Market") raise SystemExit elif "avr" in sdate: print ("Add April to the IBM Black Market") raise SystemExit elif "mai" in sdate: print ("Add May to the IBM Black Market") raise SystemExit sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y') elif language == "swedish": sdate = sdate.replace(u"jan","01") sdate = sdate.replace(u"feb","02") sdate = sdate.replace(u"mar","03") sdate = sdate.replace(u"apr","04") sdate = sdate.replace(u"maj","05") sdate = sdate.replace(u"jun","06") sdate = sdate.replace(u"jul","07") sdate = sdate.replace(u"aug","08") sdate = sdate.replace(u"sep","09") sdate = sdate.replace(u"okt","10") sdate = sdate.replace(u"nov","11") sdate = sdate.replace(u"dec","12") sdate = sdate.replace(u".","") sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y') elif language == "russian": if sdate == u'\u0412\u0447\u0435\u0440\u0430': sdate = crawlerDate.today() - timedelta(1) sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y') elif u'\xd1\xee\xe7\xe4\xe0\xed\xee' in sdate: return "" sdate = sdate.replace(u"января","01") sdate = sdate.replace(u"янв","01") sdate = sdate.replace(u"февраля","02") sdate = sdate.replace(u"Февраль", "02") sdate = sdate.replace(u"фев","02") sdate = sdate.replace(u"марта","03") sdate = sdate.replace(u"апреля","04") sdate = sdate.replace(u"апр","04") sdate = sdate.replace(u"мар","05") sdate = sdate.replace(u"май","05") sdate = sdate.replace(u"мая","05") sdate = sdate.replace(u"июня","06") sdate = sdate.replace(u"июн","06") sdate = sdate.replace(u"июля","07") sdate = sdate.replace(u"июл","07") sdate = sdate.replace(u"августа","08") sdate = sdate.replace(u"авг","08") sdate = sdate.replace(u"сентября","09") sdate = sdate.replace(u"сен","09") sdate = sdate.replace(u"октября","10") sdate = sdate.replace(u"Октябрь","10") sdate = sdate.replace(u"окт","10") sdate = sdate.replace(u"ноября","11") sdate = sdate.replace(u"ноя","11") sdate = sdate.replace(u"декабря","12") sdate = sdate.replace(u"дек","12") sdate = sdate.replace(u".","") sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y') return sdate def cleanLink(originalLink): safe_chars = string.ascii_letters + string.digits originalLink = ''.join([char if char in safe_chars else '' for char in originalLink]) return originalLink def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author): rw = [] current_time = datetime.now() day = current_time.strftime("%m/%d/%Y") ahora = current_time.strftime("%I:%M:%S") for n in range(nm): lne = forum # 0 lne += "," lne += board # 1 lne += "," lne += author[n] # 2 lne += "," lne += topic[n] # 3 lne += "," lne += "-1" if len(views) == 0 else views[n] # 4 lne += "," lne += "-1" if len(posts) == 0 else posts[n] # 5 lne += "," lne += "-1" if len(href) == 0 else href[n] # 6 lne += "," lne += "-1" if len(addDate) == 0 else str(addDate[n]) # 7 lne += "," lne += day + " " + ahora # 8 lne += "," lne += "-1" if len(image_author) == 0 else str(image_author[n]) # 9 image_user lne += "," lne += "-1" # 10 name_user lne += "," lne += "-1" # 11 status_user lne += "," lne += "-1" # 12 reputation_user lne += "," lne += "-1" # 13 interest_user lne += "," lne += "-1" # 14 signature_user lne += "," lne += "-1" # 15 content_post lne += "," lne += "-1" # 16 feedback_post lne += "," lne += "-1" # 17 dateadded_post lne += "," lne += "-1" # 18 image_post rw.append(lne) return rw def cleanString(originalString): updated_string = originalString.replace(",", "") #replace all commas updated_string = updated_string.replace("\n", "") #replace all newlines updated_string = updated_string.replace("\t", "") #replace all tabs updated_string = updated_string.replace("\r", "") #replace all carriage returns updated_string = updated_string.replace("'", "^") #replace all semicolons updated_string = updated_string.replace(u"»", '') #replace all arrows updated_string = updated_string.replace("!", "") updated_string = updated_string.replace(";", "") #replace all exclamations return updated_string #function to convert long informal date string to formal date def convertFromLongDate(longDate, crawlerdate): list_of_words = [] list_of_words = longDate.split() day = 0 week = 0 hour = 0 second = 0 minute = 0 year = 0 total_days = 0 if 'days' in list_of_words: index = list_of_words.index('days') day = float(list_of_words[index - 1]) if 'weeks' in list_of_words: index = list_of_words.index('weeks') week = float(list_of_words[index - 1]) if 'hours' in list_of_words: index = list_of_words.index('hours') hour = float(list_of_words[index - 1]) if 'seconds' in list_of_words: index = list_of_words.index('seconds') second = float(list_of_words[index - 1]) if 'minutes' in list_of_words: index = list_of_words.index('minutes') minute = float(list_of_words[index - 1]) if 'years' in list_of_words: index = list_of_words.index('years') year = float(list_of_words[index - 1]) if year != 0: total_days = day + 365 * year #today = datetime.date.today() timeDelta = fulldatetime.timedelta(days=total_days, weeks=week, hours=hour, seconds=second, minutes=minute) date = crawlerdate - timeDelta correct_date = str(date.strftime('%m/%d/%Y')) return correct_date def cleanNumbers(inputString): reg_ex = re.compile(r'[^\d.]+') updated_string = reg_ex.sub('', inputString) return updated_string def aes_encryption(item): to_bytes = bytes(item) encrypted_bytes = encryptCipher.encrypt(pad(to_bytes, BLOCK_SIZE)) return encrypted_bytes def aes_decryption(item): to_bytes = bytes(item) decrypted_bytes = decryptCipher.decrypt(to_bytes) return unpad(decrypted_bytes, BLOCK_SIZE) def encrypt_encode_image_to_base64(driver, xpath): try: img_element = driver.find_element(by=By.XPATH, value=xpath) image_data = img_element.screenshot_as_png encrypted_image = aes_encryption(image_data) base64_image = base64.b64encode(encrypted_image) string_image = base64_image.decode('utf-8') return string_image except: pass return None def decode_decrypt_image_in_base64(string_image): try: base64_image = bytes(string_image, encoding='utf-8') encrypted_image = base64.b64decode(base64_image) decrypted_image = aes_decryption(encrypted_image) im = Image.open(io.BytesIO(decrypted_image)) im.show() return decrypted_image except Exception as e: print(e) pass return None def replace_image_sources(driver, html_content): tree = lxml.fromstring(html_content) for picture_tag in tree.findall('.//picture'): for source_tag in picture_tag.findall('.//source'): picture_tag.remove(source_tag) for img_tag in tree.findall('.//img'): img_xpath = tree.getroottree().getpath(img_tag) string_image = encrypt_encode_image_to_base64(driver, img_xpath) if string_image: img_tag.set('src', f'data:image/png;base64,{string_image}') else: img_tag.getparent().remove(img_tag) modified_html = lxml.tostring(tree, encoding='utf-8').decode('utf-8') return modified_html def cleanHTML(driver, html): clean_html = replace_image_sources(driver, html) # decode_decrypt_image_in_base64(clean_html) formats = [ "jpg", "jpeg", "jfif", "pjpeg", "pjp", "png", "apng", "svg", "bmp", "gif", "avif", "webp", "ico", "cur", "tiff" ] # remove images clean_html = re.sub(r"", "", clean_html) for fmat in formats: clean_html = re.sub(r"", "", clean_html) clean_html = re.sub(r"", "", clean_html) # remove JavaScript clean_html = re.sub(r"", "", clean_html) clean_html = re.sub(r"", "", clean_html) clean_html = re.sub(r"", "", clean_html) clean_html = re.sub(r"", "", clean_html) clean_html = re.sub(r"", "", clean_html) # image and JavaScript clean_html = re.sub(r"]*style=\"[^\"]*background-image[\s\S]*?div>", "", clean_html) return clean_html