__author__ = 'DarkWeb'
|
|
|
|
import string
|
|
import time
|
|
import re
|
|
import hashlib
|
|
import base64
|
|
import io
|
|
import configparser
|
|
from datetime import datetime, timedelta
|
|
import datetime as fulldatetime
|
|
from bs4 import BeautifulSoup
|
|
from lxml import html as lxml
|
|
from selenium.webdriver.common.by import By
|
|
from Crypto.Cipher import AES
|
|
from Crypto.Util.Padding import pad, unpad
|
|
from PIL import Image
|
|
|
|
|
|
def generate_aes_key():
|
|
config = configparser.ConfigParser()
|
|
config.read('../../setup.ini')
|
|
|
|
secret = config.get('Encryption', 'secret')
|
|
secret_bytes = bytes(secret, encoding="utf-8")
|
|
|
|
# Derive a key from the seed using PBKDF2
|
|
key = hashlib.pbkdf2_hmac(hash_name='sha256', password=secret_bytes, salt=bytes(), iterations=1)
|
|
|
|
# Use the first 16 bytes of the derived key as the AES key
|
|
aes_key = key[:16]
|
|
|
|
# print("key: ", aes_key)
|
|
return aes_key
|
|
|
|
|
|
BLOCK_SIZE = 32
|
|
aes_key = generate_aes_key()
|
|
encryptCipher = AES.new(aes_key, AES.MODE_ECB)
|
|
decryptCipher = AES.new(aes_key, AES.MODE_ECB)
|
|
|
|
|
|
def cleanText(originalText):
|
|
|
|
safe_chars = string.ascii_letters + string.digits + " " + "_" + "/" + "&" + "$" + "#" "@" + "+" + "-" + "*" + "=" \
|
|
":" + ";" + "." "," + "?" + "!" + "{" + "}" + "[" + "]" + "(" + ")" + "%" + "`" + "~" + "^" + "|" + "<" + ">"
|
|
|
|
for index, text in enumerate(originalText):
|
|
|
|
originalText[index] = ''.join([char if char in safe_chars else '' for char in text])
|
|
|
|
return originalText
|
|
|
|
|
|
def convertDate(sdate, language, crawlerDate):
|
|
|
|
if language == "english":
|
|
|
|
todaysday = crawlerDate.strftime("%m/%d/%Y")
|
|
|
|
sdate = sdate.replace(u"January","01")
|
|
sdate = sdate.replace(u"February","02")
|
|
sdate = sdate.replace(u"March","03")
|
|
sdate = sdate.replace(u"April","04")
|
|
sdate = sdate.replace(u"May","05")
|
|
sdate = sdate.replace(u"June","06")
|
|
sdate = sdate.replace(u"July","07")
|
|
sdate = sdate.replace(u"August","08")
|
|
sdate = sdate.replace(u"September","09")
|
|
sdate = sdate.replace(u"October","10")
|
|
sdate = sdate.replace(u"November","11")
|
|
sdate = sdate.replace(u"December","12")
|
|
sdate = sdate.replace(u"Jan","01")
|
|
sdate = sdate.replace(u"Feb","02")
|
|
sdate = sdate.replace(u"Mar","03")
|
|
sdate = sdate.replace(u"Apr","04")
|
|
sdate = sdate.replace(u"May","05")
|
|
sdate = sdate.replace(u"Jun","06")
|
|
sdate = sdate.replace(u"Jul","07")
|
|
sdate = sdate.replace(u"Aug","08")
|
|
sdate = sdate.replace(u"Sep","09")
|
|
sdate = sdate.replace(u"Oct","10")
|
|
sdate = sdate.replace(u"Nov","11")
|
|
sdate = sdate.replace(u"Dec","12")
|
|
sdate = sdate.replace(u".","")
|
|
|
|
if sdate == "Today at":
|
|
sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%m %d %Y')
|
|
|
|
sdate = datetime.strptime(str(sdate), '%m %d %Y').strftime('%m/%d/%Y')
|
|
|
|
elif language == "french":
|
|
|
|
todaysday = crawlerDate.strftime("%m/%d/%Y")
|
|
|
|
sdate = sdate.replace(u"janvier","01")
|
|
sdate = sdate.replace(u"jan","01")
|
|
sdate = sdate.replace(u"février","02")
|
|
sdate = sdate.replace(u"juin","06")
|
|
sdate = sdate.replace(u"juillet","07")
|
|
sdate = sdate.replace(u"juil","07")
|
|
sdate = sdate.replace(u"août","08")
|
|
sdate = sdate.replace(u"septembre","09")
|
|
sdate = sdate.replace(u"sept","09")
|
|
sdate = sdate.replace(u"octobre","10")
|
|
sdate = sdate.replace(u"oct","10")
|
|
sdate = sdate.replace(u"novembre","11")
|
|
sdate = sdate.replace(u"nov","11")
|
|
sdate = sdate.replace(u"décembre","12")
|
|
sdate = sdate.replace(u"déc","12")
|
|
sdate = sdate.replace(u".","")
|
|
|
|
if sdate == u"Aujourd'hui":
|
|
sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%d %m %Y')
|
|
|
|
if "mar" in sdate:
|
|
print ("Add March to the IBM Black Market")
|
|
raise SystemExit
|
|
elif "avr" in sdate:
|
|
print ("Add April to the IBM Black Market")
|
|
raise SystemExit
|
|
elif "mai" in sdate:
|
|
print ("Add May to the IBM Black Market")
|
|
raise SystemExit
|
|
|
|
sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
|
|
|
|
elif language == "swedish":
|
|
|
|
sdate = sdate.replace(u"jan","01")
|
|
sdate = sdate.replace(u"feb","02")
|
|
sdate = sdate.replace(u"mar","03")
|
|
sdate = sdate.replace(u"apr","04")
|
|
sdate = sdate.replace(u"maj","05")
|
|
sdate = sdate.replace(u"jun","06")
|
|
sdate = sdate.replace(u"jul","07")
|
|
sdate = sdate.replace(u"aug","08")
|
|
sdate = sdate.replace(u"sep","09")
|
|
sdate = sdate.replace(u"okt","10")
|
|
sdate = sdate.replace(u"nov","11")
|
|
sdate = sdate.replace(u"dec","12")
|
|
sdate = sdate.replace(u".","")
|
|
|
|
sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
|
|
|
|
elif language == "russian":
|
|
|
|
if sdate == u'\u0412\u0447\u0435\u0440\u0430':
|
|
sdate = crawlerDate.today() - timedelta(1)
|
|
sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y')
|
|
elif u'\xd1\xee\xe7\xe4\xe0\xed\xee' in sdate:
|
|
return ""
|
|
|
|
sdate = sdate.replace(u"января","01")
|
|
sdate = sdate.replace(u"янв","01")
|
|
sdate = sdate.replace(u"февраля","02")
|
|
sdate = sdate.replace(u"Февраль", "02")
|
|
sdate = sdate.replace(u"фев","02")
|
|
sdate = sdate.replace(u"марта","03")
|
|
sdate = sdate.replace(u"апреля","04")
|
|
sdate = sdate.replace(u"апр","04")
|
|
sdate = sdate.replace(u"мар","05")
|
|
sdate = sdate.replace(u"май","05")
|
|
sdate = sdate.replace(u"мая","05")
|
|
sdate = sdate.replace(u"июня","06")
|
|
sdate = sdate.replace(u"июн","06")
|
|
sdate = sdate.replace(u"июля","07")
|
|
sdate = sdate.replace(u"июл","07")
|
|
sdate = sdate.replace(u"августа","08")
|
|
sdate = sdate.replace(u"авг","08")
|
|
sdate = sdate.replace(u"сентября","09")
|
|
sdate = sdate.replace(u"сен","09")
|
|
sdate = sdate.replace(u"октября","10")
|
|
sdate = sdate.replace(u"Октябрь","10")
|
|
sdate = sdate.replace(u"окт","10")
|
|
sdate = sdate.replace(u"ноября","11")
|
|
sdate = sdate.replace(u"ноя","11")
|
|
sdate = sdate.replace(u"декабря","12")
|
|
sdate = sdate.replace(u"дек","12")
|
|
sdate = sdate.replace(u".","")
|
|
|
|
sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
|
|
|
|
return sdate
|
|
|
|
|
|
def cleanLink(originalLink):
|
|
|
|
safe_chars = string.ascii_letters + string.digits
|
|
|
|
originalLink = ''.join([char if char in safe_chars else '' for char in originalLink])
|
|
|
|
return originalLink
|
|
|
|
|
|
def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author):
|
|
|
|
rw = []
|
|
|
|
current_time = datetime.now()
|
|
day = current_time.strftime("%m/%d/%Y")
|
|
ahora = current_time.strftime("%I:%M:%S")
|
|
|
|
for n in range(nm):
|
|
|
|
lne = forum # 0
|
|
lne += ","
|
|
lne += board # 1
|
|
lne += ","
|
|
lne += author[n] # 2
|
|
lne += ","
|
|
lne += topic[n] # 3
|
|
lne += ","
|
|
lne += "-1" if len(views) == 0 else views[n] # 4
|
|
lne += ","
|
|
lne += "-1" if len(posts) == 0 else posts[n] # 5
|
|
lne += ","
|
|
lne += "-1" if len(href) == 0 else href[n] # 6
|
|
lne += ","
|
|
lne += "-1" if len(addDate) == 0 else str(addDate[n]) # 7
|
|
lne += ","
|
|
lne += day + " " + ahora # 8
|
|
lne += ","
|
|
lne += "-1" if len(image_author) == 0 else str(image_author[n]) # 9 image_user
|
|
lne += ","
|
|
lne += "-1" # 10 name_user
|
|
lne += ","
|
|
lne += "-1" # 11 status_user
|
|
lne += ","
|
|
lne += "-1" # 12 reputation_user
|
|
lne += ","
|
|
lne += "-1" # 13 interest_user
|
|
lne += ","
|
|
lne += "-1" # 14 signature_user
|
|
lne += ","
|
|
lne += "-1" # 15 content_post
|
|
lne += ","
|
|
lne += "-1" # 16 feedback_post
|
|
lne += ","
|
|
lne += "-1" # 17 dateadded_post
|
|
lne += ","
|
|
lne += "-1" # 18 image_post
|
|
|
|
rw.append(lne)
|
|
|
|
return rw
|
|
|
|
|
|
def cleanString(originalString):
|
|
updated_string = originalString.replace(",", "") #replace all commas
|
|
updated_string = updated_string.replace("\n", "") #replace all newlines
|
|
updated_string = updated_string.replace("\t", "") #replace all tabs
|
|
updated_string = updated_string.replace("\r", "") #replace all carriage returns
|
|
updated_string = updated_string.replace("'", "^") #replace all semicolons
|
|
updated_string = updated_string.replace(u"»", '') #replace all arrows
|
|
updated_string = updated_string.replace("!", "")
|
|
updated_string = updated_string.replace(";", "") #replace all exclamations
|
|
|
|
return updated_string
|
|
|
|
|
|
#function to convert long informal date string to formal date
|
|
def convertFromLongDate(longDate, crawlerdate):
|
|
list_of_words = []
|
|
list_of_words = longDate.split()
|
|
|
|
day = 0
|
|
week = 0
|
|
hour = 0
|
|
second = 0
|
|
minute = 0
|
|
year = 0
|
|
total_days = 0
|
|
|
|
if 'days' in list_of_words:
|
|
index = list_of_words.index('days')
|
|
day = float(list_of_words[index - 1])
|
|
|
|
if 'weeks' in list_of_words:
|
|
index = list_of_words.index('weeks')
|
|
week = float(list_of_words[index - 1])
|
|
|
|
if 'hours' in list_of_words:
|
|
index = list_of_words.index('hours')
|
|
hour = float(list_of_words[index - 1])
|
|
|
|
if 'seconds' in list_of_words:
|
|
index = list_of_words.index('seconds')
|
|
second = float(list_of_words[index - 1])
|
|
|
|
if 'minutes' in list_of_words:
|
|
index = list_of_words.index('minutes')
|
|
minute = float(list_of_words[index - 1])
|
|
|
|
if 'years' in list_of_words:
|
|
index = list_of_words.index('years')
|
|
year = float(list_of_words[index - 1])
|
|
|
|
if year != 0:
|
|
total_days = day + 365 * year
|
|
|
|
#today = datetime.date.today()
|
|
timeDelta = fulldatetime.timedelta(days=total_days, weeks=week, hours=hour, seconds=second, minutes=minute)
|
|
|
|
date = crawlerdate - timeDelta
|
|
correct_date = str(date.strftime('%m/%d/%Y'))
|
|
|
|
return correct_date
|
|
|
|
|
|
def cleanNumbers(inputString):
|
|
|
|
reg_ex = re.compile(r'[^\d.]+')
|
|
updated_string = reg_ex.sub('', inputString)
|
|
|
|
return updated_string
|
|
|
|
|
|
def aes_encryption(item):
|
|
|
|
to_bytes = bytes(item)
|
|
|
|
encrypted_bytes = encryptCipher.encrypt(pad(to_bytes, BLOCK_SIZE))
|
|
|
|
return encrypted_bytes
|
|
|
|
|
|
def aes_decryption(item):
|
|
|
|
to_bytes = bytes(item)
|
|
|
|
decrypted_bytes = decryptCipher.decrypt(to_bytes)
|
|
|
|
return unpad(decrypted_bytes, BLOCK_SIZE)
|
|
|
|
|
|
def encrypt_encode_image_to_base64(driver, xpath):
|
|
|
|
try:
|
|
|
|
img_element = driver.find_element(by=By.XPATH, value=xpath)
|
|
image_data = img_element.screenshot_as_png
|
|
|
|
encrypted_image = aes_encryption(image_data)
|
|
base64_image = base64.b64encode(encrypted_image)
|
|
string_image = base64_image.decode('utf-8')
|
|
|
|
return string_image
|
|
|
|
except:
|
|
pass
|
|
|
|
return None
|
|
|
|
|
|
def decode_decrypt_image_in_base64(string_image):
|
|
|
|
try:
|
|
|
|
base64_image = bytes(string_image, encoding='utf-8')
|
|
encrypted_image = base64.b64decode(base64_image)
|
|
decrypted_image = aes_decryption(encrypted_image)
|
|
|
|
im = Image.open(io.BytesIO(decrypted_image))
|
|
im.show()
|
|
|
|
return decrypted_image
|
|
|
|
except Exception as e:
|
|
print(e)
|
|
pass
|
|
|
|
return None
|
|
|
|
|
|
def replace_image_sources(driver, html_content):
|
|
|
|
tree = lxml.fromstring(html_content)
|
|
|
|
for picture_tag in tree.findall('.//picture'):
|
|
for source_tag in picture_tag.findall('.//source'):
|
|
picture_tag.remove(source_tag)
|
|
|
|
for img_tag in tree.findall('.//img'):
|
|
|
|
img_xpath = tree.getroottree().getpath(img_tag)
|
|
|
|
string_image = encrypt_encode_image_to_base64(driver, img_xpath)
|
|
|
|
if string_image:
|
|
img_tag.set('src', f'data:image/png;base64,{string_image}')
|
|
else:
|
|
img_tag.getparent().remove(img_tag)
|
|
|
|
modified_html = lxml.tostring(tree, encoding='utf-8').decode('utf-8')
|
|
|
|
return modified_html
|
|
|
|
|
|
def cleanHTML(driver, html):
|
|
|
|
clean_html = replace_image_sources(driver, html)
|
|
# decode_decrypt_image_in_base64(clean_html)
|
|
|
|
formats = [
|
|
"jpg", "jpeg", "jfif", "pjpeg", "pjp",
|
|
"png", "apng", "svg", "bmp", "gif",
|
|
"avif", "webp", "ico", "cur", "tiff"
|
|
]
|
|
|
|
# remove images
|
|
clean_html = re.sub(r"<svg[\s\S]*?svg>", "", clean_html)
|
|
for fmat in formats:
|
|
clean_html = re.sub(r"<object.*" + fmat + "[\s\S]*?object>", "", clean_html)
|
|
clean_html = re.sub(r"<canvas[\s\S]*?canvas>", "", clean_html)
|
|
|
|
# remove JavaScript
|
|
clean_html = re.sub(r"<script[\s\S]*?script>", "", clean_html)
|
|
clean_html = re.sub(r"<iframe[\s\S]*?iframe>", "", clean_html)
|
|
clean_html = re.sub(r"<object.*javascript[\s\S]*?object>", "", clean_html)
|
|
clean_html = re.sub(r"<aplet.*mayscript[\s\S]*?aplet>", "", clean_html)
|
|
clean_html = re.sub(r"<embed.*scriptable[\s\S]*?embed>", "", clean_html)
|
|
|
|
# image and JavaScript
|
|
clean_html = re.sub(r"<div[^>]*style=\"[^\"]*background-image[\s\S]*?div>", "", clean_html)
|
|
|
|
return clean_html
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|