import string
import time
import re
import hashlib
import imghdr
import base64
import requests
import io
import urllib.parse as urlparse
from datetime import datetime, timedelta
import datetime as fulldatetime
from bs4 import BeautifulSoup
from lxml import html as lxml
from selenium.webdriver.common.by import By
from Crypto.Cipher import AES
from Crypto.Util.Padding import pad, unpad
from PIL import Image
def generate_aes_key():
from Forums.Initialization.forums_mining import config
password = "password"
password_bytes = bytes(password, encoding="utf-8")
# Derive a key from the seed using PBKDF2
key = hashlib.pbkdf2_hmac(hash_name='sha256', password=password_bytes, salt=bytes(), iterations=1)
# Use the first 16 bytes of the derived key as the AES key
aes_key = key[:16]
# print("key: ", aes_key)
return aes_key
aes_key = generate_aes_key()
encryptCipher = AES.new(aes_key, AES.MODE_ECB)
decryptCipher = AES.new(aes_key, AES.MODE_ECB)
def cleanText(originalText):
safe_chars = string.ascii_letters + string.digits + " " + "_" + "/" + "&" + "$" + "#" "@" + "+" + "-" + "*" + "=" \
":" + ";" + "." "," + "?" + "!" + "{" + "}" + "[" + "]" + "(" + ")" + "%" + "`" + "~" + "^" + "|" + "<" + ">"
for index, text in enumerate(originalText):
originalText[index] = ''.join([char if char in safe_chars else '' for char in text])
return originalText
def convertDate(sdate, language, crawlerDate):
if language == "english":
todaysday = crawlerDate.strftime("%m/%d/%Y")
sdate = sdate.replace(u"January","01")
sdate = sdate.replace(u"February","02")
sdate = sdate.replace(u"March","03")
sdate = sdate.replace(u"April","04")
sdate = sdate.replace(u"May","05")
sdate = sdate.replace(u"June","06")
sdate = sdate.replace(u"July","07")
sdate = sdate.replace(u"August","08")
sdate = sdate.replace(u"September","09")
sdate = sdate.replace(u"October","10")
sdate = sdate.replace(u"November","11")
sdate = sdate.replace(u"December","12")
sdate = sdate.replace(u"Jan","01")
sdate = sdate.replace(u"Feb","02")
sdate = sdate.replace(u"Mar","03")
sdate = sdate.replace(u"Apr","04")
sdate = sdate.replace(u"May","05")
sdate = sdate.replace(u"Jun","06")
sdate = sdate.replace(u"Jul","07")
sdate = sdate.replace(u"Aug","08")
sdate = sdate.replace(u"Sep","09")
sdate = sdate.replace(u"Oct","10")
sdate = sdate.replace(u"Nov","11")
sdate = sdate.replace(u"Dec","12")
sdate = sdate.replace(u".","")
if sdate == "Today at":
sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%m %d %Y')
sdate = datetime.strptime(str(sdate), '%m %d %Y').strftime('%m/%d/%Y')
elif language == "french":
todaysday = crawlerDate.strftime("%m/%d/%Y")
sdate = sdate.replace(u"janvier","01")
sdate = sdate.replace(u"jan","01")
sdate = sdate.replace(u"février","02")
sdate = sdate.replace(u"juin","06")
sdate = sdate.replace(u"juillet","07")
sdate = sdate.replace(u"juil","07")
sdate = sdate.replace(u"août","08")
sdate = sdate.replace(u"septembre","09")
sdate = sdate.replace(u"sept","09")
sdate = sdate.replace(u"octobre","10")
sdate = sdate.replace(u"oct","10")
sdate = sdate.replace(u"novembre","11")
sdate = sdate.replace(u"nov","11")
sdate = sdate.replace(u"décembre","12")
sdate = sdate.replace(u"déc","12")
sdate = sdate.replace(u".","")
if sdate == u"Aujourd'hui":
sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%d %m %Y')
if "mar" in sdate:
print ("Add March to the IBM Black Market")
raise SystemExit
elif "avr" in sdate:
print ("Add April to the IBM Black Market")
raise SystemExit
elif "mai" in sdate:
print ("Add May to the IBM Black Market")
raise SystemExit
sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
elif language == "swedish":
sdate = sdate.replace(u"jan","01")
sdate = sdate.replace(u"feb","02")
sdate = sdate.replace(u"mar","03")
sdate = sdate.replace(u"apr","04")
sdate = sdate.replace(u"maj","05")
sdate = sdate.replace(u"jun","06")
sdate = sdate.replace(u"jul","07")
sdate = sdate.replace(u"aug","08")
sdate = sdate.replace(u"sep","09")
sdate = sdate.replace(u"okt","10")
sdate = sdate.replace(u"nov","11")
sdate = sdate.replace(u"dec","12")
sdate = sdate.replace(u".","")
sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
elif language == "russian":
if sdate == u'\u0412\u0447\u0435\u0440\u0430':
sdate = crawlerDate.today() - timedelta(1)
sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y')
elif u'\xd1\xee\xe7\xe4\xe0\xed\xee' in sdate:
return ""
sdate = sdate.replace(u"января","01")
sdate = sdate.replace(u"янв","01")
sdate = sdate.replace(u"февраля","02")
sdate = sdate.replace(u"Февраль", "02")
sdate = sdate.replace(u"фев","02")
sdate = sdate.replace(u"марта","03")
sdate = sdate.replace(u"апреля","04")
sdate = sdate.replace(u"апр","04")
sdate = sdate.replace(u"мар","05")
sdate = sdate.replace(u"май","05")
sdate = sdate.replace(u"мая","05")
sdate = sdate.replace(u"июня","06")
sdate = sdate.replace(u"июн","06")
sdate = sdate.replace(u"июля","07")
sdate = sdate.replace(u"июл","07")
sdate = sdate.replace(u"августа","08")
sdate = sdate.replace(u"авг","08")
sdate = sdate.replace(u"сентября","09")
sdate = sdate.replace(u"сен","09")
sdate = sdate.replace(u"октября","10")
sdate = sdate.replace(u"Октябрь","10")
sdate = sdate.replace(u"окт","10")
sdate = sdate.replace(u"ноября","11")
sdate = sdate.replace(u"ноя","11")
sdate = sdate.replace(u"декабря","12")
sdate = sdate.replace(u"дек","12")
sdate = sdate.replace(u".","")
sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
return sdate
def cleanLink(originalLink):
safe_chars = string.ascii_letters + string.digits
originalLink = ''.join([char if char in safe_chars else '' for char in originalLink])
return originalLink
def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate):
day = time.strftime("%m/%d/%Y")
ahora = time.strftime("%I:%M:%S")
rw = []
for n in range(nm):
lne = forum # 0
lne += ","
lne += board # 1
lne += ","
lne += author[n] # 2
lne += ","
lne += topic[n] # 3
lne += ","
lne += "-1" if len(views) == 0 else views[n] # 4
lne += ","
lne += "-1" if len(posts) == 0 else posts[n] # 5
lne += ","
lne += "-1" if len(href) == 0 else href[n] # 6
lne += ","
lne += "-1" if len(addDate) == 0 else str(addDate[n]) # 7
lne += ","
lne += day + " " + ahora # 8
lne += ","
lne += "-1" # 9 name_user
lne += ","
lne += "-1" # 10 status_user
lne += ","
lne += "-1" # 11 reputation_user
lne += ","
lne += "-1" # 12 interest_user
lne += ","
lne += "-1" # 13 signature_user
lne += ","
lne += "-1" # 14 content_post
lne += ","
lne += "-1" # 15 feedback_post
lne += ","
lne += "-1" # 16 dateadded_post
return rw
def cleanString(originalString):
updated_string = originalString.replace(",", "") #replace all commas
updated_string = updated_string.replace("\n", "") #replace all newlines
updated_string = updated_string.replace("\t", "") #replace all tabs
updated_string = updated_string.replace("\r", "") #replace all carriage returns
updated_string = updated_string.replace("'", "^") #replace all semicolons
updated_string = updated_string.replace(u"»", '') #replace all arrows
updated_string = updated_string.replace("!", "")
updated_string = updated_string.replace(";", "") #replace all exclamations
return updated_string
#function to convert long informal date string to formal date
def convertFromLongDate(longDate, crawlerdate):
list_of_words = []
list_of_words = longDate.split()
day = 0
week = 0
hour = 0
second = 0
minute = 0
year = 0
total_days = 0
if 'days' in list_of_words:
index = list_of_words.index('days')
day = float(list_of_words[index - 1])
if 'weeks' in list_of_words:
index = list_of_words.index('weeks')
week = float(list_of_words[index - 1])
if 'hours' in list_of_words:
index = list_of_words.index('hours')
hour = float(list_of_words[index - 1])
if 'seconds' in list_of_words:
index = list_of_words.index('seconds')
second = float(list_of_words[index - 1])
if 'minutes' in list_of_words:
index = list_of_words.index('minutes')
minute = float(list_of_words[index - 1])
if 'years' in list_of_words:
index = list_of_words.index('years')
year = float(list_of_words[index - 1])
if year != 0:
total_days = day + 365 * year
#today = datetime.date.today()
timeDelta = fulldatetime.timedelta(days=total_days, weeks=week, hours=hour, seconds=second, minutes=minute)
date = crawlerdate - timeDelta
correct_date = str(date.strftime('%m/%d/%Y'))
return correct_date
def aes_encryption(item):
to_bytes = bytes(item)
encrypted_bytes = encryptCipher.encrypt(pad(to_bytes, BLOCK_SIZE))
return encrypted_bytes
def aes_decryption(item):
to_bytes = bytes(item)
decrypted_bytes = decryptCipher.decrypt(to_bytes)
return unpad(decrypted_bytes, BLOCK_SIZE)
def encrypt_encode_image_to_base64(driver, xpath):
img_element = driver.find_element(by=By.XPATH, value=xpath)
image_data = img_element.screenshot_as_png
encrypted_image = aes_encryption(image_data)
base64_image = base64.b64encode(encrypted_image)
string_image = base64_image.decode('utf-8')
return string_image
return None
def decode_decrypt_image_in_base64(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
for img_tag in soup.find_all('img'):
src_attr = img_tag.get('src')
if src_attr and src_attr.startswith('data:image'):
string_image = src_attr.split('base64,')[-1]
base64_image = bytes(string_image, encoding='utf-8')
encrypted_image = base64.b64decode(base64_image)
decrypted_image = aes_decryption(encrypted_image)
im = Image.open(io.BytesIO(decrypted_image))
except Exception as e:
def replace_image_sources(driver, html_content):
tree = lxml.fromstring(html_content)
for picture_tag in tree.findall('.//picture'):
for source_tag in picture_tag.findall('.//source'):
for img_tag in tree.findall('.//img'):
img_xpath = tree.getroottree().getpath(img_tag)
string_image = encrypt_encode_image_to_base64(driver, img_xpath)
if string_image:
img_tag.set('src', f'data:image/png;base64,{string_image}')
modified_html = lxml.tostring(tree, encoding='utf-8').decode('utf-8')
return modified_html
def cleanHTML(driver, html):
clean_html = replace_image_sources(driver, html)
# decode_decrypt_image_in_base64(clean_html)
formats = [
"jpg", "jpeg", "jfif", "pjpeg", "pjp",
"png", "apng", "svg", "bmp", "gif",
"avif", "webp", "ico", "cur", "tiff"
# remove images
clean_html = re.sub(r"<svg.*?>", "", clean_html)
for fmat in formats:
clean_html = re.sub(r"<object.*" + fmat + ".*?>", "", clean_html)
clean_html = re.sub(r"<canvas.*?>", "", clean_html)
# remove JavaScript
clean_html = re.sub(r"<script.*?>", "", clean_html)
clean_html = re.sub(r"<object.*javascript.*?>", "", clean_html)
clean_html = re.sub(r"<aplet.*mayscript?>", "", clean_html)
clean_html = re.sub(r"<embed.*scriptable?>", "", clean_html)
# image and JavaScript
clean_html = re.sub(r"<div[^>]*style=\"[^\"]*background-image.*?>|background-image:url\(\'(.*?)\'\);", "", clean_html)
return clean_html