this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

768 lines
28 KiB

__author__ = 'DarkWeb'
import psycopg2
import traceback
from Forums.Utilities.utilities import *
from dateutil.relativedelta import relativedelta, FR
from scipy.spatial import distance
def connectDataBase():
from Forums.Initialization.forums_mining import config
try:
ip = config.get('PostgreSQL', 'ip')
username = config.get('PostgreSQL', 'username')
password = config.get('PostgreSQL', 'password')
database = config.get('PostgreSQL', 'database')
return psycopg2.connect(host=ip, user=username, password=password, dbname=database)
except:
print ("Data base " + config.get('PostgreSQL', 'database') + " not found.")
raise SystemExit
def verifyForum(cur, nameForum):
try:
cur.execute("lock table forums IN ACCESS EXCLUSIVE MODE")
cur.execute("select forum_id from forums where name_forum = %(nameForum)s limit 1", {'nameForum': nameForum})
recset = cur.fetchall()
if recset:
return recset[0]['forum_id']
else:
return 0
except:
trace = traceback.format_exc()
print (trace)
def verifyTopic(cur, forumId, hrefTopic):
try:
cur.execute("lock table topics IN ACCESS EXCLUSIVE MODE")
cur.execute("select topic_id from topics where forum_id = %(forumId)s and href_topic = %(hrefTopic)s limit 1",
{'forumId': forumId, 'hrefTopic': hrefTopic})
recset = cur.fetchall()
if recset:
return recset[0]['topic_id']
else:
return 0
except:
trace = traceback.format_exc()
print (trace)
def verifyPost(cur, topicId, userId, dateAdded):
try:
cur.execute("lock table posts IN ACCESS EXCLUSIVE MODE")
cur.execute("select post_id from posts where topic_id = %(topicId)s and "
"user_id = %(userId)s and dateadded_post = %(dateAdded)s limit 1", {'topicId': topicId,
'userId': userId,
'dateAdded': dateAdded})
recset = cur.fetchall()
if recset:
return recset[0]['post_id']
else:
return 0
except:
trace = traceback.format_exc()
print (trace)
def verifyUser(cur, nameUser, forumId):
try:
cur.execute("lock table users IN ACCESS EXCLUSIVE MODE")
cur.execute("select user_id from users where name_user = %(nameUser)s and forum_id = %(forumId)s limit 1",
{'nameUser': nameUser, 'forumId': forumId})
recset = cur.fetchall()
if recset:
return recset[0]['user_id']
else:
return 0
except:
trace = traceback.format_exc()
print (trace)
def verifyImage(cur, base64Image):
try:
cur.execute("lock table forum_images IN ACCESS EXCLUSIVE MODE")
cur.execute("select image_id from forum_images where hash_image = %(hashImage)s limit 1",
{'hashImage': generate_image_hash(base64Image)})
recset = cur.fetchall()
if recset:
return recset[0]['image_id']
else:
return 0
except:
trace = traceback.format_exc()
print (trace)
def getLastForum(cur):
try:
cur.execute("select forum_id from forums order by forum_id desc limit 1")
recset = cur.fetchall()
if recset:
return recset[0]['forum_id']
else:
return 0
except:
trace = traceback.format_exc()
print (trace)
def getLastTopic(cur):
try:
cur.execute("select topic_id from topics order by topic_id desc limit 1")
recset = cur.fetchall()
if recset:
return recset[0]['topic_id']
else:
return 0
except:
trace = traceback.format_exc()
print (trace)
def getLastUser(cur):
try:
cur.execute("select user_id from users order by user_id desc limit 1")
recset = cur.fetchall()
if recset:
return recset[0]['user_id']
else:
return 0
except:
trace = traceback.format_exc()
print (trace)
def getLastUserVersion(cur, userId):
try:
cur.execute("select version_user from users_history where user_id = %(userId)s order by version_user desc limit 1", {'userId': userId})
recset = cur.fetchall()
if recset:
return recset[0]['version_user']
else:
return 0
except:
trace = traceback.format_exc()
print (trace)
def getLastTopicVersion(cur, topicId):
try:
cur.execute("select version_topic from topics_history where topic_id = %(topicId)s order by version_topic desc limit 1", {'topicId': topicId})
recset = cur.fetchall()
if recset:
return recset[0]['version_topic']
else:
return 0
except:
trace = traceback.format_exc()
print (trace)
def getLastPostVersion(cur, postId):
try:
cur.execute("select version_post from posts_history where post_id = %(postId)s order by version_post desc limit 1", {'postId': postId})
recset = cur.fetchall()
if recset:
return recset[0]['version_post']
else:
return 0
except:
trace = traceback.format_exc()
print (trace)
def getLastPost(cur):
try:
cur.execute("select post_id from posts order by post_id desc limit 1")
recset = cur.fetchall()
if recset:
return recset[0]['post_id']
else:
return 0
except:
trace = traceback.format_exc()
print (trace)
def getLastImage(cur):
try:
cur.execute("select image_id from forum_images order by image_id desc limit 1")
recset = cur.fetchall()
if recset:
return recset[0]['image_id']
else:
return 0
except:
trace = traceback.format_exc()
print (trace)
def create_forum(cur, row, url):
forumId = verifyForum(cur, row[0])
if not forumId:
forumId = int(getLastForum(cur) + 1)
sql = "Insert into forums (forum_id, name_forum, url_forum, dateinserted_forum) Values (%s, %s, %s, %s)"
recset = [forumId, row[0], url, row[8]]
cur.execute(sql, recset)
return forumId
def create_topic(cur, forumId, row, authorId):
hrefTopic = get_relative_url(row[6])
topicId = verifyTopic(cur, forumId, hrefTopic)
if not topicId:
topicId = int(getLastTopic(cur) + 1)
newTopic = True
else:
newTopic = False
if newTopic:
sql = "Insert into topics (topic_id, forum_id, author_id, title_topic, board_topic, views_topic, posts_topic, " \
"href_topic, dateadded_topic, dateinserted_topic, classification_topic) Values (%s, %s, %s, %s, %s, %s, " \
"%s, %s, %s, %s, %s)"
recset = [topicId, forumId, authorId,
row[3],
row[1],
row[4] if row[4] != '-1' else None,
row[5] if row[5] != '-1' else None,
hrefTopic,
row[7] if row[7] != '-1' else None,
row[8],
row[19] if row[19] != '-1' else None]
cur.execute(sql, recset)
else:
# Tracking potential topic changes
sql = "select * from topics where topic_Id = %(topicId)s"
cur.execute(sql, {'topicId': topicId})
recset = cur.fetchall()
if row[19] != '-1' and str(recset[0]['classification_topic']) == str(None):
sql = "Update topics set classification_topic = %(classification_topic)s where topic_id = %(topicId)s"
cur.execute(sql, {'classification_topic': row[19],
'topicId': topicId})
elif (str(recset[0]['author_id']) != str(authorId) or
str(recset[0]['title_topic']) != str(row[3]) or
str(recset[0]['board_topic']) != str(row[1]) or
str(recset[0]['views_topic']) != str(row[4] if row[4] != '-1' else None) or # there was a change in the topic information
str(recset[0]['posts_topic']) != str(row[5] if row[5] != '-1' else None) or
str(recset[0]['classification_topic']) != str(row[19] if row[19] != '-1' else recset[0]['classification_topic'])):
topicVersionId = int(getLastTopicVersion(cur, topicId) + 1)
sql = "Insert into topics_history (topic_id, version_topic, forum_id, author_id, title_topic, board_topic, views_topic, posts_topic, " \
"href_topic, dateadded_topic, dateinserted_topic, classification_topic) Values (%s, %s, %s, %s, %s, " \
"%s, %s, %s, %s, %s, %s, %s)"
recset = [topicId, topicVersionId, forumId,
recset[0]['author_id'],
recset[0]['title_topic'],
recset[0]['board_topic'],
recset[0]['views_topic'],
recset[0]['posts_topic'],
recset[0]['href_topic'],
recset[0]['dateadded_topic'],
recset[0]['dateinserted_topic'],
recset[0]['classification_topic']]
cur.execute(sql, recset)
sql = "Update topics set author_id = %(author_id)s, title_topic = %(title_topic)s, board_topic = %(board_topic)s, " \
"views_topic = %(views_topic)s, posts_topic = %(posts_topic)s, dateinserted_topic = %(dateinserted_topic)s, " \
"classification_topic = %(classification_topic)s where topic_id = %(topicId)s"
cur.execute(sql, {'author_id': authorId,
'title_topic': row[3] if row[3] != '-1' else None,
'board_topic': row[1] if row[1] != '-1' else None,
'views_topic': row[4] if row[4] != '-1' else None,
'posts_topic': row[5] if row[5] != '-1' else None,
'dateinserted_topic': row[8],
'classification_topic': row[19] if row[19] != '-1' else None,
'topicId': topicId})
return topicId
def create_author(cur, row, forumId):
userId = verifyUser(cur, row[2], forumId)
if not userId:
userId = int(getLastUser(cur) + 1)
newUser = True
else:
newUser = False
if newUser:
sql = "Insert into users (user_id, forum_id, name_user, status_user, reputation_user, interest_user, " \
"signature_user, image_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
recset = [userId, forumId,
row[2], 'Nan', 'Nan', 'Nan', 'Nan', None, #telling the create_user function to not track changes here
row[8]]
cur.execute(sql, recset)
return userId
def create_user(cur, row, forumId, index):
userId = verifyUser(cur, row[10][index], forumId)
if not userId:
userId = int(getLastUser(cur) + 1)
newUser = True
else:
newUser = False
if newUser:
imageId = create_image(cur, row[9][index], row[8])
sql = "Insert into users (user_id, forum_id, name_user, status_user, reputation_user, interest_user, " \
"signature_user, image_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
recset = [userId, forumId,
row[10][index],
row[11][index] if row[11][index] != '-1' else None,
row[12][index] if row[12][index] != '-1' else None,
row[13][index] if row[13][index] != '-1' else None,
row[14][index] if row[14][index] != '-1' else None,
imageId,
row[8]]
cur.execute(sql, recset)
else:
# Tracking potential user changes
sql = "select * from users where user_id = %(userId)s"
cur.execute(sql, {'userId': userId})
recset = cur.fetchall()
imageId = recset[0]['image_user']
if not is_same_image(cur, row[9][index], recset[0]['image_user']):
imageId = create_image(cur, row[9][index], row[8])
if (str(recset[0]['status_user']) != str(row[11][index] if row[11][index] != '-1' else None) or
str(recset[0]['reputation_user']) != str(row[12][index] if row[12][index] != '-1' else None) or
str(recset[0]['interest_user']) != str(row[13][index] if row[13][index] != '-1' else None) or
str(recset[0]['signature_user']) != str(row[14][index] if row[14][index] != '-1' else None) or
str(recset[0]['image_user']) != str(imageId)): # there was a change in the user information
if (str(recset[0]['status_user']) != 'Nan' or
str(recset[0]['reputation_user']) != 'Nan' or
str(recset[0]['interest_user']) != 'Nan' or
str(recset[0]['signature_user']) != 'Nan' or
str(recset[0]['image_user']) != str(None)):
userVersionId = int(getLastUserVersion(cur, userId) + 1)
sql = "Insert into users_history (user_id, version_user, forum_id, name_user, status_user, reputation_user, interest_user, " \
"signature_user, image_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
recset = [userId, userVersionId, forumId,
recset[0]['name_user'],
recset[0]['status_user'],
recset[0]['reputation_user'],
recset[0]['interest_user'],
recset[0]['signature_user'],
recset[0]['image_user'],
recset[0]['dateinserted_user']]
cur.execute(sql, recset)
sql = "Update users set status_user = %(status_user)s, reputation_user = %(reputation_user)s, " \
"interest_user = %(interest_user)s, signature_user = %(signature_user)s, image_user = %(image_user)s, " \
"dateinserted_user = %(dateinserted_user)s where user_id = %(userId)s"
cur.execute(sql, {'status_user': row[11][index] if row[11][index] != '-1' else None,
'reputation_user': row[12][index] if row[12][index] != '-1' else None,
'interest_user': row[13][index] if row[13][index] != '-1' else None,
'signature_user': row[14][index] if row[14][index] != '-1' else None,
'image_user': imageId,
'dateinserted_user': row[8],
'userId': userId})
return userId
def create_posts(cur, row, forumId, topicId):
if row[10] != "-1":
for i in range(len(row[10])):
userId = create_user(cur, row, forumId, i)
postId = verifyPost(cur, topicId, userId, row[17][i])
if not postId:
postId = int(getLastPost(cur) + 1)
newPost = True
else:
newPost = False
if newPost:
imageId = create_image(cur, row[18][i], row[8])
sql = "Insert into posts (post_id, topic_id, user_id, content_post, feedback_post, image_post," \
"dateadded_post, dateinserted_post) Values (%s, %s, %s, %s, %s, %s, %s, %s)"
recset = [postId, topicId, userId,
row[15][i] if row[15][i] != '-1' else None,
row[16][i] if row[16][i] != '-1' else None,
imageId,
row[17][i],
row[8]]
cur.execute(sql, recset)
else:
# Tracking potential post changes
sql = "select * from posts where post_id = %(postId)s"
cur.execute(sql, {'postId': postId})
recset = cur.fetchall()
imageId = recset[0]['image_post']
if not is_same_image(cur, row[18][i], recset[0]['image_post']):
imageId = create_image(cur, row[18][i], row[8])
if (str(recset[0]['content_post']) != str(row[15][i]) or
str(recset[0]['feedback_post']) != str(row[16][i] if row[16][i] != '-1' else None) or
str(recset[0]['image_post']) != str(imageId)): # there was a change in the post information
postVersionId = int(getLastPostVersion(cur, postId) + 1)
sql = "Insert into posts_history (post_id, version_post, topic_id, user_id, content_post, feedback_post, " \
"image_post, dateadded_post, dateinserted_post) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
recset = [postId, postVersionId, topicId, userId,
recset[0]['content_post'],
recset[0]['feedback_post'],
recset[0]['image_post'],
recset[0]['dateadded_post'],
recset[0]['dateinserted_post']]
cur.execute(sql, recset)
sql = "Update posts set content_post = %(content_post)s, feedback_post = %(feedback_post)s, " \
"image_post = %(image_post)s, dateinserted_post = %(dateinserted_post)s where post_id = %(postId)s"
cur.execute(sql, {'content_post': row[15][i] if row[15][i] != '-1' else None,
'feedback_post': row[16][i] if row[16][i] != '-1' else None,
'image_post': imageId,
'dateinserted_post': row[8],
'postId': postId})
def is_same_image(cur, newBase64Image, oldImageId):
if newBase64Image == "-1" and str(oldImageId) is str(None):
return True
if newBase64Image == "-1": # and str(oldImageId) != str(None)
return False
if str(oldImageId) == str(None): # and newBase64Image != "-1"
return False
sql = "select * from forum_images where image_id = %(imageId)s"
cur.execute(sql, {'imageId': oldImageId})
recset = cur.fetchall()
decImage = decode_decrypt_image_in_base64(newBase64Image)
hash1 = generate_image_hash(decImage)
hash2 = recset[0]['hash_image']
if hash1 == hash2:
return True
features1 = json.loads(extract_hidden_layer_output(decImage))
features2 = json.loads(recset[0]['resnet50_image'])
similarity = 1 - distance.cosine(features1, features2)
return similarity >= 0.8
def create_image(cur, base64Image, date):
if base64Image == "-1":
return None
decImage = decode_decrypt_image_in_base64(base64Image)
imageId = verifyImage(cur, decImage)
if not imageId:
imageId = int(getLastImage(cur) + 1)
newImage = True
else:
newImage = False
if newImage:
hashImage = generate_image_hash(decImage)
hloFeatures = extract_hidden_layer_output(decImage)
keypoints, descriptors = extract_keypoints(decImage)
sql = "Insert into forum_images(image_id, hash_image, base64_image, " \
"resnet50_image, siftkey_image, siftdesc_image, dateinserted_image) " \
"Values (%s, %s, %s, %s, %s, %s, %s)"
recset = [imageId,
hashImage,
base64Image,
hloFeatures,
keypoints,
descriptors,
date]
cur.execute(sql, recset)
return imageId
def create_status(cur, forumId, date, listings, descriptions, status):
date = datetime.strptime(date, "%m%d%Y")
# getting last Fridays a reference date
date_reference = date + relativedelta(weekday=FR(-1))
# checking if status already exists
sql = "select * from forums_status where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s"
cur.execute(sql, {'forum_id': forumId, 'date_inserted': date})
recset = cur.fetchall()
if recset:
sql = "Update forums_status set listings = %(listings)s, descriptions = %(descriptions)s, status = %(status)s, date_reference = %(date_reference)s " \
"where forum_id = %(forum_id)s and date_inserted = %(date_inserted)s"
recset = {'listings': listings, 'descriptions': descriptions, 'status': status, 'date_reference': date_reference, 'forum_id': forumId, 'date_inserted': date}
else:
sql = "Insert into forums_status (forum_id, date_inserted, listings, descriptions, status, date_reference) Values (%s, %s, %s, %s, %s, %s)"
recset = [forumId, date, listings, descriptions, status, date_reference]
cur.execute(sql, recset)
def create_database(cur, con):
try:
sql = "create table forum_images(image_id integer not null, hash_image character varying(64) not null, base64_image text not null, " \
"resnet50_image text null, siftkey_image text null, siftdesc_image text null, " \
"dateinserted_image timestamp(6) with time zone not null, " \
"constraint forum_images_pk primary key (image_id))"
cur.execute(sql)
sql = "create unique index unique_forum_image ON forum_images USING btree (hash_image ASC NULLS LAST)"
cur.execute(sql)
sql = "create table forums (forum_id integer NOT NULL, name_forum character varying(255) NOT NULL, url_forum " \
"character varying(255) NOT null, dateinserted_forum timestamp(6) with time zone NOT NULL, " \
"constraint forums_pk primary key (forum_id))"
cur.execute(sql)
sql = "create unique index unique_forum ON forums USING btree (name_forum ASC NULLS LAST)"
cur.execute(sql)
sql = "Create table forums_status (forum_id integer NOT NULL, date_inserted date NOT NULL, " \
"listings integer NOT NULL, descriptions integer NOT NULL, status bit(1) NOT NULL, date_reference date NOT NULL, " \
"constraint forums_status_pk PRIMARY KEY (forum_id, date_inserted), " \
"constraint forums_status_fk FOREIGN KEY (forum_id) REFERENCES forums (forum_id))"
cur.execute(sql)
sql = "create table users (user_id integer NOT NULL, forum_id integer NOT NULL, name_user character varying(" \
"255) NOT NULL, status_user character varying(255) null, reputation_user character varying(255) null, " \
"interest_user character varying(10000) null, signature_user character varying(10000) null, " \
"image_user integer null, dateinserted_user timestamp(6) with time zone NOT NULL, " \
"constraint users_pk primary key (user_id), " \
"constraint users_forum_id_fk foreign key (forum_id) references forums (forum_id), " \
"constraint users_image_id_fk foreign key (image_user) references forum_images (image_id))"
cur.execute(sql)
sql = "create unique index unique_user ON users USING btree (forum_id ASC NULLS LAST, name_user ASC NULLS LAST)"
cur.execute(sql)
sql = "create table users_history(user_id integer NOT NULL, version_user integer not null, forum_id integer NOT NULL, " \
"name_user character varying(255) NOT NULL, status_user character varying(255) null, " \
"reputation_user character varying(255) null, interest_user character varying(10000) null, " \
"signature_user character varying(10000) null, image_user integer null, " \
"dateinserted_user timestamp(6) with time zone NOT NULL, " \
"constraint users_history_pk primary key (user_id, version_user), " \
"constraint users_history_user_id_fk foreign key (user_id) references users (user_id), " \
"constraint users_history_forum_id_fk foreign key (forum_id) references forums (forum_id), " \
"constraint users_history_image_id_fk foreign key (image_user) references forum_images (image_id))"
cur.execute(sql)
sql = "create table topics(topic_id integer NOT NULL, forum_id integer NOT NULL, author_id integer NOT NULL, " \
"title_topic character varying(255) NOT NULL, board_topic character varying(255) NOT NULL, views_topic integer null, " \
"posts_topic integer null, href_topic character varying(255) NOT null, dateadded_topic timestamp(6) with time zone null, " \
"dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic double precision null, " \
"constraint topics_pk primary key (topic_id), " \
"constraint topics_author_id_fk foreign key (author_id) references users (user_id), " \
"constraint topics_forum_id_fk foreign key (forum_id) references forums (forum_id))"
cur.execute(sql)
sql = "create unique index unique_topic ON topics USING btree (forum_id ASC NULLS LAST, href_topic ASC NULLS LAST)"
cur.execute(sql)
sql = "create table topics_history(topic_id integer NOT NULL, version_topic integer not null, forum_id integer NOT NULL, " \
"author_id integer NOT NULL, title_topic character varying(255) NOT NULL, board_topic character varying(255) NOT NULL, " \
"views_topic integer null, posts_topic integer null, href_topic character varying(255) NOT null, " \
"dateadded_topic timestamp(6) with time zone null, dateinserted_topic timestamp(6) with time zone NOT NULL, " \
"classification_topic double precision null, " \
"constraint topics_history_pk primary key (topic_id, version_topic), " \
"constraint topics_history_topic_id_fk foreign key (topic_id) references topics (topic_id), " \
"constraint topics_history_author_id_f foreign key (author_id) references users (user_id), " \
"constraint topics_history_board_id_fk foreign key (forum_id) references forums (forum_id))"
cur.execute(sql)
sql = "create table posts(post_id integer NOT NULL, topic_id integer NOT NULL, " \
"user_id integer NOT NULL, content_post character varying(1000000) NOT null, feedback_post integer null, " \
"image_post integer null, dateadded_post timestamp(6) with time zone NOT NULL, " \
"dateinserted_post timestamp(6) with time zone NOT NULL, " \
"constraint posts_pk primary key (post_id), " \
"constraint posts_user_id_fk foreign key (user_id) references users (user_id), " \
"constraint posts_topic_id_fk foreign key (topic_id) references topics (topic_id), " \
"constraint posts_image_id_fk foreign key (image_post) references forum_images (image_id))"
cur.execute(sql)
sql = "create unique index unique_post ON posts USING btree (topic_id ASC NULLS LAST, user_id ASC NULLS LAST, " \
"dateadded_post ASC NULLS LAST)"
cur.execute(sql)
sql = "create table posts_history(post_id integer NOT NULL, version_post integer not null, topic_id integer NOT NULL, " \
"user_id integer NOT NULL, content_post character varying(1000000) NOT null, feedback_post integer null, " \
"image_post integer null, dateadded_post timestamp(6) with time zone NOT NULL, " \
"dateinserted_post timestamp(6) with time zone NOT NULL, " \
"constraint posts_history_pk primary key (post_id, version_post), " \
"constraint posts_history_user_id_fk foreign key (user_id) references users (user_id), " \
"constraint posts_history_topic_id_fk foreign key (topic_id) references topics (topic_id), " \
"constraint posts_history_post_id_fk foreign key (post_id) references posts (post_id), " \
"constraint posts_history_image_id_fk foreign key (image_post) references forum_images (image_id))"
cur.execute(sql)
con.commit()
except:
con.rollback()
trace = traceback.format_exc()
if (trace.find("already exists")==-1):
print ("There was a problem during the database creation." )
traceback.print_exc()
raise SystemExit