diff --git a/Forums/DB_Connection/db_connection.py b/Forums/DB_Connection/db_connection.py index 340abfc..3fb53ff 100644 --- a/Forums/DB_Connection/db_connection.py +++ b/Forums/DB_Connection/db_connection.py @@ -7,9 +7,9 @@ import configparser def connectDataBase(): - try: + from Forums.Initialization.forums_mining import config - from Forums.Initialization.forums_mining import config + try: ip = config.get('PostgreSQL', 'ip') username = config.get('PostgreSQL', 'username') @@ -20,7 +20,7 @@ def connectDataBase(): except: - print ("Data base (darkweb_forums) not found.") + print ("Data base " + config.get('PostgreSQL', 'database') + " not found.") raise SystemExit @@ -63,15 +63,34 @@ def verifyBoard(cur, forum, nameBoard): print (trace) -def verifyTopic(cur, forumId, boardId, authorId, titleTopic): +def verifyTopic(cur, forumId, authorId, titleTopic): try: - cur.execute("select topic_id from topics where forum_id = %(forumId)s and board_id = %(boardId)s and " - "author_id = %(authorId)s and title_topic = %(titleTopic)s limit 1", {'forumId': forumId, - 'boardId': boardId, - 'authorId': authorId, - 'titleTopic': titleTopic}) + cur.execute("select topic_id from topics where forum_id = %(forumId)s and author_id = %(authorId)s and title_topic = %(titleTopic)s limit 1", + {'forumId': forumId, 'authorId': authorId, 'titleTopic': titleTopic}) + + recset = cur.fetchall() + + if recset: + return recset[0][0] + else: + return 0 + + except: + + trace = traceback.format_exc() + print (trace) + + +def verifyPost(cur, topicId, userId, dateAdded): + + try: + + cur.execute("select post_id from posts where topic_id = %(topicId)s and " + "user_id = %(userId)s and dateadded_post = %(dateAdded)s limit 1", {'topicId': topicId, + 'userId': userId, + 'dateAdded': dateAdded}) recset = cur.fetchall() @@ -167,7 +186,7 @@ def getLastUser(cur): try: - cur.execute("select user_id from users order by user_id desc") + cur.execute("select user_id from users order by user_id desc limit 1") recset = cur.fetchall() @@ -182,12 +201,11 @@ def getLastUser(cur): print (trace) -''' def getLastPost(cur): try: - cur.execute("select id from Posts order by id desc limit 1") + cur.execute("select post_id from posts order by post_id desc limit 1") recset = cur.fetchall() @@ -200,7 +218,6 @@ def getLastPost(cur): trace = traceback.format_exc() print (trace) -''' def create_forum(cur, row, url): @@ -220,46 +237,86 @@ def create_forum(cur, row, url): return forumId -def create_board(cur, row, forumId): +def create_topic(cur, forumId, row, authorId): + + topicId = verifyTopic(cur, forumId, authorId, row[3]) + + if not topicId: + topicId = int(getLastTopic(cur) + 1) + newTopic = True + else: + newTopic = False - boardId = verifyBoard(cur, forumId, row[1]) + if newTopic: - if not boardId: + sql = "Insert into topics (topic_id, forum_id, author_id, title_topic, board_topic, views_topic, posts_topic, " \ + "href_topic, dateadded_topic, dateinserted_topic, classification_topic) Values (%s, %s, %s, %s, %s, %s, " \ + "%s, %s, %s, %s, %s)" - boardId = int(getLastBoard(cur) + 1) + recset = [topicId, forumId, authorId, + row[3], row[1], + row[4] if row[4] != '-1' else None, + row[5] if row[5] != '-1' else None, + row[6] if row[6] != '-1' else None, + row[7] if row[7] != '-1' else None, + row[8], + row[17]] + cur.execute(sql, recset) - sql = "Insert into boards (board_id, forum_id, name_board, dateinserted_board) Values (%s, %s, %s, %s)" + else: - recset = [boardId, forumId, row[1], row[8]] + # Tracking potential topic changes + sql = "select * from topics where topic_Id = %(topicId)s" + cur.execute(sql, {'topicId': topicId}) - cur.execute(sql, recset) + recset = cur.fetchall() - return boardId + if (str(recset[0][4]) != str(row[1]) or str(recset[0][5]) != str(row[4]) or str(recset[0][6]) != str(row[5])): # there was a change in the topic information + + sql = "Insert into topics_history (topic_id, forum_id, author_id, title_topic, board_topic, views_topic, posts_topic, " \ + "href_topic, dateadded_topic, dateinserted_topic, classification_topic) Values (%s, %s, %s, %s, %s, " \ + "%s, %s, %s, %s, %s, %s)" + + recset = [topicId, forumId, authorId, + recset[0][3], + recset[0][4], + recset[0][5], + recset[0][6], + recset[0][7], + recset[0][8], + recset[0][9], + recset[0][10]] + cur.execute(sql, recset) + + sql = "Update topics set board_topic = %(board_topic)s, views_topic = %(views_topic)s, posts_topic = %(posts_topic)s, " \ + "dateinserted_topic = %(dateinserted_topic)s where topic_id = %(topicId)s" + cur.execute(sql, {'board_topic': row[1], + 'views_topic': row[4] if row[4] != '-1' else None, + 'posts_topic': row[5] if row[5] != '-1' else None, + 'dateinserted_topic': row[8], + 'topicId': topicId}) + return topicId -def create_topic(cur, row, forumId, boardId, authorId): - topicId = verifyTopic(cur, forumId, boardId, authorId, row[3]) +def create_author(cur, row, forumId): - if not topicId: + userId = verifyUser(cur, row[2], forumId) - topicId = int(getLastTopic(cur) + 1) + if not userId: - sql = "Insert into topics (topic_id, forum_id, board_id, author_id, title_topic, views_topic, posts_topic, " \ - "href_topic, dateadded_topic, dateinserted_topic, classification_topic) Values (%s, %s, %s, %s, %s, %s, " \ - "%s, %s, %s, %s, %s)" + userId = int(getLastUser(cur) + 1) + + sql = "Insert into users (user_id, forum_id, name_user, status_user, reputation_user, interest_user, " \ + "signature_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s)" + + recset = [userId, forumId, + row[2], 'Nan', 'Nan', 'Nan', 'Nan', #telling the create_posts function to not track changes here + row[8]] - recset = [topicId, forumId, boardId, authorId, - row[3], - row[4] if row[4] != '-1' else None, - row[5] if row[5] != '-1' else None, - row[6] if row[6] != '-1' else None, - row[7] if row[7] != '-1' else None, - row[8], - row[17]] cur.execute(sql, recset) - return topicId + return userId def create_user(cur, row, forumId, index): @@ -267,8 +324,12 @@ def create_user(cur, row, forumId, index): userId = verifyUser(cur, row[9][index], forumId) if not userId: - userId = int(getLastUser(cur) + 1) + newUser = True + else: + newUser = False + + if newUser: sql = "Insert into users (user_id, forum_id, name_user, status_user, reputation_user, interest_user, " \ "signature_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s)" @@ -283,30 +344,97 @@ def create_user(cur, row, forumId, index): cur.execute(sql, recset) + else: + + # Tracking potential user changes + sql = "select * from users where user_id = %(userId)s" + cur.execute(sql, {'userId': userId}) + + recset = cur.fetchall() + + if (str(recset[0][3]) != str(row[10][index]) or str(recset[0][4]) != str(row[11][index]) or + str(recset[0][5]) != str(row[12][index] if row[12][index] != '-1' else None) or str(recset[0][6]) != str(row[13][index] if row[13][index] != '-1' else None)): # there was a change in the user information + + if (str(recset[0][3]) != 'Nan' or str(recset[0][4]) != 'Nan' or str(recset[0][5]) != 'Nan' or str(recset[0][6]) != 'Nan'): + + sql = "Insert into users_history (user_id, forum_id, name_user, status_user, reputation_user, interest_user, " \ + "signature_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s)" + + recset = [userId, forumId, + recset[0][2], + recset[0][3], + recset[0][4], + recset[0][5], + recset[0][6], + recset[0][7]] + + cur.execute(sql, recset) + + sql = "Update users set status_user = %(status_user)s, reputation_user = %(reputation_user)s, " \ + "interest_user = %(interest_user)s, signature_user = %(signature_user)s, dateinserted_user = %(dateinserted_user)s " \ + "where user_id = %(userId)s" + cur.execute(sql, {'status_user': row[10][index] if row[10][index] != '-1' else None, + 'reputation_user': row[11][index] if row[11][index] != '-1' else None, + 'interest_user': row[12][index] if row[12][index] != '-1' else None, + 'signature_user': row[13][index] if row[13][index] != '-1' else None, + 'dateinserted_user': row[8] if row[8] != '-1' else None, + 'userId': userId}) + return userId -def create_posts(cur, row, forumId, boardId, topicId): +def create_posts(cur, row, forumId, topicId): if row[9] != "-1": for i in range(len(row[9])): - if i != 0: - userId = create_user(cur, row, forumId, i) + userId = create_user(cur, row, forumId, i) + + postId = verifyPost(cur, topicId, userId, row[16][i]) + + if not postId: + + postId = int(getLastPost(cur) + 1) + + sql = "Insert into posts (post_id, topic_id, user_id, content_post, feedback_post, " \ + "dateadded_post, dateinserted_post) Values (%s, %s, %s, %s, %s, %s, %s)" + + recset = [postId, topicId, userId, + row[14][i] if row[14][i] != '-1' else None, + row[15][i] if row[15][i] != '-1' else None, + row[16][i] if row[16][i] != '-1' else None, + row[8]] + + cur.execute(sql, recset) + else: - userId = verifyUser(cur, row[2], forumId) - sql = "Insert into posts (forum_id, board_id, topic_id, user_id, content_post, feedback_post, " \ - "dateadded_post, dateinserted_post) Values (%s, %s, %s, %s, %s, %s, %s, %s)" + # Tracking potential post changes + sql = "select * from posts where post_id = %(postId)s" + cur.execute(sql, {'postId': postId}) + + recset = cur.fetchall() + + if (str(recset[0][3]) != str(row[14][i]) or str(recset[0][4]) != str(row[15][i] if row[15][i] != '-1' else None)): # there was a change in the post information + + sql = "Insert into posts_history (post_id, topic_id, user_id, content_post, feedback_post, " \ + "dateadded_post, dateinserted_post) Values (%s, %s, %s, %s, %s, %s, %s)" + + recset = [postId, topicId, userId, + recset[0][3], + recset[0][4], + recset[0][5], + recset[0][6]] - recset = [forumId, boardId, topicId, userId, - row[14][i] if row[14][i] != '-1' else None, - row[15][i] if row[15][i] != '-1' else None, - row[16][i] if row[16][i] != '-1' else None, - row[8]] + cur.execute(sql, recset) - cur.execute(sql, recset) + sql = "Update posts set content_post = %(content_post)s, feedback_post = %(feedback_post)s, dateinserted_post = %(dateinserted_post)s " \ + "where post_id = %(postId)s" + cur.execute(sql, {'content_post': row[14][i] if row[14][i] != '-1' else None, + 'feedback_post': row[15][i] if row[15][i] != '-1' else None, + 'dateinserted_post': row[8], + 'postId': postId}) def create_database(cur, con): @@ -314,14 +442,11 @@ def create_database(cur, con): try: sql = "create table forums (forum_id integer NOT NULL, name_forum character varying(255) NOT NULL, url_forum " \ - "character varying(255) null, dateinserted_forum timestamp(6) with time zone NOT NULL, constraint " \ + "character varying(255) NOT null, dateinserted_forum timestamp(6) with time zone NOT NULL, constraint " \ "forums_pk primary key (forum_id))" cur.execute(sql) - sql = "create table boards (board_id integer NOT NULL, forum_id integer NOT NULL, name_board character " \ - "varying(255) NOT NULL, dateinserted_board timestamp(6) with time zone NOT NULL, constraint boards_pk " \ - "primary key (board_id), constraint boards_forum_id_fkey foreign key (forum_id) references forums (" \ - "forum_id))" + sql = "create unique index unique_forum ON forums USING btree (name_forum ASC NULLS LAST)" cur.execute(sql) sql = "create table users (user_id integer NOT NULL, forum_id integer NOT NULL, name_user character varying(" \ @@ -331,6 +456,9 @@ def create_database(cur, con): "constraint users_forum_id_fkey foreign key (forum_id) references forums (forum_id))" cur.execute(sql) + sql = "create unique index unique_user ON users USING btree (forum_id ASC NULLS LAST, name_user ASC NULLS LAST)" + cur.execute(sql) + sql = "create table users_history(user_id integer NOT NULL, forum_id integer NOT NULL, name_user character " \ "varying(255) NOT NULL, status_user character varying(255) null, reputation_user character varying(255) " \ "null, interest_user character varying(5000) null, signature_user character varying(1000) null, " \ @@ -340,37 +468,50 @@ def create_database(cur, con): "forum_id))" cur.execute(sql) - sql = "create table topics(topic_id integer NOT NULL, forum_id integer NOT NULL, board_id integer NOT NULL, " \ - "author_id integer NOT NULL, title_topic character varying(255) NOT NULL, views_topic integer null, " \ - "posts_topic integer null, href_topic character varying(255) null, dateadded_topic timestamp(6) with " \ + sql = "create table topics(topic_id integer NOT NULL, forum_id integer NOT NULL, author_id integer NOT NULL, " \ + "title_topic character varying(255) NOT NULL, board_topic character varying(255) NOT NULL, views_topic integer null, " \ + "posts_topic integer null, href_topic character varying(255) NOT null, dateadded_topic timestamp(6) with " \ "time zone null, dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic double " \ "precision NOT NULL, constraint topics_pk primary key (topic_id), constraint topics_author_id_fkey " \ - "foreign key (author_id) references users (user_id), constraint topics_board_id_fkey foreign key (" \ - "board_id) references boards (board_id), constraint topics_forum_id_fkey foreign key (forum_id) " \ - "references forums (forum_id))" + "foreign key (author_id) references users (user_id), constraint topics_forum_id_fkey foreign key (" \ + "forum_id) references forums (forum_id))" + cur.execute(sql) + + sql = "create unique index unique_topic ON topics USING btree (forum_id ASC NULLS LAST, author_id ASC NULLS LAST, " \ + "title_topic ASC NULLS LAST)" cur.execute(sql) - sql = "create table topics_history(topic_id integer NOT NULL, forum_id integer NOT NULL, board_id integer NOT " \ - "NULL, author_id integer NOT NULL, title_topic character varying(255) NOT NULL, views_topic integer " \ - "null, posts_topic integer null, href_topic character varying(255) null, dateadded_topic timestamp(6) " \ + sql = "create table topics_history(topic_id integer NOT NULL, forum_id integer NOT NULL, author_id integer NOT NULL, " \ + "title_topic character varying(255) NOT NULL, board_topic character varying(255) NOT NULL, views_topic integer " \ + "null, posts_topic integer null, href_topic character varying(255) NOT null, dateadded_topic timestamp(6) " \ "with time zone null, dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic " \ "double precision NOT NULL, constraint topics_history_pk primary key (topic_id, dateinserted_topic), " \ "constraint topics_history_topic_id_fkey foreign key (topic_id) references topics (topic_id), " \ "constraint topics_history_author_id_fkey foreign key (author_id) references users (user_id), " \ - "constraint topics_history_board_id_fkey foreign key (board_id) references boards (board_id), " \ - "constraint topics_history_forum_id_fkey foreign key (forum_id) references forums (forum_id))" + "constraint topics_history_board_id_fkey foreign key (forum_id) references forums (forum_id))" cur.execute(sql) - sql = "create table posts(forum_id integer NOT NULL, board_id integer NOT NULL, topic_id integer NOT NULL, " \ - "user_id integer NOT NULL, content_post character varying(100000) null, feedback_post integer null, " \ + sql = "create table posts(post_id integer NOT NULL, topic_id integer NOT NULL, " \ + "user_id integer NOT NULL, content_post character varying(100000) NOT null, feedback_post integer null, " \ "dateadded_post timestamp(6) with time zone NOT NULL, dateinserted_post timestamp(6) with time zone NOT " \ - "NULL, constraint posts_pk primary key (forum_id, board_id, topic_id, user_id, dateadded_post), " \ - "constraint posts_author_id_fkey foreign key (user_id) references users (user_id), constraint " \ - "posts_board_id_fkey foreign key (board_id) references boards (board_id), constraint " \ - "posts_forum_id_fkey foreign key (forum_id) references forums (forum_id), constraint " \ + "NULL, constraint posts_pk primary key (post_id), " \ + "constraint posts_user_id_fkey foreign key (user_id) references users (user_id), constraint " \ "posts_topic_id_fkey foreign key (topic_id) references topics (topic_id))" cur.execute(sql) + sql = "create unique index unique_post ON posts USING btree (topic_id ASC NULLS LAST, user_id ASC NULLS LAST, " \ + "dateadded_post ASC NULLS LAST)" + cur.execute(sql) + + sql = "create table posts_history(post_id integer NOT NULL, topic_id integer NOT NULL, " \ + "user_id integer NOT NULL, content_post character varying(100000) NOT null, feedback_post integer null, " \ + "dateadded_post timestamp(6) with time zone NOT NULL, dateinserted_post timestamp(6) with time zone NOT " \ + "NULL, constraint posts_history_pk primary key (post_id, dateinserted_post), " \ + "constraint posts_history_user_id_fkey foreign key (user_id) references users (user_id), " \ + "constraint posts_history_topic_id_fkey foreign key (topic_id) references topics (topic_id), " \ + "constraint posts_history_post_id_fkey foreign key (post_id) references posts (post_id))" + cur.execute(sql) + con.commit() except: