diff --git a/Forums/CryptBB/parser.py b/Forums/CryptBB/parser.py index 5b83ab5..d725a98 100644 --- a/Forums/CryptBB/parser.py +++ b/Forums/CryptBB/parser.py @@ -164,9 +164,13 @@ def cryptBB_description_parser(soup): img = "-1" image_post.append(img) - img = ipost.find('div', {"class": "author_avatar"}).find('img') - if img is not None: - img = img.get('src').split('base64,')[-1] + avatar = ipost.find('div', {"class": "author_avatar"}) + if avatar is not None: + img = avatar.find('img') + if img is not None: + img = img.get('src').split('base64,')[-1] + else: + img = "-1" else: img = "-1" image_user.append(img) diff --git a/Forums/DB_Connection/db_connection.py b/Forums/DB_Connection/db_connection.py index 32ee205..eebc0da 100644 --- a/Forums/DB_Connection/db_connection.py +++ b/Forums/DB_Connection/db_connection.py @@ -27,12 +27,14 @@ def verifyForum(cur, nameForum): try: + cur.execute("lock table forums IN ACCESS EXCLUSIVE MODE;") + cur.execute("select forum_id from forums where name_forum = %(nameForum)s limit 1", {'nameForum': nameForum}) recset = cur.fetchall() if recset: - return recset[0][0] + return recset[0]['forum_id'] else: return 0 @@ -46,13 +48,15 @@ def verifyTopic(cur, forumId, authorId, titleTopic): try: + cur.execute("lock table topics IN ACCESS EXCLUSIVE MODE;") + cur.execute("select topic_id from topics where forum_id = %(forumId)s and author_id = %(authorId)s and title_topic = %(titleTopic)s limit 1", {'forumId': forumId, 'authorId': authorId, 'titleTopic': titleTopic}) recset = cur.fetchall() if recset: - return recset[0][0] + return recset[0]['topic_id'] else: return 0 @@ -66,6 +70,8 @@ def verifyPost(cur, topicId, userId, dateAdded): try: + cur.execute("lock table posts IN ACCESS EXCLUSIVE MODE;") + cur.execute("select post_id from posts where topic_id = %(topicId)s and " "user_id = %(userId)s and dateadded_post = %(dateAdded)s limit 1", {'topicId': topicId, 'userId': userId, @@ -74,7 +80,7 @@ def verifyPost(cur, topicId, userId, dateAdded): recset = cur.fetchall() if recset: - return recset[0][0] + return recset[0]['post_id'] else: return 0 @@ -88,13 +94,15 @@ def verifyUser(cur, nameUser, forumId): try: + cur.execute("lock table users IN ACCESS EXCLUSIVE MODE;") + cur.execute("select user_id from users where name_user = %(nameUser)s and forum_id = %(forumId)s limit 1", {'nameUser': nameUser, 'forumId': forumId}) recset = cur.fetchall() if recset: - return recset[0][0] + return recset[0]['user_id'] else: return 0 @@ -113,7 +121,7 @@ def getLastForum(cur): recset = cur.fetchall() if recset: - return recset[0][0] + return recset[0]['forum_id'] else: return 0 @@ -132,7 +140,7 @@ def getLastTopic(cur): recset = cur.fetchall() if recset: - return recset[0][0] + return recset[0]['topic_id'] else: return 0 @@ -151,7 +159,7 @@ def getLastUser(cur): recset = cur.fetchall() if recset: - return recset[0][0] + return recset[0]['user_id'] else: return 0 @@ -169,7 +177,7 @@ def getLastUserVersion(cur, userId): recset = cur.fetchall() if recset: - return recset[0][0] + return recset[0]['version_user'] else: return 0 @@ -187,7 +195,7 @@ def getLastTopicVersion(cur, topicId): recset = cur.fetchall() if recset: - return recset[0][0] + return recset[0]['version_topic'] else: return 0 @@ -205,7 +213,7 @@ def getLastPostVersion(cur, postId): recset = cur.fetchall() if recset: - return recset[0][0] + return recset[0]['version_post'] else: return 0 @@ -223,7 +231,7 @@ def getLastPost(cur): recset = cur.fetchall() if recset: - return recset[0][0] + return recset[0]['post_id'] else: return 0 @@ -232,7 +240,6 @@ def getLastPost(cur): trace = traceback.format_exc() print (trace) - def create_forum(cur, row, url): forumId = verifyForum(cur, row[0]) @@ -284,8 +291,9 @@ def create_topic(cur, forumId, row, authorId): recset = cur.fetchall() - if (str(recset[0][4]) != str(row[1]) or str(recset[0][5]) != str(row[4] if row[4] != '-1' else None) or # there was a change in the topic information - str(recset[0][6]) != str(row[5] if row[5] != '-1' else None)): + if (str(recset[0]['board_topic']) != str(row[1]) or + str(recset[0]['views_topic']) != str(row[4] if row[4] != '-1' else None) or # there was a change in the topic information + str(recset[0]['posts_topic']) != str(row[5] if row[5] != '-1' else None)): topicVersionId = int(getLastTopicVersion(cur, topicId) + 1) @@ -294,14 +302,14 @@ def create_topic(cur, forumId, row, authorId): "%s, %s, %s, %s, %s, %s, %s)" recset = [topicId, topicVersionId, forumId, authorId, - recset[0][3], - recset[0][4], - recset[0][5], - recset[0][6], - recset[0][7], - recset[0][8], - recset[0][9], - recset[0][10]] + recset[0]['title_topic'], + recset[0]['board_topic'], + recset[0]['views_topic'], + recset[0]['posts_topic'], + recset[0]['href_topic'], + recset[0]['dateadded_topic'], + recset[0]['dateinserted_topic'], + recset[0]['classification_topic']] cur.execute(sql, recset) sql = "Update topics set board_topic = %(board_topic)s, views_topic = %(views_topic)s, posts_topic = %(posts_topic)s, " \ @@ -371,11 +379,17 @@ def create_user(cur, row, forumId, index): # decode_decrypt_image_in_base64(recset[0][7]) - if (str(recset[0][3]) != str(row[11][index] if row[11][index] != '-1' else None) or str(recset[0][4]) != str(row[12][index] if row[12][index] != '-1' else None) or - str(recset[0][5]) != str(row[13][index] if row[13][index] != '-1' else None) or str(recset[0][6]) != str(row[14][index] if row[14][index] != '-1' else None) or - str(recset[0][7]) != str(row[9][index] if row[9][index] != '-1' else None)): # there was a change in the user information + if (str(recset[0]['status_user']) != str(row[11][index] if row[11][index] != '-1' else None) or + str(recset[0]['reputation_user']) != str(row[12][index] if row[12][index] != '-1' else None) or + str(recset[0]['interest_user']) != str(row[13][index] if row[13][index] != '-1' else None) or + str(recset[0]['signature_user']) != str(row[14][index] if row[14][index] != '-1' else None) or + str(recset[0]['image_user']) != str(row[9][index] if row[9][index] != '-1' else None)): # there was a change in the user information - if (str(recset[0][3]) != 'Nan' or str(recset[0][4]) != 'Nan' or str(recset[0][5]) != 'Nan' or str(recset[0][6]) != 'Nan' or str(recset[0][7]) != 'Nan'): + if (str(recset[0]['status_user']) != 'Nan' or + str(recset[0]['reputation_user']) != 'Nan' or + str(recset[0]['interest_user']) != 'Nan' or + str(recset[0]['signature_user']) != 'Nan' or + str(recset[0]['image_user']) != 'Nan'): userVersionId = int(getLastUserVersion(cur, userId) + 1) @@ -383,13 +397,13 @@ def create_user(cur, row, forumId, index): "signature_user, image_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" recset = [userId, userVersionId, forumId, - recset[0][2], - recset[0][3], - recset[0][4], - recset[0][5], - recset[0][6], - recset[0][7], - recset[0][8]] + recset[0]['name_user'], + recset[0]['status_user'], + recset[0]['reputation_user'], + recset[0]['interest_user'], + recset[0]['signature_user'], + recset[0]['image_user'], + recset[0]['dateinserted_user']] cur.execute(sql, recset) @@ -441,8 +455,9 @@ def create_posts(cur, row, forumId, topicId): recset = cur.fetchall() - if (str(recset[0][3]) != str(row[15][i]) or str(recset[0][4]) != str(row[16][i] if row[16][i] != '-1' else None) or - str(recset[0][5]) != str(row[18][i] if row[18][i] != '-1' else None)): # there was a change in the post information + if (str(recset[0]['content_post']) != str(row[15][i]) or + str(recset[0]['feedback_post']) != str(row[16][i] if row[16][i] != '-1' else None) or + str(recset[0]['image_post']) != str(row[18][i] if row[18][i] != '-1' else None)): # there was a change in the post information postVersionId = int(getLastPostVersion(cur, postId) + 1) @@ -450,11 +465,11 @@ def create_posts(cur, row, forumId, topicId): "image_post, dateadded_post, dateinserted_post) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s)" recset = [postId, postVersionId, topicId, userId, - recset[0][3], - recset[0][4], - recset[0][5], - recset[0][6], - recset[0][7]] + recset[0]['content_post'], + recset[0]['feedback_post'], + recset[0]['image_post'], + recset[0]['dateadded_post'], + recset[0]['dateinserted_post']] cur.execute(sql, recset) @@ -481,7 +496,7 @@ def create_database(cur, con): sql = "create table users (user_id integer NOT NULL, forum_id integer NOT NULL, name_user character varying(" \ "255) NOT NULL, status_user character varying(255) null, reputation_user character varying(255) null, " \ "interest_user character varying(5000) null, signature_user character varying(1000) null, " \ - "image_user character varying(1000000) null, dateinserted_user timestamp(6) with time zone NOT NULL, " \ + "image_user character varying(10000000) null, dateinserted_user timestamp(6) with time zone NOT NULL, " \ "constraint users_pk primary key (user_id), " \ "constraint users_forum_id_fkey foreign key (forum_id) references forums (forum_id))" cur.execute(sql) @@ -492,7 +507,7 @@ def create_database(cur, con): sql = "create table users_history(user_id integer NOT NULL, version_user integer not null, forum_id integer NOT NULL, " \ "name_user character varying(255) NOT NULL, status_user character varying(255) null, " \ "reputation_user character varying(255) null, interest_user character varying(5000) null, " \ - "signature_user character varying(1000) null, image_user character varying(1000000) null, " \ + "signature_user character varying(1000) null, image_user character varying(10000000) null, " \ "dateinserted_user timestamp(6) with time zone NOT NULL, " \ "constraint users_history_pk primary key (user_id, version_user), " \ "constraint users_history_user_id_fkey foreign key (user_id) references " \ @@ -525,7 +540,7 @@ def create_database(cur, con): sql = "create table posts(post_id integer NOT NULL, topic_id integer NOT NULL, " \ "user_id integer NOT NULL, content_post character varying(100000) NOT null, feedback_post integer null, " \ - "image_post character varying(1000000) null, dateadded_post timestamp(6) with time zone NOT NULL, " \ + "image_post character varying(10000000) null, dateadded_post timestamp(6) with time zone NOT NULL, " \ "dateinserted_post timestamp(6) with time zone NOT NULL, " \ "constraint posts_pk primary key (post_id), " \ "constraint posts_user_id_fkey foreign key (user_id) references users (user_id), constraint " \ @@ -538,7 +553,7 @@ def create_database(cur, con): sql = "create table posts_history(post_id integer NOT NULL, version_post integer not null, topic_id integer NOT NULL, " \ "user_id integer NOT NULL, content_post character varying(100000) NOT null, feedback_post integer null, " \ - "image_post character varying(1000000) null, dateadded_post timestamp(6) with time zone NOT NULL, " \ + "image_post character varying(10000000) null, dateadded_post timestamp(6) with time zone NOT NULL, " \ "dateinserted_post timestamp(6) with time zone NOT NULL, " \ "constraint posts_history_pk primary key (post_id, version_post), " \ "constraint posts_history_user_id_fkey foreign key (user_id) references users (user_id), " \ diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index 5251aad..267f887 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -4,6 +4,8 @@ import codecs import glob import os, re import shutil +from psycopg2.extras import RealDictCursor + from Forums.DB_Connection.db_connection import * from Forums.BestCardingWorld.parser import * from Forums.Cardingleaks.parser import * @@ -240,7 +242,7 @@ def new_parse(forum, url, createLog): # Connecting to the database con = connectDataBase() - cur = con.cursor() + cur = con.cursor(cursor_factory=RealDictCursor) # Creating the tables (The database should be created manually) create_database(cur, con) @@ -372,6 +374,9 @@ def new_parse(forum, url, createLog): if createLog: logFile.close() + cur.close() + con.close() + print("Parsing the " + forum + " forum and data classification done.")