From d15e38f989e4354bcf0e9355022ac394c5ba85b2 Mon Sep 17 00:00:00 2001 From: ericssonmarin-cpp <85146518+ericssonmarin-cpp@users.noreply.github.com> Date: Mon, 11 Sep 2023 12:57:30 -0700 Subject: [PATCH] Image tracking and version for forums. --- Forums/Altenens/parser.py | 32 ++-- Forums/DB_Connection/db_connection.py | 206 ++++++++++++++---------- Forums/Initialization/prepare_parser.py | 22 +-- Forums/Utilities/utilities.py | 42 ++--- 4 files changed, 179 insertions(+), 123 deletions(-) diff --git a/Forums/Altenens/parser.py b/Forums/Altenens/parser.py index bdad19d..2493c96 100644 --- a/Forums/Altenens/parser.py +++ b/Forums/Altenens/parser.py @@ -1,5 +1,7 @@ __author__ = 'DarkWeb' +from cytoolz.functoolz import partial + # Here, we are importing the auxiliary functions to clean or convert data from Forums.Utilities.utilities import * from datetime import date @@ -22,7 +24,8 @@ def altenens_description_parser(soup): post = [] # 6 all messages of each post feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) addDate = [] # 8 all dates of each post - image_user = [] + image_user = [] # 9 all user avatars of each post + image_post = [] # 10 all first images of each post topic = soup.find("h1", {"class": "p-title-value"}).text topic = cleanString(topic.strip()) @@ -74,9 +77,11 @@ def altenens_description_parser(soup): img = "-1" image_user.append(img) + image_post.append("-1") + # Populate the final variable (this should be a list with all fields scraped) - row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) + row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) # Sending the results @@ -87,7 +92,7 @@ def altenens_description_parser(soup): def altenens_listing_parser(soup): nm = 0 # *this variable should receive the number of topics - forum = "Altenens" # 0 *forum name + forum = "Altenens" # 0 *forum name board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) author = [] # 2 *all authors of each topic @@ -97,11 +102,13 @@ def altenens_listing_parser(soup): href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between # Listing and Description pages) addDate = [] # 7 when the topic was created (difficult to find) + image_user = [] # 9 all user avatars used in each topic board = soup.find('h1', {"class": "p-title-value"}).text board = cleanString(board.strip()) - itopics = soup.find_all('div', {"class": "structItem-cell structItem-cell--main"}) + regex = re.compile('structItem structItem--thread.*') + itopics = soup.find_all('div', {"class": regex}) nm = len(itopics) @@ -110,7 +117,16 @@ def altenens_listing_parser(soup): topics = itopic.find('div', {"class": "structItem-title"}).text topic.append(cleanString(topics.strip())) - link = itopic.find('a').get('href') + author_icon = itopic.find('a', {"class": "avatar avatar--s"}) + if author_icon != None: + author_icon = author_icon.find('img') + author_icon = author_icon.get('src') + author_icon = author_icon.split('base64,')[-1] + else: + author_icon = "-1" + image_user.append(author_icon) + + link = itopic.find('div', {"class": "structItem-title"}).find('a').get('href') href.append(link) user = itopic.find('ul', {"class": "structItem-parts"}).find('a').text @@ -120,10 +136,6 @@ def altenens_listing_parser(soup): date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z') addDate.append(date_time_obj) - itopics = soup.find_all('div', {"class": "structItem-cell structItem-cell--meta"}) - - for itopic in itopics: - nposts = itopic.find('dl', {"class": "pairs pairs--justified"}).text nposts = nposts.replace('Replies', '') nposts = nposts.replace('K', '000') @@ -134,7 +146,7 @@ def altenens_listing_parser(soup): nviews = nviews.replace('K', '000') views.append(cleanString(nviews)) - return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate) + return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_user) def altenens_links_parser(soup): diff --git a/Forums/DB_Connection/db_connection.py b/Forums/DB_Connection/db_connection.py index 8bb03bb..32ee205 100644 --- a/Forums/DB_Connection/db_connection.py +++ b/Forums/DB_Connection/db_connection.py @@ -42,12 +42,12 @@ def verifyForum(cur, nameForum): print (trace) -def verifyBoard(cur, forum, nameBoard): +def verifyTopic(cur, forumId, authorId, titleTopic): try: - cur.execute("select board_id from boards where forum_id = %(forum)s and name_board = %(nameBoard)s limit 1", - {'forum': forum, 'nameBoard': nameBoard}) + cur.execute("select topic_id from topics where forum_id = %(forumId)s and author_id = %(authorId)s and title_topic = %(titleTopic)s limit 1", + {'forumId': forumId, 'authorId': authorId, 'titleTopic': titleTopic}) recset = cur.fetchall() @@ -62,12 +62,14 @@ def verifyBoard(cur, forum, nameBoard): print (trace) -def verifyTopic(cur, forumId, authorId, titleTopic): +def verifyPost(cur, topicId, userId, dateAdded): try: - cur.execute("select topic_id from topics where forum_id = %(forumId)s and author_id = %(authorId)s and title_topic = %(titleTopic)s limit 1", - {'forumId': forumId, 'authorId': authorId, 'titleTopic': titleTopic}) + cur.execute("select post_id from posts where topic_id = %(topicId)s and " + "user_id = %(userId)s and dateadded_post = %(dateAdded)s limit 1", {'topicId': topicId, + 'userId': userId, + 'dateAdded': dateAdded}) recset = cur.fetchall() @@ -82,14 +84,12 @@ def verifyTopic(cur, forumId, authorId, titleTopic): print (trace) -def verifyPost(cur, topicId, userId, dateAdded): +def verifyUser(cur, nameUser, forumId): try: - cur.execute("select post_id from posts where topic_id = %(topicId)s and " - "user_id = %(userId)s and dateadded_post = %(dateAdded)s limit 1", {'topicId': topicId, - 'userId': userId, - 'dateAdded': dateAdded}) + cur.execute("select user_id from users where name_user = %(nameUser)s and forum_id = %(forumId)s limit 1", + {'nameUser': nameUser, 'forumId': forumId}) recset = cur.fetchall() @@ -104,12 +104,11 @@ def verifyPost(cur, topicId, userId, dateAdded): print (trace) -def verifyUser(cur, nameUser, forumId): +def getLastForum(cur): try: - cur.execute("select user_id from users where name_user = %(nameUser)s and forum_id = %(forumId)s limit 1", - {'nameUser': nameUser, 'forumId': forumId}) + cur.execute("select forum_id from forums order by forum_id desc limit 1") recset = cur.fetchall() @@ -124,11 +123,11 @@ def verifyUser(cur, nameUser, forumId): print (trace) -def getLastForum(cur): +def getLastTopic(cur): try: - cur.execute("select forum_id from forums order by forum_id desc limit 1") + cur.execute("select topic_id from topics order by topic_id desc limit 1") recset = cur.fetchall() @@ -143,11 +142,11 @@ def getLastForum(cur): print (trace) -def getLastBoard(cur): +def getLastUser(cur): try: - cur.execute("select board_id from boards order by board_id desc limit 1") + cur.execute("select user_id from users order by user_id desc limit 1") recset = cur.fetchall() @@ -161,12 +160,11 @@ def getLastBoard(cur): trace = traceback.format_exc() print (trace) - -def getLastTopic(cur): +def getLastUserVersion(cur, userId): try: - cur.execute("select topic_id from topics order by topic_id desc limit 1") + cur.execute("select version_user from users_history where user_id = %(userId)s order by version_user desc limit 1", {'userId': userId}) recset = cur.fetchall() @@ -180,12 +178,11 @@ def getLastTopic(cur): trace = traceback.format_exc() print (trace) - -def getLastUser(cur): +def getLastTopicVersion(cur, topicId): try: - cur.execute("select user_id from users order by user_id desc limit 1") + cur.execute("select version_topic from topics_history where topic_id = %(topicId)s order by version_topic desc limit 1", {'topicId': topicId}) recset = cur.fetchall() @@ -199,6 +196,23 @@ def getLastUser(cur): trace = traceback.format_exc() print (trace) +def getLastPostVersion(cur, postId): + + try: + + cur.execute("select version_post from posts_history where post_id = %(postId)s order by version_post desc limit 1", {'postId': postId}) + + recset = cur.fetchall() + + if recset: + return recset[0][0] + else: + return 0 + + except: + + trace = traceback.format_exc() + print (trace) def getLastPost(cur): @@ -259,7 +273,7 @@ def create_topic(cur, forumId, row, authorId): row[6] if row[6] != '-1' else None, row[7] if row[7] != '-1' else None, row[8], - row[17]] + row[19]] cur.execute(sql, recset) else: @@ -273,11 +287,13 @@ def create_topic(cur, forumId, row, authorId): if (str(recset[0][4]) != str(row[1]) or str(recset[0][5]) != str(row[4] if row[4] != '-1' else None) or # there was a change in the topic information str(recset[0][6]) != str(row[5] if row[5] != '-1' else None)): - sql = "Insert into topics_history (topic_id, forum_id, author_id, title_topic, board_topic, views_topic, posts_topic, " \ + topicVersionId = int(getLastTopicVersion(cur, topicId) + 1) + + sql = "Insert into topics_history (topic_id, version_topic, forum_id, author_id, title_topic, board_topic, views_topic, posts_topic, " \ "href_topic, dateadded_topic, dateinserted_topic, classification_topic) Values (%s, %s, %s, %s, %s, " \ - "%s, %s, %s, %s, %s, %s)" + "%s, %s, %s, %s, %s, %s, %s)" - recset = [topicId, forumId, authorId, + recset = [topicId, topicVersionId, forumId, authorId, recset[0][3], recset[0][4], recset[0][5], @@ -308,10 +324,10 @@ def create_author(cur, row, forumId): userId = int(getLastUser(cur) + 1) sql = "Insert into users (user_id, forum_id, name_user, status_user, reputation_user, interest_user, " \ - "signature_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s)" + "signature_user, image_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s)" recset = [userId, forumId, - row[2], 'Nan', 'Nan', 'Nan', 'Nan', #telling the create_posts function to not track changes here + row[2], 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', #telling the create_user function to not track changes here row[8]] cur.execute(sql, recset) @@ -321,7 +337,7 @@ def create_author(cur, row, forumId): def create_user(cur, row, forumId, index): - userId = verifyUser(cur, row[9][index], forumId) + userId = verifyUser(cur, row[10][index], forumId) if not userId: userId = int(getLastUser(cur) + 1) @@ -332,14 +348,15 @@ def create_user(cur, row, forumId, index): if newUser: sql = "Insert into users (user_id, forum_id, name_user, status_user, reputation_user, interest_user, " \ - "signature_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s)" + "signature_user, image_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s)" recset = [userId, forumId, - row[9][index], - row[10][index] if row[10][index] != '-1' else None, + row[10][index], row[11][index] if row[11][index] != '-1' else None, row[12][index] if row[12][index] != '-1' else None, row[13][index] if row[13][index] != '-1' else None, + row[14][index] if row[14][index] != '-1' else None, + row[9][index] if row[9][index] != '-1' else None, row[8]] cur.execute(sql, recset) @@ -352,31 +369,38 @@ def create_user(cur, row, forumId, index): recset = cur.fetchall() - if (str(recset[0][3]) != str(row[10][index] if row[10][index] != '-1' else None) or str(recset[0][4]) != str(row[11][index] if row[11][index] != '-1' else None) or - str(recset[0][5]) != str(row[12][index] if row[12][index] != '-1' else None) or str(recset[0][6]) != str(row[13][index] if row[13][index] != '-1' else None)): # there was a change in the user information + # decode_decrypt_image_in_base64(recset[0][7]) + + if (str(recset[0][3]) != str(row[11][index] if row[11][index] != '-1' else None) or str(recset[0][4]) != str(row[12][index] if row[12][index] != '-1' else None) or + str(recset[0][5]) != str(row[13][index] if row[13][index] != '-1' else None) or str(recset[0][6]) != str(row[14][index] if row[14][index] != '-1' else None) or + str(recset[0][7]) != str(row[9][index] if row[9][index] != '-1' else None)): # there was a change in the user information - if (str(recset[0][3]) != 'Nan' or str(recset[0][4]) != 'Nan' or str(recset[0][5]) != 'Nan' or str(recset[0][6]) != 'Nan'): + if (str(recset[0][3]) != 'Nan' or str(recset[0][4]) != 'Nan' or str(recset[0][5]) != 'Nan' or str(recset[0][6]) != 'Nan' or str(recset[0][7]) != 'Nan'): - sql = "Insert into users_history (user_id, forum_id, name_user, status_user, reputation_user, interest_user, " \ - "signature_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s)" + userVersionId = int(getLastUserVersion(cur, userId) + 1) - recset = [userId, forumId, + sql = "Insert into users_history (user_id, version_user, forum_id, name_user, status_user, reputation_user, interest_user, " \ + "signature_user, image_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" + + recset = [userId, userVersionId, forumId, recset[0][2], recset[0][3], recset[0][4], recset[0][5], recset[0][6], - recset[0][7]] + recset[0][7], + recset[0][8]] cur.execute(sql, recset) sql = "Update users set status_user = %(status_user)s, reputation_user = %(reputation_user)s, " \ - "interest_user = %(interest_user)s, signature_user = %(signature_user)s, dateinserted_user = %(dateinserted_user)s " \ - "where user_id = %(userId)s" - cur.execute(sql, {'status_user': row[10][index] if row[10][index] != '-1' else None, - 'reputation_user': row[11][index] if row[11][index] != '-1' else None, - 'interest_user': row[12][index] if row[12][index] != '-1' else None, - 'signature_user': row[13][index] if row[13][index] != '-1' else None, + "interest_user = %(interest_user)s, signature_user = %(signature_user)s, image_user = %(image_user)s, " \ + "dateinserted_user = %(dateinserted_user)s where user_id = %(userId)s" + cur.execute(sql, {'status_user': row[11][index] if row[11][index] != '-1' else None, + 'reputation_user': row[12][index] if row[12][index] != '-1' else None, + 'interest_user': row[13][index] if row[13][index] != '-1' else None, + 'signature_user': row[14][index] if row[14][index] != '-1' else None, + 'image_user': row[9][index] if row[9][index] != '-1' else None, 'dateinserted_user': row[8] if row[8] != '-1' else None, 'userId': userId}) @@ -385,25 +409,26 @@ def create_user(cur, row, forumId, index): def create_posts(cur, row, forumId, topicId): - if row[9] != "-1": + if row[10] != "-1": - for i in range(len(row[9])): + for i in range(len(row[10])): userId = create_user(cur, row, forumId, i) - postId = verifyPost(cur, topicId, userId, row[16][i]) + postId = verifyPost(cur, topicId, userId, row[17][i]) if not postId: postId = int(getLastPost(cur) + 1) - sql = "Insert into posts (post_id, topic_id, user_id, content_post, feedback_post, " \ - "dateadded_post, dateinserted_post) Values (%s, %s, %s, %s, %s, %s, %s)" + sql = "Insert into posts (post_id, topic_id, user_id, content_post, feedback_post, image_post," \ + "dateadded_post, dateinserted_post) Values (%s, %s, %s, %s, %s, %s, %s, %s)" recset = [postId, topicId, userId, - row[14][i] if row[14][i] != '-1' else None, row[15][i] if row[15][i] != '-1' else None, row[16][i] if row[16][i] != '-1' else None, + row[18][i] if row[18][i] != '-1' else None, + row[17][i] if row[17][i] != '-1' else None, row[8]] cur.execute(sql, recset) @@ -416,34 +441,38 @@ def create_posts(cur, row, forumId, topicId): recset = cur.fetchall() - if (str(recset[0][3]) != str(row[14][i]) or str(recset[0][4]) != str(row[15][i] if row[15][i] != '-1' else None)): # there was a change in the post information + if (str(recset[0][3]) != str(row[15][i]) or str(recset[0][4]) != str(row[16][i] if row[16][i] != '-1' else None) or + str(recset[0][5]) != str(row[18][i] if row[18][i] != '-1' else None)): # there was a change in the post information + + postVersionId = int(getLastPostVersion(cur, postId) + 1) - sql = "Insert into posts_history (post_id, topic_id, user_id, content_post, feedback_post, " \ - "dateadded_post, dateinserted_post) Values (%s, %s, %s, %s, %s, %s, %s)" + sql = "Insert into posts_history (post_id, version_post, topic_id, user_id, content_post, feedback_post, " \ + "image_post, dateadded_post, dateinserted_post) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s)" - recset = [postId, topicId, userId, + recset = [postId, postVersionId, topicId, userId, recset[0][3], recset[0][4], recset[0][5], - recset[0][6]] + recset[0][6], + recset[0][7]] cur.execute(sql, recset) - sql = "Update posts set content_post = %(content_post)s, feedback_post = %(feedback_post)s, dateinserted_post = %(dateinserted_post)s " \ - "where post_id = %(postId)s" - cur.execute(sql, {'content_post': row[14][i] if row[14][i] != '-1' else None, - 'feedback_post': row[15][i] if row[15][i] != '-1' else None, + sql = "Update posts set content_post = %(content_post)s, feedback_post = %(feedback_post)s, " \ + "image_post = %(image_post)s, dateinserted_post = %(dateinserted_post)s where post_id = %(postId)s" + cur.execute(sql, {'content_post': row[15][i] if row[15][i] != '-1' else None, + 'feedback_post': row[16][i] if row[16][i] != '-1' else None, + 'image_post': row[18][i] if row[18][i] != '-1' else None, 'dateinserted_post': row[8], 'postId': postId}) - def create_database(cur, con): try: sql = "create table forums (forum_id integer NOT NULL, name_forum character varying(255) NOT NULL, url_forum " \ - "character varying(255) NOT null, dateinserted_forum timestamp(6) with time zone NOT NULL, constraint " \ - "forums_pk primary key (forum_id))" + "character varying(255) NOT null, dateinserted_forum timestamp(6) with time zone NOT NULL, " \ + "constraint forums_pk primary key (forum_id))" cur.execute(sql) sql = "create unique index unique_forum ON forums USING btree (name_forum ASC NULLS LAST)" @@ -452,27 +481,29 @@ def create_database(cur, con): sql = "create table users (user_id integer NOT NULL, forum_id integer NOT NULL, name_user character varying(" \ "255) NOT NULL, status_user character varying(255) null, reputation_user character varying(255) null, " \ "interest_user character varying(5000) null, signature_user character varying(1000) null, " \ - "dateinserted_user timestamp(6) with time zone NOT NULL, constraint users_pk primary key (user_id), " \ + "image_user character varying(1000000) null, dateinserted_user timestamp(6) with time zone NOT NULL, " \ + "constraint users_pk primary key (user_id), " \ "constraint users_forum_id_fkey foreign key (forum_id) references forums (forum_id))" cur.execute(sql) sql = "create unique index unique_user ON users USING btree (forum_id ASC NULLS LAST, name_user ASC NULLS LAST)" cur.execute(sql) - sql = "create table users_history(user_id integer NOT NULL, forum_id integer NOT NULL, name_user character " \ - "varying(255) NOT NULL, status_user character varying(255) null, reputation_user character varying(255) " \ - "null, interest_user character varying(5000) null, signature_user character varying(1000) null, " \ - "dateinserted_user timestamp(6) with time zone NOT NULL, constraint users_history_pk primary key (" \ - "user_id, dateinserted_user), constraint users_history_user_id_fkey foreign key (user_id) references " \ - "users (user_id), constraint users_history_forum_id_fkey foreign key (forum_id) references forums (" \ - "forum_id))" + sql = "create table users_history(user_id integer NOT NULL, version_user integer not null, forum_id integer NOT NULL, " \ + "name_user character varying(255) NOT NULL, status_user character varying(255) null, " \ + "reputation_user character varying(255) null, interest_user character varying(5000) null, " \ + "signature_user character varying(1000) null, image_user character varying(1000000) null, " \ + "dateinserted_user timestamp(6) with time zone NOT NULL, " \ + "constraint users_history_pk primary key (user_id, version_user), " \ + "constraint users_history_user_id_fkey foreign key (user_id) references " \ + "users (user_id), constraint users_history_forum_id_fkey foreign key (forum_id) references forums (forum_id))" cur.execute(sql) sql = "create table topics(topic_id integer NOT NULL, forum_id integer NOT NULL, author_id integer NOT NULL, " \ "title_topic character varying(255) NOT NULL, board_topic character varying(255) NOT NULL, views_topic integer null, " \ - "posts_topic integer null, href_topic character varying(255) NOT null, dateadded_topic timestamp(6) with " \ - "time zone null, dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic double " \ - "precision NOT NULL, constraint topics_pk primary key (topic_id), constraint topics_author_id_fkey " \ + "posts_topic integer null, href_topic character varying(255) NOT null, dateadded_topic timestamp(6) with time zone null, " \ + "dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic double precision NOT NULL, " \ + "constraint topics_pk primary key (topic_id), constraint topics_author_id_fkey " \ "foreign key (author_id) references users (user_id), constraint topics_forum_id_fkey foreign key (" \ "forum_id) references forums (forum_id))" cur.execute(sql) @@ -481,11 +512,12 @@ def create_database(cur, con): "title_topic ASC NULLS LAST)" cur.execute(sql) - sql = "create table topics_history(topic_id integer NOT NULL, forum_id integer NOT NULL, author_id integer NOT NULL, " \ - "title_topic character varying(255) NOT NULL, board_topic character varying(255) NOT NULL, views_topic integer " \ - "null, posts_topic integer null, href_topic character varying(255) NOT null, dateadded_topic timestamp(6) " \ - "with time zone null, dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic " \ - "double precision NOT NULL, constraint topics_history_pk primary key (topic_id, dateinserted_topic), " \ + sql = "create table topics_history(topic_id integer NOT NULL, version_topic integer not null, forum_id integer NOT NULL, " \ + "author_id integer NOT NULL, title_topic character varying(255) NOT NULL, board_topic character varying(255) NOT NULL, " \ + "views_topic integer null, posts_topic integer null, href_topic character varying(255) NOT null, " \ + "dateadded_topic timestamp(6) with time zone null, dateinserted_topic timestamp(6) with time zone NOT NULL, " \ + "classification_topic double precision NOT NULL, " \ + "constraint topics_history_pk primary key (topic_id, version_topic), " \ "constraint topics_history_topic_id_fkey foreign key (topic_id) references topics (topic_id), " \ "constraint topics_history_author_id_fkey foreign key (author_id) references users (user_id), " \ "constraint topics_history_board_id_fkey foreign key (forum_id) references forums (forum_id))" @@ -493,8 +525,9 @@ def create_database(cur, con): sql = "create table posts(post_id integer NOT NULL, topic_id integer NOT NULL, " \ "user_id integer NOT NULL, content_post character varying(100000) NOT null, feedback_post integer null, " \ - "dateadded_post timestamp(6) with time zone NOT NULL, dateinserted_post timestamp(6) with time zone NOT " \ - "NULL, constraint posts_pk primary key (post_id), " \ + "image_post character varying(1000000) null, dateadded_post timestamp(6) with time zone NOT NULL, " \ + "dateinserted_post timestamp(6) with time zone NOT NULL, " \ + "constraint posts_pk primary key (post_id), " \ "constraint posts_user_id_fkey foreign key (user_id) references users (user_id), constraint " \ "posts_topic_id_fkey foreign key (topic_id) references topics (topic_id))" cur.execute(sql) @@ -503,10 +536,11 @@ def create_database(cur, con): "dateadded_post ASC NULLS LAST)" cur.execute(sql) - sql = "create table posts_history(post_id integer NOT NULL, topic_id integer NOT NULL, " \ + sql = "create table posts_history(post_id integer NOT NULL, version_post integer not null, topic_id integer NOT NULL, " \ "user_id integer NOT NULL, content_post character varying(100000) NOT null, feedback_post integer null, " \ - "dateadded_post timestamp(6) with time zone NOT NULL, dateinserted_post timestamp(6) with time zone NOT " \ - "NULL, constraint posts_history_pk primary key (post_id, dateinserted_post), " \ + "image_post character varying(1000000) null, dateadded_post timestamp(6) with time zone NOT NULL, " \ + "dateinserted_post timestamp(6) with time zone NOT NULL, " \ + "constraint posts_history_pk primary key (post_id, version_post), " \ "constraint posts_history_user_id_fkey foreign key (user_id) references users (user_id), " \ "constraint posts_history_topic_id_fkey foreign key (topic_id) references topics (topic_id), " \ "constraint posts_history_post_id_fkey foreign key (post_id) references posts (post_id))" diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index 82e08da..d4abdce 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -47,14 +47,18 @@ def mergePages(rmm, rec): # key = rec[16] print ("----------------- Matched: " + rec[3] + "--------------------") - rec[9] = rmm[1] - rec[10] = rmm[2] - rec[11] = rmm[3] - rec[12] = rmm[4] - rec[13] = rmm[5] - rec[14] = rmm[6] - rec[15] = rmm[7] - rec[16] = rmm[8] + + if rmm[9] != "-1": # image_user + rec[9] = rmm[9] + rec[10] = rmm[1] + rec[11] = rmm[2] + rec[12] = rmm[3] + rec[13] = rmm[4] + rec[14] = rmm[5] + rec[15] = rmm[6] + rec[16] = rmm[7] + rec[17] = rmm[8] + rec[18] = rmm[10] return rec @@ -327,7 +331,7 @@ def new_parse(forum, url, createLog): rec = mergePages(rmm, rec) # Append to the list the classification of the topic - rec.append(str(predict(rec[3], getPosts(rec[14]), language='sup_english'))) + rec.append(str(predict(rec[3], getPosts(rec[15]), language='sup_english'))) # Persisting the information in the database persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile) diff --git a/Forums/Utilities/utilities.py b/Forums/Utilities/utilities.py index a9165c8..14b2a1e 100644 --- a/Forums/Utilities/utilities.py +++ b/Forums/Utilities/utilities.py @@ -193,7 +193,7 @@ def cleanLink(originalLink): return originalLink -def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate): +def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author): rw = [] @@ -205,39 +205,45 @@ def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate) current_time += timedelta(seconds=2) ahora = current_time.strftime("%I:%M:%S") - lne = forum # 0 + lne = forum # 0 lne += "," - lne += board # 1 + lne += board # 1 lne += "," - lne += author[n] # 2 + lne += author[n] # 2 lne += "," - lne += topic[n] # 3 + lne += topic[n] # 3 lne += "," - lne += "-1" if len(views) == 0 else views[n] # 4 + lne += "-1" if len(views) == 0 else views[n] # 4 lne += "," - lne += "-1" if len(posts) == 0 else posts[n] # 5 + lne += "-1" if len(posts) == 0 else posts[n] # 5 lne += "," - lne += "-1" if len(href) == 0 else href[n] # 6 + lne += "-1" if len(href) == 0 else href[n] # 6 lne += "," - lne += "-1" if len(addDate) == 0 else str(addDate[n]) # 7 + lne += "-1" if len(addDate) == 0 else str(addDate[n]) # 7 lne += "," - lne += day + " " + ahora # 8 + lne += day + " " + ahora # 8 lne += "," - lne += "-1" # 9 name_user + lne += "-1" if len(image_author) == 0 else str(image_author[n]) # 9 image_user + + + lne += "," + lne += "-1" # 10 name_user + lne += "," + lne += "-1" # 11 status_user lne += "," - lne += "-1" # 10 status_user + lne += "-1" # 12 reputation_user lne += "," - lne += "-1" # 11 reputation_user + lne += "-1" # 13 interest_user lne += "," - lne += "-1" # 12 interest_user + lne += "-1" # 14 signature_user lne += "," - lne += "-1" # 13 signature_user + lne += "-1" # 15 content_post lne += "," - lne += "-1" # 14 content_post + lne += "-1" # 16 feedback_post lne += "," - lne += "-1" # 15 feedback_post + lne += "-1" # 17 dateadded_post lne += "," - lne += "-1" # 16 dateadded_post + lne += "-1" # 18 image_post rw.append(lne)