Browse Source

Image tracking and version for forums.

main
ericssonmarin-cpp 1 year ago
parent
commit
d15e38f989
4 changed files with 179 additions and 123 deletions
  1. +22
    -10
      Forums/Altenens/parser.py
  2. +120
    -86
      Forums/DB_Connection/db_connection.py
  3. +13
    -9
      Forums/Initialization/prepare_parser.py
  4. +24
    -18
      Forums/Utilities/utilities.py

+ 22
- 10
Forums/Altenens/parser.py View File

@ -1,5 +1,7 @@
__author__ = 'DarkWeb'
from cytoolz.functoolz import partial
# Here, we are importing the auxiliary functions to clean or convert data
from Forums.Utilities.utilities import *
from datetime import date
@ -22,7 +24,8 @@ def altenens_description_parser(soup):
post = [] # 6 all messages of each post
feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
addDate = [] # 8 all dates of each post
image_user = []
image_user = [] # 9 all user avatars of each post
image_post = [] # 10 all first images of each post
topic = soup.find("h1", {"class": "p-title-value"}).text
topic = cleanString(topic.strip())
@ -74,9 +77,11 @@ def altenens_description_parser(soup):
img = "-1"
image_user.append(img)
image_post.append("-1")
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
# Sending the results
@ -87,7 +92,7 @@ def altenens_description_parser(soup):
def altenens_listing_parser(soup):
nm = 0 # *this variable should receive the number of topics
forum = "Altenens" # 0 *forum name
forum = "Altenens" # 0 *forum name
board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
author = [] # 2 *all authors of each topic
@ -97,11 +102,13 @@ def altenens_listing_parser(soup):
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between
# Listing and Description pages)
addDate = [] # 7 when the topic was created (difficult to find)
image_user = [] # 9 all user avatars used in each topic
board = soup.find('h1', {"class": "p-title-value"}).text
board = cleanString(board.strip())
itopics = soup.find_all('div', {"class": "structItem-cell structItem-cell--main"})
regex = re.compile('structItem structItem--thread.*')
itopics = soup.find_all('div', {"class": regex})
nm = len(itopics)
@ -110,7 +117,16 @@ def altenens_listing_parser(soup):
topics = itopic.find('div', {"class": "structItem-title"}).text
topic.append(cleanString(topics.strip()))
link = itopic.find('a').get('href')
author_icon = itopic.find('a', {"class": "avatar avatar--s"})
if author_icon != None:
author_icon = author_icon.find('img')
author_icon = author_icon.get('src')
author_icon = author_icon.split('base64,')[-1]
else:
author_icon = "-1"
image_user.append(author_icon)
link = itopic.find('div', {"class": "structItem-title"}).find('a').get('href')
href.append(link)
user = itopic.find('ul', {"class": "structItem-parts"}).find('a').text
@ -120,10 +136,6 @@ def altenens_listing_parser(soup):
date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z')
addDate.append(date_time_obj)
itopics = soup.find_all('div', {"class": "structItem-cell structItem-cell--meta"})
for itopic in itopics:
nposts = itopic.find('dl', {"class": "pairs pairs--justified"}).text
nposts = nposts.replace('Replies', '')
nposts = nposts.replace('K', '000')
@ -134,7 +146,7 @@ def altenens_listing_parser(soup):
nviews = nviews.replace('K', '000')
views.append(cleanString(nviews))
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate)
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_user)
def altenens_links_parser(soup):


+ 120
- 86
Forums/DB_Connection/db_connection.py View File

@ -42,12 +42,12 @@ def verifyForum(cur, nameForum):
print (trace)
def verifyBoard(cur, forum, nameBoard):
def verifyTopic(cur, forumId, authorId, titleTopic):
try:
cur.execute("select board_id from boards where forum_id = %(forum)s and name_board = %(nameBoard)s limit 1",
{'forum': forum, 'nameBoard': nameBoard})
cur.execute("select topic_id from topics where forum_id = %(forumId)s and author_id = %(authorId)s and title_topic = %(titleTopic)s limit 1",
{'forumId': forumId, 'authorId': authorId, 'titleTopic': titleTopic})
recset = cur.fetchall()
@ -62,12 +62,14 @@ def verifyBoard(cur, forum, nameBoard):
print (trace)
def verifyTopic(cur, forumId, authorId, titleTopic):
def verifyPost(cur, topicId, userId, dateAdded):
try:
cur.execute("select topic_id from topics where forum_id = %(forumId)s and author_id = %(authorId)s and title_topic = %(titleTopic)s limit 1",
{'forumId': forumId, 'authorId': authorId, 'titleTopic': titleTopic})
cur.execute("select post_id from posts where topic_id = %(topicId)s and "
"user_id = %(userId)s and dateadded_post = %(dateAdded)s limit 1", {'topicId': topicId,
'userId': userId,
'dateAdded': dateAdded})
recset = cur.fetchall()
@ -82,14 +84,12 @@ def verifyTopic(cur, forumId, authorId, titleTopic):
print (trace)
def verifyPost(cur, topicId, userId, dateAdded):
def verifyUser(cur, nameUser, forumId):
try:
cur.execute("select post_id from posts where topic_id = %(topicId)s and "
"user_id = %(userId)s and dateadded_post = %(dateAdded)s limit 1", {'topicId': topicId,
'userId': userId,
'dateAdded': dateAdded})
cur.execute("select user_id from users where name_user = %(nameUser)s and forum_id = %(forumId)s limit 1",
{'nameUser': nameUser, 'forumId': forumId})
recset = cur.fetchall()
@ -104,12 +104,11 @@ def verifyPost(cur, topicId, userId, dateAdded):
print (trace)
def verifyUser(cur, nameUser, forumId):
def getLastForum(cur):
try:
cur.execute("select user_id from users where name_user = %(nameUser)s and forum_id = %(forumId)s limit 1",
{'nameUser': nameUser, 'forumId': forumId})
cur.execute("select forum_id from forums order by forum_id desc limit 1")
recset = cur.fetchall()
@ -124,11 +123,11 @@ def verifyUser(cur, nameUser, forumId):
print (trace)
def getLastForum(cur):
def getLastTopic(cur):
try:
cur.execute("select forum_id from forums order by forum_id desc limit 1")
cur.execute("select topic_id from topics order by topic_id desc limit 1")
recset = cur.fetchall()
@ -143,11 +142,11 @@ def getLastForum(cur):
print (trace)
def getLastBoard(cur):
def getLastUser(cur):
try:
cur.execute("select board_id from boards order by board_id desc limit 1")
cur.execute("select user_id from users order by user_id desc limit 1")
recset = cur.fetchall()
@ -161,12 +160,11 @@ def getLastBoard(cur):
trace = traceback.format_exc()
print (trace)
def getLastTopic(cur):
def getLastUserVersion(cur, userId):
try:
cur.execute("select topic_id from topics order by topic_id desc limit 1")
cur.execute("select version_user from users_history where user_id = %(userId)s order by version_user desc limit 1", {'userId': userId})
recset = cur.fetchall()
@ -180,12 +178,11 @@ def getLastTopic(cur):
trace = traceback.format_exc()
print (trace)
def getLastUser(cur):
def getLastTopicVersion(cur, topicId):
try:
cur.execute("select user_id from users order by user_id desc limit 1")
cur.execute("select version_topic from topics_history where topic_id = %(topicId)s order by version_topic desc limit 1", {'topicId': topicId})
recset = cur.fetchall()
@ -199,6 +196,23 @@ def getLastUser(cur):
trace = traceback.format_exc()
print (trace)
def getLastPostVersion(cur, postId):
try:
cur.execute("select version_post from posts_history where post_id = %(postId)s order by version_post desc limit 1", {'postId': postId})
recset = cur.fetchall()
if recset:
return recset[0][0]
else:
return 0
except:
trace = traceback.format_exc()
print (trace)
def getLastPost(cur):
@ -259,7 +273,7 @@ def create_topic(cur, forumId, row, authorId):
row[6] if row[6] != '-1' else None,
row[7] if row[7] != '-1' else None,
row[8],
row[17]]
row[19]]
cur.execute(sql, recset)
else:
@ -273,11 +287,13 @@ def create_topic(cur, forumId, row, authorId):
if (str(recset[0][4]) != str(row[1]) or str(recset[0][5]) != str(row[4] if row[4] != '-1' else None) or # there was a change in the topic information
str(recset[0][6]) != str(row[5] if row[5] != '-1' else None)):
sql = "Insert into topics_history (topic_id, forum_id, author_id, title_topic, board_topic, views_topic, posts_topic, " \
topicVersionId = int(getLastTopicVersion(cur, topicId) + 1)
sql = "Insert into topics_history (topic_id, version_topic, forum_id, author_id, title_topic, board_topic, views_topic, posts_topic, " \
"href_topic, dateadded_topic, dateinserted_topic, classification_topic) Values (%s, %s, %s, %s, %s, " \
"%s, %s, %s, %s, %s, %s)"
"%s, %s, %s, %s, %s, %s, %s)"
recset = [topicId, forumId, authorId,
recset = [topicId, topicVersionId, forumId, authorId,
recset[0][3],
recset[0][4],
recset[0][5],
@ -308,10 +324,10 @@ def create_author(cur, row, forumId):
userId = int(getLastUser(cur) + 1)
sql = "Insert into users (user_id, forum_id, name_user, status_user, reputation_user, interest_user, " \
"signature_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s)"
"signature_user, image_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
recset = [userId, forumId,
row[2], 'Nan', 'Nan', 'Nan', 'Nan', #telling the create_posts function to not track changes here
row[2], 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', #telling the create_user function to not track changes here
row[8]]
cur.execute(sql, recset)
@ -321,7 +337,7 @@ def create_author(cur, row, forumId):
def create_user(cur, row, forumId, index):
userId = verifyUser(cur, row[9][index], forumId)
userId = verifyUser(cur, row[10][index], forumId)
if not userId:
userId = int(getLastUser(cur) + 1)
@ -332,14 +348,15 @@ def create_user(cur, row, forumId, index):
if newUser:
sql = "Insert into users (user_id, forum_id, name_user, status_user, reputation_user, interest_user, " \
"signature_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s)"
"signature_user, image_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
recset = [userId, forumId,
row[9][index],
row[10][index] if row[10][index] != '-1' else None,
row[10][index],
row[11][index] if row[11][index] != '-1' else None,
row[12][index] if row[12][index] != '-1' else None,
row[13][index] if row[13][index] != '-1' else None,
row[14][index] if row[14][index] != '-1' else None,
row[9][index] if row[9][index] != '-1' else None,
row[8]]
cur.execute(sql, recset)
@ -352,31 +369,38 @@ def create_user(cur, row, forumId, index):
recset = cur.fetchall()
if (str(recset[0][3]) != str(row[10][index] if row[10][index] != '-1' else None) or str(recset[0][4]) != str(row[11][index] if row[11][index] != '-1' else None) or
str(recset[0][5]) != str(row[12][index] if row[12][index] != '-1' else None) or str(recset[0][6]) != str(row[13][index] if row[13][index] != '-1' else None)): # there was a change in the user information
# decode_decrypt_image_in_base64(recset[0][7])
if (str(recset[0][3]) != str(row[11][index] if row[11][index] != '-1' else None) or str(recset[0][4]) != str(row[12][index] if row[12][index] != '-1' else None) or
str(recset[0][5]) != str(row[13][index] if row[13][index] != '-1' else None) or str(recset[0][6]) != str(row[14][index] if row[14][index] != '-1' else None) or
str(recset[0][7]) != str(row[9][index] if row[9][index] != '-1' else None)): # there was a change in the user information
if (str(recset[0][3]) != 'Nan' or str(recset[0][4]) != 'Nan' or str(recset[0][5]) != 'Nan' or str(recset[0][6]) != 'Nan'):
if (str(recset[0][3]) != 'Nan' or str(recset[0][4]) != 'Nan' or str(recset[0][5]) != 'Nan' or str(recset[0][6]) != 'Nan' or str(recset[0][7]) != 'Nan'):
sql = "Insert into users_history (user_id, forum_id, name_user, status_user, reputation_user, interest_user, " \
"signature_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s)"
userVersionId = int(getLastUserVersion(cur, userId) + 1)
recset = [userId, forumId,
sql = "Insert into users_history (user_id, version_user, forum_id, name_user, status_user, reputation_user, interest_user, " \
"signature_user, image_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
recset = [userId, userVersionId, forumId,
recset[0][2],
recset[0][3],
recset[0][4],
recset[0][5],
recset[0][6],
recset[0][7]]
recset[0][7],
recset[0][8]]
cur.execute(sql, recset)
sql = "Update users set status_user = %(status_user)s, reputation_user = %(reputation_user)s, " \
"interest_user = %(interest_user)s, signature_user = %(signature_user)s, dateinserted_user = %(dateinserted_user)s " \
"where user_id = %(userId)s"
cur.execute(sql, {'status_user': row[10][index] if row[10][index] != '-1' else None,
'reputation_user': row[11][index] if row[11][index] != '-1' else None,
'interest_user': row[12][index] if row[12][index] != '-1' else None,
'signature_user': row[13][index] if row[13][index] != '-1' else None,
"interest_user = %(interest_user)s, signature_user = %(signature_user)s, image_user = %(image_user)s, " \
"dateinserted_user = %(dateinserted_user)s where user_id = %(userId)s"
cur.execute(sql, {'status_user': row[11][index] if row[11][index] != '-1' else None,
'reputation_user': row[12][index] if row[12][index] != '-1' else None,
'interest_user': row[13][index] if row[13][index] != '-1' else None,
'signature_user': row[14][index] if row[14][index] != '-1' else None,
'image_user': row[9][index] if row[9][index] != '-1' else None,
'dateinserted_user': row[8] if row[8] != '-1' else None,
'userId': userId})
@ -385,25 +409,26 @@ def create_user(cur, row, forumId, index):
def create_posts(cur, row, forumId, topicId):
if row[9] != "-1":
if row[10] != "-1":
for i in range(len(row[9])):
for i in range(len(row[10])):
userId = create_user(cur, row, forumId, i)
postId = verifyPost(cur, topicId, userId, row[16][i])
postId = verifyPost(cur, topicId, userId, row[17][i])
if not postId:
postId = int(getLastPost(cur) + 1)
sql = "Insert into posts (post_id, topic_id, user_id, content_post, feedback_post, " \
"dateadded_post, dateinserted_post) Values (%s, %s, %s, %s, %s, %s, %s)"
sql = "Insert into posts (post_id, topic_id, user_id, content_post, feedback_post, image_post," \
"dateadded_post, dateinserted_post) Values (%s, %s, %s, %s, %s, %s, %s, %s)"
recset = [postId, topicId, userId,
row[14][i] if row[14][i] != '-1' else None,
row[15][i] if row[15][i] != '-1' else None,
row[16][i] if row[16][i] != '-1' else None,
row[18][i] if row[18][i] != '-1' else None,
row[17][i] if row[17][i] != '-1' else None,
row[8]]
cur.execute(sql, recset)
@ -416,34 +441,38 @@ def create_posts(cur, row, forumId, topicId):
recset = cur.fetchall()
if (str(recset[0][3]) != str(row[14][i]) or str(recset[0][4]) != str(row[15][i] if row[15][i] != '-1' else None)): # there was a change in the post information
if (str(recset[0][3]) != str(row[15][i]) or str(recset[0][4]) != str(row[16][i] if row[16][i] != '-1' else None) or
str(recset[0][5]) != str(row[18][i] if row[18][i] != '-1' else None)): # there was a change in the post information
postVersionId = int(getLastPostVersion(cur, postId) + 1)
sql = "Insert into posts_history (post_id, topic_id, user_id, content_post, feedback_post, " \
"dateadded_post, dateinserted_post) Values (%s, %s, %s, %s, %s, %s, %s)"
sql = "Insert into posts_history (post_id, version_post, topic_id, user_id, content_post, feedback_post, " \
"image_post, dateadded_post, dateinserted_post) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
recset = [postId, topicId, userId,
recset = [postId, postVersionId, topicId, userId,
recset[0][3],
recset[0][4],
recset[0][5],
recset[0][6]]
recset[0][6],
recset[0][7]]
cur.execute(sql, recset)
sql = "Update posts set content_post = %(content_post)s, feedback_post = %(feedback_post)s, dateinserted_post = %(dateinserted_post)s " \
"where post_id = %(postId)s"
cur.execute(sql, {'content_post': row[14][i] if row[14][i] != '-1' else None,
'feedback_post': row[15][i] if row[15][i] != '-1' else None,
sql = "Update posts set content_post = %(content_post)s, feedback_post = %(feedback_post)s, " \
"image_post = %(image_post)s, dateinserted_post = %(dateinserted_post)s where post_id = %(postId)s"
cur.execute(sql, {'content_post': row[15][i] if row[15][i] != '-1' else None,
'feedback_post': row[16][i] if row[16][i] != '-1' else None,
'image_post': row[18][i] if row[18][i] != '-1' else None,
'dateinserted_post': row[8],
'postId': postId})
def create_database(cur, con):
try:
sql = "create table forums (forum_id integer NOT NULL, name_forum character varying(255) NOT NULL, url_forum " \
"character varying(255) NOT null, dateinserted_forum timestamp(6) with time zone NOT NULL, constraint " \
"forums_pk primary key (forum_id))"
"character varying(255) NOT null, dateinserted_forum timestamp(6) with time zone NOT NULL, " \
"constraint forums_pk primary key (forum_id))"
cur.execute(sql)
sql = "create unique index unique_forum ON forums USING btree (name_forum ASC NULLS LAST)"
@ -452,27 +481,29 @@ def create_database(cur, con):
sql = "create table users (user_id integer NOT NULL, forum_id integer NOT NULL, name_user character varying(" \
"255) NOT NULL, status_user character varying(255) null, reputation_user character varying(255) null, " \
"interest_user character varying(5000) null, signature_user character varying(1000) null, " \
"dateinserted_user timestamp(6) with time zone NOT NULL, constraint users_pk primary key (user_id), " \
"image_user character varying(1000000) null, dateinserted_user timestamp(6) with time zone NOT NULL, " \
"constraint users_pk primary key (user_id), " \
"constraint users_forum_id_fkey foreign key (forum_id) references forums (forum_id))"
cur.execute(sql)
sql = "create unique index unique_user ON users USING btree (forum_id ASC NULLS LAST, name_user ASC NULLS LAST)"
cur.execute(sql)
sql = "create table users_history(user_id integer NOT NULL, forum_id integer NOT NULL, name_user character " \
"varying(255) NOT NULL, status_user character varying(255) null, reputation_user character varying(255) " \
"null, interest_user character varying(5000) null, signature_user character varying(1000) null, " \
"dateinserted_user timestamp(6) with time zone NOT NULL, constraint users_history_pk primary key (" \
"user_id, dateinserted_user), constraint users_history_user_id_fkey foreign key (user_id) references " \
"users (user_id), constraint users_history_forum_id_fkey foreign key (forum_id) references forums (" \
"forum_id))"
sql = "create table users_history(user_id integer NOT NULL, version_user integer not null, forum_id integer NOT NULL, " \
"name_user character varying(255) NOT NULL, status_user character varying(255) null, " \
"reputation_user character varying(255) null, interest_user character varying(5000) null, " \
"signature_user character varying(1000) null, image_user character varying(1000000) null, " \
"dateinserted_user timestamp(6) with time zone NOT NULL, " \
"constraint users_history_pk primary key (user_id, version_user), " \
"constraint users_history_user_id_fkey foreign key (user_id) references " \
"users (user_id), constraint users_history_forum_id_fkey foreign key (forum_id) references forums (forum_id))"
cur.execute(sql)
sql = "create table topics(topic_id integer NOT NULL, forum_id integer NOT NULL, author_id integer NOT NULL, " \
"title_topic character varying(255) NOT NULL, board_topic character varying(255) NOT NULL, views_topic integer null, " \
"posts_topic integer null, href_topic character varying(255) NOT null, dateadded_topic timestamp(6) with " \
"time zone null, dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic double " \
"precision NOT NULL, constraint topics_pk primary key (topic_id), constraint topics_author_id_fkey " \
"posts_topic integer null, href_topic character varying(255) NOT null, dateadded_topic timestamp(6) with time zone null, " \
"dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic double precision NOT NULL, " \
"constraint topics_pk primary key (topic_id), constraint topics_author_id_fkey " \
"foreign key (author_id) references users (user_id), constraint topics_forum_id_fkey foreign key (" \
"forum_id) references forums (forum_id))"
cur.execute(sql)
@ -481,11 +512,12 @@ def create_database(cur, con):
"title_topic ASC NULLS LAST)"
cur.execute(sql)
sql = "create table topics_history(topic_id integer NOT NULL, forum_id integer NOT NULL, author_id integer NOT NULL, " \
"title_topic character varying(255) NOT NULL, board_topic character varying(255) NOT NULL, views_topic integer " \
"null, posts_topic integer null, href_topic character varying(255) NOT null, dateadded_topic timestamp(6) " \
"with time zone null, dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic " \
"double precision NOT NULL, constraint topics_history_pk primary key (topic_id, dateinserted_topic), " \
sql = "create table topics_history(topic_id integer NOT NULL, version_topic integer not null, forum_id integer NOT NULL, " \
"author_id integer NOT NULL, title_topic character varying(255) NOT NULL, board_topic character varying(255) NOT NULL, " \
"views_topic integer null, posts_topic integer null, href_topic character varying(255) NOT null, " \
"dateadded_topic timestamp(6) with time zone null, dateinserted_topic timestamp(6) with time zone NOT NULL, " \
"classification_topic double precision NOT NULL, " \
"constraint topics_history_pk primary key (topic_id, version_topic), " \
"constraint topics_history_topic_id_fkey foreign key (topic_id) references topics (topic_id), " \
"constraint topics_history_author_id_fkey foreign key (author_id) references users (user_id), " \
"constraint topics_history_board_id_fkey foreign key (forum_id) references forums (forum_id))"
@ -493,8 +525,9 @@ def create_database(cur, con):
sql = "create table posts(post_id integer NOT NULL, topic_id integer NOT NULL, " \
"user_id integer NOT NULL, content_post character varying(100000) NOT null, feedback_post integer null, " \
"dateadded_post timestamp(6) with time zone NOT NULL, dateinserted_post timestamp(6) with time zone NOT " \
"NULL, constraint posts_pk primary key (post_id), " \
"image_post character varying(1000000) null, dateadded_post timestamp(6) with time zone NOT NULL, " \
"dateinserted_post timestamp(6) with time zone NOT NULL, " \
"constraint posts_pk primary key (post_id), " \
"constraint posts_user_id_fkey foreign key (user_id) references users (user_id), constraint " \
"posts_topic_id_fkey foreign key (topic_id) references topics (topic_id))"
cur.execute(sql)
@ -503,10 +536,11 @@ def create_database(cur, con):
"dateadded_post ASC NULLS LAST)"
cur.execute(sql)
sql = "create table posts_history(post_id integer NOT NULL, topic_id integer NOT NULL, " \
sql = "create table posts_history(post_id integer NOT NULL, version_post integer not null, topic_id integer NOT NULL, " \
"user_id integer NOT NULL, content_post character varying(100000) NOT null, feedback_post integer null, " \
"dateadded_post timestamp(6) with time zone NOT NULL, dateinserted_post timestamp(6) with time zone NOT " \
"NULL, constraint posts_history_pk primary key (post_id, dateinserted_post), " \
"image_post character varying(1000000) null, dateadded_post timestamp(6) with time zone NOT NULL, " \
"dateinserted_post timestamp(6) with time zone NOT NULL, " \
"constraint posts_history_pk primary key (post_id, version_post), " \
"constraint posts_history_user_id_fkey foreign key (user_id) references users (user_id), " \
"constraint posts_history_topic_id_fkey foreign key (topic_id) references topics (topic_id), " \
"constraint posts_history_post_id_fkey foreign key (post_id) references posts (post_id))"


+ 13
- 9
Forums/Initialization/prepare_parser.py View File

@ -47,14 +47,18 @@ def mergePages(rmm, rec):
# key = rec[16]
print ("----------------- Matched: " + rec[3] + "--------------------")
rec[9] = rmm[1]
rec[10] = rmm[2]
rec[11] = rmm[3]
rec[12] = rmm[4]
rec[13] = rmm[5]
rec[14] = rmm[6]
rec[15] = rmm[7]
rec[16] = rmm[8]
if rmm[9] != "-1": # image_user
rec[9] = rmm[9]
rec[10] = rmm[1]
rec[11] = rmm[2]
rec[12] = rmm[3]
rec[13] = rmm[4]
rec[14] = rmm[5]
rec[15] = rmm[6]
rec[16] = rmm[7]
rec[17] = rmm[8]
rec[18] = rmm[10]
return rec
@ -327,7 +331,7 @@ def new_parse(forum, url, createLog):
rec = mergePages(rmm, rec)
# Append to the list the classification of the topic
rec.append(str(predict(rec[3], getPosts(rec[14]), language='sup_english')))
rec.append(str(predict(rec[3], getPosts(rec[15]), language='sup_english')))
# Persisting the information in the database
persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile)


+ 24
- 18
Forums/Utilities/utilities.py View File

@ -193,7 +193,7 @@ def cleanLink(originalLink):
return originalLink
def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate):
def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author):
rw = []
@ -205,39 +205,45 @@ def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate)
current_time += timedelta(seconds=2)
ahora = current_time.strftime("%I:%M:%S")
lne = forum # 0
lne = forum # 0
lne += ","
lne += board # 1
lne += board # 1
lne += ","
lne += author[n] # 2
lne += author[n] # 2
lne += ","
lne += topic[n] # 3
lne += topic[n] # 3
lne += ","
lne += "-1" if len(views) == 0 else views[n] # 4
lne += "-1" if len(views) == 0 else views[n] # 4
lne += ","
lne += "-1" if len(posts) == 0 else posts[n] # 5
lne += "-1" if len(posts) == 0 else posts[n] # 5
lne += ","
lne += "-1" if len(href) == 0 else href[n] # 6
lne += "-1" if len(href) == 0 else href[n] # 6
lne += ","
lne += "-1" if len(addDate) == 0 else str(addDate[n]) # 7
lne += "-1" if len(addDate) == 0 else str(addDate[n]) # 7
lne += ","
lne += day + " " + ahora # 8
lne += day + " " + ahora # 8
lne += ","
lne += "-1" # 9 name_user
lne += "-1" if len(image_author) == 0 else str(image_author[n]) # 9 image_user
lne += ","
lne += "-1" # 10 name_user
lne += ","
lne += "-1" # 11 status_user
lne += ","
lne += "-1" # 10 status_user
lne += "-1" # 12 reputation_user
lne += ","
lne += "-1" # 11 reputation_user
lne += "-1" # 13 interest_user
lne += ","
lne += "-1" # 12 interest_user
lne += "-1" # 14 signature_user
lne += ","
lne += "-1" # 13 signature_user
lne += "-1" # 15 content_post
lne += ","
lne += "-1" # 14 content_post
lne += "-1" # 16 feedback_post
lne += ","
lne += "-1" # 15 feedback_post
lne += "-1" # 17 dateadded_post
lne += ","
lne += "-1" # 16 dateadded_post
lne += "-1" # 18 image_post
rw.append(lne)


Loading…
Cancel
Save