diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml index 71f5e9b..7507657 100644 --- a/.idea/DW_Pipeline_Test.iml +++ b/.idea/DW_Pipeline_Test.iml @@ -2,7 +2,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index baf04e9..730b23f 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/Forums/CryptBB/parser.py b/Forums/CryptBB/parser.py index b6772f9..0957b76 100644 --- a/Forums/CryptBB/parser.py +++ b/Forums/CryptBB/parser.py @@ -20,7 +20,7 @@ def cryptBB_description_parser(soup): addDate = [] # all dated of each post feedback = [] # all feedbacks of each vendor (this was found in just one Forum and with a number format) status = [] # all user's authority in each post such as (adm, member, dangerous) - reputation = [] # all users's karma in each post (usually found as a number) + reputation = [] # all user's karma in each post (usually found as a number) sign = [] # all user's signature in each post (usually a standard message after the content of the post) post = [] # all messages of each post interest = [] # all user's interest in each post @@ -34,153 +34,127 @@ def cryptBB_description_parser(soup): topic = topic.replace(",","") topic = topic.replace("\n","") topic = cleanString(topic.strip()) - print(topic) - # Finding the repeated tag that corresponds to the listing of posts - - # posts = soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg"}) + \ - # soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg2"}) - try: - posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all( - 'div', {"class": "post"}) - # print(len(posts)) + # Finding the repeated tag that corresponds to the listing of posts - # For each message (post), get all the fields we are interested to: + # try: + posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all( + 'div', {"class": "post"}) - for ipost in posts: + # For each message (post), get all the fields we are interested to: - # Finding a first level of the HTML page + for ipost in posts: - # post_wrapper = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "poster"}) - post_wrapper = ipost.find('span', {"class": "largetext"}) - # Finding the author (user) of the post + # Finding a first level of the HTML page - # author = post_wrapper.find('h4') - author = post_wrapper.text.strip() - # print("author " + author) - user.append(cleanString(author)) # Remember to clean the problematic characters + post_wrapper = ipost.find('span', {"class": "largetext"}) - # Finding the status of the author + # Finding the author (user) of the post - smalltext = ipost.find('div', {"class": "post_author"}) + author = post_wrapper.text.strip() + user.append(cleanString(author)) # Remember to clean the problematic characters - # Testing here two possibilities to find this status and combine them - if ipost.find('div', {"class": "deleted_post_author"}): - status.append(-1) - interest.append(-1) - reputation.append(-1) - addDate.append(-1) - post.append("THIS POST HAS BEEN REMOVED!") - sign.append(-1) - feedback.append(-1) - continue + # Finding the status of the author - # CryptBB does have membergroup and postgroup + smalltext = ipost.find('div', {"class": "post_author"}) - membergroup = smalltext.find('div', {"class": "profile-rank"}) - postgroup = smalltext.find('div', {"class": "postgroup"}) - if membergroup != None: - membergroup = membergroup.text.strip() - if postgroup != None: - postgroup = postgroup.text.strip() - membergroup = membergroup + " - " + postgroup - else: - if postgroup != None: - membergroup = postgroup.text.strip() - else: - membergroup = "-1" - - status.append(cleanString(membergroup)) - # print("status " + cleanString(membergroup)) - # Finding the interest of the author - # CryptBB does not have blurb - blurb = smalltext.find('li', {"class": "blurb"}) - if blurb != None: - blurb = blurb.text.strip() - else: - blurb = "-1" - interest.append(cleanString(blurb)) - - # Finding the reputation of the user - # CryptBB does have reputation - author_stats = smalltext.find('div', {"class": "author_statistics"}) - karma = author_stats.find('strong') - if karma != None: - karma = karma.text - karma = karma.replace("Community Rating: ", "") - karma = karma.replace("Karma: ", "") - karma = karma.strip() - else: - karma = "-1" - reputation.append(cleanString(karma)) - # print("karma " + cleanString(karma)) - # Getting here another good tag to find the post date, post content and users' signature - - postarea = ipost.find('div', {"class": "post_content"}) - - dt = postarea.find('span', {"class": "post_date"}).text - # dt = dt.strip().split() - dt = dt.strip() - day=date.today() - if "Yesterday" in dt: - yesterday = day - timedelta(days=1) - yesterday = yesterday.strftime('%m-%d-%Y') - stime = dt.replace('Yesterday,','').strip() - date_time_obj = yesterday+ ', '+stime - date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p') - elif "hours ago" in dt: - day = day.strftime('%m-%d-%Y') - date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title'] - date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p') - else: - date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p') - stime = date_time_obj.strftime('%b %d, %Y') - sdate = date_time_obj.strftime('%I:%M %p') - - - addDate.append(date_time_obj) - # print("date " + str(date_time_obj)) - # Finding the date of the post - # date_time_obj = datetime.strptime(dt, '%a %b %d, %Y %I:%M %p') - # smalltext = postarea.find('div', {"class": "flow_hidden"}).find('div', {"class": "keyinfo"})\ - # .find('div', {"class": "smalltext"}) - # sdatetime = smalltext.text - # sdatetime = sdatetime.replace(u"\xab","") # Removing unnecessary characters - # sdatetime = sdatetime.replace(u"\xbb","") # Removing unnecessary characters - # sdatetime = sdatetime.split("on: ") # Removing unnecessary characters - # sdatetime = sdatetime[1].strip() - # stime = sdatetime[:-12:-1] # Finding the time of the post - # stime = stime[::-1] - # sdate = sdatetime.replace(stime,"") # Finding the date of the post - # sdate = sdate.replace(",","") - # sdate = sdate.strip() - - # Covert the date of the post that can be informed as: "12 February 2016", "today", "yesterday". We need - # a date format here as "mm/dd/yyyy" - - # addDate.append(convertDate(sdate,"english", crawlerDate) + " " + stime) - - # Finding the post - - inner = postarea.find('div', {"class": "post_body scaleimages"}) - inner = inner.text.strip() - # print(inner) - post.append(cleanString(inner)) - - # Finding the users's signature - - # signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"}) - signature = ipost.find('div', {"class": "signature scaleimages"}) - if signature != None: - signature = signature.text.strip() - # print(signature) + ''' + # Testing here two possibilities to find this status and combine them + if ipost.find('div', {"class": "deleted_post_author"}): + status.append(-1) + interest.append(-1) + reputation.append(-1) + addDate.append(-1) + post.append("THIS POST HAS BEEN REMOVED!") + sign.append(-1) + feedback.append(-1) + continue + ''' + + # CryptBB does have membergroup and postgroup + + membergroup = smalltext.find('div', {"class": "profile-rank"}) + postgroup = smalltext.find('div', {"class": "postgroup"}) + if membergroup != None: + membergroup = membergroup.text.strip() + if postgroup != None: + postgroup = postgroup.text.strip() + membergroup = membergroup + " - " + postgroup + else: + if postgroup != None: + membergroup = postgroup.text.strip() else: - signature = "-1" - sign.append(cleanString(signature)) - - # As no information about users's feedback was found, just assign "-1" to the variable - - feedback.append("-1") + membergroup = "-1" + status.append(cleanString(membergroup)) + + # Finding the interest of the author + # CryptBB does not have blurb + blurb = smalltext.find('li', {"class": "blurb"}) + if blurb != None: + blurb = blurb.text.strip() + else: + blurb = "-1" + interest.append(cleanString(blurb)) + + # Finding the reputation of the user + # CryptBB does have reputation + author_stats = smalltext.find('div', {"class": "author_statistics"}) + karma = author_stats.find('strong') + if karma != None: + karma = karma.text + karma = karma.replace("Community Rating: ", "") + karma = karma.replace("Karma: ", "") + karma = karma.strip() + else: + karma = "-1" + reputation.append(cleanString(karma)) + + # Getting here another good tag to find the post date, post content and users' signature + + postarea = ipost.find('div', {"class": "post_content"}) + + dt = postarea.find('span', {"class": "post_date"}).text + # dt = dt.strip().split() + dt = dt.strip() + day=date.today() + if "Yesterday" in dt: + yesterday = day - timedelta(days=1) + yesterday = yesterday.strftime('%m-%d-%Y') + stime = dt.replace('Yesterday,','').strip() + date_time_obj = yesterday+ ', '+stime + date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p') + elif "hours ago" in dt: + day = day.strftime('%m-%d-%Y') + date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title'] + date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p') + else: + date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p') + stime = date_time_obj.strftime('%b %d, %Y') + sdate = date_time_obj.strftime('%I:%M %p') + addDate.append(date_time_obj) + + # Finding the post + + inner = postarea.find('div', {"class": "post_body scaleimages"}) + inner = inner.text.strip() + post.append(cleanString(inner)) + + # Finding the user's signature + + # signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"}) + signature = ipost.find('div', {"class": "signature scaleimages"}) + if signature != None: + signature = signature.text.strip() + # print(signature) + else: + signature = "-1" + sign.append(cleanString(signature)) + + # As no information about user's feedback was found, just assign "-1" to the variable + + feedback.append("-1") + + ''' except: if soup.find('td', {"class": "trow1"}).text == " You do not have permission to access this page. ": user.append("-1") @@ -191,11 +165,12 @@ def cryptBB_description_parser(soup): post.append("NO ACCESS TO THIS PAGE!") sign.append(-1) feedback.append(-1) + ''' # Populate the final variable (this should be a list with all fields scraped) - row = (topic, post, user, addDate, feedback, status, reputation, sign, interest) + row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) # Sending the results @@ -210,9 +185,9 @@ def cryptBB_listing_parser(soup): nm = 0 # this variable should receive the number of topics topic = [] # all topics - user = [] # all users of each topic - post = [] # number of posts of each topic - view = [] # number of views of each topic + author = [] # all authors of each topic + views = [] # number of views of each topic + posts = [] # number of posts of each topic addDate = [] # when the topic was created (difficult to find) href = [] # this variable should receive all cleaned urls (we will use this to do the marge between # Listing and Description pages) @@ -225,13 +200,12 @@ def cryptBB_listing_parser(soup): # Finding the repeated tag that corresponds to the listing of topics itopics = soup.find_all('tr',{"class": "inline_row"}) - index = 0 + for itopic in itopics: # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them # to don't miss any topic - # Adding the topic to the topic list try: topics = itopic.find('span', {"class": "subject_old"}).find('a').text @@ -254,93 +228,32 @@ def cryptBB_listing_parser(soup): # Finding the author of the topic ps = itopic.find('div', {"class":"author smalltext"}).find('a').text - author = ps.strip() - user.append(cleanString(author)) + user = ps.strip() + author.append(cleanString(user)) # Finding the number of replies columns = itopic.findChildren('td',recursive=False) - posts = columns[3].text + replies = columns[3].text - post.append(cleanString(posts)) + posts.append(cleanString(replies)) # Finding the number of Views tview = columns[4].text - view.append(cleanString(tview)) - - + views.append(cleanString(tview)) # If no information about when the topic was added, just assign "-1" to the variable - #dt = itopic.find('div', {"class": "responsive-hide"}).text.split('ยป')[1] - #dt = dt.strip() - #date_time_obj = datetime.strptime(dt,'%a %b %d, %Y %I:%M %p') - #addDate.append(date_time_obj) - addDate.append("-1") + addDate.append("-1") + return organizeTopics("CryptBB", nm, topic, board, author, views, posts, href, addDate) - index += 1 - return organizeTopics("CryptBB", nm, topic, board, view, post, user, addDate, href) - - # if len(tag) > 0: - # - # # Finding the topic - # - # tds = tds[0].find(tag[0]) - # topics = tds.text - # topics = topics.replace(u"\xbb","") - # topics = topics.strip() - # topic.append(cleanString(topics)) - # - # # Counting how many topics we have found so far - # - # nm = len(topic) - # - # # Adding the url to the list of urls - # - # link = tds.findAll('a', href=True) - # link = link[0].get('href') - # link = cleanLink(link) - # href.append(link) - # - # # Finding the author of the topic - # - # ps = itopic.find('td', {"class": tag[1]}).find('p').find('a') - # if ps == None: - # ps = itopic.find('td', {"class": tag[1]}).find('p') - # ps = ps.text.replace("Started by ","") - # else: - # ps = ps.text - # author = ps.strip() - # user.append(cleanString(author)) - # - # # Finding the number of replies - # - # statistics = itopic.find('td', {"class": tag[2]}) - # statistics = statistics.text - # statistics = statistics.split("Replies") - # posts = statistics[0].strip() - # post.append(cleanString(posts)) - # - # # Finding the number of Views - # - # views = statistics[1] - # views = views.replace("Views","") - # views = views.strip() - # view.append(cleanString(views)) - # - # # As no information about when the topic was added, just assign "-1" to the variable - # - # addDate.append("-1") - - #return organizeTopics("TheMajesticGarden", nm, topic, board, view, post, user, addDate, href) def cryptBB_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] - #print(soup.find('table', {"class": "tborder clear"}).find( - # 'tbody').find_all('tr', {"class": "inline_row"})) + listing = soup.find('table', {"class": "tborder clear"}).find('tbody').find_all('tr', {"class": "inline_row"}) for a in listing: diff --git a/Forums/DB_Connection/db_connection.py b/Forums/DB_Connection/db_connection.py index 55d42a2..56d578a 100644 --- a/Forums/DB_Connection/db_connection.py +++ b/Forums/DB_Connection/db_connection.py @@ -36,7 +36,7 @@ def verifyForum(cur, nameForum): try: - cur.execute("select id from forums where name = %(nameForum)s limit 1", {'nameForum': nameForum}) + cur.execute("select forum_id from forums where name_forum = %(nameForum)s limit 1", {'nameForum': nameForum}) recset = cur.fetchall() @@ -55,7 +55,7 @@ def verifyBoard(cur, forum, nameBoard): try: - cur.execute("select id from boards where forum_id = %(forum)s and name = %(nameBoard)s limit 1", + cur.execute("select board_id from boards where forum_id = %(forum)s and name_board = %(nameBoard)s limit 1", {'forum': forum, 'nameBoard': nameBoard}) recset = cur.fetchall() @@ -71,12 +71,15 @@ def verifyBoard(cur, forum, nameBoard): print (trace) -def verifyTopic(cur, forum, board, nameTopic): +def verifyTopic(cur, forumId, boardId, authorId, titleTopic): try: - cur.execute("select id from topics where forum_id = %(forum)s and board_id = %(board)s and " - "name = %(nameTopic)s limit 1",{'forum': forum, 'board': board, 'nameTopic': nameTopic}) + cur.execute("select topic_id from topics where forum_id = %(forumId)s and board_id = %(boardId)s and " + "author_id = %(authorId)s and title_topic = %(titleTopic)s limit 1", {'forumId': forumId, + 'boardId': boardId, + 'authorId': authorId, + 'titleTopic': titleTopic}) recset = cur.fetchall() @@ -91,11 +94,12 @@ def verifyTopic(cur, forum, board, nameTopic): print (trace) -def verifyUser(cur, nameUser): +def verifyUser(cur, nameUser, forumId): try: - cur.execute("select id from users where name = %(nameUser)s limit 1", {'nameUser': nameUser}) + cur.execute("select user_id from users where name_user = %(nameUser)s and forum_id = %(forumId)s limit 1", + {'nameUser': nameUser, 'forumId': forumId}) recset = cur.fetchall() @@ -114,7 +118,7 @@ def getLastForum(cur): try: - cur.execute("select id from forums order by id desc limit 1") + cur.execute("select forum_id from forums order by forum_id desc limit 1") recset = cur.fetchall() @@ -133,7 +137,7 @@ def getLastBoard(cur): try: - cur.execute("select id from boards order by id desc limit 1") + cur.execute("select board_id from boards order by board_id desc limit 1") recset = cur.fetchall() @@ -152,7 +156,7 @@ def getLastTopic(cur): try: - cur.execute("select id from topics order by id desc limit 1") + cur.execute("select topic_id from topics order by topic_id desc limit 1") recset = cur.fetchall() @@ -171,7 +175,7 @@ def getLastUser(cur): try: - cur.execute("select id from Users order by id desc") + cur.execute("select user_id from users order by user_id desc") recset = cur.fetchall() @@ -186,6 +190,7 @@ def getLastUser(cur): print (trace) +''' def getLastPost(cur): try: @@ -203,97 +208,111 @@ def getLastPost(cur): trace = traceback.format_exc() print (trace) +''' def create_forum(cur, row): - forum = verifyForum(cur, row[0]) + forumId = verifyForum(cur, row[0]) - if not forum: + if not forumId: - forum = int(getLastForum(cur) + 1) + forumId = int(getLastForum(cur) + 1) - sql = "Insert into forums (id, name, date_Inserted) Values (%s, %s, %s)" + sql = "Insert into forums (forum_id, name_forum, url_forum, dateinserted_forum) Values (%s, %s, %s, %s)" - recset = [forum, row[0], time.asctime()] + recset = [forumId, row[0], None, row[8]] cur.execute(sql, recset) - return forum + return forumId -def create_board(cur, row, forum): +def create_board(cur, row, forumId): - board = verifyBoard(cur, forum, row[2]) + boardId = verifyBoard(cur, forumId, row[1]) - if not board: + if not boardId: - board = int(getLastBoard(cur) + 1) + boardId = int(getLastBoard(cur) + 1) - sql = "Insert into boards (id, forum_id, name, date_inserted) Values (%s, %s, %s, %s)" + sql = "Insert into boards (board_id, forum_id, name_board, dateinserted_board) Values (%s, %s, %s, %s)" - recset = [board, forum, row[2], time.asctime()] + recset = [boardId, forumId, row[1], row[8]] cur.execute(sql, recset) - return board - + return boardId -def create_topic(cur, row, forum, board, user): - topic = verifyTopic(cur, board, forum, row[2]) +def create_topic(cur, row, forumId, boardId, authorId): - if not topic: + topicId = verifyTopic(cur, forumId, boardId, authorId, row[3]) - topic = int(getLastTopic(cur) + 1) + if not topicId: - sql = "Insert into topics (id, forum_id, board_id, author_id, name, classification, date_added, date_inserted) " \ - "Values (%s, %s, %s, %s, %s, %s, %s, %s)" + topicId = int(getLastTopic(cur) + 1) - recset = [topic, forum, board, user, row[1], row[17], row[6] if row[6]!= '-1' else None, time.asctime()] + sql = "Insert into topics (topic_id, forum_id, board_id, author_id, title_topic, views_topic, posts_topic, " \ + "href_topic, dateadded_topic, dateinserted_topic, classification_topic) Values (%s, %s, %s, %s, %s, %s, " \ + "%s, %s, %s, %s, %s)" + recset = [topicId, forumId, boardId, authorId, + row[3], + row[4] if row[4] != '-1' else None, + row[5] if row[5] != '-1' else None, + row[6] if row[6] != '-1' else None, + row[7] if row[7] != '-1' else None, + row[8], + row[17]] cur.execute(sql, recset) - return topic + return topicId -def create_user(cur, nameUser): +def create_user(cur, row, forumId, index): - user = verifyUser(cur, nameUser) + userId = verifyUser(cur, row[9][index], forumId) - if not user: + if not userId: - user = int(getLastUser(cur) + 1) + userId = int(getLastUser(cur) + 1) - sql = "Insert into users (id, name, date_Inserted) Values (%s, %s, %s)" + sql = "Insert into users (user_id, forum_id, name_user, status_user, reputation_user, interest_user, " \ + "signature_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s)" - recset = [user, nameUser, time.asctime()] + recset = [userId, forumId, + row[9][index], + row[10][index] if row[10][index] != '-1' else None, + row[11][index] if row[11][index] != '-1' else None, + row[12][index] if row[12][index] != '-1' else None, + row[13][index] if row[13][index] != '-1' else None, + row[8]] cur.execute(sql, recset) - return user - - -def create_posts(cur, row, forum, board, topic): + return userId - if row[8] != "-1": - for i in range(len(row[8])): +def create_posts(cur, row, forumId, boardId, topicId): - id = int(getLastPost(cur) + 1) + if row[9] != "-1": - user = create_user(cur, row[9][i]) + for i in range(len(row[9])): - sql = "Insert into posts (id, forum_id, board_id, topic_id, user_id, content, rule, date_added, reputation_user, " \ - "status_user, feedback_user, interest_user, date_inserted) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" + if i != 0: + userId = create_user(cur, row, forumId, i) + else: + userId = verifyUser(cur, row[2], forumId) - recset = [id, forum, board, topic, user, row[8][i] if row[8][i]!= '-1' else None, - row[14][i] if row[14][i]!= '-1' else None, row[10][i] if row[10][i]!= '-1' else None, - row[13][i] if row[13][i]!= '-1' else None, row[12][i] if row[12][i]!= '-1' else None, - row[11][i] if row[11][i]!= '-1' else None, row[15][i] if row[15][i]!= '-1' else None, + sql = "Insert into posts (forum_id, board_id, topic_id, user_id, content_post, feedback_post, " \ + "dateadded_post, dateinserted_post) Values (%s, %s, %s, %s, %s, %s, %s, %s)" - str("%02d" %date.today().month) + "/" + str("%02d" %date.today().day) + "/" + - str("%04d" %date.today().year) + " " + time.strftime("%I:%M:%S")] + recset = [forumId, boardId, topicId, userId, + row[14][i] if row[14][i] != '-1' else None, + row[15][i] if row[15][i] != '-1' else None, + row[16][i] if row[16][i] != '-1' else None, + row[8]] cur.execute(sql, recset) @@ -302,35 +321,62 @@ def create_database(cur, con): try: - sql = "create table forums (id integer NOT NULL, name character varying(255) NOT NULL, " \ - "date_inserted timestamp(6) with time zone NOT NULL, constraint forums_pk primary key (id))" + sql = "create table forums (forum_id integer NOT NULL, name_forum character varying(255) NOT NULL, url_forum " \ + "character varying(255) null, dateinserted_forum timestamp(6) with time zone NOT NULL, constraint " \ + "forums_pk primary key (forum_id))" + cur.execute(sql) + + sql = "create table boards (board_id integer NOT NULL, forum_id integer NOT NULL, name_board character " \ + "varying(255) NOT NULL, dateinserted_board timestamp(6) with time zone NOT NULL, constraint boards_pk " \ + "primary key (board_id), constraint boards_forum_id_fkey foreign key (forum_id) references forums (" \ + "forum_id))" + cur.execute(sql) + + sql = "create table users (user_id integer NOT NULL, forum_id integer NOT NULL, name_user character varying(" \ + "255) NOT NULL, status_user character varying(255) null, reputation_user character varying(255) null, " \ + "interest_user character varying(5000) null, signature_user character varying(1000) null, " \ + "dateinserted_user timestamp(6) with time zone NOT NULL, constraint users_pk primary key (user_id), " \ + "constraint users_forum_id_fkey foreign key (forum_id) references forums (forum_id))" cur.execute(sql) - sql = "create table boards (id integer NOT NULL, forum_id integer NOT NULL, name character varying(255) NOT NULL," \ - "date_inserted timestamp(6) with time zone NOT NULL, constraint boards_pk primary key (id), " \ - "constraint boards_forum_id_fkey foreign key (forum_id) references forums (id))" + sql = "create table users_history(user_id integer NOT NULL, forum_id integer NOT NULL, name_user character " \ + "varying(255) NOT NULL, status_user character varying(255) null, reputation_user character varying(255) " \ + "null, interest_user character varying(5000) null, signature_user character varying(1000) null, " \ + "dateinserted_user timestamp(6) with time zone NOT NULL, constraint users_history_pk primary key (" \ + "user_id, dateinserted_user), constraint users_history_user_id_fkey foreign key (user_id) references " \ + "users (user_id), constraint users_history_forum_id_fkey foreign key (forum_id) references forums (" \ + "forum_id))" cur.execute(sql) - sql = "create table users (id integer NOT NULL, name character varying(255) NOT NULL, " \ - "date_inserted timestamp(6) with time zone NOT NULL, constraint users_pk primary key (id))" + sql = "create table topics(topic_id integer NOT NULL, forum_id integer NOT NULL, board_id integer NOT NULL, " \ + "author_id integer NOT NULL, title_topic character varying(255) NOT NULL, views_topic integer null, " \ + "posts_topic integer null, href_topic character varying(255) null, dateadded_topic timestamp(6) with " \ + "time zone null, dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic double " \ + "precision NOT NULL, constraint topics_pk primary key (topic_id), constraint topics_author_id_fkey " \ + "foreign key (author_id) references users (user_id), constraint topics_board_id_fkey foreign key (" \ + "board_id) references boards (board_id), constraint topics_forum_id_fkey foreign key (forum_id) " \ + "references forums (forum_id))" cur.execute(sql) - sql = "create table topics(id integer NOT NULL, forum_id integer NOT NULL, board_id integer NOT NULL, " \ - "author_id integer NOT NULL, name character varying(255) NOT NULL, classification double precision not null, " \ - "date_added timestamp(6) with time zone, date_inserted timestamp(6) with time zone NOT NULL, " \ - "constraint topics_pk primary key (id), constraint topics_author_id_fkey foreign key (author_id) references users (id), " \ - "constraint topics_board_id_fkey foreign key (board_id) references boards (id), " \ - "constraint topics_forum_id_fkey foreign key (forum_id) references forums (id))" + sql = "create table topics_history(topic_id integer NOT NULL, forum_id integer NOT NULL, board_id integer NOT " \ + "NULL, author_id integer NOT NULL, title_topic character varying(255) NOT NULL, views_topic integer " \ + "null, posts_topic integer null, href_topic character varying(255) null, dateadded_topic timestamp(6) " \ + "with time zone null, dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic " \ + "double precision NOT NULL, constraint topics_history_pk primary key (topic_id, dateinserted_topic), " \ + "constraint topics_history_topic_id_fkey foreign key (topic_id) references topics (topic_id), " \ + "constraint topics_history_author_id_fkey foreign key (author_id) references users (user_id), " \ + "constraint topics_history_board_id_fkey foreign key (board_id) references boards (board_id), " \ + "constraint topics_history_forum_id_fkey foreign key (forum_id) references forums (forum_id))" cur.execute(sql) - sql = "create table posts(id integer NOT NULL, forum_id integer NOT NULL, board_id integer NOT NULL, " \ - "topic_id integer NOT NULL, user_id integer NOT NULL, content character varying(100000), rule character varying(5000), " \ - "reputation_user character varying(100), status_user character varying(255), feedback_user integer, " \ - "interest_user character varying(1000), date_added timestamp(6) with time zone, date_inserted timestamp(6) with time zone NOT NULL, " \ - "constraint posts_pk primary key (id), constraint posts_author_id_fkey foreign key (user_id) references users (id), " \ - "constraint posts_board_id_fkey foreign key (board_id) references boards (id), " \ - "constraint posts_forum_id_fkey foreign key (forum_id) references forums (id)," \ - "constraint posts_topic_id_fkey foreign key (topic_id) references topics (id))" + sql = "create table posts(forum_id integer NOT NULL, board_id integer NOT NULL, topic_id integer NOT NULL, " \ + "user_id integer NOT NULL, content_post character varying(100000) null, feedback_post integer null, " \ + "dateadded_post timestamp(6) with time zone NOT NULL, dateinserted_post timestamp(6) with time zone NOT " \ + "NULL, constraint posts_pk primary key (forum_id, board_id, topic_id, user_id, dateadded_post), " \ + "constraint posts_author_id_fkey foreign key (user_id) references users (user_id), constraint " \ + "posts_board_id_fkey foreign key (board_id) references boards (board_id), constraint " \ + "posts_forum_id_fkey foreign key (forum_id) references forums (forum_id), constraint " \ + "posts_topic_id_fkey foreign key (topic_id) references topics (topic_id))" cur.execute(sql) con.commit() diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index 5896bc5..23d97f1 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -38,15 +38,15 @@ def mergePages(rmm, rec): # key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip() # key = rec[16] - print ("----------------- Matched: " + rec[1] + "--------------------") - rec[8] = rmm[1] - rec[9] = rmm[2] - rec[10] = rmm[3] - rec[11] = rmm[4] - rec[12] = rmm[5] - rec[13] = rmm[6] - rec[14] = rmm[7] - rec[15] = rmm[8] + print ("----------------- Matched: " + rec[3] + "--------------------") + rec[9] = rmm[1] + rec[10] = rmm[2] + rec[11] = rmm[3] + rec[12] = rmm[4] + rec[13] = rmm[5] + rec[14] = rmm[6] + rec[15] = rmm[7] + rec[16] = rmm[8] return rec @@ -64,13 +64,13 @@ def getPosts(posts): #@param: row is the list of entries for this instance, cur is the db connection object def persist_data(row, cur): - user = create_user(cur, row[5]) - forum = create_forum(cur, row) board = create_board(cur, row, forum) - topic = create_topic(cur, row, forum, board, user) + author = create_user(cur, row, forum, 0) + + topic = create_topic(cur, row, forum, board, author) create_posts(cur, row, forum, board, topic) @@ -219,7 +219,7 @@ def new_parse(forum, createLog): # key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip() # key = rec[16] - url = ''.join(e for e in rec[16] if e.isalnum()) + url = ''.join(e for e in rec[6] if e.isalnum()) key = u"Url:" + url if key in detPage: @@ -233,7 +233,7 @@ def new_parse(forum, createLog): # rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_russian'))) # else: # rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_english'))) - rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_english'))) + rec.append(str(predict(rec[3], getPosts(rec[14]), language='sup_english'))) # Persisting the information in the database try: diff --git a/Forums/Utilities/utilities.py b/Forums/Utilities/utilities.py index 7d9ada9..9d64cb6 100644 --- a/Forums/Utilities/utilities.py +++ b/Forums/Utilities/utilities.py @@ -160,19 +160,47 @@ def cleanLink(originalLink): return originalLink -def organizeTopics(forum, nm, topic, board, view, post, user, addDate, href): +def organizeTopics(forum, nm, topic, board, author, views, posts, href, addDate): + + day = time.strftime("%m/%d/%Y") + ahora = time.strftime("%I:%M:%S") rw = [] for n in range(nm): - - lne = forum + "," + topic[n] + "," + board + "," # 0, 1, 2 - lne += "-1" if len(view) == 0 else view[n] # 3 + lne = forum # 0 + lne += "," + lne += board # 1 + lne += "," + lne += author[n] # 2 + lne += "," + lne += topic[n] # 3 + lne += "," + lne += "-1" if len(views) == 0 else views[n] # 4 + lne += "," + lne += "-1" if len(posts) == 0 else posts[n] # 5 + lne += "," + lne += "-1" if len(href) == 0 else href[n] # 6 + lne += "," + lne += "-1" if len(addDate) == 0 else str(addDate[n]) # 7 + lne += "," + lne += day + " " + ahora # 8 + lne += "," + lne += "-1" # 9 name_user + lne += "," + lne += "-1" # 10 status_user + lne += "," + lne += "-1" # 11 reputation_user + lne += "," + lne += "-1" # 12 interest_user + lne += "," + lne += "-1" # 13 signature_user + lne += "," + lne += "-1" # 14 content_post + lne += "," + lne += "-1" # 15 feedback_post lne += "," - lne += "-1" if len(post) == 0 else post[n] # 4 - lne += "," + user[n] + "," + str(addDate[n]) + "," + time.asctime() # 5, 6, 7 - lne += ",-1,-1,-1,-1,-1,-1,-1,-1," # 8, 9, 10, 11, 12, 13, 14, 15 - lne += href[n] # 16 + lne += "-1" # 16 dateadded_post rw.append(lne)