Browse Source

updated forums schema

main
westernmeadow 1 year ago
parent
commit
e0b52e0ec2
6 changed files with 298 additions and 311 deletions
  1. +1
    -1
      .idea/DW_Pipeline_Test.iml
  2. +1
    -1
      .idea/misc.xml
  3. +124
    -211
      Forums/CryptBB/parser.py
  4. +122
    -76
      Forums/DB_Connection/db_connection.py
  5. +14
    -14
      Forums/Initialization/prepare_parser.py
  6. +36
    -8
      Forums/Utilities/utilities.py

+ 1
- 1
.idea/DW_Pipeline_Test.iml View File

@ -2,7 +2,7 @@
<module type="PYTHON_MODULE" version="4"> <module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager"> <component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" /> <content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="C:\Users\Helium\anaconda3" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="C:\ProgramData\Anaconda3" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />
</component> </component>
<component name="PyNamespacePackagesService"> <component name="PyNamespacePackagesService">


+ 1
- 1
.idea/misc.xml View File

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project version="4"> <project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="C:\Users\Helium\anaconda3" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="C:\ProgramData\Anaconda3" project-jdk-type="Python SDK" />
</project> </project>

+ 124
- 211
Forums/CryptBB/parser.py View File

@ -20,7 +20,7 @@ def cryptBB_description_parser(soup):
addDate = [] # all dated of each post addDate = [] # all dated of each post
feedback = [] # all feedbacks of each vendor (this was found in just one Forum and with a number format) feedback = [] # all feedbacks of each vendor (this was found in just one Forum and with a number format)
status = [] # all user's authority in each post such as (adm, member, dangerous) status = [] # all user's authority in each post such as (adm, member, dangerous)
reputation = [] # all users's karma in each post (usually found as a number)
reputation = [] # all user's karma in each post (usually found as a number)
sign = [] # all user's signature in each post (usually a standard message after the content of the post) sign = [] # all user's signature in each post (usually a standard message after the content of the post)
post = [] # all messages of each post post = [] # all messages of each post
interest = [] # all user's interest in each post interest = [] # all user's interest in each post
@ -34,153 +34,127 @@ def cryptBB_description_parser(soup):
topic = topic.replace(",","") topic = topic.replace(",","")
topic = topic.replace("\n","") topic = topic.replace("\n","")
topic = cleanString(topic.strip()) topic = cleanString(topic.strip())
print(topic)
# Finding the repeated tag that corresponds to the listing of posts
# posts = soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg"}) + \
# soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg2"})
try:
posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all(
'div', {"class": "post"})
# print(len(posts))
# Finding the repeated tag that corresponds to the listing of posts
# For each message (post), get all the fields we are interested to:
# try:
posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all(
'div', {"class": "post"})
for ipost in posts:
# For each message (post), get all the fields we are interested to:
# Finding a first level of the HTML page
for ipost in posts:
# post_wrapper = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "poster"})
post_wrapper = ipost.find('span', {"class": "largetext"})
# Finding the author (user) of the post
# Finding a first level of the HTML page
# author = post_wrapper.find('h4')
author = post_wrapper.text.strip()
# print("author " + author)
user.append(cleanString(author)) # Remember to clean the problematic characters
post_wrapper = ipost.find('span', {"class": "largetext"})
# Finding the status of the author
# Finding the author (user) of the post
smalltext = ipost.find('div', {"class": "post_author"})
author = post_wrapper.text.strip()
user.append(cleanString(author)) # Remember to clean the problematic characters
# Testing here two possibilities to find this status and combine them
if ipost.find('div', {"class": "deleted_post_author"}):
status.append(-1)
interest.append(-1)
reputation.append(-1)
addDate.append(-1)
post.append("THIS POST HAS BEEN REMOVED!")
sign.append(-1)
feedback.append(-1)
continue
# Finding the status of the author
# CryptBB does have membergroup and postgroup
smalltext = ipost.find('div', {"class": "post_author"})
membergroup = smalltext.find('div', {"class": "profile-rank"})
postgroup = smalltext.find('div', {"class": "postgroup"})
if membergroup != None:
membergroup = membergroup.text.strip()
if postgroup != None:
postgroup = postgroup.text.strip()
membergroup = membergroup + " - " + postgroup
else:
if postgroup != None:
membergroup = postgroup.text.strip()
else:
membergroup = "-1"
status.append(cleanString(membergroup))
# print("status " + cleanString(membergroup))
# Finding the interest of the author
# CryptBB does not have blurb
blurb = smalltext.find('li', {"class": "blurb"})
if blurb != None:
blurb = blurb.text.strip()
else:
blurb = "-1"
interest.append(cleanString(blurb))
# Finding the reputation of the user
# CryptBB does have reputation
author_stats = smalltext.find('div', {"class": "author_statistics"})
karma = author_stats.find('strong')
if karma != None:
karma = karma.text
karma = karma.replace("Community Rating: ", "")
karma = karma.replace("Karma: ", "")
karma = karma.strip()
else:
karma = "-1"
reputation.append(cleanString(karma))
# print("karma " + cleanString(karma))
# Getting here another good tag to find the post date, post content and users' signature
postarea = ipost.find('div', {"class": "post_content"})
dt = postarea.find('span', {"class": "post_date"}).text
# dt = dt.strip().split()
dt = dt.strip()
day=date.today()
if "Yesterday" in dt:
yesterday = day - timedelta(days=1)
yesterday = yesterday.strftime('%m-%d-%Y')
stime = dt.replace('Yesterday,','').strip()
date_time_obj = yesterday+ ', '+stime
date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
elif "hours ago" in dt:
day = day.strftime('%m-%d-%Y')
date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p')
else:
date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p')
stime = date_time_obj.strftime('%b %d, %Y')
sdate = date_time_obj.strftime('%I:%M %p')
addDate.append(date_time_obj)
# print("date " + str(date_time_obj))
# Finding the date of the post
# date_time_obj = datetime.strptime(dt, '%a %b %d, %Y %I:%M %p')
# smalltext = postarea.find('div', {"class": "flow_hidden"}).find('div', {"class": "keyinfo"})\
# .find('div', {"class": "smalltext"})
# sdatetime = smalltext.text
# sdatetime = sdatetime.replace(u"\xab","") # Removing unnecessary characters
# sdatetime = sdatetime.replace(u"\xbb","") # Removing unnecessary characters
# sdatetime = sdatetime.split("on: ") # Removing unnecessary characters
# sdatetime = sdatetime[1].strip()
# stime = sdatetime[:-12:-1] # Finding the time of the post
# stime = stime[::-1]
# sdate = sdatetime.replace(stime,"") # Finding the date of the post
# sdate = sdate.replace(",","")
# sdate = sdate.strip()
# Covert the date of the post that can be informed as: "12 February 2016", "today", "yesterday". We need
# a date format here as "mm/dd/yyyy"
# addDate.append(convertDate(sdate,"english", crawlerDate) + " " + stime)
# Finding the post
inner = postarea.find('div', {"class": "post_body scaleimages"})
inner = inner.text.strip()
# print(inner)
post.append(cleanString(inner))
# Finding the users's signature
# signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"})
signature = ipost.find('div', {"class": "signature scaleimages"})
if signature != None:
signature = signature.text.strip()
# print(signature)
'''
# Testing here two possibilities to find this status and combine them
if ipost.find('div', {"class": "deleted_post_author"}):
status.append(-1)
interest.append(-1)
reputation.append(-1)
addDate.append(-1)
post.append("THIS POST HAS BEEN REMOVED!")
sign.append(-1)
feedback.append(-1)
continue
'''
# CryptBB does have membergroup and postgroup
membergroup = smalltext.find('div', {"class": "profile-rank"})
postgroup = smalltext.find('div', {"class": "postgroup"})
if membergroup != None:
membergroup = membergroup.text.strip()
if postgroup != None:
postgroup = postgroup.text.strip()
membergroup = membergroup + " - " + postgroup
else:
if postgroup != None:
membergroup = postgroup.text.strip()
else: else:
signature = "-1"
sign.append(cleanString(signature))
# As no information about users's feedback was found, just assign "-1" to the variable
feedback.append("-1")
membergroup = "-1"
status.append(cleanString(membergroup))
# Finding the interest of the author
# CryptBB does not have blurb
blurb = smalltext.find('li', {"class": "blurb"})
if blurb != None:
blurb = blurb.text.strip()
else:
blurb = "-1"
interest.append(cleanString(blurb))
# Finding the reputation of the user
# CryptBB does have reputation
author_stats = smalltext.find('div', {"class": "author_statistics"})
karma = author_stats.find('strong')
if karma != None:
karma = karma.text
karma = karma.replace("Community Rating: ", "")
karma = karma.replace("Karma: ", "")
karma = karma.strip()
else:
karma = "-1"
reputation.append(cleanString(karma))
# Getting here another good tag to find the post date, post content and users' signature
postarea = ipost.find('div', {"class": "post_content"})
dt = postarea.find('span', {"class": "post_date"}).text
# dt = dt.strip().split()
dt = dt.strip()
day=date.today()
if "Yesterday" in dt:
yesterday = day - timedelta(days=1)
yesterday = yesterday.strftime('%m-%d-%Y')
stime = dt.replace('Yesterday,','').strip()
date_time_obj = yesterday+ ', '+stime
date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
elif "hours ago" in dt:
day = day.strftime('%m-%d-%Y')
date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p')
else:
date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p')
stime = date_time_obj.strftime('%b %d, %Y')
sdate = date_time_obj.strftime('%I:%M %p')
addDate.append(date_time_obj)
# Finding the post
inner = postarea.find('div', {"class": "post_body scaleimages"})
inner = inner.text.strip()
post.append(cleanString(inner))
# Finding the user's signature
# signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"})
signature = ipost.find('div', {"class": "signature scaleimages"})
if signature != None:
signature = signature.text.strip()
# print(signature)
else:
signature = "-1"
sign.append(cleanString(signature))
# As no information about user's feedback was found, just assign "-1" to the variable
feedback.append("-1")
'''
except: except:
if soup.find('td', {"class": "trow1"}).text == " You do not have permission to access this page. ": if soup.find('td', {"class": "trow1"}).text == " You do not have permission to access this page. ":
user.append("-1") user.append("-1")
@ -191,11 +165,12 @@ def cryptBB_description_parser(soup):
post.append("NO ACCESS TO THIS PAGE!") post.append("NO ACCESS TO THIS PAGE!")
sign.append(-1) sign.append(-1)
feedback.append(-1) feedback.append(-1)
'''
# Populate the final variable (this should be a list with all fields scraped) # Populate the final variable (this should be a list with all fields scraped)
row = (topic, post, user, addDate, feedback, status, reputation, sign, interest)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
# Sending the results # Sending the results
@ -210,9 +185,9 @@ def cryptBB_listing_parser(soup):
nm = 0 # this variable should receive the number of topics nm = 0 # this variable should receive the number of topics
topic = [] # all topics topic = [] # all topics
user = [] # all users of each topic
post = [] # number of posts of each topic
view = [] # number of views of each topic
author = [] # all authors of each topic
views = [] # number of views of each topic
posts = [] # number of posts of each topic
addDate = [] # when the topic was created (difficult to find) addDate = [] # when the topic was created (difficult to find)
href = [] # this variable should receive all cleaned urls (we will use this to do the marge between href = [] # this variable should receive all cleaned urls (we will use this to do the marge between
# Listing and Description pages) # Listing and Description pages)
@ -225,13 +200,12 @@ def cryptBB_listing_parser(soup):
# Finding the repeated tag that corresponds to the listing of topics # Finding the repeated tag that corresponds to the listing of topics
itopics = soup.find_all('tr',{"class": "inline_row"}) itopics = soup.find_all('tr',{"class": "inline_row"})
index = 0
for itopic in itopics: for itopic in itopics:
# For each topic found, the structure to get the rest of the information can be of two types. Testing all of them # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
# to don't miss any topic # to don't miss any topic
# Adding the topic to the topic list # Adding the topic to the topic list
try: try:
topics = itopic.find('span', {"class": "subject_old"}).find('a').text topics = itopic.find('span', {"class": "subject_old"}).find('a').text
@ -254,93 +228,32 @@ def cryptBB_listing_parser(soup):
# Finding the author of the topic # Finding the author of the topic
ps = itopic.find('div', {"class":"author smalltext"}).find('a').text ps = itopic.find('div', {"class":"author smalltext"}).find('a').text
author = ps.strip()
user.append(cleanString(author))
user = ps.strip()
author.append(cleanString(user))
# Finding the number of replies # Finding the number of replies
columns = itopic.findChildren('td',recursive=False) columns = itopic.findChildren('td',recursive=False)
posts = columns[3].text
replies = columns[3].text
post.append(cleanString(posts))
posts.append(cleanString(replies))
# Finding the number of Views # Finding the number of Views
tview = columns[4].text tview = columns[4].text
view.append(cleanString(tview))
views.append(cleanString(tview))
# If no information about when the topic was added, just assign "-1" to the variable # If no information about when the topic was added, just assign "-1" to the variable
#dt = itopic.find('div', {"class": "responsive-hide"}).text.split('»')[1]
#dt = dt.strip()
#date_time_obj = datetime.strptime(dt,'%a %b %d, %Y %I:%M %p')
#addDate.append(date_time_obj)
addDate.append("-1")
addDate.append("-1")
return organizeTopics("CryptBB", nm, topic, board, author, views, posts, href, addDate)
index += 1
return organizeTopics("CryptBB", nm, topic, board, view, post, user, addDate, href)
# if len(tag) > 0:
#
# # Finding the topic
#
# tds = tds[0].find(tag[0])
# topics = tds.text
# topics = topics.replace(u"\xbb","")
# topics = topics.strip()
# topic.append(cleanString(topics))
#
# # Counting how many topics we have found so far
#
# nm = len(topic)
#
# # Adding the url to the list of urls
#
# link = tds.findAll('a', href=True)
# link = link[0].get('href')
# link = cleanLink(link)
# href.append(link)
#
# # Finding the author of the topic
#
# ps = itopic.find('td', {"class": tag[1]}).find('p').find('a')
# if ps == None:
# ps = itopic.find('td', {"class": tag[1]}).find('p')
# ps = ps.text.replace("Started by ","")
# else:
# ps = ps.text
# author = ps.strip()
# user.append(cleanString(author))
#
# # Finding the number of replies
#
# statistics = itopic.find('td', {"class": tag[2]})
# statistics = statistics.text
# statistics = statistics.split("Replies")
# posts = statistics[0].strip()
# post.append(cleanString(posts))
#
# # Finding the number of Views
#
# views = statistics[1]
# views = views.replace("Views","")
# views = views.strip()
# view.append(cleanString(views))
#
# # As no information about when the topic was added, just assign "-1" to the variable
#
# addDate.append("-1")
#return organizeTopics("TheMajesticGarden", nm, topic, board, view, post, user, addDate, href)
def cryptBB_links_parser(soup): def cryptBB_links_parser(soup):
# Returning all links that should be visited by the Crawler # Returning all links that should be visited by the Crawler
href = [] href = []
#print(soup.find('table', {"class": "tborder clear"}).find(
# 'tbody').find_all('tr', {"class": "inline_row"}))
listing = soup.find('table', {"class": "tborder clear"}).find('tbody').find_all('tr', {"class": "inline_row"}) listing = soup.find('table', {"class": "tborder clear"}).find('tbody').find_all('tr', {"class": "inline_row"})
for a in listing: for a in listing:


+ 122
- 76
Forums/DB_Connection/db_connection.py View File

@ -36,7 +36,7 @@ def verifyForum(cur, nameForum):
try: try:
cur.execute("select id from forums where name = %(nameForum)s limit 1", {'nameForum': nameForum})
cur.execute("select forum_id from forums where name_forum = %(nameForum)s limit 1", {'nameForum': nameForum})
recset = cur.fetchall() recset = cur.fetchall()
@ -55,7 +55,7 @@ def verifyBoard(cur, forum, nameBoard):
try: try:
cur.execute("select id from boards where forum_id = %(forum)s and name = %(nameBoard)s limit 1",
cur.execute("select board_id from boards where forum_id = %(forum)s and name_board = %(nameBoard)s limit 1",
{'forum': forum, 'nameBoard': nameBoard}) {'forum': forum, 'nameBoard': nameBoard})
recset = cur.fetchall() recset = cur.fetchall()
@ -71,12 +71,15 @@ def verifyBoard(cur, forum, nameBoard):
print (trace) print (trace)
def verifyTopic(cur, forum, board, nameTopic):
def verifyTopic(cur, forumId, boardId, authorId, titleTopic):
try: try:
cur.execute("select id from topics where forum_id = %(forum)s and board_id = %(board)s and "
"name = %(nameTopic)s limit 1",{'forum': forum, 'board': board, 'nameTopic': nameTopic})
cur.execute("select topic_id from topics where forum_id = %(forumId)s and board_id = %(boardId)s and "
"author_id = %(authorId)s and title_topic = %(titleTopic)s limit 1", {'forumId': forumId,
'boardId': boardId,
'authorId': authorId,
'titleTopic': titleTopic})
recset = cur.fetchall() recset = cur.fetchall()
@ -91,11 +94,12 @@ def verifyTopic(cur, forum, board, nameTopic):
print (trace) print (trace)
def verifyUser(cur, nameUser):
def verifyUser(cur, nameUser, forumId):
try: try:
cur.execute("select id from users where name = %(nameUser)s limit 1", {'nameUser': nameUser})
cur.execute("select user_id from users where name_user = %(nameUser)s and forum_id = %(forumId)s limit 1",
{'nameUser': nameUser, 'forumId': forumId})
recset = cur.fetchall() recset = cur.fetchall()
@ -114,7 +118,7 @@ def getLastForum(cur):
try: try:
cur.execute("select id from forums order by id desc limit 1")
cur.execute("select forum_id from forums order by forum_id desc limit 1")
recset = cur.fetchall() recset = cur.fetchall()
@ -133,7 +137,7 @@ def getLastBoard(cur):
try: try:
cur.execute("select id from boards order by id desc limit 1")
cur.execute("select board_id from boards order by board_id desc limit 1")
recset = cur.fetchall() recset = cur.fetchall()
@ -152,7 +156,7 @@ def getLastTopic(cur):
try: try:
cur.execute("select id from topics order by id desc limit 1")
cur.execute("select topic_id from topics order by topic_id desc limit 1")
recset = cur.fetchall() recset = cur.fetchall()
@ -171,7 +175,7 @@ def getLastUser(cur):
try: try:
cur.execute("select id from Users order by id desc")
cur.execute("select user_id from users order by user_id desc")
recset = cur.fetchall() recset = cur.fetchall()
@ -186,6 +190,7 @@ def getLastUser(cur):
print (trace) print (trace)
'''
def getLastPost(cur): def getLastPost(cur):
try: try:
@ -203,97 +208,111 @@ def getLastPost(cur):
trace = traceback.format_exc() trace = traceback.format_exc()
print (trace) print (trace)
'''
def create_forum(cur, row): def create_forum(cur, row):
forum = verifyForum(cur, row[0])
forumId = verifyForum(cur, row[0])
if not forum:
if not forumId:
forum = int(getLastForum(cur) + 1)
forumId = int(getLastForum(cur) + 1)
sql = "Insert into forums (id, name, date_Inserted) Values (%s, %s, %s)"
sql = "Insert into forums (forum_id, name_forum, url_forum, dateinserted_forum) Values (%s, %s, %s, %s)"
recset = [forum, row[0], time.asctime()]
recset = [forumId, row[0], None, row[8]]
cur.execute(sql, recset) cur.execute(sql, recset)
return forum
return forumId
def create_board(cur, row, forum):
def create_board(cur, row, forumId):
board = verifyBoard(cur, forum, row[2])
boardId = verifyBoard(cur, forumId, row[1])
if not board:
if not boardId:
board = int(getLastBoard(cur) + 1)
boardId = int(getLastBoard(cur) + 1)
sql = "Insert into boards (id, forum_id, name, date_inserted) Values (%s, %s, %s, %s)"
sql = "Insert into boards (board_id, forum_id, name_board, dateinserted_board) Values (%s, %s, %s, %s)"
recset = [board, forum, row[2], time.asctime()]
recset = [boardId, forumId, row[1], row[8]]
cur.execute(sql, recset) cur.execute(sql, recset)
return board
return boardId
def create_topic(cur, row, forum, board, user):
topic = verifyTopic(cur, board, forum, row[2])
def create_topic(cur, row, forumId, boardId, authorId):
if not topic:
topicId = verifyTopic(cur, forumId, boardId, authorId, row[3])
topic = int(getLastTopic(cur) + 1)
if not topicId:
sql = "Insert into topics (id, forum_id, board_id, author_id, name, classification, date_added, date_inserted) " \
"Values (%s, %s, %s, %s, %s, %s, %s, %s)"
topicId = int(getLastTopic(cur) + 1)
recset = [topic, forum, board, user, row[1], row[17], row[6] if row[6]!= '-1' else None, time.asctime()]
sql = "Insert into topics (topic_id, forum_id, board_id, author_id, title_topic, views_topic, posts_topic, " \
"href_topic, dateadded_topic, dateinserted_topic, classification_topic) Values (%s, %s, %s, %s, %s, %s, " \
"%s, %s, %s, %s, %s)"
recset = [topicId, forumId, boardId, authorId,
row[3],
row[4] if row[4] != '-1' else None,
row[5] if row[5] != '-1' else None,
row[6] if row[6] != '-1' else None,
row[7] if row[7] != '-1' else None,
row[8],
row[17]]
cur.execute(sql, recset) cur.execute(sql, recset)
return topic
return topicId
def create_user(cur, nameUser):
def create_user(cur, row, forumId, index):
user = verifyUser(cur, nameUser)
userId = verifyUser(cur, row[9][index], forumId)
if not user:
if not userId:
user = int(getLastUser(cur) + 1)
userId = int(getLastUser(cur) + 1)
sql = "Insert into users (id, name, date_Inserted) Values (%s, %s, %s)"
sql = "Insert into users (user_id, forum_id, name_user, status_user, reputation_user, interest_user, " \
"signature_user, dateinserted_user) Values (%s, %s, %s, %s, %s, %s, %s, %s)"
recset = [user, nameUser, time.asctime()]
recset = [userId, forumId,
row[9][index],
row[10][index] if row[10][index] != '-1' else None,
row[11][index] if row[11][index] != '-1' else None,
row[12][index] if row[12][index] != '-1' else None,
row[13][index] if row[13][index] != '-1' else None,
row[8]]
cur.execute(sql, recset) cur.execute(sql, recset)
return user
def create_posts(cur, row, forum, board, topic):
return userId
if row[8] != "-1":
for i in range(len(row[8])):
def create_posts(cur, row, forumId, boardId, topicId):
id = int(getLastPost(cur) + 1)
if row[9] != "-1":
user = create_user(cur, row[9][i])
for i in range(len(row[9])):
sql = "Insert into posts (id, forum_id, board_id, topic_id, user_id, content, rule, date_added, reputation_user, " \
"status_user, feedback_user, interest_user, date_inserted) Values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
if i != 0:
userId = create_user(cur, row, forumId, i)
else:
userId = verifyUser(cur, row[2], forumId)
recset = [id, forum, board, topic, user, row[8][i] if row[8][i]!= '-1' else None,
row[14][i] if row[14][i]!= '-1' else None, row[10][i] if row[10][i]!= '-1' else None,
row[13][i] if row[13][i]!= '-1' else None, row[12][i] if row[12][i]!= '-1' else None,
row[11][i] if row[11][i]!= '-1' else None, row[15][i] if row[15][i]!= '-1' else None,
sql = "Insert into posts (forum_id, board_id, topic_id, user_id, content_post, feedback_post, " \
"dateadded_post, dateinserted_post) Values (%s, %s, %s, %s, %s, %s, %s, %s)"
str("%02d" %date.today().month) + "/" + str("%02d" %date.today().day) + "/" +
str("%04d" %date.today().year) + " " + time.strftime("%I:%M:%S")]
recset = [forumId, boardId, topicId, userId,
row[14][i] if row[14][i] != '-1' else None,
row[15][i] if row[15][i] != '-1' else None,
row[16][i] if row[16][i] != '-1' else None,
row[8]]
cur.execute(sql, recset) cur.execute(sql, recset)
@ -302,35 +321,62 @@ def create_database(cur, con):
try: try:
sql = "create table forums (id integer NOT NULL, name character varying(255) NOT NULL, " \
"date_inserted timestamp(6) with time zone NOT NULL, constraint forums_pk primary key (id))"
sql = "create table forums (forum_id integer NOT NULL, name_forum character varying(255) NOT NULL, url_forum " \
"character varying(255) null, dateinserted_forum timestamp(6) with time zone NOT NULL, constraint " \
"forums_pk primary key (forum_id))"
cur.execute(sql)
sql = "create table boards (board_id integer NOT NULL, forum_id integer NOT NULL, name_board character " \
"varying(255) NOT NULL, dateinserted_board timestamp(6) with time zone NOT NULL, constraint boards_pk " \
"primary key (board_id), constraint boards_forum_id_fkey foreign key (forum_id) references forums (" \
"forum_id))"
cur.execute(sql)
sql = "create table users (user_id integer NOT NULL, forum_id integer NOT NULL, name_user character varying(" \
"255) NOT NULL, status_user character varying(255) null, reputation_user character varying(255) null, " \
"interest_user character varying(5000) null, signature_user character varying(1000) null, " \
"dateinserted_user timestamp(6) with time zone NOT NULL, constraint users_pk primary key (user_id), " \
"constraint users_forum_id_fkey foreign key (forum_id) references forums (forum_id))"
cur.execute(sql) cur.execute(sql)
sql = "create table boards (id integer NOT NULL, forum_id integer NOT NULL, name character varying(255) NOT NULL," \
"date_inserted timestamp(6) with time zone NOT NULL, constraint boards_pk primary key (id), " \
"constraint boards_forum_id_fkey foreign key (forum_id) references forums (id))"
sql = "create table users_history(user_id integer NOT NULL, forum_id integer NOT NULL, name_user character " \
"varying(255) NOT NULL, status_user character varying(255) null, reputation_user character varying(255) " \
"null, interest_user character varying(5000) null, signature_user character varying(1000) null, " \
"dateinserted_user timestamp(6) with time zone NOT NULL, constraint users_history_pk primary key (" \
"user_id, dateinserted_user), constraint users_history_user_id_fkey foreign key (user_id) references " \
"users (user_id), constraint users_history_forum_id_fkey foreign key (forum_id) references forums (" \
"forum_id))"
cur.execute(sql) cur.execute(sql)
sql = "create table users (id integer NOT NULL, name character varying(255) NOT NULL, " \
"date_inserted timestamp(6) with time zone NOT NULL, constraint users_pk primary key (id))"
sql = "create table topics(topic_id integer NOT NULL, forum_id integer NOT NULL, board_id integer NOT NULL, " \
"author_id integer NOT NULL, title_topic character varying(255) NOT NULL, views_topic integer null, " \
"posts_topic integer null, href_topic character varying(255) null, dateadded_topic timestamp(6) with " \
"time zone null, dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic double " \
"precision NOT NULL, constraint topics_pk primary key (topic_id), constraint topics_author_id_fkey " \
"foreign key (author_id) references users (user_id), constraint topics_board_id_fkey foreign key (" \
"board_id) references boards (board_id), constraint topics_forum_id_fkey foreign key (forum_id) " \
"references forums (forum_id))"
cur.execute(sql) cur.execute(sql)
sql = "create table topics(id integer NOT NULL, forum_id integer NOT NULL, board_id integer NOT NULL, " \
"author_id integer NOT NULL, name character varying(255) NOT NULL, classification double precision not null, " \
"date_added timestamp(6) with time zone, date_inserted timestamp(6) with time zone NOT NULL, " \
"constraint topics_pk primary key (id), constraint topics_author_id_fkey foreign key (author_id) references users (id), " \
"constraint topics_board_id_fkey foreign key (board_id) references boards (id), " \
"constraint topics_forum_id_fkey foreign key (forum_id) references forums (id))"
sql = "create table topics_history(topic_id integer NOT NULL, forum_id integer NOT NULL, board_id integer NOT " \
"NULL, author_id integer NOT NULL, title_topic character varying(255) NOT NULL, views_topic integer " \
"null, posts_topic integer null, href_topic character varying(255) null, dateadded_topic timestamp(6) " \
"with time zone null, dateinserted_topic timestamp(6) with time zone NOT NULL, classification_topic " \
"double precision NOT NULL, constraint topics_history_pk primary key (topic_id, dateinserted_topic), " \
"constraint topics_history_topic_id_fkey foreign key (topic_id) references topics (topic_id), " \
"constraint topics_history_author_id_fkey foreign key (author_id) references users (user_id), " \
"constraint topics_history_board_id_fkey foreign key (board_id) references boards (board_id), " \
"constraint topics_history_forum_id_fkey foreign key (forum_id) references forums (forum_id))"
cur.execute(sql) cur.execute(sql)
sql = "create table posts(id integer NOT NULL, forum_id integer NOT NULL, board_id integer NOT NULL, " \
"topic_id integer NOT NULL, user_id integer NOT NULL, content character varying(100000), rule character varying(5000), " \
"reputation_user character varying(100), status_user character varying(255), feedback_user integer, " \
"interest_user character varying(1000), date_added timestamp(6) with time zone, date_inserted timestamp(6) with time zone NOT NULL, " \
"constraint posts_pk primary key (id), constraint posts_author_id_fkey foreign key (user_id) references users (id), " \
"constraint posts_board_id_fkey foreign key (board_id) references boards (id), " \
"constraint posts_forum_id_fkey foreign key (forum_id) references forums (id)," \
"constraint posts_topic_id_fkey foreign key (topic_id) references topics (id))"
sql = "create table posts(forum_id integer NOT NULL, board_id integer NOT NULL, topic_id integer NOT NULL, " \
"user_id integer NOT NULL, content_post character varying(100000) null, feedback_post integer null, " \
"dateadded_post timestamp(6) with time zone NOT NULL, dateinserted_post timestamp(6) with time zone NOT " \
"NULL, constraint posts_pk primary key (forum_id, board_id, topic_id, user_id, dateadded_post), " \
"constraint posts_author_id_fkey foreign key (user_id) references users (user_id), constraint " \
"posts_board_id_fkey foreign key (board_id) references boards (board_id), constraint " \
"posts_forum_id_fkey foreign key (forum_id) references forums (forum_id), constraint " \
"posts_topic_id_fkey foreign key (topic_id) references topics (topic_id))"
cur.execute(sql) cur.execute(sql)
con.commit() con.commit()


+ 14
- 14
Forums/Initialization/prepare_parser.py View File

@ -38,15 +38,15 @@ def mergePages(rmm, rec):
# key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip() # key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip()
# key = rec[16] # key = rec[16]
print ("----------------- Matched: " + rec[1] + "--------------------")
rec[8] = rmm[1]
rec[9] = rmm[2]
rec[10] = rmm[3]
rec[11] = rmm[4]
rec[12] = rmm[5]
rec[13] = rmm[6]
rec[14] = rmm[7]
rec[15] = rmm[8]
print ("----------------- Matched: " + rec[3] + "--------------------")
rec[9] = rmm[1]
rec[10] = rmm[2]
rec[11] = rmm[3]
rec[12] = rmm[4]
rec[13] = rmm[5]
rec[14] = rmm[6]
rec[15] = rmm[7]
rec[16] = rmm[8]
return rec return rec
@ -64,13 +64,13 @@ def getPosts(posts):
#@param: row is the list of entries for this instance, cur is the db connection object #@param: row is the list of entries for this instance, cur is the db connection object
def persist_data(row, cur): def persist_data(row, cur):
user = create_user(cur, row[5])
forum = create_forum(cur, row) forum = create_forum(cur, row)
board = create_board(cur, row, forum) board = create_board(cur, row, forum)
topic = create_topic(cur, row, forum, board, user)
author = create_user(cur, row, forum, 0)
topic = create_topic(cur, row, forum, board, author)
create_posts(cur, row, forum, board, topic) create_posts(cur, row, forum, board, topic)
@ -219,7 +219,7 @@ def new_parse(forum, createLog):
# key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip() # key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip()
# key = rec[16] # key = rec[16]
url = ''.join(e for e in rec[16] if e.isalnum())
url = ''.join(e for e in rec[6] if e.isalnum())
key = u"Url:" + url key = u"Url:" + url
if key in detPage: if key in detPage:
@ -233,7 +233,7 @@ def new_parse(forum, createLog):
# rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_russian'))) # rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_russian')))
# else: # else:
# rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_english'))) # rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_english')))
rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_english')))
rec.append(str(predict(rec[3], getPosts(rec[14]), language='sup_english')))
# Persisting the information in the database # Persisting the information in the database
try: try:


+ 36
- 8
Forums/Utilities/utilities.py View File

@ -160,19 +160,47 @@ def cleanLink(originalLink):
return originalLink return originalLink
def organizeTopics(forum, nm, topic, board, view, post, user, addDate, href):
def organizeTopics(forum, nm, topic, board, author, views, posts, href, addDate):
day = time.strftime("%m/%d/%Y")
ahora = time.strftime("%I:%M:%S")
rw = [] rw = []
for n in range(nm): for n in range(nm):
lne = forum + "," + topic[n] + "," + board + "," # 0, 1, 2
lne += "-1" if len(view) == 0 else view[n] # 3
lne = forum # 0
lne += ","
lne += board # 1
lne += ","
lne += author[n] # 2
lne += ","
lne += topic[n] # 3
lne += ","
lne += "-1" if len(views) == 0 else views[n] # 4
lne += ","
lne += "-1" if len(posts) == 0 else posts[n] # 5
lne += ","
lne += "-1" if len(href) == 0 else href[n] # 6
lne += ","
lne += "-1" if len(addDate) == 0 else str(addDate[n]) # 7
lne += ","
lne += day + " " + ahora # 8
lne += ","
lne += "-1" # 9 name_user
lne += ","
lne += "-1" # 10 status_user
lne += ","
lne += "-1" # 11 reputation_user
lne += ","
lne += "-1" # 12 interest_user
lne += ","
lne += "-1" # 13 signature_user
lne += ","
lne += "-1" # 14 content_post
lne += ","
lne += "-1" # 15 feedback_post
lne += "," lne += ","
lne += "-1" if len(post) == 0 else post[n] # 4
lne += "," + user[n] + "," + str(addDate[n]) + "," + time.asctime() # 5, 6, 7
lne += ",-1,-1,-1,-1,-1,-1,-1,-1," # 8, 9, 10, 11, 12, 13, 14, 15
lne += href[n] # 16
lne += "-1" # 16 dateadded_post
rw.append(lne) rw.append(lne)


Loading…
Cancel
Save