this is based on calsyslab project
__author__ = 'DarkWeb'
import codecs
import glob
import os
import shutil
from Forums.DB_Connection.db_connection import *
from Forums.BestCardingWorld.parser import *
from Forums.CryptBB.parser import *
import re
from Forums.Classifier.classify_product import predict
# from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
# determines if forum is russian, not really used now but maybe later
def isRussianForum(forum):
with open('russian_forums.txt') as f:
forums = f.readlines()
result = False
for iforum in forums:
iforum = iforum.replace('\n','')
if iforum == forum:
result = True
return result
#tries to match description pages to listing pages by using a key made for every description page and every link in listing page
#once verified and matched, the info is merged into a 'rec', which is returned
#@param: detPage is a list of keys of valid pages, rec is the row of data of an instance
#return: rec, row of data, that may have additional data added on after matching description to listing page
def mergePages(rmm, rec):
# key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip()
# key = rec[16]
print ("----------------- Matched: " + rec[3] + "--------------------")
rec[9] = rmm[1]
rec[10] = rmm[2]
rec[11] = rmm[3]
rec[12] = rmm[4]
rec[13] = rmm[5]
rec[14] = rmm[6]
rec[15] = rmm[7]
rec[16] = rmm[8]
return rec
#gets a string of posts and joins them together into one string to be put in the database as one string of text
#@param: list of strings (the posts of a thread)
#return: string containing the concatenation of all the strings
def getPosts(posts):
strPosts = ' '.join(posts)
return strPosts.strip()
#uses db connection , another program, methods to persists values to the correct categories
#@param: row is the list of entries for this instance, cur is the db connection object
def persist_data(url, row, cur):
forum = create_forum(cur, row, url)
board = create_board(cur, row, forum)
author = create_user(cur, row, forum, 0)
topic = create_topic(cur, row, forum, board, author)
create_posts(cur, row, forum, board, topic)
#main method for this program, what actually gets the parsed info from the parser, and persists them into the db
#calls the different parser methods here depending on the type of html page
def new_parse(forum, url, createLog):
from Forums.Initialization.forums_mining import CURRENT_DATE
print("Parsing The " + forum + " Forum and conduct data classification to store the information in the database.")
# ini = time.time()
# Connecting to the database
con = connectDataBase()
cur = con.cursor()
# Creating the tables (The database should be created manually)
create_database(cur, con)
nError = 0
lines = [] # listing pages
lns = [] # description pages
detPage = {} # first pages
other = {} # other pages
# Creating the log file for each Forum
if createLog:
if not os.path.exists("./" + forum + "/Logs/" + forum + "_" + CURRENT_DATE + ".log"):
logFile = open("./" + forum + "/Logs/" + forum + "_" + CURRENT_DATE + ".log", "w")
print("Files of the date " + CURRENT_DATE + " from the Forum " + forum +
" were already read. Delete the referent information in the Data Base and also delete the log file"
" in the _Logs folder to read files from this Forum of this date again.")
raise SystemExit
# Reading the Listing Html Pages -> to memory
for fileListing in glob.glob(os.path.join("..\\" + forum + "\\HTML_Pages\\" + CURRENT_DATE + "\\Listing", '*.html')):
# Reading the Description Html Pages -> to memory
for fileDescription in glob.glob(os.path.join("..\\" + forum + "\\HTML_Pages\\" + CURRENT_DATE + "\\Description" ,'*.html')):
# Parsing the Description Pages and put the tag's content into a dictionary (Hash table)
for index, line2 in enumerate(lns):
print("Reading description folder of '" + forum + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns)))
html ='\n'), encoding='utf8')#trying t open them in utf8 format
soup = BeautifulSoup(html, "html.parser")#throw into beautiful soup
html = open(line2.strip('\n'))
soup = BeautifulSoup(html, "html.parser")
nError += 1
print("There was a problem to read the file " + line2 + " in the Description section!")
if createLog:
logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section!\n")
#Where actual parsing occurs
if forum == "BestCardingWorld":
rmm = bestcardingworld_description_parser(soup)
elif forum == "CryptBB":
rmm = cryptBB_description_parser(soup)
# key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip()
#essentially filename and url
key = u"Url:" + os.path.basename(line2).replace(".html", "")#should end with either no(page+num) or no page+num
# check if page or page exists at the end of a string followed by a series of numbers
#if yes add to other if no add to first page dictionary
# save descritions into record in memory
check = re.compile(r'(?<=Page|page)[0-9]*')
# print(key, 'is an other page\n')
other[key] = {'rmm': rmm, 'filename': os.path.basename(line2)}
# print(key, 'is a first page\n')
detPage[key] = {'rmm': rmm, 'files': [os.path.basename(line2)]}
nError += 1
print("There was a problem to parse the file " + line2 + " in the Description section!")
if createLog:
logFile.write(str(nError) + ". There was a problem to parse the file " + line2 + " in the Description section.\n")
# goes through keys from detPage and other, checks if the keys match.
# if yes adds other[key] values to detPage w/o overwritting
for key in detPage.keys():
for k in list(other.keys()):
checkkey = str(key[4:])
checkk = str(k[4:])
if checkkey in checkk:
# Parsing the Listing Pages and put the tag's content into a list
for index, line1 in enumerate(lines):
print("Reading listing folder of '" + forum + "', file '" + os.path.basename(line1) + "', index= " + str(index + 1) + " ... " + str(len(lines)))
readError = False
html ='\n'), encoding='utf8')
soup = BeautifulSoup(html, "html.parser")
html = open(line1.strip('\n'))
soup = BeautifulSoup(html, "html.parser")
nError += 1
print("There was a problem to read the file " + line1 + " in the Listing section!")
if createLog:
logFile.write(str(nError) + ". There was a problem to read the file " + line1 + " in the Listing section.\n")
readError = True
if not readError:
parseError = False
if forum == "BestCardingWorld":
rw = bestcardingworld_listing_parser(soup)
elif forum == "CryptBB":
rw = cryptBB_listing_parser(soup)
nError += 1
print("There was a problem to read the file " + line1 + " in the listing section!")
if createLog:
str(nError) + ". There was a problem to read the file " + line1 + " in the Listing section.\n")
parseError = True
if not parseError:
persistError = False
moveError = False
num_in_db = 0
num_persisted_moved = 0
for rec in rw:
rec = rec.split(',')
# key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip()
key = u"Url:" + cleanLink(rec[6])
if key in detPage:
# Combining the information from Listing and Description Pages
rmm = detPage[key]['rmm']
rec = mergePages(rmm, rec)
# Append to the list the classification of the topic
# if isRussianForum(forum):
# rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_russian')))
# else:
# rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_english')))
rec.append(str(predict(rec[3], getPosts(rec[14]), language='sup_english')))
# Persisting the information in the database
persist_data(url, tuple(rec), cur)
trace = traceback.format_exc()
if trace.find("already exists") == -1:
nError += 1
print("There was a problem to persist the file " + detPage[key]['filename'] + " in the database!")
if createLog:
str(nError) + ". There was a problem to persist the file " + detPage[key]['filename'] + " in the database.\n")
persistError = True
if not persistError:
# move description files of completed folder
for filename in detPage[key]['files']:
source = line2.replace(os.path.basename(line2), "") + filename
destination = line2.replace(os.path.basename(line2), "") + r'Read/'
shutil.move(source, destination)
num_persisted_moved += 1
print("There was a problem to move the file " + filename + " in the Description section!")
nError += 1
if createLog:
str(nError) + ". There was a problem to move the file " + filename + " in the Description section!.\n")
moveError = True
# if the associated description page is not read or not parsed
# query database
# if the post already exists:
# num_in_db += 1
# if number of topics on listing page is equal to
# the number of merged, persisted, and moved topics plus
# the number of topics already in the database
if not persistError and not moveError and len(rw) == (num_persisted_moved + num_in_db):
# move listing file to completed folder
source = line1
destination = line1.replace(os.path.basename(line1), "") + r'Read/'
shutil.move(source, destination)
nError += 1
print("There was a problem to move the file " + line1 + " in the Listing section!")
if createLog:
logFile.write(str(nError) + ". There was a problem to move the file " + line1 + " in the Listing section!.\n")
if createLog:
#end = time.time()
#finalTime = float(end-ini)
#print (forum + " Parsing Perfomed Succesfully in %.2f" %finalTime + "!")
input("Parsing the " + forum + " forum and data classification done successfully. Press ENTER to continue\n")