this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

385 lines
13 KiB

__author__ = 'DarkWeb'
import codecs
import glob
import os, re
import shutil
from psycopg2.extras import RealDictCursor
from Forums.DB_Connection.db_connection import *
from Forums.BestCardingWorld.parser import *
from Forums.CryptBB.parser import *
from Forums.Incogsnoo.parser import *
from Forums.Classifier.classify_product import predict
# from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
# controls the log id
nError = 0
# determines if forum is russian, not really used now but maybe later
def isRussianForum(forum):
with open('russian_forums.txt') as f:
forums = f.readlines()
result = False
for iforum in forums:
iforum = iforum.replace('\n','')
if iforum == forum:
result = True
break
return result
#tries to match description pages to listing pages by using a key made for every description page and every link in listing page
#once verified and matched, the info is merged into a 'rec', which is returned
#@param: detPage is a list of keys of valid pages, rec is the row of data of an instance
#return: rec, row of data, that may have additional data added on after matching description to listing page
def mergePages(rmm, rec):
# key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip()
# key = rec[16]
print ("----------------- Matched: " + rec[3] + "--------------------")
if rmm[9] != "-1": # image_user
rec[9] = rmm[9]
rec[10] = rmm[1]
rec[11] = rmm[2]
rec[12] = rmm[3]
rec[13] = rmm[4]
rec[14] = rmm[5]
rec[15] = rmm[6]
rec[16] = rmm[7]
rec[17] = rmm[8]
rec[18] = rmm[10]
return rec
#gets a string of posts and joins them together into one string to be put in the database as one string of text
#@param: list of strings (the posts of a thread)
#return: string containing the concatenation of all the strings
def getPosts(posts):
strPosts = ' '.join(posts)
return strPosts.strip()
#uses db connection , another program, methods to persists values to the correct categories
#@param: row is the list of entries for this instance, cur is the db connection object
def persist_data(url, row, cur):
forum = create_forum(cur, row, url)
author = create_author(cur, row, forum)
topic = create_topic(cur, forum, row, author)
create_posts(cur, row, forum, topic)
def incrementError():
global nError
nError += 1
def read_file(filePath, createLog, logFile):
try:
html = codecs.open(filePath.strip('\n'), encoding='utf8')
soup = BeautifulSoup(html, "html.parser")
html.close()
time.sleep(0.01) # making sure the file is closed before returning soup object
return soup
except:
try:
html = open(filePath.strip('\n'))
soup = BeautifulSoup(html, "html.parser")
html.close()
time.sleep(0.01) # making sure the file is closed before returning soup object
return soup
except:
incrementError()
print("There was a problem to read the file " + filePath)
if createLog:
logFile.write(
str(nError) + ". There was a problem to read the file " + filePath + "\n" + traceback.format_exc() + "\n")
return None
def parse_listing(forum, listingFile, soup, createLog, logFile):
try:
if forum == "BestCardingWorld":
rw = bestcardingworld_listing_parser(soup)
elif forum == "CryptBB":
rw = cryptBB_listing_parser(soup)
elif forum == "Incogsnoo":
rw = incogsnoo_listing_parser(soup)
else:
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
raise Exception
return rw
except:
incrementError()
print("There was a problem to parse the file " + listingFile + " in the listing section!")
traceback.print_exc()
if createLog:
logFile.write(
str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n"
+ traceback.format_exc() + "\n")
return None
def parse_description(forum, descriptionFile, soup, createLog, logFile):
try:
if forum == "BestCardingWorld":
rmm = bestcardingworld_description_parser(soup)
elif forum == "CryptBB":
rmm = cryptBB_description_parser(soup)
elif forum == "Incogsnoo":
rmm = incogsnoo_description_parser(soup)
else:
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
raise Exception
return rmm
except:
incrementError()
print("There was a problem to parse the file " + descriptionFile + " in the Description section!")
traceback.print_exc()
if createLog:
logFile.write(
str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n"
+ traceback.format_exc() + "\n")
return None
def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile):
try:
persist_data(url, tuple(rec), cur)
con.commit()
return True
except:
con.rollback()
incrementError()
print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!")
traceback.print_exc()
if createLog:
logFile.write(
str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n"
+ traceback.format_exc() + "\n")
return False
def move_file(filePath, createLog, logFile):
source = filePath
destination = filePath.replace(os.path.basename(filePath), "") + 'Read\\' + os.path.basename(filePath)
try:
shutil.move(source, destination, shutil.copy2)
return True
except:
try:
shutil.move(source, destination, shutil.copytree)
return True
except:
incrementError()
print("There was a problem to move the file " + filePath)
traceback.print_exc()
if createLog:
logFile.write(
str(nError) + ". There was a problem to move the file " + filePath + "\n" + traceback.format_exc() + "\n")
return False
#main method for this program, what actually gets the parsed info from the parser, and persists them into the db
#calls the different parser methods here depending on the type of html page
def new_parse(forum, url, createLog):
from Forums.Initialization.forums_mining import config, CURRENT_DATE
global nError
nError = 0
print("Parsing the " + forum + " forum and conduct data classification to store the information in the database.")
# Connecting to the database
con = connectDataBase()
cur = con.cursor(cursor_factory=RealDictCursor)
# Creating the tables (The database should be created manually)
create_database(cur, con)
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums\\" + forum + "\\HTML_Pages")
# Creating the log file for each Forum
if createLog:
try:
logFile = open(mainDir + f"/{CURRENT_DATE}/" + forum + "_" + CURRENT_DATE + ".log", "w")
except:
print("Could not open log file!")
createLog = False
logFile = None
# raise SystemExit
else:
logFile = None
# Reading the Listing Html Pages
listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html'))
listings.sort(key=os.path.getmtime)
for listingIndex, listingFile in enumerate(listings):
print("Reading listing folder of '" + forum + "', file '" + os.path.basename(listingFile) + "', index= " + str(
listingIndex + 1) + " ... " + str(len(listings)))
listingSoup = read_file(listingFile, createLog, logFile)
# listing flags
doParseListing = listingSoup is not None
doDescription = False
readDescriptionError = False
parseDescriptionError = False
persistDescriptionError = False
moveDescriptionError = False
findDescriptionError = False
rw = []
if doParseListing:
rw = parse_listing(forum, listingFile, listingSoup, createLog, logFile)
doDescription = rw is not None
if doDescription:
nFound = 0
for rec in rw:
rec = rec.split(',')
descriptionPattern = cleanLink(rec[6]) + "page[0-9]*.html"
# Reading the associated description Html Pages
descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern))
descriptions.sort(key=os.path.getmtime)
nFound += len(descriptions)
# Aggregate of posts from multiple description (topic) pages
posts = []
for descriptionIndex, descriptionFile in enumerate(descriptions):
print("Reading description folder of '" + forum + "', file '" + os.path.basename(
descriptionFile) + "', index= " + str(descriptionIndex + 1) + " ... " + str(len(descriptions)))
descriptionSoup = read_file(descriptionFile, createLog, logFile)
# description flags
doParseDescription = descriptionSoup is not None
doPersistRecord = False
doMoveDescription = False
rmm = []
if doParseDescription:
rmm = parse_description(forum, descriptionFile, descriptionSoup, createLog, logFile)
doPersistRecord = rmm is not None
else:
readDescriptionError = True
parseDescriptionError = True
if doPersistRecord:
# Combining the information from Listing and Description Pages
rec = mergePages(rmm, rec)
# Add the page's posts to aggregate
posts += rec[15]
# Classify on final description page
if descriptionIndex == len(descriptions) - 1:
# classification for topic based on all posts from all pages
rec[19] = str(predict(rec[3], getPosts(posts), language='sup_english'))
# Persisting the information in the database
persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile)
doMoveDescription = persistSuccess
else:
parseDescriptionError = True
if doMoveDescription:
# move description files of completed folder
moveSuccess = move_file(descriptionFile, createLog, logFile)
if not moveSuccess:
moveDescriptionError = True
else:
moveDescriptionError = True
if not (nFound > 0):
findDescriptionError = True
incrementError()
print(f"There was a problem to locate the file(s) for {listingFile} in the Description section!")
if createLog:
logFile.write(
str(nError) + f". There was a problem to locate the file(s) for {listingFile}"
f" in the Description section!\n\n")
if not (readDescriptionError or parseDescriptionError or persistDescriptionError
or moveDescriptionError or findDescriptionError):
# move listing files of completed folder
move_file(listingFile, createLog, logFile)
# registering the current forum status (up/down) and the number of scraped pages in the database
forumId = verifyForum(cur, forum)
if (forumId > 0):
readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.html'))
readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.html'))
create_status(cur, forumId, CURRENT_DATE, len(readListings), len(readDescriptions), '1' if len(listings) > 0 else '0')
con.commit()
if createLog:
logFile.close()
cur.close()
con.close()
print("Parsing the " + forum + " forum and data classification done.")