|
|
@ -16,6 +16,8 @@ from Forums.Libre.parser import * |
|
|
|
from Forums.Classifier.classify_product import predict |
|
|
|
# from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi |
|
|
|
|
|
|
|
# controls the log id |
|
|
|
nError = 0 |
|
|
|
|
|
|
|
# determines if forum is russian, not really used now but maybe later |
|
|
|
def isRussianForum(forum): |
|
|
@ -80,6 +82,141 @@ def persist_data(url, row, cur): |
|
|
|
create_posts(cur, row, forum, board, topic) |
|
|
|
|
|
|
|
|
|
|
|
def incrementError(): |
|
|
|
global nError |
|
|
|
nError += 1 |
|
|
|
|
|
|
|
def read_file(filePath, createLog, logFile): |
|
|
|
|
|
|
|
try: |
|
|
|
html = codecs.open(filePath.strip('\n'), encoding='utf8') |
|
|
|
soup = BeautifulSoup(html, "html.parser") |
|
|
|
html.close() |
|
|
|
return soup |
|
|
|
except: |
|
|
|
|
|
|
|
try: |
|
|
|
html = open(filePath.strip('\n')) |
|
|
|
soup = BeautifulSoup(html, "html.parser") |
|
|
|
html.close() |
|
|
|
return soup |
|
|
|
except: |
|
|
|
|
|
|
|
incrementError() |
|
|
|
print("There was a problem to read the file " + filePath) |
|
|
|
if createLog: |
|
|
|
logFile.write( |
|
|
|
str(nError) + ". There was a problem to read the file " + filePath + "\n") |
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
def parse_listing(forum, listingFile, soup, createLog, logFile): |
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
rw = [] |
|
|
|
|
|
|
|
if forum == "BestCardingWorld": |
|
|
|
rw = bestcardingworld_listing_parser(soup) |
|
|
|
elif forum == "Cardingleaks": |
|
|
|
rw = cardingleaks_listing_parser(soup) |
|
|
|
elif forum == "CryptBB": |
|
|
|
rw = cryptBB_listing_parser(soup) |
|
|
|
elif forum == "OnniForums": |
|
|
|
rw = onniForums_listing_parser(soup) |
|
|
|
elif forum == "Altenens": |
|
|
|
rw = altenens_listing_parser(soup) |
|
|
|
elif forum == "Procrax": |
|
|
|
rw = procrax_listing_parser(soup) |
|
|
|
elif forum == "Libre": |
|
|
|
rw = libre_listing_parser(soup) |
|
|
|
return rw |
|
|
|
|
|
|
|
except: |
|
|
|
|
|
|
|
incrementError() |
|
|
|
print("There was a problem to read the file " + listingFile + " in the listing section!") |
|
|
|
traceback.print_exc() |
|
|
|
if createLog: |
|
|
|
logFile.write( |
|
|
|
str(nError) + ". There was a problem to read the file " + listingFile + " in the Listing section.\n") |
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
def parse_description(forum, descriptionFile, soup, createLog, logFile): |
|
|
|
|
|
|
|
try: |
|
|
|
|
|
|
|
rmm = [] |
|
|
|
|
|
|
|
if forum == "BestCardingWorld": |
|
|
|
rmm = bestcardingworld_description_parser(soup) |
|
|
|
elif forum == "Cardingleaks": |
|
|
|
rmm = cardingleaks_description_parser(soup) |
|
|
|
elif forum == "CryptBB": |
|
|
|
rmm = cryptBB_description_parser(soup) |
|
|
|
elif forum == "OnniForums": |
|
|
|
rmm = onniForums_description_parser(soup) |
|
|
|
elif forum == "Altenens": |
|
|
|
rmm = altenens_description_parser(soup) |
|
|
|
elif forum == "Procrax": |
|
|
|
rmm = procrax_description_parser(soup) |
|
|
|
elif forum == "Libre": |
|
|
|
rmm = libre_description_parser(soup) |
|
|
|
return rmm |
|
|
|
|
|
|
|
except: |
|
|
|
|
|
|
|
incrementError() |
|
|
|
print("There was a problem to parse the file " + descriptionFile + " in the Description section!") |
|
|
|
traceback.print_exc() |
|
|
|
if createLog: |
|
|
|
logFile.write( |
|
|
|
str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n") |
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile): |
|
|
|
|
|
|
|
try: |
|
|
|
persist_data(url, tuple(rec), cur) |
|
|
|
con.commit() |
|
|
|
return True |
|
|
|
except: |
|
|
|
|
|
|
|
con.rollback() |
|
|
|
|
|
|
|
trace = traceback.format_exc() |
|
|
|
|
|
|
|
if trace.find("already exists") == -1: |
|
|
|
incrementError() |
|
|
|
print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!") |
|
|
|
if createLog: |
|
|
|
logFile.write(str(nError) + f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n") |
|
|
|
return False |
|
|
|
else: |
|
|
|
return True |
|
|
|
|
|
|
|
|
|
|
|
def move_file(filePath, createLog, logFile): |
|
|
|
|
|
|
|
# source = line2.replace(os.path.basename(line2), "") + filename |
|
|
|
source = filePath |
|
|
|
destination = filePath.replace(os.path.basename(filePath), "") + r'Read/' |
|
|
|
|
|
|
|
try: |
|
|
|
shutil.move(source, destination) |
|
|
|
return True |
|
|
|
except: |
|
|
|
|
|
|
|
print("There was a problem to move the file " + filePath) |
|
|
|
incrementError() |
|
|
|
if createLog: |
|
|
|
logFile.write( |
|
|
|
str(nError) + ". There was a problem to move the file " + filePath + "\n") |
|
|
|
return False |
|
|
|
|
|
|
|
|
|
|
|
#main method for this program, what actually gets the parsed info from the parser, and persists them into the db |
|
|
|
#calls the different parser methods here depending on the type of html page |
|
|
|
def new_parse(forum, url, createLog): |
|
|
@ -88,8 +225,6 @@ def new_parse(forum, url, createLog): |
|
|
|
|
|
|
|
print("Parsing The " + forum + " Forum and conduct data classification to store the information in the database.") |
|
|
|
|
|
|
|
# ini = time.time() |
|
|
|
|
|
|
|
# Connecting to the database |
|
|
|
con = connectDataBase() |
|
|
|
cur = con.cursor() |
|
|
@ -97,268 +232,113 @@ def new_parse(forum, url, createLog): |
|
|
|
# Creating the tables (The database should be created manually) |
|
|
|
create_database(cur, con) |
|
|
|
|
|
|
|
nError = 0 |
|
|
|
|
|
|
|
lines = [] # listing pages |
|
|
|
lns = [] # description pages |
|
|
|
detPage = {} # first pages |
|
|
|
other = {} # other pages |
|
|
|
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + forum + "/HTML_Pages") |
|
|
|
|
|
|
|
# Creating the log file for each Forum |
|
|
|
if createLog: |
|
|
|
if not os.path.exists("./" + forum + "/Logs/" + forum + "_" + CURRENT_DATE + ".log"): |
|
|
|
logFile = open("./" + forum + "/Logs/" + forum + "_" + CURRENT_DATE + ".log", "w") |
|
|
|
else: |
|
|
|
print("Files of the date " + CURRENT_DATE + " from the Forum " + forum + |
|
|
|
" were already read. Delete the referent information in the Data Base and also delete the log file" |
|
|
|
" in the _Logs folder to read files from this Forum of this date again.") |
|
|
|
try: |
|
|
|
logFile = open(mainDir + f"/{CURRENT_DATE}/" + forum + "_" + CURRENT_DATE + ".log", "w") |
|
|
|
except: |
|
|
|
print("Could not open log file!") |
|
|
|
raise SystemExit |
|
|
|
|
|
|
|
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + forum + "/HTML_Pages") |
|
|
|
else: |
|
|
|
logFile = None |
|
|
|
|
|
|
|
# Reading the Listing Html Pages |
|
|
|
for fileListing in glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')): |
|
|
|
lines.append(fileListing) |
|
|
|
listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')) |
|
|
|
for listingIndex, listingFile in enumerate(listings): |
|
|
|
|
|
|
|
# Reading the Description Html Pages |
|
|
|
for fileDescription in glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", '*.html')): |
|
|
|
lns.append(fileDescription) |
|
|
|
print("Reading listing folder of '" + forum + "', file '" + os.path.basename(listingFile) + "', index= " + str( |
|
|
|
listingIndex + 1) + " ... " + str(len(listings))) |
|
|
|
|
|
|
|
# Parsing the Description Pages and put the tag's content into a dictionary (Hash table) |
|
|
|
for index, line2 in enumerate(lns): |
|
|
|
listingSoup = read_file(listingFile, createLog, logFile) |
|
|
|
|
|
|
|
print("Reading description folder of '" + forum + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns))) |
|
|
|
# listing flags |
|
|
|
doParseListing = listingSoup is not None |
|
|
|
doDescription = False |
|
|
|
|
|
|
|
try: |
|
|
|
html = codecs.open(line2.strip('\n'), encoding='utf8') |
|
|
|
soup = BeautifulSoup(html, "html.parser") |
|
|
|
html.close() |
|
|
|
except: |
|
|
|
readDescriptionError = False |
|
|
|
parseDescriptionError = False |
|
|
|
persistDescriptionError = False |
|
|
|
moveDescriptionError = False |
|
|
|
|
|
|
|
try: |
|
|
|
html = open(line2.strip('\n')) |
|
|
|
soup = BeautifulSoup(html, "html.parser") |
|
|
|
html.close() |
|
|
|
except: |
|
|
|
rw = [] |
|
|
|
|
|
|
|
nError += 1 |
|
|
|
print("There was a problem to read the file " + line2 + " in the Description section!") |
|
|
|
if createLog: |
|
|
|
logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section!\n") |
|
|
|
continue |
|
|
|
if doParseListing: |
|
|
|
|
|
|
|
try: |
|
|
|
rw = parse_listing(forum, listingFile, listingSoup, createLog, logFile) |
|
|
|
|
|
|
|
if forum == "BestCardingWorld": |
|
|
|
rmm = bestcardingworld_description_parser(soup) |
|
|
|
elif forum == "Cardingleaks": |
|
|
|
rmm = cardingleaks_description_parser(soup) |
|
|
|
elif forum == "CryptBB": |
|
|
|
rmm = cryptBB_description_parser(soup) |
|
|
|
elif forum == "OnniForums": |
|
|
|
rmm = onniForums_description_parser(soup) |
|
|
|
elif forum == "Altenens": |
|
|
|
rmm = altenens_description_parser(soup) |
|
|
|
elif forum == "Procrax": |
|
|
|
rmm = procrax_description_parser(soup) |
|
|
|
elif forum == "Libre": |
|
|
|
rmm = libre_description_parser(soup) |
|
|
|
|
|
|
|
# key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip() |
|
|
|
key = u"Url:" + os.path.basename(line2).replace(".html", "") |
|
|
|
|
|
|
|
# check if "page1" exists at the end of a string |
|
|
|
# if yes add to first page directory if no add to other |
|
|
|
check = re.compile(r'page1$') |
|
|
|
if check.search(key): |
|
|
|
# print(key, 'is a first page\n') |
|
|
|
detPage[key] = {'rmm': rmm, 'files': [os.path.basename(line2)]} |
|
|
|
else: |
|
|
|
# print(key, 'is an other page\n') |
|
|
|
other[key] = {'rmm': rmm, 'filename': os.path.basename(line2)} |
|
|
|
doDescription = rw is not None |
|
|
|
|
|
|
|
except: |
|
|
|
if doDescription: |
|
|
|
|
|
|
|
nError += 1 |
|
|
|
print("There was a problem to parse the file " + line2 + " in the Description section!") |
|
|
|
traceback.print_exc() |
|
|
|
if createLog: |
|
|
|
logFile.write(str(nError) + ". There was a problem to parse the file " + line2 + " in the Description section.\n") |
|
|
|
|
|
|
|
# goes through keys from detPage and other, checks if the keys match. |
|
|
|
# if yes adds other[key] values to detPage w/o overwritting |
|
|
|
for key in detPage.keys(): |
|
|
|
for k in list(other.keys()): |
|
|
|
checkkey = str(key[4:]) |
|
|
|
checkk = str(k[4:]) |
|
|
|
|
|
|
|
if checkkey in checkk: |
|
|
|
detPage[key]['rmm'][1].extend(other[k]['rmm'][1]) |
|
|
|
detPage[key]['rmm'][2].extend(other[k]['rmm'][2]) |
|
|
|
detPage[key]['rmm'][3].extend(other[k]['rmm'][3]) |
|
|
|
detPage[key]['rmm'][4].extend(other[k]['rmm'][4]) |
|
|
|
detPage[key]['rmm'][5].extend(other[k]['rmm'][5]) |
|
|
|
detPage[key]['rmm'][6].extend(other[k]['rmm'][6]) |
|
|
|
detPage[key]['rmm'][7].extend(other[k]['rmm'][7]) |
|
|
|
detPage[key]['rmm'][8].extend(other[k]['rmm'][8]) |
|
|
|
detPage[key]['files'].append(other[k]['filename']) |
|
|
|
|
|
|
|
other.pop(k) |
|
|
|
|
|
|
|
# Parsing the Listing Pages and put the tag's content into a list |
|
|
|
for index, line1 in enumerate(lines): |
|
|
|
|
|
|
|
print("Reading listing folder of '" + forum + "', file '" + os.path.basename(line1) + "', index= " + str(index + 1) + " ... " + str(len(lines))) |
|
|
|
|
|
|
|
readError = False |
|
|
|
try: |
|
|
|
html = codecs.open(line1.strip('\n'), encoding='utf8') |
|
|
|
soup = BeautifulSoup(html, "html.parser") |
|
|
|
html.close() |
|
|
|
except: |
|
|
|
for rec in rw: |
|
|
|
|
|
|
|
rec = rec.split(',') |
|
|
|
|
|
|
|
descriptionPattern = cleanLink(rec[6]) + "page[0-9]*.html" |
|
|
|
|
|
|
|
# Reading the associated description Html Pages |
|
|
|
descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern)) |
|
|
|
for descriptionIndex, descriptionFile in enumerate(descriptions): |
|
|
|
|
|
|
|
print("Reading description folder of '" + forum + "', file '" + os.path.basename( |
|
|
|
descriptionFile) + "', index= " + str(descriptionIndex + 1) + " ... " + str(len(descriptions))) |
|
|
|
|
|
|
|
descriptionSoup = read_file(descriptionFile, createLog, logFile) |
|
|
|
|
|
|
|
# description flags |
|
|
|
doParseDescription = descriptionSoup is not None |
|
|
|
doPersistRecord = False |
|
|
|
doMoveDescription = False |
|
|
|
|
|
|
|
try: |
|
|
|
html = open(line1.strip('\n')) |
|
|
|
soup = BeautifulSoup(html, "html.parser") |
|
|
|
html.close() |
|
|
|
except: |
|
|
|
|
|
|
|
nError += 1 |
|
|
|
print("There was a problem to read the file " + line1 + " in the Listing section!") |
|
|
|
if createLog: |
|
|
|
logFile.write(str(nError) + ". There was a problem to read the file " + line1 + " in the Listing section.\n") |
|
|
|
readError = True |
|
|
|
|
|
|
|
if not readError: |
|
|
|
|
|
|
|
parseError = False |
|
|
|
try: |
|
|
|
|
|
|
|
if forum == "BestCardingWorld": |
|
|
|
rw = bestcardingworld_listing_parser(soup) |
|
|
|
elif forum == "Cardingleaks": |
|
|
|
rw = cardingleaks_listing_parser(soup) |
|
|
|
elif forum == "CryptBB": |
|
|
|
rw = cryptBB_listing_parser(soup) |
|
|
|
elif forum == "OnniForums": |
|
|
|
rw = onniForums_listing_parser(soup) |
|
|
|
elif forum == "Altenens": |
|
|
|
rw = altenens_listing_parser(soup) |
|
|
|
elif forum == "Procrax": |
|
|
|
rw = procrax_listing_parser(soup) |
|
|
|
elif forum == "Libre": |
|
|
|
rw = libre_listing_parser(soup) |
|
|
|
|
|
|
|
except: |
|
|
|
|
|
|
|
nError += 1 |
|
|
|
print("There was a problem to read the file " + line1 + " in the listing section!") |
|
|
|
traceback.print_exc() |
|
|
|
if createLog: |
|
|
|
logFile.write( |
|
|
|
str(nError) + ". There was a problem to read the file " + line1 + " in the Listing section.\n") |
|
|
|
parseError = True |
|
|
|
|
|
|
|
if not parseError: |
|
|
|
|
|
|
|
persistError = False |
|
|
|
moveError = False |
|
|
|
num_in_db = 0 |
|
|
|
num_persisted_moved = 0 |
|
|
|
|
|
|
|
for rec in rw: |
|
|
|
|
|
|
|
rec = rec.split(',') |
|
|
|
# print(rec) |
|
|
|
|
|
|
|
# key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip() |
|
|
|
key = u"Url:" + cleanLink(rec[6]) + "page1" |
|
|
|
# print(key) |
|
|
|
|
|
|
|
if key in detPage: |
|
|
|
rmm = [] |
|
|
|
|
|
|
|
if doParseDescription: |
|
|
|
|
|
|
|
rmm = parse_description(forum, descriptionFile, descriptionSoup, createLog, logFile) |
|
|
|
|
|
|
|
doPersistRecord = rmm is not None |
|
|
|
|
|
|
|
else: |
|
|
|
readDescriptionError = True |
|
|
|
parseDescriptionError = True |
|
|
|
|
|
|
|
if doPersistRecord: |
|
|
|
|
|
|
|
# Combining the information from Listing and Description Pages |
|
|
|
rmm = detPage[key]['rmm'] |
|
|
|
rec = mergePages(rmm, rec) |
|
|
|
|
|
|
|
# Append to the list the classification of the topic |
|
|
|
# if isRussianForum(forum): |
|
|
|
# rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_russian'))) |
|
|
|
# else: |
|
|
|
# rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_english'))) |
|
|
|
rec.append(str(predict(rec[3], getPosts(rec[14]), language='sup_english'))) |
|
|
|
|
|
|
|
# Persisting the information in the database |
|
|
|
try: |
|
|
|
persist_data(url, tuple(rec), cur) |
|
|
|
con.commit() |
|
|
|
except: |
|
|
|
|
|
|
|
trace = traceback.format_exc() |
|
|
|
|
|
|
|
if trace.find("already exists") == -1: |
|
|
|
nError += 1 |
|
|
|
print("There was a problem to persist the file " + detPage[key]['filename'] + " in the database!") |
|
|
|
if createLog: |
|
|
|
logFile.write( |
|
|
|
str(nError) + ". There was a problem to persist the file " + detPage[key]['filename'] + " in the database.\n") |
|
|
|
persistError = True |
|
|
|
|
|
|
|
con.rollback() |
|
|
|
|
|
|
|
if not persistError: |
|
|
|
|
|
|
|
# move description files of completed folder |
|
|
|
for filename in detPage[key]['files']: |
|
|
|
source = line2.replace(os.path.basename(line2), "") + filename |
|
|
|
destination = line2.replace(os.path.basename(line2), "") + r'Read/' |
|
|
|
|
|
|
|
try: |
|
|
|
shutil.move(source, destination) |
|
|
|
num_persisted_moved += 1 |
|
|
|
except: |
|
|
|
|
|
|
|
print("There was a problem to move the file " + filename + " in the Description section!") |
|
|
|
nError += 1 |
|
|
|
if createLog: |
|
|
|
logFile.write( |
|
|
|
str(nError) + ". There was a problem to move the file " + filename + " in the Description section!.\n") |
|
|
|
moveError = True |
|
|
|
|
|
|
|
# if the associated description page is not read or not parsed |
|
|
|
else: |
|
|
|
# query database |
|
|
|
# if the post already exists: |
|
|
|
# num_in_db += 1 |
|
|
|
pass |
|
|
|
persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile) |
|
|
|
|
|
|
|
# if number of topics on listing page is equal to |
|
|
|
# the number of merged, persisted, and moved topics plus |
|
|
|
# the number of topics already in the database |
|
|
|
if not persistError and not moveError and len(rw) == (num_persisted_moved + num_in_db): |
|
|
|
doMoveDescription = persistSuccess |
|
|
|
|
|
|
|
# move listing file to completed folder |
|
|
|
source = line1 |
|
|
|
destination = line1.replace(os.path.basename(line1), "") + r'Read/' |
|
|
|
else: |
|
|
|
parseDescriptionError = True |
|
|
|
|
|
|
|
try: |
|
|
|
shutil.move(source, destination) |
|
|
|
except: |
|
|
|
if doMoveDescription: |
|
|
|
|
|
|
|
nError += 1 |
|
|
|
print("There was a problem to move the file " + line1 + " in the Listing section!") |
|
|
|
if createLog: |
|
|
|
logFile.write(str(nError) + ". There was a problem to move the file " + line1 + " in the Listing section!.\n") |
|
|
|
# move description files of completed folder |
|
|
|
moveSuccess = move_file(descriptionFile, createLog, logFile) |
|
|
|
|
|
|
|
if createLog: |
|
|
|
logFile.close() |
|
|
|
if not moveSuccess: |
|
|
|
moveDescriptionError = True |
|
|
|
|
|
|
|
#end = time.time() |
|
|
|
else: |
|
|
|
moveDescriptionError = True |
|
|
|
|
|
|
|
#finalTime = float(end-ini) |
|
|
|
if not (readDescriptionError or parseDescriptionError or persistDescriptionError or moveDescriptionError): |
|
|
|
|
|
|
|
#print (forum + " Parsing Perfomed Succesfully in %.2f" %finalTime + "!") |
|
|
|
# move listing files of completed folder |
|
|
|
move_file(listingFile, createLog, logFile) |
|
|
|
|
|
|
|
if createLog: |
|
|
|
logFile.close() |
|
|
|
|
|
|
|
input("Parsing the " + forum + " forum and data classification done successfully. Press ENTER to continue\n") |
|
|
|
print("Parsing the " + forum + " forum and data classification done.") |
|
|
|
|
|
|
|
|