Browse Source

new prepare_parser.py logic

main
westernmeadow 1 year ago
parent
commit
99ad53022c
4 changed files with 215 additions and 237 deletions
  1. +1
    -0
      Forums/Altenens/crawler_selenium.py
  2. +2
    -5
      Forums/Initialization/forums_mining.py
  3. +211
    -231
      Forums/Initialization/prepare_parser.py
  4. +1
    -1
      MarketPlaces/Initialization/markets_mining.py

+ 1
- 0
Forums/Altenens/crawler_selenium.py View File

@ -138,6 +138,7 @@ def createFFDriver():
return driver
def getAccess():
url = getFixedURL()
driver = createFFDriver()


+ 2
- 5
Forums/Initialization/forums_mining.py View File

@ -1,7 +1,7 @@
__author__ = 'DarkWeb'
'''
Starting point of the Darkweb Forums Mining
'''
import os
@ -101,8 +101,7 @@ if __name__ == '__main__':
print("Creating listing and description directories ... for " + forum)
createDirectory(forum)
time.sleep(5) # wait for directories to be created
input("Directories created successfully. Press ENTER to continue\n")
print("Directories created successfully.")
if forum == "BestCardingWorld":
crawlerBestCardingWorld()
@ -123,8 +122,6 @@ if __name__ == '__main__':
elif forum == 'Libre':
crawlerLibre()
print("Scraping process completed successfully!")


+ 211
- 231
Forums/Initialization/prepare_parser.py View File

@ -16,6 +16,8 @@ from Forums.Libre.parser import *
from Forums.Classifier.classify_product import predict
# from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
# controls the log id
nError = 0
# determines if forum is russian, not really used now but maybe later
def isRussianForum(forum):
@ -80,6 +82,141 @@ def persist_data(url, row, cur):
create_posts(cur, row, forum, board, topic)
def incrementError():
global nError
nError += 1
def read_file(filePath, createLog, logFile):
try:
html = codecs.open(filePath.strip('\n'), encoding='utf8')
soup = BeautifulSoup(html, "html.parser")
html.close()
return soup
except:
try:
html = open(filePath.strip('\n'))
soup = BeautifulSoup(html, "html.parser")
html.close()
return soup
except:
incrementError()
print("There was a problem to read the file " + filePath)
if createLog:
logFile.write(
str(nError) + ". There was a problem to read the file " + filePath + "\n")
return None
def parse_listing(forum, listingFile, soup, createLog, logFile):
try:
rw = []
if forum == "BestCardingWorld":
rw = bestcardingworld_listing_parser(soup)
elif forum == "Cardingleaks":
rw = cardingleaks_listing_parser(soup)
elif forum == "CryptBB":
rw = cryptBB_listing_parser(soup)
elif forum == "OnniForums":
rw = onniForums_listing_parser(soup)
elif forum == "Altenens":
rw = altenens_listing_parser(soup)
elif forum == "Procrax":
rw = procrax_listing_parser(soup)
elif forum == "Libre":
rw = libre_listing_parser(soup)
return rw
except:
incrementError()
print("There was a problem to read the file " + listingFile + " in the listing section!")
traceback.print_exc()
if createLog:
logFile.write(
str(nError) + ". There was a problem to read the file " + listingFile + " in the Listing section.\n")
return None
def parse_description(forum, descriptionFile, soup, createLog, logFile):
try:
rmm = []
if forum == "BestCardingWorld":
rmm = bestcardingworld_description_parser(soup)
elif forum == "Cardingleaks":
rmm = cardingleaks_description_parser(soup)
elif forum == "CryptBB":
rmm = cryptBB_description_parser(soup)
elif forum == "OnniForums":
rmm = onniForums_description_parser(soup)
elif forum == "Altenens":
rmm = altenens_description_parser(soup)
elif forum == "Procrax":
rmm = procrax_description_parser(soup)
elif forum == "Libre":
rmm = libre_description_parser(soup)
return rmm
except:
incrementError()
print("There was a problem to parse the file " + descriptionFile + " in the Description section!")
traceback.print_exc()
if createLog:
logFile.write(
str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n")
return None
def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile):
try:
persist_data(url, tuple(rec), cur)
con.commit()
return True
except:
con.rollback()
trace = traceback.format_exc()
if trace.find("already exists") == -1:
incrementError()
print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!")
if createLog:
logFile.write(str(nError) + f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n")
return False
else:
return True
def move_file(filePath, createLog, logFile):
# source = line2.replace(os.path.basename(line2), "") + filename
source = filePath
destination = filePath.replace(os.path.basename(filePath), "") + r'Read/'
try:
shutil.move(source, destination)
return True
except:
print("There was a problem to move the file " + filePath)
incrementError()
if createLog:
logFile.write(
str(nError) + ". There was a problem to move the file " + filePath + "\n")
return False
#main method for this program, what actually gets the parsed info from the parser, and persists them into the db
#calls the different parser methods here depending on the type of html page
def new_parse(forum, url, createLog):
@ -88,8 +225,6 @@ def new_parse(forum, url, createLog):
print("Parsing The " + forum + " Forum and conduct data classification to store the information in the database.")
# ini = time.time()
# Connecting to the database
con = connectDataBase()
cur = con.cursor()
@ -97,268 +232,113 @@ def new_parse(forum, url, createLog):
# Creating the tables (The database should be created manually)
create_database(cur, con)
nError = 0
lines = [] # listing pages
lns = [] # description pages
detPage = {} # first pages
other = {} # other pages
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + forum + "/HTML_Pages")
# Creating the log file for each Forum
if createLog:
if not os.path.exists("./" + forum + "/Logs/" + forum + "_" + CURRENT_DATE + ".log"):
logFile = open("./" + forum + "/Logs/" + forum + "_" + CURRENT_DATE + ".log", "w")
else:
print("Files of the date " + CURRENT_DATE + " from the Forum " + forum +
" were already read. Delete the referent information in the Data Base and also delete the log file"
" in the _Logs folder to read files from this Forum of this date again.")
try:
logFile = open(mainDir + f"/{CURRENT_DATE}/" + forum + "_" + CURRENT_DATE + ".log", "w")
except:
print("Could not open log file!")
raise SystemExit
mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + forum + "/HTML_Pages")
else:
logFile = None
# Reading the Listing Html Pages
for fileListing in glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')):
lines.append(fileListing)
listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html'))
for listingIndex, listingFile in enumerate(listings):
# Reading the Description Html Pages
for fileDescription in glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", '*.html')):
lns.append(fileDescription)
print("Reading listing folder of '" + forum + "', file '" + os.path.basename(listingFile) + "', index= " + str(
listingIndex + 1) + " ... " + str(len(listings)))
# Parsing the Description Pages and put the tag's content into a dictionary (Hash table)
for index, line2 in enumerate(lns):
listingSoup = read_file(listingFile, createLog, logFile)
print("Reading description folder of '" + forum + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns)))
# listing flags
doParseListing = listingSoup is not None
doDescription = False
try:
html = codecs.open(line2.strip('\n'), encoding='utf8')
soup = BeautifulSoup(html, "html.parser")
html.close()
except:
readDescriptionError = False
parseDescriptionError = False
persistDescriptionError = False
moveDescriptionError = False
try:
html = open(line2.strip('\n'))
soup = BeautifulSoup(html, "html.parser")
html.close()
except:
rw = []
nError += 1
print("There was a problem to read the file " + line2 + " in the Description section!")
if createLog:
logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section!\n")
continue
if doParseListing:
try:
rw = parse_listing(forum, listingFile, listingSoup, createLog, logFile)
if forum == "BestCardingWorld":
rmm = bestcardingworld_description_parser(soup)
elif forum == "Cardingleaks":
rmm = cardingleaks_description_parser(soup)
elif forum == "CryptBB":
rmm = cryptBB_description_parser(soup)
elif forum == "OnniForums":
rmm = onniForums_description_parser(soup)
elif forum == "Altenens":
rmm = altenens_description_parser(soup)
elif forum == "Procrax":
rmm = procrax_description_parser(soup)
elif forum == "Libre":
rmm = libre_description_parser(soup)
# key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip()
key = u"Url:" + os.path.basename(line2).replace(".html", "")
# check if "page1" exists at the end of a string
# if yes add to first page directory if no add to other
check = re.compile(r'page1$')
if check.search(key):
# print(key, 'is a first page\n')
detPage[key] = {'rmm': rmm, 'files': [os.path.basename(line2)]}
else:
# print(key, 'is an other page\n')
other[key] = {'rmm': rmm, 'filename': os.path.basename(line2)}
doDescription = rw is not None
except:
if doDescription:
nError += 1
print("There was a problem to parse the file " + line2 + " in the Description section!")
traceback.print_exc()
if createLog:
logFile.write(str(nError) + ". There was a problem to parse the file " + line2 + " in the Description section.\n")
# goes through keys from detPage and other, checks if the keys match.
# if yes adds other[key] values to detPage w/o overwritting
for key in detPage.keys():
for k in list(other.keys()):
checkkey = str(key[4:])
checkk = str(k[4:])
if checkkey in checkk:
detPage[key]['rmm'][1].extend(other[k]['rmm'][1])
detPage[key]['rmm'][2].extend(other[k]['rmm'][2])
detPage[key]['rmm'][3].extend(other[k]['rmm'][3])
detPage[key]['rmm'][4].extend(other[k]['rmm'][4])
detPage[key]['rmm'][5].extend(other[k]['rmm'][5])
detPage[key]['rmm'][6].extend(other[k]['rmm'][6])
detPage[key]['rmm'][7].extend(other[k]['rmm'][7])
detPage[key]['rmm'][8].extend(other[k]['rmm'][8])
detPage[key]['files'].append(other[k]['filename'])
other.pop(k)
# Parsing the Listing Pages and put the tag's content into a list
for index, line1 in enumerate(lines):
print("Reading listing folder of '" + forum + "', file '" + os.path.basename(line1) + "', index= " + str(index + 1) + " ... " + str(len(lines)))
readError = False
try:
html = codecs.open(line1.strip('\n'), encoding='utf8')
soup = BeautifulSoup(html, "html.parser")
html.close()
except:
for rec in rw:
rec = rec.split(',')
descriptionPattern = cleanLink(rec[6]) + "page[0-9]*.html"
# Reading the associated description Html Pages
descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern))
for descriptionIndex, descriptionFile in enumerate(descriptions):
print("Reading description folder of '" + forum + "', file '" + os.path.basename(
descriptionFile) + "', index= " + str(descriptionIndex + 1) + " ... " + str(len(descriptions)))
descriptionSoup = read_file(descriptionFile, createLog, logFile)
# description flags
doParseDescription = descriptionSoup is not None
doPersistRecord = False
doMoveDescription = False
try:
html = open(line1.strip('\n'))
soup = BeautifulSoup(html, "html.parser")
html.close()
except:
nError += 1
print("There was a problem to read the file " + line1 + " in the Listing section!")
if createLog:
logFile.write(str(nError) + ". There was a problem to read the file " + line1 + " in the Listing section.\n")
readError = True
if not readError:
parseError = False
try:
if forum == "BestCardingWorld":
rw = bestcardingworld_listing_parser(soup)
elif forum == "Cardingleaks":
rw = cardingleaks_listing_parser(soup)
elif forum == "CryptBB":
rw = cryptBB_listing_parser(soup)
elif forum == "OnniForums":
rw = onniForums_listing_parser(soup)
elif forum == "Altenens":
rw = altenens_listing_parser(soup)
elif forum == "Procrax":
rw = procrax_listing_parser(soup)
elif forum == "Libre":
rw = libre_listing_parser(soup)
except:
nError += 1
print("There was a problem to read the file " + line1 + " in the listing section!")
traceback.print_exc()
if createLog:
logFile.write(
str(nError) + ". There was a problem to read the file " + line1 + " in the Listing section.\n")
parseError = True
if not parseError:
persistError = False
moveError = False
num_in_db = 0
num_persisted_moved = 0
for rec in rw:
rec = rec.split(',')
# print(rec)
# key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip()
key = u"Url:" + cleanLink(rec[6]) + "page1"
# print(key)
if key in detPage:
rmm = []
if doParseDescription:
rmm = parse_description(forum, descriptionFile, descriptionSoup, createLog, logFile)
doPersistRecord = rmm is not None
else:
readDescriptionError = True
parseDescriptionError = True
if doPersistRecord:
# Combining the information from Listing and Description Pages
rmm = detPage[key]['rmm']
rec = mergePages(rmm, rec)
# Append to the list the classification of the topic
# if isRussianForum(forum):
# rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_russian')))
# else:
# rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_english')))
rec.append(str(predict(rec[3], getPosts(rec[14]), language='sup_english')))
# Persisting the information in the database
try:
persist_data(url, tuple(rec), cur)
con.commit()
except:
trace = traceback.format_exc()
if trace.find("already exists") == -1:
nError += 1
print("There was a problem to persist the file " + detPage[key]['filename'] + " in the database!")
if createLog:
logFile.write(
str(nError) + ". There was a problem to persist the file " + detPage[key]['filename'] + " in the database.\n")
persistError = True
con.rollback()
if not persistError:
# move description files of completed folder
for filename in detPage[key]['files']:
source = line2.replace(os.path.basename(line2), "") + filename
destination = line2.replace(os.path.basename(line2), "") + r'Read/'
try:
shutil.move(source, destination)
num_persisted_moved += 1
except:
print("There was a problem to move the file " + filename + " in the Description section!")
nError += 1
if createLog:
logFile.write(
str(nError) + ". There was a problem to move the file " + filename + " in the Description section!.\n")
moveError = True
# if the associated description page is not read or not parsed
else:
# query database
# if the post already exists:
# num_in_db += 1
pass
persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile)
# if number of topics on listing page is equal to
# the number of merged, persisted, and moved topics plus
# the number of topics already in the database
if not persistError and not moveError and len(rw) == (num_persisted_moved + num_in_db):
doMoveDescription = persistSuccess
# move listing file to completed folder
source = line1
destination = line1.replace(os.path.basename(line1), "") + r'Read/'
else:
parseDescriptionError = True
try:
shutil.move(source, destination)
except:
if doMoveDescription:
nError += 1
print("There was a problem to move the file " + line1 + " in the Listing section!")
if createLog:
logFile.write(str(nError) + ". There was a problem to move the file " + line1 + " in the Listing section!.\n")
# move description files of completed folder
moveSuccess = move_file(descriptionFile, createLog, logFile)
if createLog:
logFile.close()
if not moveSuccess:
moveDescriptionError = True
#end = time.time()
else:
moveDescriptionError = True
#finalTime = float(end-ini)
if not (readDescriptionError or parseDescriptionError or persistDescriptionError or moveDescriptionError):
#print (forum + " Parsing Perfomed Succesfully in %.2f" %finalTime + "!")
# move listing files of completed folder
move_file(listingFile, createLog, logFile)
if createLog:
logFile.close()
input("Parsing the " + forum + " forum and data classification done successfully. Press ENTER to continue\n")
print("Parsing the " + forum + " forum and data classification done.")

+ 1
- 1
MarketPlaces/Initialization/markets_mining.py View File

@ -1,7 +1,7 @@
__author__ = 'DarkWeb'
'''
Starting point of the Darkweb Mining Platform
Starting point of the Darkweb Markets Mining
'''
import os


Loading…
Cancel
Save