__author__ = 'DarkWeb' import codecs import glob import os, re import shutil from Forums.DB_Connection.db_connection import * from Forums.BestCardingWorld.parser import * from Forums.Cardingleaks.parser import * from Forums.CryptBB.parser import * from Forums.OnniForums.parser import * from Forums.Altenens.parser import * from Forums.Procrax.parser import * from Forums.Libre.parser import * from Forums.Classifier.classify_product import predict # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi # controls the log id nError = 0 # determines if forum is russian, not really used now but maybe later def isRussianForum(forum): with open('russian_forums.txt') as f: forums = f.readlines() result = False for iforum in forums: iforum = iforum.replace('\n','') if iforum == forum: result = True break return result #tries to match description pages to listing pages by using a key made for every description page and every link in listing page #once verified and matched, the info is merged into a 'rec', which is returned #@param: detPage is a list of keys of valid pages, rec is the row of data of an instance #return: rec, row of data, that may have additional data added on after matching description to listing page def mergePages(rmm, rec): # key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip() # key = rec[16] print ("----------------- Matched: " + rec[3] + "--------------------") rec[9] = rmm[1] rec[10] = rmm[2] rec[11] = rmm[3] rec[12] = rmm[4] rec[13] = rmm[5] rec[14] = rmm[6] rec[15] = rmm[7] rec[16] = rmm[8] return rec #gets a string of posts and joins them together into one string to be put in the database as one string of text #@param: list of strings (the posts of a thread) #return: string containing the concatenation of all the strings def getPosts(posts): strPosts = ' '.join(posts) return strPosts.strip() #uses db connection , another program, methods to persists values to the correct categories #@param: row is the list of entries for this instance, cur is the db connection object def persist_data(url, row, cur): forum = create_forum(cur, row, url) board = create_board(cur, row, forum) author = create_user(cur, row, forum, 0) topic = create_topic(cur, row, forum, board, author) create_posts(cur, row, forum, board, topic) def incrementError(): global nError nError += 1 def read_file(filePath, createLog, logFile): try: html = codecs.open(filePath.strip('\n'), encoding='utf8') soup = BeautifulSoup(html, "html.parser") html.close() return soup except: try: html = open(filePath.strip('\n')) soup = BeautifulSoup(html, "html.parser") html.close() return soup except: incrementError() print("There was a problem to read the file " + filePath) if createLog: logFile.write( str(nError) + ". There was a problem to read the file " + filePath + "\n") return None def parse_listing(forum, listingFile, soup, createLog, logFile): try: rw = [] if forum == "BestCardingWorld": rw = bestcardingworld_listing_parser(soup) elif forum == "Cardingleaks": rw = cardingleaks_listing_parser(soup) elif forum == "CryptBB": rw = cryptBB_listing_parser(soup) elif forum == "OnniForums": rw = onniForums_listing_parser(soup) elif forum == "Altenens": rw = altenens_listing_parser(soup) elif forum == "Procrax": rw = procrax_listing_parser(soup) elif forum == "Libre": rw = libre_listing_parser(soup) return rw except: incrementError() print("There was a problem to read the file " + listingFile + " in the listing section!") traceback.print_exc() if createLog: logFile.write( str(nError) + ". There was a problem to read the file " + listingFile + " in the Listing section.\n") return None def parse_description(forum, descriptionFile, soup, createLog, logFile): try: rmm = [] if forum == "BestCardingWorld": rmm = bestcardingworld_description_parser(soup) elif forum == "Cardingleaks": rmm = cardingleaks_description_parser(soup) elif forum == "CryptBB": rmm = cryptBB_description_parser(soup) elif forum == "OnniForums": rmm = onniForums_description_parser(soup) elif forum == "Altenens": rmm = altenens_description_parser(soup) elif forum == "Procrax": rmm = procrax_description_parser(soup) elif forum == "Libre": rmm = libre_description_parser(soup) return rmm except: incrementError() print("There was a problem to parse the file " + descriptionFile + " in the Description section!") traceback.print_exc() if createLog: logFile.write( str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n") return None def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile): try: persist_data(url, tuple(rec), cur) con.commit() return True except: con.rollback() trace = traceback.format_exc() if trace.find("already exists") == -1: incrementError() print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!") if createLog: logFile.write(str(nError) + f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n") return False else: return True def move_file(filePath, createLog, logFile): # source = line2.replace(os.path.basename(line2), "") + filename source = filePath destination = filePath.replace(os.path.basename(filePath), "") + r'Read/' try: shutil.move(source, destination) return True except: print("There was a problem to move the file " + filePath) incrementError() if createLog: logFile.write( str(nError) + ". There was a problem to move the file " + filePath + "\n") return False #main method for this program, what actually gets the parsed info from the parser, and persists them into the db #calls the different parser methods here depending on the type of html page def new_parse(forum, url, createLog): from Forums.Initialization.forums_mining import config, CURRENT_DATE print("Parsing The " + forum + " Forum and conduct data classification to store the information in the database.") # Connecting to the database con = connectDataBase() cur = con.cursor() # Creating the tables (The database should be created manually) create_database(cur, con) mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + forum + "/HTML_Pages") # Creating the log file for each Forum if createLog: try: logFile = open(mainDir + f"/{CURRENT_DATE}/" + forum + "_" + CURRENT_DATE + ".log", "w") except: print("Could not open log file!") raise SystemExit else: logFile = None # Reading the Listing Html Pages listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')) for listingIndex, listingFile in enumerate(listings): print("Reading listing folder of '" + forum + "', file '" + os.path.basename(listingFile) + "', index= " + str( listingIndex + 1) + " ... " + str(len(listings))) listingSoup = read_file(listingFile, createLog, logFile) # listing flags doParseListing = listingSoup is not None doDescription = False readDescriptionError = False parseDescriptionError = False persistDescriptionError = False moveDescriptionError = False rw = [] if doParseListing: rw = parse_listing(forum, listingFile, listingSoup, createLog, logFile) doDescription = rw is not None if doDescription: for rec in rw: rec = rec.split(',') descriptionPattern = cleanLink(rec[6]) + "page[0-9]*.html" # Reading the associated description Html Pages descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern)) for descriptionIndex, descriptionFile in enumerate(descriptions): print("Reading description folder of '" + forum + "', file '" + os.path.basename( descriptionFile) + "', index= " + str(descriptionIndex + 1) + " ... " + str(len(descriptions))) descriptionSoup = read_file(descriptionFile, createLog, logFile) # description flags doParseDescription = descriptionSoup is not None doPersistRecord = False doMoveDescription = False rmm = [] if doParseDescription: rmm = parse_description(forum, descriptionFile, descriptionSoup, createLog, logFile) doPersistRecord = rmm is not None else: readDescriptionError = True parseDescriptionError = True if doPersistRecord: # Combining the information from Listing and Description Pages rec = mergePages(rmm, rec) # Append to the list the classification of the topic rec.append(str(predict(rec[3], getPosts(rec[14]), language='sup_english'))) # Persisting the information in the database persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile) doMoveDescription = persistSuccess else: parseDescriptionError = True if doMoveDescription: # move description files of completed folder moveSuccess = move_file(descriptionFile, createLog, logFile) if not moveSuccess: moveDescriptionError = True else: moveDescriptionError = True if not (readDescriptionError or parseDescriptionError or persistDescriptionError or moveDescriptionError): # move listing files of completed folder move_file(listingFile, createLog, logFile) if createLog: logFile.close() print("Parsing the " + forum + " forum and data classification done.")