__author__ = 'DarkWeb' import codecs import glob import os, re import shutil from psycopg2.extras import RealDictCursor from Forums.DB_Connection.db_connection import * from Forums.BestCardingWorld.parser import * from Forums.CryptBB.parser import * from Forums.Incogsnoo.parser import * from Forums.Classifier.classify_product import predict # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi from Translator.translate import translate # controls the log id nError = 0 # determines if forum is russian, not really used now but maybe later def isRussianForum(forum): with open('russian_forums.txt') as f: forums = f.readlines() result = False for iforum in forums: iforum = iforum.replace('\n','') if iforum == forum: result = True break return result #tries to match description pages to listing pages by using a key made for every description page and every link in listing page #once verified and matched, the info is merged into a 'rec', which is returned #@param: detPage is a list of keys of valid pages, rec is the row of data of an instance #return: rec, row of data, that may have additional data added on after matching description to listing page def mergePages(rmm, rec): # key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip() # key = rec[16] print ("----------------- Matched: " + rec[3] + "--------------------") if rmm[9] != "-1": # image_user rec[9] = rmm[9] rec[10] = rmm[1] rec[11] = rmm[2] rec[12] = rmm[3] rec[13] = rmm[4] rec[14] = rmm[5] rec[15] = rmm[6] rec[16] = rmm[7] rec[17] = rmm[8] rec[18] = rmm[10] return rec #gets a string of posts and joins them together into one string to be put in the database as one string of text #@param: list of strings (the posts of a thread) #return: string containing the concatenation of all the strings def getPosts(posts): strPosts = ' '.join(posts) return strPosts.strip() #uses db connection , another program, methods to persists values to the correct categories #@param: row is the list of entries for this instance, cur is the db connection object def persist_data(url, row, cur): forum = create_forum(cur, row, url) author = create_author(cur, row, forum) topic = create_topic(cur, forum, row, author) create_posts(cur, row, forum, topic) def incrementError(): global nError nError += 1 def read_file(filePath, createLog, logFile): try: html = codecs.open(filePath.strip('\n'), encoding='utf8') soup = BeautifulSoup(html, "html.parser") html.close() time.sleep(0.01) # making sure the file is closed before returning soup object return soup except: try: html = open(filePath.strip('\n')) soup = BeautifulSoup(html, "html.parser") html.close() time.sleep(0.01) # making sure the file is closed before returning soup object return soup except: incrementError() print("There was a problem to read the file " + filePath) if createLog: logFile.write( str(nError) + ". There was a problem to read the file " + filePath + "\n" + traceback.format_exc() + "\n") return None def parse_listing(forum, listingFile, soup, createLog, logFile): try: if forum == "BestCardingWorld": rw = bestcardingworld_listing_parser(soup) elif forum == "CryptBB": rw = cryptBB_listing_parser(soup) elif forum == "Incogsnoo": rw = incogsnoo_listing_parser(soup) else: print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") raise Exception return rw except: incrementError() print("There was a problem to parse the file " + listingFile + " in the listing section!") traceback.print_exc() if createLog: logFile.write( str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n" + traceback.format_exc() + "\n") return None def parse_description(forum, descriptionFile, soup, createLog, logFile): try: if forum == "BestCardingWorld": rmm = bestcardingworld_description_parser(soup) elif forum == "CryptBB": rmm = cryptBB_description_parser(soup) elif forum == "Incogsnoo": rmm = incogsnoo_description_parser(soup) else: print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") raise Exception return rmm except: incrementError() print("There was a problem to parse the file " + descriptionFile + " in the Description section!") traceback.print_exc() if createLog: logFile.write( str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n" + traceback.format_exc() + "\n") return None def get_source_language(forum): if forum == "BestCardingWorld": lang = 'english' elif forum == "CryptBB": lang = 'english' elif forum == "Incogsnoo": lang = 'english' else: print("MISSING CALL TO GET LANGUAGE IN PREPARE_PARSER.PY!") lang = 'auto' return lang def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile): try: persist_data(url, tuple(rec), cur) con.commit() return True except: con.rollback() incrementError() print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!") traceback.print_exc() if createLog: logFile.write( str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n" + traceback.format_exc() + "\n") return False def move_file(filePath, createLog, logFile): source = filePath destination = filePath.replace(os.path.basename(filePath), "") + 'Read\\' + os.path.basename(filePath) try: shutil.move(source, destination, shutil.copy2) return True except: try: shutil.move(source, destination, shutil.copytree) return True except: incrementError() print("There was a problem to move the file " + filePath) traceback.print_exc() if createLog: logFile.write( str(nError) + ". There was a problem to move the file " + filePath + "\n" + traceback.format_exc() + "\n") return False #main method for this program, what actually gets the parsed info from the parser, and persists them into the db #calls the different parser methods here depending on the type of html page def new_parse(forum, url, createLog): from Forums.Initialization.forums_mining import config, CURRENT_DATE global nError nError = 0 print("Parsing the " + forum + " forum and conduct data classification to store the information in the database.") # Connecting to the database con = connectDataBase() cur = con.cursor(cursor_factory=RealDictCursor) # Creating the tables (The database should be created manually) create_database(cur, con) mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums\\" + forum + "\\HTML_Pages") # Creating the log file for each Forum if createLog: try: logFile = open(mainDir + f"/{CURRENT_DATE}/" + forum + "_" + CURRENT_DATE + ".log", "w") except: print("Could not open log file!") createLog = False logFile = None # raise SystemExit else: logFile = None source_lang = get_source_language(forum) # Reading the Listing Html Pages listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')) listings.sort(key=os.path.getmtime) for listingIndex, listingFile in enumerate(listings): print("Reading listing folder of '" + forum + "', file '" + os.path.basename(listingFile) + "', index= " + str( listingIndex + 1) + " ... " + str(len(listings))) listingSoup = read_file(listingFile, createLog, logFile) # listing flags doParseListing = listingSoup is not None doDescription = False readDescriptionError = False parseDescriptionError = False persistDescriptionError = False moveDescriptionError = False findDescriptionError = False rw = [] if doParseListing: rw = parse_listing(forum, listingFile, listingSoup, createLog, logFile) doDescription = rw is not None if doDescription: nFound = 0 for rec in rw: rec = rec.split(',') descriptionPattern = cleanLink(rec[6]) + "page[0-9]*.html" # Reading the associated description Html Pages descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern)) descriptions.sort(key=os.path.getmtime) nFound += len(descriptions) # Aggregate of posts from multiple description (topic) pages posts = [] for descriptionIndex, descriptionFile in enumerate(descriptions): print("Reading description folder of '" + forum + "', file '" + os.path.basename( descriptionFile) + "', index= " + str(descriptionIndex + 1) + " ... " + str(len(descriptions))) descriptionSoup = read_file(descriptionFile, createLog, logFile) # description flags doParseDescription = descriptionSoup is not None doPersistRecord = False doMoveDescription = False rmm = [] if doParseDescription: rmm = parse_description(forum, descriptionFile, descriptionSoup, createLog, logFile) doPersistRecord = rmm is not None else: readDescriptionError = True parseDescriptionError = True if doPersistRecord: # Combining the information from Listing and Description Pages rec = mergePages(rmm, rec) # Add the page's posts to aggregate posts += rec[15] # Classify on final description page if descriptionIndex == len(descriptions) - 1: title = translate(rec[3], source_lang) content = translate(getPosts(posts), source_lang) # classification for topic based on all posts from all pages rec[19] = str(predict(title, content, language='sup_english')) # Persisting the information in the database persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile) doMoveDescription = persistSuccess else: parseDescriptionError = True if doMoveDescription: # move description files of completed folder moveSuccess = move_file(descriptionFile, createLog, logFile) if not moveSuccess: moveDescriptionError = True else: moveDescriptionError = True if not (nFound > 0): findDescriptionError = True incrementError() print(f"There was a problem to locate the file(s) for {listingFile} in the Description section!") if createLog: logFile.write( str(nError) + f". There was a problem to locate the file(s) for {listingFile}" f" in the Description section!\n\n") if not (readDescriptionError or parseDescriptionError or persistDescriptionError or moveDescriptionError or findDescriptionError): # move listing files of completed folder move_file(listingFile, createLog, logFile) # registering the current forum status (up/down) and the number of scraped pages in the database forumId = verifyForum(cur, forum) if (forumId > 0): readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.html')) readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.html')) create_status(cur, forumId, CURRENT_DATE, len(readListings), len(readDescriptions), '1' if len(listings) > 0 else '0') con.commit() if createLog: logFile.close() cur.close() con.close() print("Parsing the " + forum + " forum and data classification done.")