__author__ = 'DarkWeb' import codecs import glob import os, re import shutil from Forums.DB_Connection.db_connection import * from Forums.BestCardingWorld.parser import * from Forums.CryptBB.parser import * from Forums.OnniForums.parser import * from Forums.Altenens.parser import * from Forums.Procrax.parser import * from Forums.Classifier.classify_product import predict # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi # determines if forum is russian, not really used now but maybe later def isRussianForum(forum): with open('russian_forums.txt') as f: forums = f.readlines() result = False for iforum in forums: iforum = iforum.replace('\n','') if iforum == forum: result = True break return result #tries to match description pages to listing pages by using a key made for every description page and every link in listing page #once verified and matched, the info is merged into a 'rec', which is returned #@param: detPage is a list of keys of valid pages, rec is the row of data of an instance #return: rec, row of data, that may have additional data added on after matching description to listing page def mergePages(rmm, rec): # key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip() # key = rec[16] print ("----------------- Matched: " + rec[3] + "--------------------") rec[9] = rmm[1] rec[10] = rmm[2] rec[11] = rmm[3] rec[12] = rmm[4] rec[13] = rmm[5] rec[14] = rmm[6] rec[15] = rmm[7] rec[16] = rmm[8] return rec #gets a string of posts and joins them together into one string to be put in the database as one string of text #@param: list of strings (the posts of a thread) #return: string containing the concatenation of all the strings def getPosts(posts): strPosts = ' '.join(posts) return strPosts.strip() #uses db connection , another program, methods to persists values to the correct categories #@param: row is the list of entries for this instance, cur is the db connection object def persist_data(url, row, cur): forum = create_forum(cur, row, url) board = create_board(cur, row, forum) author = create_user(cur, row, forum, 0) topic = create_topic(cur, row, forum, board, author) create_posts(cur, row, forum, board, topic) #main method for this program, what actually gets the parsed info from the parser, and persists them into the db #calls the different parser methods here depending on the type of html page def new_parse(forum, url, createLog): from Forums.Initialization.forums_mining import config, CURRENT_DATE print("Parsing The " + forum + " Forum and conduct data classification to store the information in the database.") # ini = time.time() # Connecting to the database con = connectDataBase() cur = con.cursor() # Creating the tables (The database should be created manually) create_database(cur, con) nError = 0 lines = [] # listing pages lns = [] # description pages detPage = {} # first pages other = {} # other pages # Creating the log file for each Forum if createLog: if not os.path.exists("./" + forum + "/Logs/" + forum + "_" + CURRENT_DATE + ".log"): logFile = open("./" + forum + "/Logs/" + forum + "_" + CURRENT_DATE + ".log", "w") else: print("Files of the date " + CURRENT_DATE + " from the Forum " + forum + " were already read. Delete the referent information in the Data Base and also delete the log file" " in the _Logs folder to read files from this Forum of this date again.") raise SystemExit mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + forum + "/HTML_Pages") # Reading the Listing Html Pages for fileListing in glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')): lines.append(fileListing) # Reading the Description Html Pages for fileDescription in glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", '*.html')): lns.append(fileDescription) # Parsing the Description Pages and put the tag's content into a dictionary (Hash table) for index, line2 in enumerate(lns): print("Reading description folder of '" + forum + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns))) try: html = codecs.open(line2.strip('\n'), encoding='utf8') soup = BeautifulSoup(html, "html.parser") html.close() except: try: html = open(line2.strip('\n')) soup = BeautifulSoup(html, "html.parser") html.close() except: nError += 1 print("There was a problem to read the file " + line2 + " in the Description section!") if createLog: logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section!\n") continue try: if forum == "BestCardingWorld": rmm = bestcardingworld_description_parser(soup) elif forum == "CryptBB": rmm = cryptBB_description_parser(soup) elif forum == "OnniForums": rmm = onniForums_description_parser(soup) elif forum == "Altenens": rmm = altenens_description_parser(soup) elif forum == "Procrax": rmm = procrax_description_parser(soup) # key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip() key = u"Url:" + os.path.basename(line2).replace(".html", "") # check if "page1" exists at the end of a string # if yes add to first page directory if no add to other check = re.compile(r'page1$') if check.search(key): # print(key, 'is a first page\n') detPage[key] = {'rmm': rmm, 'files': [os.path.basename(line2)]} else: # print(key, 'is an other page\n') other[key] = {'rmm': rmm, 'filename': os.path.basename(line2)} except: nError += 1 print("There was a problem to parse the file " + line2 + " in the Description section!") traceback.print_exc() if createLog: logFile.write(str(nError) + ". There was a problem to parse the file " + line2 + " in the Description section.\n") # goes through keys from detPage and other, checks if the keys match. # if yes adds other[key] values to detPage w/o overwritting for key in detPage.keys(): for k in list(other.keys()): checkkey = str(key[4:]) checkk = str(k[4:]) if checkkey in checkk: detPage[key]['rmm'][1].extend(other[k]['rmm'][1]) detPage[key]['rmm'][2].extend(other[k]['rmm'][2]) detPage[key]['rmm'][3].extend(other[k]['rmm'][3]) detPage[key]['rmm'][4].extend(other[k]['rmm'][4]) detPage[key]['rmm'][5].extend(other[k]['rmm'][5]) detPage[key]['rmm'][6].extend(other[k]['rmm'][6]) detPage[key]['rmm'][7].extend(other[k]['rmm'][7]) detPage[key]['rmm'][8].extend(other[k]['rmm'][8]) detPage[key]['files'].append(other[k]['filename']) other.pop(k) # Parsing the Listing Pages and put the tag's content into a list for index, line1 in enumerate(lines): print("Reading listing folder of '" + forum + "', file '" + os.path.basename(line1) + "', index= " + str(index + 1) + " ... " + str(len(lines))) readError = False try: html = codecs.open(line1.strip('\n'), encoding='utf8') soup = BeautifulSoup(html, "html.parser") html.close() except: try: html = open(line1.strip('\n')) soup = BeautifulSoup(html, "html.parser") html.close() except: nError += 1 print("There was a problem to read the file " + line1 + " in the Listing section!") if createLog: logFile.write(str(nError) + ". There was a problem to read the file " + line1 + " in the Listing section.\n") readError = True if not readError: parseError = False try: if forum == "BestCardingWorld": rw = bestcardingworld_listing_parser(soup) elif forum == "CryptBB": rw = cryptBB_listing_parser(soup) elif forum == "OnniForums": rw = onniForums_listing_parser(soup) elif forum == "Altenens": rw = altenens_listing_parser(soup) elif forum == "Procrax": rw = procrax_listing_parser(soup) except: nError += 1 print("There was a problem to read the file " + line1 + " in the listing section!") traceback.print_exc() if createLog: logFile.write( str(nError) + ". There was a problem to read the file " + line1 + " in the Listing section.\n") parseError = True if not parseError: persistError = False moveError = False num_in_db = 0 num_persisted_moved = 0 for rec in rw: rec = rec.split(',') # print(rec) # key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip() key = u"Url:" + cleanLink(rec[6]) + "page1" # print(key) if key in detPage: # Combining the information from Listing and Description Pages rmm = detPage[key]['rmm'] rec = mergePages(rmm, rec) # Append to the list the classification of the topic # if isRussianForum(forum): # rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_russian'))) # else: # rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_english'))) rec.append(str(predict(rec[3], getPosts(rec[14]), language='sup_english'))) # Persisting the information in the database try: persist_data(url, tuple(rec), cur) con.commit() except: trace = traceback.format_exc() if trace.find("already exists") == -1: nError += 1 print("There was a problem to persist the file " + detPage[key]['filename'] + " in the database!") if createLog: logFile.write( str(nError) + ". There was a problem to persist the file " + detPage[key]['filename'] + " in the database.\n") persistError = True con.rollback() if not persistError: # move description files of completed folder for filename in detPage[key]['files']: source = line2.replace(os.path.basename(line2), "") + filename destination = line2.replace(os.path.basename(line2), "") + r'Read/' try: shutil.move(source, destination) num_persisted_moved += 1 except: print("There was a problem to move the file " + filename + " in the Description section!") nError += 1 if createLog: logFile.write( str(nError) + ". There was a problem to move the file " + filename + " in the Description section!.\n") moveError = True # if the associated description page is not read or not parsed else: # query database # if the post already exists: # num_in_db += 1 pass # if number of topics on listing page is equal to # the number of merged, persisted, and moved topics plus # the number of topics already in the database if not persistError and not moveError and len(rw) == (num_persisted_moved + num_in_db): # move listing file to completed folder source = line1 destination = line1.replace(os.path.basename(line1), "") + r'Read/' try: shutil.move(source, destination) except: nError += 1 print("There was a problem to move the file " + line1 + " in the Listing section!") if createLog: logFile.write(str(nError) + ". There was a problem to move the file " + line1 + " in the Listing section!.\n") if createLog: logFile.close() #end = time.time() #finalTime = float(end-ini) #print (forum + " Parsing Perfomed Succesfully in %.2f" %finalTime + "!") input("Parsing the " + forum + " forum and data classification done successfully. Press ENTER to continue\n")