diff --git a/Forums/Altenens/crawler_selenium.py b/Forums/Altenens/crawler_selenium.py index 2d9e3cd..1ae6d8a 100644 --- a/Forums/Altenens/crawler_selenium.py +++ b/Forums/Altenens/crawler_selenium.py @@ -138,6 +138,7 @@ def createFFDriver(): return driver + def getAccess(): url = getFixedURL() driver = createFFDriver() diff --git a/Forums/Initialization/forums_mining.py b/Forums/Initialization/forums_mining.py index 5fcf17e..883ac34 100644 --- a/Forums/Initialization/forums_mining.py +++ b/Forums/Initialization/forums_mining.py @@ -1,7 +1,7 @@ __author__ = 'DarkWeb' ''' - +Starting point of the Darkweb Forums Mining ''' import os @@ -101,8 +101,7 @@ if __name__ == '__main__': print("Creating listing and description directories ... for " + forum) createDirectory(forum) time.sleep(5) # wait for directories to be created - input("Directories created successfully. Press ENTER to continue\n") - + print("Directories created successfully.") if forum == "BestCardingWorld": crawlerBestCardingWorld() @@ -123,8 +122,6 @@ if __name__ == '__main__': elif forum == 'Libre': crawlerLibre() - - print("Scraping process completed successfully!") diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index a1ef429..c9a50ae 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -16,6 +16,8 @@ from Forums.Libre.parser import * from Forums.Classifier.classify_product import predict # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi +# controls the log id +nError = 0 # determines if forum is russian, not really used now but maybe later def isRussianForum(forum): @@ -80,6 +82,141 @@ def persist_data(url, row, cur): create_posts(cur, row, forum, board, topic) +def incrementError(): + global nError + nError += 1 + +def read_file(filePath, createLog, logFile): + + try: + html = codecs.open(filePath.strip('\n'), encoding='utf8') + soup = BeautifulSoup(html, "html.parser") + html.close() + return soup + except: + + try: + html = open(filePath.strip('\n')) + soup = BeautifulSoup(html, "html.parser") + html.close() + return soup + except: + + incrementError() + print("There was a problem to read the file " + filePath) + if createLog: + logFile.write( + str(nError) + ". There was a problem to read the file " + filePath + "\n") + return None + + +def parse_listing(forum, listingFile, soup, createLog, logFile): + + try: + + rw = [] + + if forum == "BestCardingWorld": + rw = bestcardingworld_listing_parser(soup) + elif forum == "Cardingleaks": + rw = cardingleaks_listing_parser(soup) + elif forum == "CryptBB": + rw = cryptBB_listing_parser(soup) + elif forum == "OnniForums": + rw = onniForums_listing_parser(soup) + elif forum == "Altenens": + rw = altenens_listing_parser(soup) + elif forum == "Procrax": + rw = procrax_listing_parser(soup) + elif forum == "Libre": + rw = libre_listing_parser(soup) + return rw + + except: + + incrementError() + print("There was a problem to read the file " + listingFile + " in the listing section!") + traceback.print_exc() + if createLog: + logFile.write( + str(nError) + ". There was a problem to read the file " + listingFile + " in the Listing section.\n") + return None + + +def parse_description(forum, descriptionFile, soup, createLog, logFile): + + try: + + rmm = [] + + if forum == "BestCardingWorld": + rmm = bestcardingworld_description_parser(soup) + elif forum == "Cardingleaks": + rmm = cardingleaks_description_parser(soup) + elif forum == "CryptBB": + rmm = cryptBB_description_parser(soup) + elif forum == "OnniForums": + rmm = onniForums_description_parser(soup) + elif forum == "Altenens": + rmm = altenens_description_parser(soup) + elif forum == "Procrax": + rmm = procrax_description_parser(soup) + elif forum == "Libre": + rmm = libre_description_parser(soup) + return rmm + + except: + + incrementError() + print("There was a problem to parse the file " + descriptionFile + " in the Description section!") + traceback.print_exc() + if createLog: + logFile.write( + str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n") + return None + + +def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile): + + try: + persist_data(url, tuple(rec), cur) + con.commit() + return True + except: + + con.rollback() + + trace = traceback.format_exc() + + if trace.find("already exists") == -1: + incrementError() + print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!") + if createLog: + logFile.write(str(nError) + f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n") + return False + else: + return True + + +def move_file(filePath, createLog, logFile): + + # source = line2.replace(os.path.basename(line2), "") + filename + source = filePath + destination = filePath.replace(os.path.basename(filePath), "") + r'Read/' + + try: + shutil.move(source, destination) + return True + except: + + print("There was a problem to move the file " + filePath) + incrementError() + if createLog: + logFile.write( + str(nError) + ". There was a problem to move the file " + filePath + "\n") + return False + + #main method for this program, what actually gets the parsed info from the parser, and persists them into the db #calls the different parser methods here depending on the type of html page def new_parse(forum, url, createLog): @@ -88,8 +225,6 @@ def new_parse(forum, url, createLog): print("Parsing The " + forum + " Forum and conduct data classification to store the information in the database.") - # ini = time.time() - # Connecting to the database con = connectDataBase() cur = con.cursor() @@ -97,268 +232,113 @@ def new_parse(forum, url, createLog): # Creating the tables (The database should be created manually) create_database(cur, con) - nError = 0 - - lines = [] # listing pages - lns = [] # description pages - detPage = {} # first pages - other = {} # other pages + mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + forum + "/HTML_Pages") # Creating the log file for each Forum if createLog: - if not os.path.exists("./" + forum + "/Logs/" + forum + "_" + CURRENT_DATE + ".log"): - logFile = open("./" + forum + "/Logs/" + forum + "_" + CURRENT_DATE + ".log", "w") - else: - print("Files of the date " + CURRENT_DATE + " from the Forum " + forum + - " were already read. Delete the referent information in the Data Base and also delete the log file" - " in the _Logs folder to read files from this Forum of this date again.") + try: + logFile = open(mainDir + f"/{CURRENT_DATE}/" + forum + "_" + CURRENT_DATE + ".log", "w") + except: + print("Could not open log file!") raise SystemExit - - mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + forum + "/HTML_Pages") + else: + logFile = None # Reading the Listing Html Pages - for fileListing in glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')): - lines.append(fileListing) + listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')) + for listingIndex, listingFile in enumerate(listings): - # Reading the Description Html Pages - for fileDescription in glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", '*.html')): - lns.append(fileDescription) + print("Reading listing folder of '" + forum + "', file '" + os.path.basename(listingFile) + "', index= " + str( + listingIndex + 1) + " ... " + str(len(listings))) - # Parsing the Description Pages and put the tag's content into a dictionary (Hash table) - for index, line2 in enumerate(lns): + listingSoup = read_file(listingFile, createLog, logFile) - print("Reading description folder of '" + forum + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns))) + # listing flags + doParseListing = listingSoup is not None + doDescription = False - try: - html = codecs.open(line2.strip('\n'), encoding='utf8') - soup = BeautifulSoup(html, "html.parser") - html.close() - except: + readDescriptionError = False + parseDescriptionError = False + persistDescriptionError = False + moveDescriptionError = False - try: - html = open(line2.strip('\n')) - soup = BeautifulSoup(html, "html.parser") - html.close() - except: + rw = [] - nError += 1 - print("There was a problem to read the file " + line2 + " in the Description section!") - if createLog: - logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section!\n") - continue + if doParseListing: - try: + rw = parse_listing(forum, listingFile, listingSoup, createLog, logFile) - if forum == "BestCardingWorld": - rmm = bestcardingworld_description_parser(soup) - elif forum == "Cardingleaks": - rmm = cardingleaks_description_parser(soup) - elif forum == "CryptBB": - rmm = cryptBB_description_parser(soup) - elif forum == "OnniForums": - rmm = onniForums_description_parser(soup) - elif forum == "Altenens": - rmm = altenens_description_parser(soup) - elif forum == "Procrax": - rmm = procrax_description_parser(soup) - elif forum == "Libre": - rmm = libre_description_parser(soup) - - # key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip() - key = u"Url:" + os.path.basename(line2).replace(".html", "") - - # check if "page1" exists at the end of a string - # if yes add to first page directory if no add to other - check = re.compile(r'page1$') - if check.search(key): - # print(key, 'is a first page\n') - detPage[key] = {'rmm': rmm, 'files': [os.path.basename(line2)]} - else: - # print(key, 'is an other page\n') - other[key] = {'rmm': rmm, 'filename': os.path.basename(line2)} + doDescription = rw is not None - except: + if doDescription: - nError += 1 - print("There was a problem to parse the file " + line2 + " in the Description section!") - traceback.print_exc() - if createLog: - logFile.write(str(nError) + ". There was a problem to parse the file " + line2 + " in the Description section.\n") - - # goes through keys from detPage and other, checks if the keys match. - # if yes adds other[key] values to detPage w/o overwritting - for key in detPage.keys(): - for k in list(other.keys()): - checkkey = str(key[4:]) - checkk = str(k[4:]) - - if checkkey in checkk: - detPage[key]['rmm'][1].extend(other[k]['rmm'][1]) - detPage[key]['rmm'][2].extend(other[k]['rmm'][2]) - detPage[key]['rmm'][3].extend(other[k]['rmm'][3]) - detPage[key]['rmm'][4].extend(other[k]['rmm'][4]) - detPage[key]['rmm'][5].extend(other[k]['rmm'][5]) - detPage[key]['rmm'][6].extend(other[k]['rmm'][6]) - detPage[key]['rmm'][7].extend(other[k]['rmm'][7]) - detPage[key]['rmm'][8].extend(other[k]['rmm'][8]) - detPage[key]['files'].append(other[k]['filename']) - - other.pop(k) - - # Parsing the Listing Pages and put the tag's content into a list - for index, line1 in enumerate(lines): - - print("Reading listing folder of '" + forum + "', file '" + os.path.basename(line1) + "', index= " + str(index + 1) + " ... " + str(len(lines))) - - readError = False - try: - html = codecs.open(line1.strip('\n'), encoding='utf8') - soup = BeautifulSoup(html, "html.parser") - html.close() - except: + for rec in rw: + + rec = rec.split(',') + + descriptionPattern = cleanLink(rec[6]) + "page[0-9]*.html" + + # Reading the associated description Html Pages + descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern)) + for descriptionIndex, descriptionFile in enumerate(descriptions): + + print("Reading description folder of '" + forum + "', file '" + os.path.basename( + descriptionFile) + "', index= " + str(descriptionIndex + 1) + " ... " + str(len(descriptions))) + + descriptionSoup = read_file(descriptionFile, createLog, logFile) + + # description flags + doParseDescription = descriptionSoup is not None + doPersistRecord = False + doMoveDescription = False - try: - html = open(line1.strip('\n')) - soup = BeautifulSoup(html, "html.parser") - html.close() - except: - - nError += 1 - print("There was a problem to read the file " + line1 + " in the Listing section!") - if createLog: - logFile.write(str(nError) + ". There was a problem to read the file " + line1 + " in the Listing section.\n") - readError = True - - if not readError: - - parseError = False - try: - - if forum == "BestCardingWorld": - rw = bestcardingworld_listing_parser(soup) - elif forum == "Cardingleaks": - rw = cardingleaks_listing_parser(soup) - elif forum == "CryptBB": - rw = cryptBB_listing_parser(soup) - elif forum == "OnniForums": - rw = onniForums_listing_parser(soup) - elif forum == "Altenens": - rw = altenens_listing_parser(soup) - elif forum == "Procrax": - rw = procrax_listing_parser(soup) - elif forum == "Libre": - rw = libre_listing_parser(soup) - - except: - - nError += 1 - print("There was a problem to read the file " + line1 + " in the listing section!") - traceback.print_exc() - if createLog: - logFile.write( - str(nError) + ". There was a problem to read the file " + line1 + " in the Listing section.\n") - parseError = True - - if not parseError: - - persistError = False - moveError = False - num_in_db = 0 - num_persisted_moved = 0 - - for rec in rw: - - rec = rec.split(',') - # print(rec) - - # key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip() - key = u"Url:" + cleanLink(rec[6]) + "page1" - # print(key) - - if key in detPage: + rmm = [] + + if doParseDescription: + + rmm = parse_description(forum, descriptionFile, descriptionSoup, createLog, logFile) + + doPersistRecord = rmm is not None + + else: + readDescriptionError = True + parseDescriptionError = True + + if doPersistRecord: # Combining the information from Listing and Description Pages - rmm = detPage[key]['rmm'] rec = mergePages(rmm, rec) # Append to the list the classification of the topic - # if isRussianForum(forum): - # rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_russian'))) - # else: - # rec.append(str(predict(rec[1], getPosts(rec[8]), language='sup_english'))) rec.append(str(predict(rec[3], getPosts(rec[14]), language='sup_english'))) # Persisting the information in the database - try: - persist_data(url, tuple(rec), cur) - con.commit() - except: - - trace = traceback.format_exc() - - if trace.find("already exists") == -1: - nError += 1 - print("There was a problem to persist the file " + detPage[key]['filename'] + " in the database!") - if createLog: - logFile.write( - str(nError) + ". There was a problem to persist the file " + detPage[key]['filename'] + " in the database.\n") - persistError = True - - con.rollback() - - if not persistError: - - # move description files of completed folder - for filename in detPage[key]['files']: - source = line2.replace(os.path.basename(line2), "") + filename - destination = line2.replace(os.path.basename(line2), "") + r'Read/' - - try: - shutil.move(source, destination) - num_persisted_moved += 1 - except: - - print("There was a problem to move the file " + filename + " in the Description section!") - nError += 1 - if createLog: - logFile.write( - str(nError) + ". There was a problem to move the file " + filename + " in the Description section!.\n") - moveError = True - - # if the associated description page is not read or not parsed - else: - # query database - # if the post already exists: - # num_in_db += 1 - pass + persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile) - # if number of topics on listing page is equal to - # the number of merged, persisted, and moved topics plus - # the number of topics already in the database - if not persistError and not moveError and len(rw) == (num_persisted_moved + num_in_db): + doMoveDescription = persistSuccess - # move listing file to completed folder - source = line1 - destination = line1.replace(os.path.basename(line1), "") + r'Read/' + else: + parseDescriptionError = True - try: - shutil.move(source, destination) - except: + if doMoveDescription: - nError += 1 - print("There was a problem to move the file " + line1 + " in the Listing section!") - if createLog: - logFile.write(str(nError) + ". There was a problem to move the file " + line1 + " in the Listing section!.\n") + # move description files of completed folder + moveSuccess = move_file(descriptionFile, createLog, logFile) - if createLog: - logFile.close() + if not moveSuccess: + moveDescriptionError = True - #end = time.time() + else: + moveDescriptionError = True - #finalTime = float(end-ini) + if not (readDescriptionError or parseDescriptionError or persistDescriptionError or moveDescriptionError): - #print (forum + " Parsing Perfomed Succesfully in %.2f" %finalTime + "!") + # move listing files of completed folder + move_file(listingFile, createLog, logFile) + + if createLog: + logFile.close() - input("Parsing the " + forum + " forum and data classification done successfully. Press ENTER to continue\n") + print("Parsing the " + forum + " forum and data classification done.") diff --git a/MarketPlaces/Initialization/markets_mining.py b/MarketPlaces/Initialization/markets_mining.py index 7740eda..d3538b2 100644 --- a/MarketPlaces/Initialization/markets_mining.py +++ b/MarketPlaces/Initialization/markets_mining.py @@ -1,7 +1,7 @@ __author__ = 'DarkWeb' ''' -Starting point of the Darkweb Mining Platform +Starting point of the Darkweb Markets Mining ''' import os