__author__ = 'DarkWeb' import glob import os import codecs import shutil import traceback from psycopg2.extras import RealDictCursor from MarketPlaces.DB_Connection.db_connection import * from MarketPlaces.DarkFox.parser import * from MarketPlaces.AnonymousMarketplace.parser import * from MarketPlaces.TheDarkMarket.parser import * from MarketPlaces.ViceCity.parser import * from MarketPlaces.M00nkeyMarket.parser import * from MarketPlaces.MikesGrandStore.parser import * from MarketPlaces.PabloEscobarMarket.parser import * from MarketPlaces.CityMarket.parser import * from MarketPlaces.DarkBazar.parser import * from MarketPlaces.Sonanza.parser import * from MarketPlaces.Kingdom.parser import * from MarketPlaces.BlackPyramid.parser import * from MarketPlaces.Quest.parser import * from MarketPlaces.Ares.parser import * from MarketPlaces.CypherMarketplace.parser import * from MarketPlaces.WeTheNorth.parser import * from MarketPlaces.Torzon.parser import * from MarketPlaces.GoFish.parser import * from MarketPlaces.ZeroDay.parser import * from MarketPlaces.DarkMarket.parser import * from MarketPlaces.DarkDock.parser import * from MarketPlaces.SilkRoad4.parser import * from MarketPlaces.DarkRoad.parser import * from MarketPlaces.Classifier.classify_product import predict from Translator.translate import translate nError = 0 def mergePages(rmm, rec): # key = u"Pr:" + rec[1].upper() + u" Vendor:" + rec[18].upper() # key = rec[23] print("----------------- Matched: " + rec[4] + "--------------------") if rec[1] == "-1": # name_vendor rec[1] = rmm[0] if rec[2] == "-1": # rating_vendor rec[2] = rmm[1] if rec[3] == "-1": # success_vendor rec[3] = rmm[2] if rec[4] == "-1": # name_item rec[4] = rmm[3] if rec[5] == "-1": # description_item rec[5] = rmm[4] if rec[6] == "-1": # cve_item rec[6] = rmm[5] if rec[7] == "-1": # ms_item rec[7] = rmm[6] if rec[8] == "-1": # category_item rec[8] = rmm[7] if rec[9] == "-1": # views_item rec[9] = rmm[8] if rec[10] == "-1": # reviews_item rec[10] = rmm[9] if rec[11] == "-1": # rating_item rec[11] = rmm[10] if rec[12] == "-1": # adddate_item rec[12] = rmm[11] if rec[13] == "-1": # btc_item rec[13] = rmm[12] if rec[14] == "-1": # usd_item rec[14] = rmm[13] if rec[15] == "-1": # euro_item rec[15] = rmm[14] if rec[16] == "-1": # quantitysold_item rec[16] = rmm[15] if rec[17] == "-1": # quantityleft_item rec[17] = rmm[16] if rec[18] == "-1": # shippedfrom_item rec[18] = rmm[17] if rec[19] == "-1": # shippedto_item rec[19] = rmm[18] if rmm[19] != "-1": # image rec[20] = rmm[19] if rmm[20] != "-1": # image_vendor rec[21] = rmm[20] return rec def persist_data(url, row, cur): marketPlace = create_marketPlace(cur, row, url) vendor = create_vendor(cur, row, marketPlace) create_items(cur, row, marketPlace, vendor) def incrementError(): global nError nError += 1 def read_file(filePath, createLog, logFile): try: html = codecs.open(filePath.strip('\n'), encoding='utf8') soup = BeautifulSoup(html, "html.parser") html.close() time.sleep(0.01) # making sure the file is closed before returning soup object return soup except: try: html = open(filePath.strip('\n')) soup = BeautifulSoup(html, "html.parser") html.close() time.sleep(0.01) # making sure the file is closed before returning soup object return soup except: incrementError() print("There was a problem to read the file " + filePath) if createLog: logFile.write( str(nError) + ". There was a problem to read the file " + filePath + "\n" + traceback.format_exc() + "\n") return None def parse_listing(marketPlace, listingFile, soup, createLog, logFile): try: if marketPlace == "DarkFox": rw = darkfox_listing_parser(soup) elif marketPlace == "AnonymousMarketplace": rw = anonymousMarketplace_listing_parser(soup) elif marketPlace == "ViceCity": rw = vicecity_listing_parser(soup) elif marketPlace == "M00nkeyMarket": rw = m00nkey_listing_parser(soup) elif marketPlace == "MikesGrandStore": rw = MikesGrandStore_listing_parser(soup) elif marketPlace == "PabloEscobarMarket": rw = pabloescobarmarket_listing_parser(soup) elif marketPlace == "CityMarket": rw = city_listing_parser(soup) elif marketPlace == "Ares": rw = ares_listing_parser(soup) elif marketPlace == "DarkBazar": rw = darkbazar_listing_parser(soup) elif marketPlace == "Sonanza": rw = sonanza_listing_parser(soup) elif marketPlace == "Kingdom": rw = kingdom_listing_parser(soup) elif marketPlace == "BlackPyramid": rw = blackpyramid_listing_parser(soup) elif marketPlace == "Quest": rw = quest_listing_parser(soup) elif marketPlace == "CypherMarketplace": rw = cyphermarketplace_listing_parser(soup) elif marketPlace == "TheDarkMarket": rw = thedarkmarket_listing_parser(soup) elif marketPlace == "WeTheNorth": rw = wethenorth_listing_parser(soup) elif marketPlace == "GoFish": rw = gofish_listing_parser(soup) elif marketPlace == "ZeroDay": rw = zeroday_listing_parser(soup) elif marketPlace == "Torzon": rw = torzon_listing_parser(soup) elif marketPlace == "DarkMarket": rw = darkmarket_listing_parser(soup) elif marketPlace == "DarkDock": rw = darkdock_listing_parser(soup) elif marketPlace == "SilkRoad4": rw = silkroad4_listing_parser(soup) elif marketplace == "DarkRoad": rw = darkroad_listing_parser(soup) else: print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") raise Exception return rw except: incrementError() print("There was a problem to parse the file " + listingFile + " in the listing section!") traceback.print_exc() if createLog: logFile.write( str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n" + traceback.format_exc() + "\n") return None def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): try: if marketPlace == "DarkFox": rmm = darkfox_description_parser(soup) elif marketPlace == "AnonymousMarketplace": rmm = anonymousMarketplace_description_parser(soup) elif marketPlace == "ViceCity": rmm = vicecity_description_parser(soup) elif marketPlace == "M00nkeyMarket": rmm = m00nkey_description_parser(soup) elif marketPlace == "MikesGrandStore": rmm = MikesGrandStore_description_parser(soup) elif marketPlace == "PabloEscobarMarket": rmm = pabloescobarmarket_description_parser(soup) elif marketPlace == "CityMarket": rmm = city_description_parser(soup) elif marketPlace == "Ares": rmm = ares_description_parser(soup) elif marketPlace == "DarkBazar": rmm = darkbazar_description_parser(soup) elif marketPlace == "Sonanza": rmm = sonanza_description_parser(soup) elif marketPlace == "Kingdom": rmm = kingdom_description_parser(soup) elif marketPlace == "BlackPyramid": rmm = blackpyramid_description_parser(soup) elif marketPlace == "Quest": rmm = quest_description_parser(soup) elif marketPlace == "CypherMarketplace": rmm = cyphermarketplace_description_parser(soup) elif marketPlace == "TheDarkMarket": rmm = thedarkmarket_description_parser(soup) elif marketPlace == "WeTheNorth": rmm = wethenorth_description_parser(soup) elif marketPlace == "GoFish": rmm = gofish_description_parser(soup) elif marketPlace == "ZeroDay": rmm = zeroday_description_parser(soup) elif marketPlace == "Torzon": rmm = torzon_description_parser(soup) elif marketPlace == "DarkMarket": rmm = darkmarket_description_parser(soup) elif marketPlace == "DarkDock": rmm = darkdock_description_parser(soup) elif marketPlace == "SilkRoad4": rmm = silkroad4_description_parser(soup) elif marketPlace == "DarkRoad": rmm = darkroad_description_parser(soup) else: print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") raise Exception return rmm except: incrementError() print("There was a problem to parse the file " + descriptionFile + " in the Description section!") traceback.print_exc() if createLog: logFile.write( str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n" + traceback.format_exc() + "\n") return None def get_source_language(marketPlace): if marketPlace == "BestCardingWorld": lang = 'english' elif marketPlace == "CryptBB": lang = 'english' elif marketPlace == "Incogsnoo": lang = 'english' elif marketPlace == "CityMarket": lang = 'english' elif marketPlace == "DarkMarket": lang = 'english' else: print("MISSING CALL TO GET LANGUAGE IN PREPARE_PARSER.PY!") lang = 'auto' return lang def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile): try: persist_data(url, tuple(rec), cur) con.commit() return True except: con.rollback() incrementError() print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!") traceback.print_exc() if createLog: logFile.write( str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n" + traceback.format_exc() + "\n") return False def move_file(filePath, createLog, logFile): source = filePath destination = filePath.replace(os.path.basename(filePath), "") + 'Read\\' + os.path.basename(filePath) try: shutil.move(source, destination, shutil.copy2) return True except: try: shutil.move(source, destination, shutil.copytree) return True except: incrementError() print("There was a problem to move the file " + filePath) traceback.print_exc() if createLog: logFile.write( str(nError) + ". There was a problem to move the file " + filePath + "\n" + traceback.format_exc() + "\n") return False def new_parse(marketPlace, url, createLog): from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE global nError nError = 0 print("Parsing the " + marketPlace + " market and conduct data classification to store the information in the database.") # Connecting to the database con = connectDataBase() cur = con.cursor(cursor_factory=RealDictCursor) # Creating the tables (The database should be created manually) create_database(cur, con) mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces\\" + marketPlace + "\\HTML_Pages") # Creating the log file for each Forum if createLog: try: logFile = open(mainDir + f"/{CURRENT_DATE}/" + marketPlace + "_" + CURRENT_DATE + ".log", "w") except: print("Could not open log file!") createLog = False logFile = None # raise SystemExit else: logFile = None source_lang = get_source_language(marketPlace) # Reading the Listing Html Pages listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')) listings.sort(key=os.path.getmtime) for listingIndex, listingFile in enumerate(listings): print("Reading listing folder of '" + marketPlace + "', file '" + os.path.basename(listingFile) + "', index= " + str( listingIndex + 1) + " ... " + str(len(listings))) listingSoup = read_file(listingFile, createLog, logFile) # listing flags doParseListing = listingSoup is not None doDescription = False readDescriptionError = False parseDescriptionError = False persistDescriptionError = False moveDescriptionError = False findDescriptionError = False rw = [] if doParseListing: rw = parse_listing(marketPlace, listingFile, listingSoup, createLog, logFile) doDescription = rw is not None if doDescription: nFound = 0 for rec in rw: rec = rec.split(',') descriptionPattern = cleanLink(rec[22]) + ".html" # Reading the associated description Html Pages descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern)) descriptions.sort(key=os.path.getmtime) nFound += len(descriptions) for descriptionIndex, descriptionFile in enumerate(descriptions): print("Reading description folder of '" + marketPlace + "', file '" + os.path.basename( descriptionFile) + "', index= " + str(descriptionIndex + 1) + " ... " + str(len(descriptions))) descriptionSoup = read_file(descriptionFile, createLog, logFile) # description flags doParseDescription = descriptionSoup is not None doPersistRecord = False doMoveDescription = False rmm = [] if doParseDescription: rmm = parse_description(marketPlace, descriptionFile, descriptionSoup, createLog, logFile) doPersistRecord = rmm is not None else: readDescriptionError = True parseDescriptionError = True if doPersistRecord: # Combining the information from Listing and Description Pages rec = mergePages(rmm, rec) title = translate(rec[4], source_lang) content = translate(rec[5], source_lang) # Append to the list the classification of the topic rec.append(str(predict(title, content, language='sup_english'))) # Persisting the information in the database persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile) doMoveDescription = persistSuccess else: parseDescriptionError = True if doMoveDescription: # move description files of completed folder moveSuccess = move_file(descriptionFile, createLog, logFile) if not moveSuccess: moveDescriptionError = True else: moveDescriptionError = True if not (nFound > 0): findDescriptionError = True incrementError() print(f"There was a problem to locate the file(s) for {listingFile} in the Description section!") if createLog: logFile.write( str(nError) + f". There was a problem to locate the file(s) for {listingFile}" f" in the Description section!\n\n") if not (readDescriptionError or parseDescriptionError or persistDescriptionError or moveDescriptionError or findDescriptionError): # move listing files of completed folder move_file(listingFile, createLog, logFile) # registering the current forum status (up/down) and the number of scraped pages in the database marketId = verifyMarketPlace(cur, marketPlace) if (marketId > 0): readListings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing\\read", '*.html')) readDescriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description\\read", '*.html')) create_status(cur, marketId, CURRENT_DATE, len(readListings), len(readDescriptions), '1' if len(listings) > 0 else '0') con.commit() if createLog: logFile.close() cur.close() con.close() print("Parsing the " + marketPlace + " market and data classification done.")