|
|
@ -20,9 +20,12 @@ from MarketPlaces.TorMarket.parser import * |
|
|
|
from MarketPlaces.HiddenMarket.parser import * |
|
|
|
from MarketPlaces.RobinhoodMarket.parser import * |
|
|
|
from MarketPlaces.Nexus.parser import * |
|
|
|
from MarketPlaces.MikesGrandStore.parser import * |
|
|
|
|
|
|
|
from MarketPlaces.Classifier.classify_product import predict |
|
|
|
|
|
|
|
nError = 0 |
|
|
|
|
|
|
|
|
|
|
|
def mergePages(rmm, rec): |
|
|
|
|
|
|
@ -82,13 +85,182 @@ def persist_data(url, row, cur): |
|
|
|
create_items(cur, row, marketPlace, vendor) |
|
|
|
|
|
|
|
|
|
|
|
def incrementError(): |
|
|
|
global nError |
|
|
|
nError += 1 |
|
|
|
|
|
|
|
|
|
|
|
def read_file(filePath, createLog, logFile): |
|
|
|
try: |
|
|
|
html = codecs.open(filePath.strip('\n'), encoding='utf8') |
|
|
|
soup = BeautifulSoup(html, "html.parser") |
|
|
|
html.close() |
|
|
|
return soup |
|
|
|
except: |
|
|
|
|
|
|
|
try: |
|
|
|
html = open(filePath.strip('\n')) |
|
|
|
soup = BeautifulSoup(html, "html.parser") |
|
|
|
html.close() |
|
|
|
return soup |
|
|
|
except: |
|
|
|
|
|
|
|
incrementError() |
|
|
|
print("There was a problem to read the file " + filePath) |
|
|
|
if createLog: |
|
|
|
logFile.write( |
|
|
|
str(nError) + ". There was a problem to read the file " + filePath + "\n") |
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
def parse_listing(marketPlace, listingFile, soup, createLog, logFile): |
|
|
|
try: |
|
|
|
|
|
|
|
if marketPlace == "DarkFox": |
|
|
|
rw = darkfox_listing_parser(soup) |
|
|
|
elif marketPlace == "Tor2door": |
|
|
|
rw = tor2door_listing_parser(soup) |
|
|
|
elif marketPlace == "Apocalypse": |
|
|
|
rw = apocalypse_listing_parser(soup) |
|
|
|
elif marketPlace == "ThiefWorld": |
|
|
|
rw = thiefWorld_listing_parser(soup) |
|
|
|
elif marketPlace == "AnonymousMarketplace": |
|
|
|
rw = anonymousMarketplace_listing_parser(soup) |
|
|
|
elif marketPlace == "ViceCity": |
|
|
|
rw = vicecity_listing_parser(soup) |
|
|
|
elif marketPlace == "TorBay": |
|
|
|
rw = torbay_listing_parser(soup) |
|
|
|
elif marketPlace == "M00nkeyMarket": |
|
|
|
rw = m00nkey_listing_parser(soup) |
|
|
|
elif marketPlace == "HiddenMarket": |
|
|
|
rw = hiddenmarket_listing_parser(soup) |
|
|
|
elif marketPlace == "DarkMatter": |
|
|
|
rw = darkmatter_listing_parser(soup) |
|
|
|
elif marketPlace == "DigitalThriftShop": |
|
|
|
rw = digitalThriftShop_listing_parser(soup) |
|
|
|
elif marketPlace == "LionMarketplace": |
|
|
|
rw = lionmarketplace_listing_parser(soup) |
|
|
|
elif marketPlace == "TorMarket": |
|
|
|
rw = tormarket_listing_parser(soup) |
|
|
|
elif marketPlace == "RobinhoodMarket": |
|
|
|
rw = Robinhood_listing_parser(soup) |
|
|
|
elif marketPlace == "Nexus": |
|
|
|
rw = nexus_listing_parser(soup) |
|
|
|
elif marketPlace == "MikesGrandStore": |
|
|
|
rw = mikesGrandStore_listing_parser(soup) |
|
|
|
else: |
|
|
|
print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") |
|
|
|
raise Exception |
|
|
|
return rw |
|
|
|
|
|
|
|
except: |
|
|
|
|
|
|
|
incrementError() |
|
|
|
print("There was a problem to parse the file " + listingFile + " in the listing section!") |
|
|
|
traceback.print_exc() |
|
|
|
if createLog: |
|
|
|
logFile.write( |
|
|
|
str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n") |
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): |
|
|
|
try: |
|
|
|
|
|
|
|
if marketPlace == "DarkFox": |
|
|
|
rmm = darkfox_description_parser(soup) |
|
|
|
elif marketPlace == "Tor2door": |
|
|
|
rmm = tor2door_description_parser(soup) |
|
|
|
elif marketPlace == "Apocalypse": |
|
|
|
rmm = apocalypse_description_parser(soup) |
|
|
|
elif marketPlace == "ThiefWorld": |
|
|
|
rmm = thiefWorld_description_parser(soup) |
|
|
|
elif marketPlace == "AnonymousMarketplace": |
|
|
|
rmm = anonymousMarketplace_description_parser(soup) |
|
|
|
elif marketPlace == "ViceCity": |
|
|
|
rmm = vicecity_description_parser(soup) |
|
|
|
elif marketPlace == "TorBay": |
|
|
|
rmm = torbay_description_parser(soup) |
|
|
|
elif marketPlace == "M00nkeyMarket": |
|
|
|
rmm = m00nkey_description_parser(soup) |
|
|
|
elif marketPlace == "HiddenMarket": |
|
|
|
rmm = hiddenmarket_description_parser(soup) |
|
|
|
elif marketPlace == "DarkMatter": |
|
|
|
rmm = darkmatter_description_parser(soup) |
|
|
|
elif marketPlace == "DigitalThriftShop": |
|
|
|
rmm = digitalThriftShop_description_parser(soup) |
|
|
|
elif marketPlace == "LionMarketplace": |
|
|
|
rmm = lionmarketplace_description_parser(soup) |
|
|
|
elif marketPlace == "TorMarket": |
|
|
|
rmm = tormarket_description_parser(soup) |
|
|
|
elif marketPlace == "RobinhoodMarket": |
|
|
|
rmm = Robinhood_description_parser(soup) |
|
|
|
elif marketPlace == "Nexus": |
|
|
|
rmm = nexus_description_parser(soup) |
|
|
|
elif marketPlace == "MikesGrandStore": |
|
|
|
rmm = mikesGrandStore_description_parser(soup) |
|
|
|
else: |
|
|
|
print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") |
|
|
|
raise Exception |
|
|
|
return rmm |
|
|
|
|
|
|
|
except: |
|
|
|
|
|
|
|
incrementError() |
|
|
|
print("There was a problem to parse the file " + descriptionFile + " in the Description section!") |
|
|
|
traceback.print_exc() |
|
|
|
if createLog: |
|
|
|
logFile.write( |
|
|
|
str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n") |
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile): |
|
|
|
try: |
|
|
|
persist_data(url, tuple(rec), cur) |
|
|
|
con.commit() |
|
|
|
return True |
|
|
|
except: |
|
|
|
|
|
|
|
con.rollback() |
|
|
|
|
|
|
|
trace = traceback.format_exc() |
|
|
|
|
|
|
|
if trace.find("already exists") == -1: |
|
|
|
incrementError() |
|
|
|
print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!") |
|
|
|
traceback.print_exc() |
|
|
|
if createLog: |
|
|
|
logFile.write( |
|
|
|
str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n") |
|
|
|
return False |
|
|
|
else: |
|
|
|
return True |
|
|
|
|
|
|
|
|
|
|
|
def move_file(filePath, createLog, logFile): |
|
|
|
# source = line2.replace(os.path.basename(line2), "") + filename |
|
|
|
source = filePath |
|
|
|
destination = filePath.replace(os.path.basename(filePath), "") + r'Read/' |
|
|
|
|
|
|
|
try: |
|
|
|
shutil.move(source, destination) |
|
|
|
return True |
|
|
|
except: |
|
|
|
|
|
|
|
print("There was a problem to move the file " + filePath) |
|
|
|
incrementError() |
|
|
|
if createLog: |
|
|
|
logFile.write( |
|
|
|
str(nError) + ". There was a problem to move the file " + filePath + "\n") |
|
|
|
return False |
|
|
|
|
|
|
|
|
|
|
|
def new_parse(marketPlace, url, createLog): |
|
|
|
|
|
|
|
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE |
|
|
|
|
|
|
|
print("Parsing the " + marketPlace + " marketplace and conduct data classification to store the information in the database.") |
|
|
|
|
|
|
|
# ini = time.time() |
|
|
|
print("Parsing the " + marketPlace + " market and conduct data classification to store the information in the database.") |
|
|
|
|
|
|
|
# Connecting to the database |
|
|
|
con = connectDataBase() |
|
|
@ -97,271 +269,131 @@ def new_parse(marketPlace, url, createLog): |
|
|
|
# Creating the tables (The database should be created manually) |
|
|
|
create_database(cur, con) |
|
|
|
|
|
|
|
nError = 0 |
|
|
|
|
|
|
|
lines = [] # listing pages |
|
|
|
lns = [] # description pages |
|
|
|
detPage = {} |
|
|
|
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + marketPlace + "/HTML_Pages") |
|
|
|
|
|
|
|
#Creating the log file for each Market Place |
|
|
|
# Creating the log file for each Forum |
|
|
|
if createLog: |
|
|
|
if not os.path.exists("./" + marketPlace + "/Logs/" + marketPlace + "_" + CURRENT_DATE + ".log"): |
|
|
|
logFile = open("./" + marketPlace + "/Logs/" + marketPlace + "_" + CURRENT_DATE + ".log", "w") |
|
|
|
else: |
|
|
|
print("Files of the date " + CURRENT_DATE + " from the Market Place " + marketPlace + |
|
|
|
" were already read. Delete the referent information in the Data Base and also delete the log file" |
|
|
|
" in the _Logs folder to read files from this Market Place of this date again.") |
|
|
|
raise SystemExit |
|
|
|
|
|
|
|
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + marketPlace + "/HTML_Pages") |
|
|
|
try: |
|
|
|
logFile = open(mainDir + f"/{CURRENT_DATE}/" + marketPlace + "_" + CURRENT_DATE + ".log", "w") |
|
|
|
except: |
|
|
|
print("Could not open log file!") |
|
|
|
createLog = False |
|
|
|
logFile = None |
|
|
|
# raise SystemExit |
|
|
|
else: |
|
|
|
logFile = None |
|
|
|
|
|
|
|
# Reading the Listing Html Pages |
|
|
|
for fileListing in glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')): |
|
|
|
lines.append(fileListing) |
|
|
|
listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')) |
|
|
|
for listingIndex, listingFile in enumerate(listings): |
|
|
|
|
|
|
|
# Reading the Description Html Pages |
|
|
|
for fileDescription in glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", '*.html')): |
|
|
|
lns.append(fileDescription) |
|
|
|
print("Reading listing folder of '" + marketPlace + "', file '" + os.path.basename(listingFile) + "', index= " + str( |
|
|
|
listingIndex + 1) + " ... " + str(len(listings))) |
|
|
|
|
|
|
|
# Parsing the Description Pages and put the tag's content into a dictionary (Hash table) |
|
|
|
for index, line2 in enumerate(lns): |
|
|
|
listingSoup = read_file(listingFile, createLog, logFile) |
|
|
|
|
|
|
|
print("Reading description folder of '" + marketPlace + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns))) |
|
|
|
# listing flags |
|
|
|
doParseListing = listingSoup is not None |
|
|
|
doDescription = False |
|
|
|
|
|
|
|
try: |
|
|
|
html = codecs.open(line2.strip('\n'), encoding='utf8') |
|
|
|
soup = BeautifulSoup(html, "html.parser") |
|
|
|
html.close() |
|
|
|
except: |
|
|
|
readDescriptionError = False |
|
|
|
parseDescriptionError = False |
|
|
|
persistDescriptionError = False |
|
|
|
moveDescriptionError = False |
|
|
|
findDescriptionError = False |
|
|
|
|
|
|
|
try: |
|
|
|
html = open(line2.strip('\n')) |
|
|
|
soup = BeautifulSoup(html, "html.parser") |
|
|
|
html.close() |
|
|
|
except: |
|
|
|
rw = [] |
|
|
|
|
|
|
|
nError += 1 |
|
|
|
print("There was a problem to read the file " + line2 + " in the Description section!") |
|
|
|
if createLog: |
|
|
|
logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section.\n") |
|
|
|
continue |
|
|
|
if doParseListing: |
|
|
|
|
|
|
|
try: |
|
|
|
rw = parse_listing(marketPlace, listingFile, listingSoup, createLog, logFile) |
|
|
|
|
|
|
|
if marketPlace == "DarkFox": |
|
|
|
rmm = darkfox_description_parser(soup) |
|
|
|
elif marketPlace == "Tor2door": |
|
|
|
rmm = tor2door_description_parser(soup) |
|
|
|
elif marketPlace == "Apocalypse": |
|
|
|
rmm = apocalypse_description_parser(soup) |
|
|
|
elif marketPlace == "ThiefWorld": |
|
|
|
rmm = thiefWorld_description_parser(soup) |
|
|
|
elif marketPlace =="AnonymousMarketplace": |
|
|
|
rmm = anonymousMarketplace_description_parser(soup) |
|
|
|
elif marketPlace == "ViceCity": |
|
|
|
rmm = vicecity_description_parser(soup) |
|
|
|
elif marketPlace == "TorBay": |
|
|
|
rmm = torbay_description_parser(soup) |
|
|
|
elif marketPlace == "M00nkeyMarket": |
|
|
|
rmm = m00nkey_description_parser(soup) |
|
|
|
elif marketPlace == "HiddenMarket": |
|
|
|
rmm = hiddenmarket_description_parser(soup) |
|
|
|
elif marketPlace == "DarkMatter": |
|
|
|
rmm = darkmatter_description_parser(soup) |
|
|
|
elif marketPlace == "DigitalThriftShop": |
|
|
|
rmm = digitalThriftShop_description_parser(soup) |
|
|
|
elif marketPlace == "LionMarketplace": |
|
|
|
rmm = lionmarketplace_description_parser(soup) |
|
|
|
elif marketPlace == "TorMarket": |
|
|
|
rmm = tormarket_description_parser(soup) |
|
|
|
elif marketPlace == "RobinhoodMarket": |
|
|
|
rmm = Robinhood_description_parser(soup) |
|
|
|
elif marketPlace == "Nexus": |
|
|
|
rmm = nexus_description_parser(soup) |
|
|
|
# key = u"Pr:" + rmm[0].upper()[:desc_lim1] + u" Vendor:" + rmm[13].upper()[:desc_lim2] |
|
|
|
key = u"Url:" + os.path.basename(line2).replace(".html", "") |
|
|
|
|
|
|
|
# save file address with description record in memory |
|
|
|
detPage[key] = {'rmm': rmm, 'filename': os.path.basename(line2)} |
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
raise e |
|
|
|
|
|
|
|
nError += 1 |
|
|
|
print("There was a problem to parse the file " + line2 + " in the Description section!") |
|
|
|
if createLog: |
|
|
|
logFile.write(str(nError) + ". There was a problem to parse the file " + line2 + " in the Description section.\n") |
|
|
|
doDescription = rw is not None |
|
|
|
|
|
|
|
# Parsing the Listing Pages and put the tag's content into a list |
|
|
|
for index, line1 in enumerate(lines): |
|
|
|
if doDescription: |
|
|
|
|
|
|
|
print("Reading listing folder of '" + marketPlace + "', file '" + os.path.basename(line1) + "', index= " + str(index + 1) + " ... " + str(len(lines))) |
|
|
|
nFound = 0 |
|
|
|
|
|
|
|
readError = False |
|
|
|
try: |
|
|
|
html = codecs.open(line1.strip('\n'), encoding='utf8') |
|
|
|
soup = BeautifulSoup(html, "html.parser") |
|
|
|
html.close() |
|
|
|
except: |
|
|
|
for rec in rw: |
|
|
|
|
|
|
|
try: |
|
|
|
html = open(line1.strip('\n')) |
|
|
|
soup = BeautifulSoup(html, "html.parser") |
|
|
|
html.close() |
|
|
|
except Exception as e: |
|
|
|
raise e |
|
|
|
nError += 1 |
|
|
|
print("There was a problem to read the file " + line1 + " in the Listing section!") |
|
|
|
if createLog: |
|
|
|
logFile.write(str(nError) + ". There was a problem to read the file " + line1 + " in the Listing section.\n") |
|
|
|
readError = True |
|
|
|
|
|
|
|
if not readError: |
|
|
|
|
|
|
|
parseError = False |
|
|
|
try: |
|
|
|
|
|
|
|
if marketPlace == "DarkFox": |
|
|
|
rw = darkfox_listing_parser(soup) |
|
|
|
elif marketPlace == "Tor2door": |
|
|
|
rw = tor2door_listing_parser(soup) |
|
|
|
elif marketPlace == "Apocalypse": |
|
|
|
rw = apocalypse_listing_parser(soup) |
|
|
|
elif marketPlace == "ThiefWorld": |
|
|
|
rw = thiefWorld_listing_parser(soup) |
|
|
|
elif marketPlace == "AnonymousMarketplace": |
|
|
|
rw = anonymousMarketplace_listing_parser(soup) |
|
|
|
elif marketPlace == "ViceCity": |
|
|
|
rw = vicecity_listing_parser(soup) |
|
|
|
elif marketPlace == "TorBay": |
|
|
|
rw = torbay_listing_parser(soup) |
|
|
|
elif marketPlace == "M00nkeyMarket": |
|
|
|
rw = m00nkey_listing_parser(soup) |
|
|
|
elif marketPlace == "HiddenMarket": |
|
|
|
rw =hiddenmarket_listing_parser(soup) |
|
|
|
elif marketPlace == "DarkMatter": |
|
|
|
rw = darkmatter_listing_parser(soup) |
|
|
|
elif marketPlace == "DigitalThriftShop": |
|
|
|
rw = digitalThriftShop_listing_parser(soup) |
|
|
|
elif marketPlace == "LionMarketplace": |
|
|
|
rw = lionmarketplace_listing_parser(soup) |
|
|
|
elif marketPlace == "TorMarket": |
|
|
|
rw = tormarket_listing_parser(soup) |
|
|
|
elif marketPlace == "RobinhoodMarket": |
|
|
|
rw = Robinhood_listing_parser(soup) |
|
|
|
elif marketPlace == "Nexus": |
|
|
|
rw = nexus_listing_parser(soup) |
|
|
|
else: |
|
|
|
parseError = True |
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
|
|
nError += 1 |
|
|
|
print("There was a problem to parse the file " + line1 + " in the listing section!") |
|
|
|
if createLog: |
|
|
|
logFile.write( |
|
|
|
str(nError) + ". There was a problem to parse the file " + line1 + " in the Listing section.\n") |
|
|
|
parseError = True |
|
|
|
rec = rec.split(',') |
|
|
|
|
|
|
|
if not parseError: |
|
|
|
descriptionPattern = cleanLink(rec[20]) + ".html" |
|
|
|
|
|
|
|
persistError = False |
|
|
|
moveError = False |
|
|
|
num_in_db = 0 |
|
|
|
num_persisted_moved = 0 |
|
|
|
# Reading the associated description Html Pages |
|
|
|
descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern)) |
|
|
|
|
|
|
|
for rec in rw: |
|
|
|
nFound += len(descriptions) |
|
|
|
|
|
|
|
rec = rec.split(',') |
|
|
|
for descriptionIndex, descriptionFile in enumerate(descriptions): |
|
|
|
|
|
|
|
# if len(detPage) > 0: #It was created here just because Zeroday Market does not have Description Pages |
|
|
|
# key = rec[23] |
|
|
|
print("Reading description folder of '" + marketPlace + "', file '" + os.path.basename( |
|
|
|
descriptionFile) + "', index= " + str(descriptionIndex + 1) + " ... " + str(len(descriptions))) |
|
|
|
|
|
|
|
# key = u"Pr:" + rec[1].upper()[:list_lim1] + u" Vendor:" + rec[18].upper()[:list_lim2] |
|
|
|
key = u"Url:" + cleanLink(rec[20]) |
|
|
|
descriptionSoup = read_file(descriptionFile, createLog, logFile) |
|
|
|
|
|
|
|
# if the associated description page is parsed |
|
|
|
if key in detPage: |
|
|
|
# description flags |
|
|
|
doParseDescription = descriptionSoup is not None |
|
|
|
doPersistRecord = False |
|
|
|
doMoveDescription = False |
|
|
|
|
|
|
|
# rec = mergePages(detPage, rec) |
|
|
|
rmm = [] |
|
|
|
|
|
|
|
if doParseDescription: |
|
|
|
|
|
|
|
rmm = parse_description(marketPlace, descriptionFile, descriptionSoup, createLog, logFile) |
|
|
|
|
|
|
|
doPersistRecord = rmm is not None |
|
|
|
|
|
|
|
else: |
|
|
|
readDescriptionError = True |
|
|
|
parseDescriptionError = True |
|
|
|
|
|
|
|
if doPersistRecord: |
|
|
|
|
|
|
|
# Combining the information from Listing and Description Pages |
|
|
|
rmm = detPage[key]['rmm'] |
|
|
|
rec = mergePages(rmm, rec) |
|
|
|
|
|
|
|
# Append to the list the classification of the product |
|
|
|
# rec.append(str(predict(rec[1], rec[5], language='markets'))) |
|
|
|
# Append to the list the classification of the topic |
|
|
|
rec.append(str(predict(rec[4], rec[5], language='sup_english'))) |
|
|
|
|
|
|
|
# Persisting the information in the database |
|
|
|
try: |
|
|
|
persist_data(url, tuple(rec), cur) |
|
|
|
con.commit() |
|
|
|
except Exception as e: |
|
|
|
|
|
|
|
trace = traceback.format_exc() |
|
|
|
|
|
|
|
if trace.find("already exists") == -1: |
|
|
|
nError += 1 |
|
|
|
print("There was a problem to persist the file " + detPage[key]['filename'] + " in the database!") |
|
|
|
if createLog: |
|
|
|
logFile.write( |
|
|
|
str(nError) + ". There was a problem to persist the file " + detPage[key]['filename'] + " in the database.\n") |
|
|
|
persistError = True |
|
|
|
|
|
|
|
con.rollback() |
|
|
|
|
|
|
|
if not persistError: |
|
|
|
|
|
|
|
# move description files of completed folder |
|
|
|
source = line2.replace(os.path.basename(line2), "") + detPage[key]['filename'] |
|
|
|
destination = line2.replace(os.path.basename(line2), "") + r'Read/' |
|
|
|
|
|
|
|
try: |
|
|
|
shutil.move(source, destination) |
|
|
|
num_persisted_moved += 1 |
|
|
|
except: |
|
|
|
|
|
|
|
print("There was a problem to move the file " + detPage[key]['filename'] + " in the Description section!") |
|
|
|
nError += 1 |
|
|
|
if createLog: |
|
|
|
logFile.write( |
|
|
|
str(nError) + ". There was a problem to move the file " + detPage[key]['filename'] + " in the Description section!.\n") |
|
|
|
moveError = True |
|
|
|
|
|
|
|
# if the associated description page is not read or not parsed |
|
|
|
persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, |
|
|
|
descriptionFile) |
|
|
|
|
|
|
|
doMoveDescription = persistSuccess |
|
|
|
|
|
|
|
else: |
|
|
|
# query database |
|
|
|
# if the product already exists: |
|
|
|
# num_in_db += 1 |
|
|
|
pass |
|
|
|
parseDescriptionError = True |
|
|
|
|
|
|
|
# if number of products on listing page is equal to |
|
|
|
# the number of merged, persisted, and moved products plus |
|
|
|
# the number of products already in the database |
|
|
|
if not persistError and not moveError and len(rw) == (num_persisted_moved + num_in_db): |
|
|
|
if doMoveDescription: |
|
|
|
|
|
|
|
# move listing file to completed folder |
|
|
|
source = line1 |
|
|
|
destination = line1.replace(os.path.basename(line1), "") + r'Read/' |
|
|
|
# move description files of completed folder |
|
|
|
moveSuccess = move_file(descriptionFile, createLog, logFile) |
|
|
|
|
|
|
|
try: |
|
|
|
shutil.move(source, destination) |
|
|
|
except: |
|
|
|
if not moveSuccess: |
|
|
|
moveDescriptionError = True |
|
|
|
|
|
|
|
nError += 1 |
|
|
|
print("There was a problem to move the file " + line1 + " in the Listing section!") |
|
|
|
if createLog: |
|
|
|
logFile.write(str(nError) + ". There was a problem to move the file " + line1 + " in the Listing section!.\n") |
|
|
|
else: |
|
|
|
moveDescriptionError = True |
|
|
|
|
|
|
|
# g.close () |
|
|
|
if not (nFound > 0): |
|
|
|
|
|
|
|
if createLog: |
|
|
|
logFile.close() |
|
|
|
findDescriptionError = True |
|
|
|
|
|
|
|
incrementError() |
|
|
|
print(f"There was a problem to locate the file(s) for {listingFile} in the Description section!") |
|
|
|
if createLog: |
|
|
|
logFile.write( |
|
|
|
str(nError) + f". There was a problem to locate the file(s) for {listingFile}" |
|
|
|
f" in the Description section!\n") |
|
|
|
|
|
|
|
# end = time.time() |
|
|
|
if not (readDescriptionError or parseDescriptionError or persistDescriptionError |
|
|
|
or moveDescriptionError or findDescriptionError): |
|
|
|
# move listing files of completed folder |
|
|
|
move_file(listingFile, createLog, logFile) |
|
|
|
|
|
|
|
# finalTime = float(end-ini) |
|
|
|
if createLog: |
|
|
|
logFile.close() |
|
|
|
|
|
|
|
# print (marketPlace + " Parsing Perfomed Succesfully in %.2f" %finalTime + "!") |
|
|
|
input("Parsing the " + marketPlace + " marketplace and data classification done successfully. Press ENTER to continue\n") |
|
|
|
print("Parsing the " + marketPlace + " market and data classification done.") |