|
@ -29,6 +29,7 @@ from MarketPlaces.GoFish.parser import * |
|
|
from MarketPlaces.ZeroDay.parser import * |
|
|
from MarketPlaces.ZeroDay.parser import * |
|
|
|
|
|
|
|
|
from MarketPlaces.Classifier.classify_product import predict |
|
|
from MarketPlaces.Classifier.classify_product import predict |
|
|
|
|
|
from Translator.translate import translate |
|
|
|
|
|
|
|
|
nError = 0 |
|
|
nError = 0 |
|
|
|
|
|
|
|
@ -241,6 +242,20 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): |
|
|
return None |
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_source_language(marketPlace): |
|
|
|
|
|
|
|
|
|
|
|
if marketPlace == "BestCardingWorld": |
|
|
|
|
|
lang = 'english' |
|
|
|
|
|
elif marketPlace == "CryptBB": |
|
|
|
|
|
lang = 'english' |
|
|
|
|
|
elif marketPlace == "Incogsnoo": |
|
|
|
|
|
lang = 'english' |
|
|
|
|
|
else: |
|
|
|
|
|
print("MISSING CALL TO GET LANGUAGE IN PREPARE_PARSER.PY!") |
|
|
|
|
|
lang = 'auto' |
|
|
|
|
|
return lang |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile): |
|
|
def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile): |
|
|
try: |
|
|
try: |
|
|
persist_data(url, tuple(rec), cur) |
|
|
persist_data(url, tuple(rec), cur) |
|
@ -313,6 +328,8 @@ def new_parse(marketPlace, url, createLog): |
|
|
else: |
|
|
else: |
|
|
logFile = None |
|
|
logFile = None |
|
|
|
|
|
|
|
|
|
|
|
source_lang = get_source_language(marketPlace) |
|
|
|
|
|
|
|
|
# Reading the Listing Html Pages |
|
|
# Reading the Listing Html Pages |
|
|
listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')) |
|
|
listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')) |
|
|
|
|
|
|
|
@ -389,8 +406,11 @@ def new_parse(marketPlace, url, createLog): |
|
|
# Combining the information from Listing and Description Pages |
|
|
# Combining the information from Listing and Description Pages |
|
|
rec = mergePages(rmm, rec) |
|
|
rec = mergePages(rmm, rec) |
|
|
|
|
|
|
|
|
|
|
|
title = translate(rec[4], source_lang) |
|
|
|
|
|
content = translate(rec[5], source_lang) |
|
|
|
|
|
|
|
|
# Append to the list the classification of the topic |
|
|
# Append to the list the classification of the topic |
|
|
rec.append(str(predict(rec[4], rec[5], language='sup_english'))) |
|
|
|
|
|
|
|
|
rec.append(str(predict(title, content, language='sup_english'))) |
|
|
|
|
|
|
|
|
# Persisting the information in the database |
|
|
# Persisting the information in the database |
|
|
persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, |
|
|
persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, |
|
|