__author__ = 'DarkWeb'
|
|
|
|
import string
|
|
import time
|
|
import re
|
|
from datetime import datetime, timedelta
|
|
|
|
|
|
def convertDate(sdate, language, crawlerDate):
|
|
|
|
if language == "english":
|
|
|
|
today = crawlerDate.strftime("%m/%d/%Y")
|
|
yesterday = (crawlerDate - timedelta(1)).strftime("%m/%d/%Y")
|
|
|
|
sdate = sdate.replace(u"January","01")
|
|
sdate = sdate.replace(u"February","02")
|
|
sdate = sdate.replace(u"March","03")
|
|
sdate = sdate.replace(u"April","04")
|
|
sdate = sdate.replace(u"May","05")
|
|
sdate = sdate.replace(u"June","06")
|
|
sdate = sdate.replace(u"July","07")
|
|
sdate = sdate.replace(u"August","08")
|
|
sdate = sdate.replace(u"September","09")
|
|
sdate = sdate.replace(u"October","10")
|
|
sdate = sdate.replace(u"November","11")
|
|
sdate = sdate.replace(u"December","12")
|
|
sdate = sdate.replace(u"Jan","01")
|
|
sdate = sdate.replace(u"Feb","02")
|
|
sdate = sdate.replace(u"Mar","03")
|
|
sdate = sdate.replace(u"Apr","04")
|
|
sdate = sdate.replace(u"May","05")
|
|
sdate = sdate.replace(u"Jun","06")
|
|
sdate = sdate.replace(u"Jul","07")
|
|
sdate = sdate.replace(u"Aug","08")
|
|
sdate = sdate.replace(u"Sep","09")
|
|
sdate = sdate.replace(u"Oct","10")
|
|
sdate = sdate.replace(u"Nov","11")
|
|
sdate = sdate.replace(u"Dec","12")
|
|
sdate = sdate.replace(u".","")
|
|
|
|
if "Today" in sdate:
|
|
sdate = datetime.strptime(str(today), '%m/%d/%Y').strftime('%m %d %Y')
|
|
elif "Yesterday" in sdate:
|
|
sdate = datetime.strptime(str(yesterday), '%m/%d/%Y').strftime('%m %d %Y')
|
|
|
|
sdate = datetime.strptime(str(sdate), '%m %d %Y').strftime('%m/%d/%Y')
|
|
|
|
elif language == "british":
|
|
|
|
sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
|
|
|
|
elif language == "french":
|
|
|
|
todaysday = crawlerDate.strftime("%m/%d/%Y")
|
|
|
|
sdate = sdate.replace(u"janvier","01")
|
|
sdate = sdate.replace(u"jan","01")
|
|
sdate = sdate.replace(u"février","02")
|
|
sdate = sdate.replace(u"juin","06")
|
|
sdate = sdate.replace(u"juillet","07")
|
|
sdate = sdate.replace(u"juil","07")
|
|
sdate = sdate.replace(u"août","08")
|
|
sdate = sdate.replace(u"septembre","09")
|
|
sdate = sdate.replace(u"sept","09")
|
|
sdate = sdate.replace(u"octobre","10")
|
|
sdate = sdate.replace(u"oct","10")
|
|
sdate = sdate.replace(u"novembre","11")
|
|
sdate = sdate.replace(u"nov","11")
|
|
sdate = sdate.replace(u"décembre","12")
|
|
sdate = sdate.replace(u"déc","12")
|
|
sdate = sdate.replace(u".","")
|
|
|
|
if sdate == u"Aujourd'hui" or "Today" in sdate:
|
|
sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%d %m %Y')
|
|
|
|
if "mar" in sdate:
|
|
print ("Add March to the IBM Black Market")
|
|
raise SystemExit
|
|
elif "avr" in sdate:
|
|
print ("Add April to the IBM Black Market")
|
|
raise SystemExit
|
|
elif "mai" in sdate:
|
|
print ("Add May to the IBM Black Market")
|
|
raise SystemExit
|
|
|
|
sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
|
|
|
|
elif language == "swedish":
|
|
|
|
sdate = sdate.replace(u"jan","01")
|
|
sdate = sdate.replace(u"feb","02")
|
|
sdate = sdate.replace(u"mar","03")
|
|
sdate = sdate.replace(u"apr","04")
|
|
sdate = sdate.replace(u"maj","05")
|
|
sdate = sdate.replace(u"jun","06")
|
|
sdate = sdate.replace(u"jul","07")
|
|
sdate = sdate.replace(u"aug","08")
|
|
sdate = sdate.replace(u"sep","09")
|
|
sdate = sdate.replace(u"okt","10")
|
|
sdate = sdate.replace(u"nov","11")
|
|
sdate = sdate.replace(u"dec","12")
|
|
sdate = sdate.replace(u".","")
|
|
|
|
if sdate == u"Ig\xe5r" or sdate == u"Idag" or "minuter sedan" in sdate:
|
|
sdate = crawlerDate
|
|
sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y')
|
|
|
|
sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
|
|
|
|
elif language == "russian":
|
|
|
|
if sdate == u'\u0412\u0447\u0435\u0440\u0430' or u"Вчера" in sdate:
|
|
sdate = crawlerDate - timedelta(1)
|
|
sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y')
|
|
elif sdate == u'\u0421\u0435\u0433\u043e\u0434\u043d\u044f':
|
|
sdate = crawlerDate
|
|
sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y')
|
|
elif u'\xd1\xee\xe7\xe4\xe0\xed\xee' in sdate:
|
|
return ""
|
|
|
|
sdate = sdate.replace(u"Январь","01")
|
|
sdate = sdate.replace(u"января","01")
|
|
sdate = sdate.replace(u"янв","01")
|
|
sdate = sdate.replace(u"January","01")
|
|
sdate = sdate.replace(u"Jan","01")
|
|
sdate = sdate.replace(u"фев","02")
|
|
sdate = sdate.replace(u"февраля","02")
|
|
sdate = sdate.replace(u"Февраль", "02")
|
|
sdate = sdate.replace(u"February", "02")
|
|
sdate = sdate.replace(u"Feb", "02")
|
|
sdate = sdate.replace(u"Март","03")
|
|
sdate = sdate.replace(u"марта","03")
|
|
sdate = sdate.replace(u"March","03")
|
|
sdate = sdate.replace(u"Mar","03")
|
|
sdate = sdate.replace(u"Апрель","04")
|
|
sdate = sdate.replace(u"апреля","04")
|
|
sdate = sdate.replace(u"апр","04")
|
|
sdate = sdate.replace(u"April","04")
|
|
sdate = sdate.replace(u"Apr","04")
|
|
sdate = sdate.replace(u"май","05")
|
|
sdate = sdate.replace(u"Май","05")
|
|
sdate = sdate.replace(u"мар","05")
|
|
sdate = sdate.replace(u"май","05")
|
|
sdate = sdate.replace(u"мая","05")
|
|
sdate = sdate.replace(u"May","05")
|
|
sdate = sdate.replace(u"Июнь","06")
|
|
sdate = sdate.replace(u"июня","06")
|
|
sdate = sdate.replace(u"июн","06")
|
|
sdate = sdate.replace(u"June","06")
|
|
sdate = sdate.replace(u"Jun","06")
|
|
sdate = sdate.replace(u"Июль","07")
|
|
sdate = sdate.replace(u"июля","07")
|
|
sdate = sdate.replace(u"июл","07")
|
|
sdate = sdate.replace(u"July","07")
|
|
sdate = sdate.replace(u"Jul","07")
|
|
sdate = sdate.replace(u"августа","08")
|
|
sdate = sdate.replace(u"Август","08")
|
|
sdate = sdate.replace(u"авг","08")
|
|
sdate = sdate.replace(u"August","08")
|
|
sdate = sdate.replace(u"Aug","08")
|
|
sdate = sdate.replace(u"Сентябрь","09")
|
|
sdate = sdate.replace(u"сентября","09")
|
|
sdate = sdate.replace(u"сен","09")
|
|
sdate = sdate.replace(u"September","09")
|
|
sdate = sdate.replace(u"Sep","09")
|
|
sdate = sdate.replace(u"октября","10")
|
|
sdate = sdate.replace(u"Октябрь","10")
|
|
sdate = sdate.replace(u"October","10")
|
|
sdate = sdate.replace(u"Oct","10")
|
|
sdate = sdate.replace(u"окт","10")
|
|
sdate = sdate.replace(u"Ноябрь","11")
|
|
sdate = sdate.replace(u"ноября","11")
|
|
sdate = sdate.replace(u"ноя","11")
|
|
sdate = sdate.replace(u"November","11")
|
|
sdate = sdate.replace(u"Nov","11")
|
|
sdate = sdate.replace(u"Декабрь","12")
|
|
sdate = sdate.replace(u"декабря","12")
|
|
sdate = sdate.replace(u"дек","12")
|
|
sdate = sdate.replace(u"December","12")
|
|
sdate = sdate.replace(u"Dec","12")
|
|
sdate = sdate.replace(u".","")
|
|
|
|
sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
|
|
|
|
return sdate
|
|
|
|
|
|
def cleanText(originalText):
|
|
|
|
safe_chars = string.ascii_letters + string.digits + " " + "_" + "/" + "&" + "$" + "#" "@" + "+" + "-" + "*" + "=" \
|
|
":" + ";" + "." "," + "?" + "!" + "{" + "}" + "[" + "]" + "(" + ")" + "%" + "`" + "~" + "^" + "|" + "<" + ">"
|
|
|
|
for index, text in enumerate(originalText):
|
|
|
|
originalText[index] = ''.join([char if char in safe_chars else '' for char in text])
|
|
|
|
return originalText
|
|
|
|
|
|
def cleanLink(originalLink):
|
|
|
|
safe_chars = string.ascii_letters + string.digits
|
|
|
|
originalLink = ''.join([char if char in safe_chars else '' for char in originalLink])
|
|
|
|
return originalLink
|
|
|
|
|
|
def organizeProducts(marketplace, nm, nombre, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
|
|
BTC, USD, EURO, qLeft, shipFrom, shipTo, user, rating, success, sold, href):
|
|
|
|
rw = []
|
|
|
|
day = time.strftime("%m/%d/%Y")
|
|
#day = time.strftime("%d/%m/%Y")
|
|
ahora = time.strftime("%I:%M:%S")
|
|
|
|
for n in range(nm):
|
|
|
|
lne = marketplace + "," #0
|
|
lne += "-1" if len(nombre) == 0 else nombre[n] #1
|
|
lne += ','
|
|
lne += "-1" if len(CVE) == 0 else CVE[n] #2
|
|
lne += ","
|
|
lne += "-1" if len(MS) == 0 else MS[n] #3
|
|
lne += ","
|
|
lne += "-1" if len(category) == 0 else category[n] #4
|
|
lne += ","
|
|
lne += "-1" if len(describe) == 0 else describe[n] #5
|
|
lne += ","
|
|
lne += "-1" if len(escrow) == 0 else escrow[n] #6
|
|
lne += ","
|
|
lne += "-1" if len(views) == 0 else views[n] #7
|
|
lne += ","
|
|
lne += "-1" if len(reviews) == 0 else reviews[n] #8
|
|
lne += ","
|
|
lne += "-1" if len(addDate) == 0 else addDate[n] #9
|
|
lne += ","
|
|
lne += "-1" if len(lastSeen) == 0 else lastSeen[n] #10
|
|
lne += ","
|
|
lne += "-1" if len(BTC) == 0 else BTC[n] #11
|
|
lne += ","
|
|
lne += "-1" if len(USD) == 0 else USD[n] #12
|
|
lne += ","
|
|
lne += "-1" if len(EURO) == 0 else EURO[n] #13
|
|
lne += ","
|
|
lne += "-1" if len(sold) == 0 else sold[n] #14
|
|
lne += ","
|
|
lne += "-1" if len(qLeft) == 0 else qLeft[n] #15
|
|
lne += ","
|
|
lne += "-1" if len(shipFrom) == 0 else shipFrom[n] #16
|
|
lne += ","
|
|
lne += "-1" if len(shipTo) == 0 else shipTo[n] #17
|
|
lne += "," + user[n] + "," #18
|
|
lne += "-1" if len(rating) == 0 else rating[n] #19
|
|
lne += ","
|
|
lne += "-1" if len(success) == 0 else success[n] #20
|
|
lne += "," + "-1" + "," + day + " " + ahora + "," #21, 22
|
|
lne += "-1" if len(href) == 0 else href[n] #23
|
|
|
|
rw.append(lne)
|
|
|
|
return rw
|
|
|
|
|
|
def cleanString(originalString):
|
|
updated_string = originalString.replace(",", "") #replace all commas
|
|
updated_string = updated_string.replace("\n", "") #replace all newlines
|
|
updated_string = updated_string.replace("\t", "") #replace all tabs
|
|
updated_string = updated_string.replace("\r", "") #replace all carriage returns
|
|
updated_string = updated_string.replace("'", "^") #replace all semicolons
|
|
updated_string = updated_string.replace(u"»", '') #replace all arrows
|
|
updated_string = updated_string.replace("!", "") #replace all exclamation points
|
|
updated_string = updated_string.replace(";", "") #replace all exclamations
|
|
|
|
return updated_string
|
|
|
|
|
|
def checkDateFormat(myString):
|
|
isDate = re.match('[0-1][0-9]\/[0-3][0-9]\/[1-2][0-9]{3}', myString)
|
|
return isDate
|
|
|
|
|
|
def cleanNumbers(inputString):
|
|
|
|
reg_ex = re.compile(r'[^\d.]+')
|
|
updated_string = reg_ex.sub('', inputString)
|
|
|
|
return updated_string
|
|
|
|
|
|
def cleanHTML(html):
|
|
|
|
formats = [
|
|
"jpg", "jpeg", "jfif", "pjpeg", "pjp",
|
|
"png", "apng", "svg", "bmp", "gif",
|
|
"avif", "webp", "ico", "cur", "tiff"
|
|
]
|
|
|
|
# remove images
|
|
clean_html = re.sub(r"<img.*?>", "", html)
|
|
clean_html = re.sub(r"<picture.*?>", "", clean_html)
|
|
clean_html = re.sub(r"<svg.*?>", "", clean_html)
|
|
for fmat in formats:
|
|
clean_html = re.sub(r"<object.*" + fmat + ".*?>", "", clean_html)
|
|
clean_html = re.sub(r"<canvas.*?>", "", clean_html)
|
|
|
|
# remove JavaScript
|
|
clean_html = re.sub(r"<script.*?>", "", clean_html)
|
|
clean_html = re.sub(r"<object.*javascript.*?>", "", clean_html)
|
|
clean_html = re.sub(r"<aplet.*mayscript?>", "", clean_html)
|
|
clean_html = re.sub(r"<embed.*scriptable?>", "", clean_html)
|
|
|
|
# image and JavaScript
|
|
clean_html = re.sub(r"<div.*background-image.*?>", "", clean_html)
|
|
|
|
return clean_html
|
|
|
|
|
|
|