__author__ = 'DarkWeb' import string import time import re from datetime import datetime, timedelta def convertDate(sdate, language, crawlerDate): if language == "english": today = crawlerDate.strftime("%m/%d/%Y") yesterday = (crawlerDate - timedelta(1)).strftime("%m/%d/%Y") sdate = sdate.replace(u"January","01") sdate = sdate.replace(u"February","02") sdate = sdate.replace(u"March","03") sdate = sdate.replace(u"April","04") sdate = sdate.replace(u"May","05") sdate = sdate.replace(u"June","06") sdate = sdate.replace(u"July","07") sdate = sdate.replace(u"August","08") sdate = sdate.replace(u"September","09") sdate = sdate.replace(u"October","10") sdate = sdate.replace(u"November","11") sdate = sdate.replace(u"December","12") sdate = sdate.replace(u"Jan","01") sdate = sdate.replace(u"Feb","02") sdate = sdate.replace(u"Mar","03") sdate = sdate.replace(u"Apr","04") sdate = sdate.replace(u"May","05") sdate = sdate.replace(u"Jun","06") sdate = sdate.replace(u"Jul","07") sdate = sdate.replace(u"Aug","08") sdate = sdate.replace(u"Sep","09") sdate = sdate.replace(u"Oct","10") sdate = sdate.replace(u"Nov","11") sdate = sdate.replace(u"Dec","12") sdate = sdate.replace(u".","") if "Today" in sdate: sdate = datetime.strptime(str(today), '%m/%d/%Y').strftime('%m %d %Y') elif "Yesterday" in sdate: sdate = datetime.strptime(str(yesterday), '%m/%d/%Y').strftime('%m %d %Y') sdate = datetime.strptime(str(sdate), '%m %d %Y').strftime('%m/%d/%Y') elif language == "british": sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y') elif language == "french": todaysday = crawlerDate.strftime("%m/%d/%Y") sdate = sdate.replace(u"janvier","01") sdate = sdate.replace(u"jan","01") sdate = sdate.replace(u"février","02") sdate = sdate.replace(u"juin","06") sdate = sdate.replace(u"juillet","07") sdate = sdate.replace(u"juil","07") sdate = sdate.replace(u"août","08") sdate = sdate.replace(u"septembre","09") sdate = sdate.replace(u"sept","09") sdate = sdate.replace(u"octobre","10") sdate = sdate.replace(u"oct","10") sdate = sdate.replace(u"novembre","11") sdate = sdate.replace(u"nov","11") sdate = sdate.replace(u"décembre","12") sdate = sdate.replace(u"déc","12") sdate = sdate.replace(u".","") if sdate == u"Aujourd'hui" or "Today" in sdate: sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%d %m %Y') if "mar" in sdate: print ("Add March to the IBM Black Market") raise SystemExit elif "avr" in sdate: print ("Add April to the IBM Black Market") raise SystemExit elif "mai" in sdate: print ("Add May to the IBM Black Market") raise SystemExit sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y') elif language == "swedish": sdate = sdate.replace(u"jan","01") sdate = sdate.replace(u"feb","02") sdate = sdate.replace(u"mar","03") sdate = sdate.replace(u"apr","04") sdate = sdate.replace(u"maj","05") sdate = sdate.replace(u"jun","06") sdate = sdate.replace(u"jul","07") sdate = sdate.replace(u"aug","08") sdate = sdate.replace(u"sep","09") sdate = sdate.replace(u"okt","10") sdate = sdate.replace(u"nov","11") sdate = sdate.replace(u"dec","12") sdate = sdate.replace(u".","") if sdate == u"Ig\xe5r" or sdate == u"Idag" or "minuter sedan" in sdate: sdate = crawlerDate sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y') sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y') elif language == "russian": if sdate == u'\u0412\u0447\u0435\u0440\u0430' or u"Вчера" in sdate: sdate = crawlerDate - timedelta(1) sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y') elif sdate == u'\u0421\u0435\u0433\u043e\u0434\u043d\u044f': sdate = crawlerDate sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y') elif u'\xd1\xee\xe7\xe4\xe0\xed\xee' in sdate: return "" sdate = sdate.replace(u"Январь","01") sdate = sdate.replace(u"января","01") sdate = sdate.replace(u"янв","01") sdate = sdate.replace(u"January","01") sdate = sdate.replace(u"Jan","01") sdate = sdate.replace(u"фев","02") sdate = sdate.replace(u"февраля","02") sdate = sdate.replace(u"Февраль", "02") sdate = sdate.replace(u"February", "02") sdate = sdate.replace(u"Feb", "02") sdate = sdate.replace(u"Март","03") sdate = sdate.replace(u"марта","03") sdate = sdate.replace(u"March","03") sdate = sdate.replace(u"Mar","03") sdate = sdate.replace(u"Апрель","04") sdate = sdate.replace(u"апреля","04") sdate = sdate.replace(u"апр","04") sdate = sdate.replace(u"April","04") sdate = sdate.replace(u"Apr","04") sdate = sdate.replace(u"май","05") sdate = sdate.replace(u"Май","05") sdate = sdate.replace(u"мар","05") sdate = sdate.replace(u"май","05") sdate = sdate.replace(u"мая","05") sdate = sdate.replace(u"May","05") sdate = sdate.replace(u"Июнь","06") sdate = sdate.replace(u"июня","06") sdate = sdate.replace(u"июн","06") sdate = sdate.replace(u"June","06") sdate = sdate.replace(u"Jun","06") sdate = sdate.replace(u"Июль","07") sdate = sdate.replace(u"июля","07") sdate = sdate.replace(u"июл","07") sdate = sdate.replace(u"July","07") sdate = sdate.replace(u"Jul","07") sdate = sdate.replace(u"августа","08") sdate = sdate.replace(u"Август","08") sdate = sdate.replace(u"авг","08") sdate = sdate.replace(u"August","08") sdate = sdate.replace(u"Aug","08") sdate = sdate.replace(u"Сентябрь","09") sdate = sdate.replace(u"сентября","09") sdate = sdate.replace(u"сен","09") sdate = sdate.replace(u"September","09") sdate = sdate.replace(u"Sep","09") sdate = sdate.replace(u"октября","10") sdate = sdate.replace(u"Октябрь","10") sdate = sdate.replace(u"October","10") sdate = sdate.replace(u"Oct","10") sdate = sdate.replace(u"окт","10") sdate = sdate.replace(u"Ноябрь","11") sdate = sdate.replace(u"ноября","11") sdate = sdate.replace(u"ноя","11") sdate = sdate.replace(u"November","11") sdate = sdate.replace(u"Nov","11") sdate = sdate.replace(u"Декабрь","12") sdate = sdate.replace(u"декабря","12") sdate = sdate.replace(u"дек","12") sdate = sdate.replace(u"December","12") sdate = sdate.replace(u"Dec","12") sdate = sdate.replace(u".","") sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y') return sdate def cleanText(originalText): safe_chars = string.ascii_letters + string.digits + " " + "_" + "/" + "&" + "$" + "#" "@" + "+" + "-" + "*" + "=" \ ":" + ";" + "." "," + "?" + "!" + "{" + "}" + "[" + "]" + "(" + ")" + "%" + "`" + "~" + "^" + "|" + "<" + ">" for index, text in enumerate(originalText): originalText[index] = ''.join([char if char in safe_chars else '' for char in text]) return originalText def cleanLink(originalLink): safe_chars = string.ascii_letters + string.digits originalLink = ''.join([char if char in safe_chars else '' for char in originalLink]) return originalLink def organizeProducts(marketplace, nm, vendor, rating_vendor, success_vendor, nombre, CVE, MS, category, describe, views, reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href): rw = [] day = time.strftime("%m/%d/%Y") ahora = time.strftime("%I:%M:%S") for n in range(nm): lne = marketplace # 0 lne += "," lne += vendor[n] # 1 lne += "," lne += "-1" if len(rating_vendor) == 0 else rating_vendor[n] # 2 lne += "," lne += "-1" if len(success_vendor) == 0 else success_vendor[n] # 3 lne += "," lne += nombre[n] # 4 lne += ',' lne += "-1" if len(describe) == 0 else describe[n] # 5 lne += "," lne += "-1" if len(CVE) == 0 else CVE[n] # 6 lne += "," lne += "-1" if len(MS) == 0 else MS[n] # 7 lne += "," lne += "-1" if len(category) == 0 else category[n] # 8 lne += "," lne += "-1" if len(views) == 0 else views[n] # 9 lne += "," lne += "-1" if len(reviews) == 0 else reviews[n] # 10 lne += "," lne += "-1" if len(rating_item) == 0 else rating_item[n] # 11 lne += "," lne += "-1" if len(addDate) == 0 else addDate[n] # 12 lne += "," lne += "-1" if len(BTC) == 0 else BTC[n] # 13 lne += "," lne += "-1" if len(USD) == 0 else USD[n] # 14 lne += "," lne += "-1" if len(EURO) == 0 else EURO[n] # 15 lne += "," lne += "-1" if len(sold) == 0 else sold[n] # 16 lne += "," lne += "-1" if len(qLeft) == 0 else qLeft[n] # 17 lne += "," lne += "-1" if len(shipFrom) == 0 else shipFrom[n] # 18 lne += "," lne += "-1" if len(shipTo) == 0 else shipTo[n] # 19 lne += "," lne += "-1" if len(href) == 0 else href[n] # 20 lne += "," lne += day + " " + ahora # 21 rw.append(lne) return rw def cleanString(originalString): updated_string = originalString.replace(",", "") #replace all commas updated_string = updated_string.replace("\n", "") #replace all newlines updated_string = updated_string.replace("\t", "") #replace all tabs updated_string = updated_string.replace("\r", "") #replace all carriage returns updated_string = updated_string.replace("'", "^") #replace all semicolons updated_string = updated_string.replace(u"»", '') #replace all arrows updated_string = updated_string.replace("!", "") #replace all exclamation points updated_string = updated_string.replace(";", "") #replace all exclamations return updated_string def checkDateFormat(myString): isDate = re.match('[0-1][0-9]\/[0-3][0-9]\/[1-2][0-9]{3}', myString) return isDate def cleanNumbers(inputString): reg_ex = re.compile(r'[^\d.]+') updated_string = reg_ex.sub('', inputString) return updated_string def cleanHTML(html): formats = [ "jpg", "jpeg", "jfif", "pjpeg", "pjp", "png", "apng", "svg", "bmp", "gif", "avif", "webp", "ico", "cur", "tiff" ] # remove images clean_html = re.sub(r"", "", html) clean_html = re.sub(r"", "", clean_html) clean_html = re.sub(r"", "", clean_html) for fmat in formats: clean_html = re.sub(r"", "", clean_html) clean_html = re.sub(r"", "", clean_html) # remove JavaScript clean_html = re.sub(r"", "", clean_html) clean_html = re.sub(r"", "", clean_html) clean_html = re.sub(r"", "", clean_html) clean_html = re.sub(r"", "", clean_html) # image and JavaScript clean_html = re.sub(r"", "", clean_html) return clean_html