this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

321 lines
12 KiB

1 year ago
  1. __author__ = 'DarkWeb'
  2. import string
  3. import time
  4. import re
  5. from datetime import datetime, timedelta
  6. def convertDate(sdate, language, crawlerDate):
  7. if language == "english":
  8. today = crawlerDate.strftime("%m/%d/%Y")
  9. yesterday = (crawlerDate - timedelta(1)).strftime("%m/%d/%Y")
  10. sdate = sdate.replace(u"January","01")
  11. sdate = sdate.replace(u"February","02")
  12. sdate = sdate.replace(u"March","03")
  13. sdate = sdate.replace(u"April","04")
  14. sdate = sdate.replace(u"May","05")
  15. sdate = sdate.replace(u"June","06")
  16. sdate = sdate.replace(u"July","07")
  17. sdate = sdate.replace(u"August","08")
  18. sdate = sdate.replace(u"September","09")
  19. sdate = sdate.replace(u"October","10")
  20. sdate = sdate.replace(u"November","11")
  21. sdate = sdate.replace(u"December","12")
  22. sdate = sdate.replace(u"Jan","01")
  23. sdate = sdate.replace(u"Feb","02")
  24. sdate = sdate.replace(u"Mar","03")
  25. sdate = sdate.replace(u"Apr","04")
  26. sdate = sdate.replace(u"May","05")
  27. sdate = sdate.replace(u"Jun","06")
  28. sdate = sdate.replace(u"Jul","07")
  29. sdate = sdate.replace(u"Aug","08")
  30. sdate = sdate.replace(u"Sep","09")
  31. sdate = sdate.replace(u"Oct","10")
  32. sdate = sdate.replace(u"Nov","11")
  33. sdate = sdate.replace(u"Dec","12")
  34. sdate = sdate.replace(u".","")
  35. if "Today" in sdate:
  36. sdate = datetime.strptime(str(today), '%m/%d/%Y').strftime('%m %d %Y')
  37. elif "Yesterday" in sdate:
  38. sdate = datetime.strptime(str(yesterday), '%m/%d/%Y').strftime('%m %d %Y')
  39. sdate = datetime.strptime(str(sdate), '%m %d %Y').strftime('%m/%d/%Y')
  40. elif language == "british":
  41. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  42. elif language == "french":
  43. todaysday = crawlerDate.strftime("%m/%d/%Y")
  44. sdate = sdate.replace(u"janvier","01")
  45. sdate = sdate.replace(u"jan","01")
  46. sdate = sdate.replace(u"février","02")
  47. sdate = sdate.replace(u"juin","06")
  48. sdate = sdate.replace(u"juillet","07")
  49. sdate = sdate.replace(u"juil","07")
  50. sdate = sdate.replace(u"août","08")
  51. sdate = sdate.replace(u"septembre","09")
  52. sdate = sdate.replace(u"sept","09")
  53. sdate = sdate.replace(u"octobre","10")
  54. sdate = sdate.replace(u"oct","10")
  55. sdate = sdate.replace(u"novembre","11")
  56. sdate = sdate.replace(u"nov","11")
  57. sdate = sdate.replace(u"décembre","12")
  58. sdate = sdate.replace(u"déc","12")
  59. sdate = sdate.replace(u".","")
  60. if sdate == u"Aujourd'hui" or "Today" in sdate:
  61. sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%d %m %Y')
  62. if "mar" in sdate:
  63. print ("Add March to the IBM Black Market")
  64. raise SystemExit
  65. elif "avr" in sdate:
  66. print ("Add April to the IBM Black Market")
  67. raise SystemExit
  68. elif "mai" in sdate:
  69. print ("Add May to the IBM Black Market")
  70. raise SystemExit
  71. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  72. elif language == "swedish":
  73. sdate = sdate.replace(u"jan","01")
  74. sdate = sdate.replace(u"feb","02")
  75. sdate = sdate.replace(u"mar","03")
  76. sdate = sdate.replace(u"apr","04")
  77. sdate = sdate.replace(u"maj","05")
  78. sdate = sdate.replace(u"jun","06")
  79. sdate = sdate.replace(u"jul","07")
  80. sdate = sdate.replace(u"aug","08")
  81. sdate = sdate.replace(u"sep","09")
  82. sdate = sdate.replace(u"okt","10")
  83. sdate = sdate.replace(u"nov","11")
  84. sdate = sdate.replace(u"dec","12")
  85. sdate = sdate.replace(u".","")
  86. if sdate == u"Ig\xe5r" or sdate == u"Idag" or "minuter sedan" in sdate:
  87. sdate = crawlerDate
  88. sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y')
  89. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  90. elif language == "russian":
  91. if sdate == u'\u0412\u0447\u0435\u0440\u0430' or u"Вчера" in sdate:
  92. sdate = crawlerDate - timedelta(1)
  93. sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y')
  94. elif sdate == u'\u0421\u0435\u0433\u043e\u0434\u043d\u044f':
  95. sdate = crawlerDate
  96. sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y')
  97. elif u'\xd1\xee\xe7\xe4\xe0\xed\xee' in sdate:
  98. return ""
  99. sdate = sdate.replace(u"Январь","01")
  100. sdate = sdate.replace(u"января","01")
  101. sdate = sdate.replace(u"янв","01")
  102. sdate = sdate.replace(u"January","01")
  103. sdate = sdate.replace(u"Jan","01")
  104. sdate = sdate.replace(u"фев","02")
  105. sdate = sdate.replace(u"февраля","02")
  106. sdate = sdate.replace(u"Февраль", "02")
  107. sdate = sdate.replace(u"February", "02")
  108. sdate = sdate.replace(u"Feb", "02")
  109. sdate = sdate.replace(u"Март","03")
  110. sdate = sdate.replace(u"марта","03")
  111. sdate = sdate.replace(u"March","03")
  112. sdate = sdate.replace(u"Mar","03")
  113. sdate = sdate.replace(u"Апрель","04")
  114. sdate = sdate.replace(u"апреля","04")
  115. sdate = sdate.replace(u"апр","04")
  116. sdate = sdate.replace(u"April","04")
  117. sdate = sdate.replace(u"Apr","04")
  118. sdate = sdate.replace(u"май","05")
  119. sdate = sdate.replace(u"Май","05")
  120. sdate = sdate.replace(u"мар","05")
  121. sdate = sdate.replace(u"май","05")
  122. sdate = sdate.replace(u"мая","05")
  123. sdate = sdate.replace(u"May","05")
  124. sdate = sdate.replace(u"Июнь","06")
  125. sdate = sdate.replace(u"июня","06")
  126. sdate = sdate.replace(u"июн","06")
  127. sdate = sdate.replace(u"June","06")
  128. sdate = sdate.replace(u"Jun","06")
  129. sdate = sdate.replace(u"Июль","07")
  130. sdate = sdate.replace(u"июля","07")
  131. sdate = sdate.replace(u"июл","07")
  132. sdate = sdate.replace(u"July","07")
  133. sdate = sdate.replace(u"Jul","07")
  134. sdate = sdate.replace(u"августа","08")
  135. sdate = sdate.replace(u"Август","08")
  136. sdate = sdate.replace(u"авг","08")
  137. sdate = sdate.replace(u"August","08")
  138. sdate = sdate.replace(u"Aug","08")
  139. sdate = sdate.replace(u"Сентябрь","09")
  140. sdate = sdate.replace(u"сентября","09")
  141. sdate = sdate.replace(u"сен","09")
  142. sdate = sdate.replace(u"September","09")
  143. sdate = sdate.replace(u"Sep","09")
  144. sdate = sdate.replace(u"октября","10")
  145. sdate = sdate.replace(u"Октябрь","10")
  146. sdate = sdate.replace(u"October","10")
  147. sdate = sdate.replace(u"Oct","10")
  148. sdate = sdate.replace(u"окт","10")
  149. sdate = sdate.replace(u"Ноябрь","11")
  150. sdate = sdate.replace(u"ноября","11")
  151. sdate = sdate.replace(u"ноя","11")
  152. sdate = sdate.replace(u"November","11")
  153. sdate = sdate.replace(u"Nov","11")
  154. sdate = sdate.replace(u"Декабрь","12")
  155. sdate = sdate.replace(u"декабря","12")
  156. sdate = sdate.replace(u"дек","12")
  157. sdate = sdate.replace(u"December","12")
  158. sdate = sdate.replace(u"Dec","12")
  159. sdate = sdate.replace(u".","")
  160. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  161. return sdate
  162. def cleanText(originalText):
  163. safe_chars = string.ascii_letters + string.digits + " " + "_" + "/" + "&" + "$" + "#" "@" + "+" + "-" + "*" + "=" \
  164. ":" + ";" + "." "," + "?" + "!" + "{" + "}" + "[" + "]" + "(" + ")" + "%" + "`" + "~" + "^" + "|" + "<" + ">"
  165. for index, text in enumerate(originalText):
  166. originalText[index] = ''.join([char if char in safe_chars else '' for char in text])
  167. return originalText
  168. def cleanLink(originalLink):
  169. safe_chars = string.ascii_letters + string.digits
  170. originalLink = ''.join([char if char in safe_chars else '' for char in originalLink])
  171. return originalLink
  172. def organizeProducts(marketplace, nm, nombre, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
  173. BTC, USD, EURO, qLeft, shipFrom, shipTo, user, rating, success, sold, href):
  174. rw = []
  175. day = time.strftime("%m/%d/%Y")
  176. #day = time.strftime("%d/%m/%Y")
  177. ahora = time.strftime("%I:%M:%S")
  178. for n in range(nm):
  179. lne = marketplace + "," #0
  180. lne += "-1" if len(nombre) == 0 else nombre[n] #1
  181. lne += ','
  182. lne += "-1" if len(CVE) == 0 else CVE[n] #2
  183. lne += ","
  184. lne += "-1" if len(MS) == 0 else MS[n] #3
  185. lne += ","
  186. lne += "-1" if len(category) == 0 else category[n] #4
  187. lne += ","
  188. lne += "-1" if len(describe) == 0 else describe[n] #5
  189. lne += ","
  190. lne += "-1" if len(escrow) == 0 else escrow[n] #6
  191. lne += ","
  192. lne += "-1" if len(views) == 0 else views[n] #7
  193. lne += ","
  194. lne += "-1" if len(reviews) == 0 else reviews[n] #8
  195. lne += ","
  196. lne += "-1" if len(addDate) == 0 else addDate[n] #9
  197. lne += ","
  198. lne += "-1" if len(lastSeen) == 0 else lastSeen[n] #10
  199. lne += ","
  200. lne += "-1" if len(BTC) == 0 else BTC[n] #11
  201. lne += ","
  202. lne += "-1" if len(USD) == 0 else USD[n] #12
  203. lne += ","
  204. lne += "-1" if len(EURO) == 0 else EURO[n] #13
  205. lne += ","
  206. lne += "-1" if len(sold) == 0 else sold[n] #14
  207. lne += ","
  208. lne += "-1" if len(qLeft) == 0 else qLeft[n] #15
  209. lne += ","
  210. lne += "-1" if len(shipFrom) == 0 else shipFrom[n] #16
  211. lne += ","
  212. lne += "-1" if len(shipTo) == 0 else shipTo[n] #17
  213. lne += "," + user[n] + "," #18
  214. lne += "-1" if len(rating) == 0 else rating[n] #19
  215. lne += ","
  216. lne += "-1" if len(success) == 0 else success[n] #20
  217. lne += "," + "-1" + "," + day + " " + ahora + "," #21, 22
  218. lne += "-1" if len(href) == 0 else href[n] #23
  219. rw.append(lne)
  220. return rw
  221. def cleanString(originalString):
  222. updated_string = originalString.replace(",", "") #replace all commas
  223. updated_string = updated_string.replace("\n", "") #replace all newlines
  224. updated_string = updated_string.replace("\t", "") #replace all tabs
  225. updated_string = updated_string.replace("\r", "") #replace all carriage returns
  226. updated_string = updated_string.replace("'", "^") #replace all semicolons
  227. updated_string = updated_string.replace(u"»", '') #replace all arrows
  228. updated_string = updated_string.replace("!", "") #replace all exclamation points
  229. updated_string = updated_string.replace(";", "") #replace all exclamations
  230. return updated_string
  231. def checkDateFormat(myString):
  232. isDate = re.match('[0-1][0-9]\/[0-3][0-9]\/[1-2][0-9]{3}', myString)
  233. return isDate
  234. def cleanNumbers(inputString):
  235. reg_ex = re.compile(r'[^\d.]+')
  236. updated_string = reg_ex.sub('', inputString)
  237. return updated_string
  238. def cleanHTML(html):
  239. formats = [
  240. "jpg", "jpeg", "jfif", "pjpeg", "pjp",
  241. "png", "apng", "svg", "bmp", "gif",
  242. "avif", "webp", "ico", "cur", "tiff"
  243. ]
  244. # remove images
  245. clean_html = re.sub(r"<img.*?>", "", html)
  246. clean_html = re.sub(r"<picture.*?>", "", clean_html)
  247. clean_html = re.sub(r"<svg.*?>", "", clean_html)
  248. for fmat in formats:
  249. clean_html = re.sub(r"<object.*" + fmat + ".*?>", "", clean_html)
  250. clean_html = re.sub(r"<canvas.*?>", "", clean_html)
  251. # remove JavaScript
  252. clean_html = re.sub(r"<script.*?>", "", clean_html)
  253. clean_html = re.sub(r"<object.*javascript.*?>", "", clean_html)
  254. clean_html = re.sub(r"<aplet.*mayscript?>", "", clean_html)
  255. clean_html = re.sub(r"<embed.*scriptable?>", "", clean_html)
  256. # image and JavaScript
  257. clean_html = re.sub(r"<div.*background-image.*?>", "", clean_html)
  258. return clean_html