this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

279 lines
9.3 KiB

1 year ago
  1. __author__ = 'DarkWeb'
  2. import string
  3. import time
  4. import re
  5. from datetime import datetime, timedelta
  6. import datetime as fulldatetime
  7. def cleanText(originalText):
  8. safe_chars = string.ascii_letters + string.digits + " " + "_" + "/" + "&" + "$" + "#" "@" + "+" + "-" + "*" + "=" \
  9. ":" + ";" + "." "," + "?" + "!" + "{" + "}" + "[" + "]" + "(" + ")" + "%" + "`" + "~" + "^" + "|" + "<" + ">"
  10. for index, text in enumerate(originalText):
  11. originalText[index] = ''.join([char if char in safe_chars else '' for char in text])
  12. return originalText
  13. def convertDate(sdate, language, crawlerDate):
  14. if language == "english":
  15. todaysday = crawlerDate.strftime("%m/%d/%Y")
  16. sdate = sdate.replace(u"January","01")
  17. sdate = sdate.replace(u"February","02")
  18. sdate = sdate.replace(u"March","03")
  19. sdate = sdate.replace(u"April","04")
  20. sdate = sdate.replace(u"May","05")
  21. sdate = sdate.replace(u"June","06")
  22. sdate = sdate.replace(u"July","07")
  23. sdate = sdate.replace(u"August","08")
  24. sdate = sdate.replace(u"September","09")
  25. sdate = sdate.replace(u"October","10")
  26. sdate = sdate.replace(u"November","11")
  27. sdate = sdate.replace(u"December","12")
  28. sdate = sdate.replace(u"Jan","01")
  29. sdate = sdate.replace(u"Feb","02")
  30. sdate = sdate.replace(u"Mar","03")
  31. sdate = sdate.replace(u"Apr","04")
  32. sdate = sdate.replace(u"May","05")
  33. sdate = sdate.replace(u"Jun","06")
  34. sdate = sdate.replace(u"Jul","07")
  35. sdate = sdate.replace(u"Aug","08")
  36. sdate = sdate.replace(u"Sep","09")
  37. sdate = sdate.replace(u"Oct","10")
  38. sdate = sdate.replace(u"Nov","11")
  39. sdate = sdate.replace(u"Dec","12")
  40. sdate = sdate.replace(u".","")
  41. if sdate == "Today at":
  42. sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%m %d %Y')
  43. sdate = datetime.strptime(str(sdate), '%m %d %Y').strftime('%m/%d/%Y')
  44. elif language == "french":
  45. todaysday = crawlerDate.strftime("%m/%d/%Y")
  46. sdate = sdate.replace(u"janvier","01")
  47. sdate = sdate.replace(u"jan","01")
  48. sdate = sdate.replace(u"février","02")
  49. sdate = sdate.replace(u"juin","06")
  50. sdate = sdate.replace(u"juillet","07")
  51. sdate = sdate.replace(u"juil","07")
  52. sdate = sdate.replace(u"août","08")
  53. sdate = sdate.replace(u"septembre","09")
  54. sdate = sdate.replace(u"sept","09")
  55. sdate = sdate.replace(u"octobre","10")
  56. sdate = sdate.replace(u"oct","10")
  57. sdate = sdate.replace(u"novembre","11")
  58. sdate = sdate.replace(u"nov","11")
  59. sdate = sdate.replace(u"décembre","12")
  60. sdate = sdate.replace(u"déc","12")
  61. sdate = sdate.replace(u".","")
  62. if sdate == u"Aujourd'hui":
  63. sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%d %m %Y')
  64. if "mar" in sdate:
  65. print ("Add March to the IBM Black Market")
  66. raise SystemExit
  67. elif "avr" in sdate:
  68. print ("Add April to the IBM Black Market")
  69. raise SystemExit
  70. elif "mai" in sdate:
  71. print ("Add May to the IBM Black Market")
  72. raise SystemExit
  73. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  74. elif language == "swedish":
  75. sdate = sdate.replace(u"jan","01")
  76. sdate = sdate.replace(u"feb","02")
  77. sdate = sdate.replace(u"mar","03")
  78. sdate = sdate.replace(u"apr","04")
  79. sdate = sdate.replace(u"maj","05")
  80. sdate = sdate.replace(u"jun","06")
  81. sdate = sdate.replace(u"jul","07")
  82. sdate = sdate.replace(u"aug","08")
  83. sdate = sdate.replace(u"sep","09")
  84. sdate = sdate.replace(u"okt","10")
  85. sdate = sdate.replace(u"nov","11")
  86. sdate = sdate.replace(u"dec","12")
  87. sdate = sdate.replace(u".","")
  88. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  89. elif language == "russian":
  90. if sdate == u'\u0412\u0447\u0435\u0440\u0430':
  91. sdate = crawlerDate.today() - timedelta(1)
  92. sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y')
  93. elif u'\xd1\xee\xe7\xe4\xe0\xed\xee' in sdate:
  94. return ""
  95. sdate = sdate.replace(u"января","01")
  96. sdate = sdate.replace(u"янв","01")
  97. sdate = sdate.replace(u"февраля","02")
  98. sdate = sdate.replace(u"Февраль", "02")
  99. sdate = sdate.replace(u"фев","02")
  100. sdate = sdate.replace(u"марта","03")
  101. sdate = sdate.replace(u"апреля","04")
  102. sdate = sdate.replace(u"апр","04")
  103. sdate = sdate.replace(u"мар","05")
  104. sdate = sdate.replace(u"май","05")
  105. sdate = sdate.replace(u"мая","05")
  106. sdate = sdate.replace(u"июня","06")
  107. sdate = sdate.replace(u"июн","06")
  108. sdate = sdate.replace(u"июля","07")
  109. sdate = sdate.replace(u"июл","07")
  110. sdate = sdate.replace(u"августа","08")
  111. sdate = sdate.replace(u"авг","08")
  112. sdate = sdate.replace(u"сентября","09")
  113. sdate = sdate.replace(u"сен","09")
  114. sdate = sdate.replace(u"октября","10")
  115. sdate = sdate.replace(u"Октябрь","10")
  116. sdate = sdate.replace(u"окт","10")
  117. sdate = sdate.replace(u"ноября","11")
  118. sdate = sdate.replace(u"ноя","11")
  119. sdate = sdate.replace(u"декабря","12")
  120. sdate = sdate.replace(u"дек","12")
  121. sdate = sdate.replace(u".","")
  122. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  123. return sdate
  124. def cleanLink(originalLink):
  125. safe_chars = string.ascii_letters + string.digits
  126. originalLink = ''.join([char if char in safe_chars else '' for char in originalLink])
  127. return originalLink
  128. def organizeTopics(forum, nm, topic, board, view, post, user, addDate, href):
  129. rw = []
  130. for n in range(nm):
  131. lne = forum + "," + topic[n] + "," + board + "," # 0, 1, 2
  132. lne += "-1" if len(view) == 0 else view[n] # 3
  133. lne += ","
  134. lne += "-1" if len(post) == 0 else post[n] # 4
  135. lne += "," + user[n] + "," + str(addDate[n]) + "," + time.asctime() # 5, 6, 7
  136. lne += ",-1,-1,-1,-1,-1,-1,-1,-1," # 8, 9, 10, 11, 12, 13, 14, 15
  137. lne += href[n] # 16
  138. rw.append(lne)
  139. return rw
  140. def cleanString(originalString):
  141. updated_string = originalString.replace(",", "") #replace all commas
  142. updated_string = updated_string.replace("\n", "") #replace all newlines
  143. updated_string = updated_string.replace("\t", "") #replace all tabs
  144. updated_string = updated_string.replace("\r", "") #replace all carriage returns
  145. updated_string = updated_string.replace("'", "^") #replace all semicolons
  146. updated_string = updated_string.replace(u"»", '') #replace all arrows
  147. updated_string = updated_string.replace("!", "")
  148. updated_string = updated_string.replace(";", "") #replace all exclamations
  149. return updated_string
  150. #function to convert long informal date string to formal date
  151. def convertFromLongDate(longDate, crawlerdate):
  152. list_of_words = []
  153. list_of_words = longDate.split()
  154. day = 0
  155. week = 0
  156. hour = 0
  157. second = 0
  158. minute = 0
  159. year = 0
  160. total_days = 0
  161. if 'days' in list_of_words:
  162. index = list_of_words.index('days')
  163. day = float(list_of_words[index - 1])
  164. if 'weeks' in list_of_words:
  165. index = list_of_words.index('weeks')
  166. week = float(list_of_words[index - 1])
  167. if 'hours' in list_of_words:
  168. index = list_of_words.index('hours')
  169. hour = float(list_of_words[index - 1])
  170. if 'seconds' in list_of_words:
  171. index = list_of_words.index('seconds')
  172. second = float(list_of_words[index - 1])
  173. if 'minutes' in list_of_words:
  174. index = list_of_words.index('minutes')
  175. minute = float(list_of_words[index - 1])
  176. if 'years' in list_of_words:
  177. index = list_of_words.index('years')
  178. year = float(list_of_words[index - 1])
  179. if year != 0:
  180. total_days = day + 365 * year
  181. #today = datetime.date.today()
  182. timeDelta = fulldatetime.timedelta(days=total_days, weeks=week, hours=hour, seconds=second, minutes=minute)
  183. date = crawlerdate - timeDelta
  184. correct_date = str(date.strftime('%m/%d/%Y'))
  185. return correct_date
  186. def cleanHTML(html):
  187. formats = [
  188. "jpg", "jpeg", "jfif", "pjpeg", "pjp",
  189. "png", "apng", "svg", "bmp", "gif",
  190. "avif", "webp", "ico", "cur", "tiff"
  191. ]
  192. # remove images
  193. clean_html = re.sub(r"<img.*?>", "", html)
  194. clean_html = re.sub(r"<picture.*?>", "", clean_html)
  195. clean_html = re.sub(r"<svg.*?>", "", clean_html)
  196. for fmat in formats:
  197. clean_html = re.sub(r"<object.*" + fmat + ".*?>", "", clean_html)
  198. clean_html = re.sub(r"<canvas.*?>", "", clean_html)
  199. # remove JavaScript
  200. clean_html = re.sub(r"<script.*?>", "", clean_html)
  201. clean_html = re.sub(r"<object.*javascript.*?>", "", clean_html)
  202. clean_html = re.sub(r"<aplet.*mayscript?>", "", clean_html)
  203. clean_html = re.sub(r"<embed.*scriptable?>", "", clean_html)
  204. # image and JavaScript
  205. clean_html = re.sub(r"<div.*background-image.*?>", "", clean_html)
  206. return clean_html