this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

307 lines
10 KiB

1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
  1. __author__ = 'DarkWeb'
  2. import string
  3. import time
  4. import re
  5. from datetime import datetime, timedelta
  6. import datetime as fulldatetime
  7. def cleanText(originalText):
  8. safe_chars = string.ascii_letters + string.digits + " " + "_" + "/" + "&" + "$" + "#" "@" + "+" + "-" + "*" + "=" \
  9. ":" + ";" + "." "," + "?" + "!" + "{" + "}" + "[" + "]" + "(" + ")" + "%" + "`" + "~" + "^" + "|" + "<" + ">"
  10. for index, text in enumerate(originalText):
  11. originalText[index] = ''.join([char if char in safe_chars else '' for char in text])
  12. return originalText
  13. def convertDate(sdate, language, crawlerDate):
  14. if language == "english":
  15. todaysday = crawlerDate.strftime("%m/%d/%Y")
  16. sdate = sdate.replace(u"January","01")
  17. sdate = sdate.replace(u"February","02")
  18. sdate = sdate.replace(u"March","03")
  19. sdate = sdate.replace(u"April","04")
  20. sdate = sdate.replace(u"May","05")
  21. sdate = sdate.replace(u"June","06")
  22. sdate = sdate.replace(u"July","07")
  23. sdate = sdate.replace(u"August","08")
  24. sdate = sdate.replace(u"September","09")
  25. sdate = sdate.replace(u"October","10")
  26. sdate = sdate.replace(u"November","11")
  27. sdate = sdate.replace(u"December","12")
  28. sdate = sdate.replace(u"Jan","01")
  29. sdate = sdate.replace(u"Feb","02")
  30. sdate = sdate.replace(u"Mar","03")
  31. sdate = sdate.replace(u"Apr","04")
  32. sdate = sdate.replace(u"May","05")
  33. sdate = sdate.replace(u"Jun","06")
  34. sdate = sdate.replace(u"Jul","07")
  35. sdate = sdate.replace(u"Aug","08")
  36. sdate = sdate.replace(u"Sep","09")
  37. sdate = sdate.replace(u"Oct","10")
  38. sdate = sdate.replace(u"Nov","11")
  39. sdate = sdate.replace(u"Dec","12")
  40. sdate = sdate.replace(u".","")
  41. if sdate == "Today at":
  42. sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%m %d %Y')
  43. sdate = datetime.strptime(str(sdate), '%m %d %Y').strftime('%m/%d/%Y')
  44. elif language == "french":
  45. todaysday = crawlerDate.strftime("%m/%d/%Y")
  46. sdate = sdate.replace(u"janvier","01")
  47. sdate = sdate.replace(u"jan","01")
  48. sdate = sdate.replace(u"février","02")
  49. sdate = sdate.replace(u"juin","06")
  50. sdate = sdate.replace(u"juillet","07")
  51. sdate = sdate.replace(u"juil","07")
  52. sdate = sdate.replace(u"août","08")
  53. sdate = sdate.replace(u"septembre","09")
  54. sdate = sdate.replace(u"sept","09")
  55. sdate = sdate.replace(u"octobre","10")
  56. sdate = sdate.replace(u"oct","10")
  57. sdate = sdate.replace(u"novembre","11")
  58. sdate = sdate.replace(u"nov","11")
  59. sdate = sdate.replace(u"décembre","12")
  60. sdate = sdate.replace(u"déc","12")
  61. sdate = sdate.replace(u".","")
  62. if sdate == u"Aujourd'hui":
  63. sdate = datetime.strptime(str(todaysday), '%m/%d/%Y').strftime('%d %m %Y')
  64. if "mar" in sdate:
  65. print ("Add March to the IBM Black Market")
  66. raise SystemExit
  67. elif "avr" in sdate:
  68. print ("Add April to the IBM Black Market")
  69. raise SystemExit
  70. elif "mai" in sdate:
  71. print ("Add May to the IBM Black Market")
  72. raise SystemExit
  73. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  74. elif language == "swedish":
  75. sdate = sdate.replace(u"jan","01")
  76. sdate = sdate.replace(u"feb","02")
  77. sdate = sdate.replace(u"mar","03")
  78. sdate = sdate.replace(u"apr","04")
  79. sdate = sdate.replace(u"maj","05")
  80. sdate = sdate.replace(u"jun","06")
  81. sdate = sdate.replace(u"jul","07")
  82. sdate = sdate.replace(u"aug","08")
  83. sdate = sdate.replace(u"sep","09")
  84. sdate = sdate.replace(u"okt","10")
  85. sdate = sdate.replace(u"nov","11")
  86. sdate = sdate.replace(u"dec","12")
  87. sdate = sdate.replace(u".","")
  88. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  89. elif language == "russian":
  90. if sdate == u'\u0412\u0447\u0435\u0440\u0430':
  91. sdate = crawlerDate.today() - timedelta(1)
  92. sdate = datetime.strptime(str(sdate), '%Y-%m-%d').strftime('%d %m %Y')
  93. elif u'\xd1\xee\xe7\xe4\xe0\xed\xee' in sdate:
  94. return ""
  95. sdate = sdate.replace(u"января","01")
  96. sdate = sdate.replace(u"янв","01")
  97. sdate = sdate.replace(u"февраля","02")
  98. sdate = sdate.replace(u"Февраль", "02")
  99. sdate = sdate.replace(u"фев","02")
  100. sdate = sdate.replace(u"марта","03")
  101. sdate = sdate.replace(u"апреля","04")
  102. sdate = sdate.replace(u"апр","04")
  103. sdate = sdate.replace(u"мар","05")
  104. sdate = sdate.replace(u"май","05")
  105. sdate = sdate.replace(u"мая","05")
  106. sdate = sdate.replace(u"июня","06")
  107. sdate = sdate.replace(u"июн","06")
  108. sdate = sdate.replace(u"июля","07")
  109. sdate = sdate.replace(u"июл","07")
  110. sdate = sdate.replace(u"августа","08")
  111. sdate = sdate.replace(u"авг","08")
  112. sdate = sdate.replace(u"сентября","09")
  113. sdate = sdate.replace(u"сен","09")
  114. sdate = sdate.replace(u"октября","10")
  115. sdate = sdate.replace(u"Октябрь","10")
  116. sdate = sdate.replace(u"окт","10")
  117. sdate = sdate.replace(u"ноября","11")
  118. sdate = sdate.replace(u"ноя","11")
  119. sdate = sdate.replace(u"декабря","12")
  120. sdate = sdate.replace(u"дек","12")
  121. sdate = sdate.replace(u".","")
  122. sdate = datetime.strptime(str(sdate), '%d %m %Y').strftime('%m/%d/%Y')
  123. return sdate
  124. def cleanLink(originalLink):
  125. safe_chars = string.ascii_letters + string.digits
  126. originalLink = ''.join([char if char in safe_chars else '' for char in originalLink])
  127. return originalLink
  128. def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate):
  129. day = time.strftime("%m/%d/%Y")
  130. ahora = time.strftime("%I:%M:%S")
  131. rw = []
  132. for n in range(nm):
  133. lne = forum # 0
  134. lne += ","
  135. lne += board # 1
  136. lne += ","
  137. lne += author[n] # 2
  138. lne += ","
  139. lne += topic[n] # 3
  140. lne += ","
  141. lne += "-1" if len(views) == 0 else views[n] # 4
  142. lne += ","
  143. lne += "-1" if len(posts) == 0 else posts[n] # 5
  144. lne += ","
  145. lne += "-1" if len(href) == 0 else href[n] # 6
  146. lne += ","
  147. lne += "-1" if len(addDate) == 0 else str(addDate[n]) # 7
  148. lne += ","
  149. lne += day + " " + ahora # 8
  150. lne += ","
  151. lne += "-1" # 9 name_user
  152. lne += ","
  153. lne += "-1" # 10 status_user
  154. lne += ","
  155. lne += "-1" # 11 reputation_user
  156. lne += ","
  157. lne += "-1" # 12 interest_user
  158. lne += ","
  159. lne += "-1" # 13 signature_user
  160. lne += ","
  161. lne += "-1" # 14 content_post
  162. lne += ","
  163. lne += "-1" # 15 feedback_post
  164. lne += ","
  165. lne += "-1" # 16 dateadded_post
  166. rw.append(lne)
  167. return rw
  168. def cleanString(originalString):
  169. updated_string = originalString.replace(",", "") #replace all commas
  170. updated_string = updated_string.replace("\n", "") #replace all newlines
  171. updated_string = updated_string.replace("\t", "") #replace all tabs
  172. updated_string = updated_string.replace("\r", "") #replace all carriage returns
  173. updated_string = updated_string.replace("'", "^") #replace all semicolons
  174. updated_string = updated_string.replace(u"»", '') #replace all arrows
  175. updated_string = updated_string.replace("!", "")
  176. updated_string = updated_string.replace(";", "") #replace all exclamations
  177. return updated_string
  178. #function to convert long informal date string to formal date
  179. def convertFromLongDate(longDate, crawlerdate):
  180. list_of_words = []
  181. list_of_words = longDate.split()
  182. day = 0
  183. week = 0
  184. hour = 0
  185. second = 0
  186. minute = 0
  187. year = 0
  188. total_days = 0
  189. if 'days' in list_of_words:
  190. index = list_of_words.index('days')
  191. day = float(list_of_words[index - 1])
  192. if 'weeks' in list_of_words:
  193. index = list_of_words.index('weeks')
  194. week = float(list_of_words[index - 1])
  195. if 'hours' in list_of_words:
  196. index = list_of_words.index('hours')
  197. hour = float(list_of_words[index - 1])
  198. if 'seconds' in list_of_words:
  199. index = list_of_words.index('seconds')
  200. second = float(list_of_words[index - 1])
  201. if 'minutes' in list_of_words:
  202. index = list_of_words.index('minutes')
  203. minute = float(list_of_words[index - 1])
  204. if 'years' in list_of_words:
  205. index = list_of_words.index('years')
  206. year = float(list_of_words[index - 1])
  207. if year != 0:
  208. total_days = day + 365 * year
  209. #today = datetime.date.today()
  210. timeDelta = fulldatetime.timedelta(days=total_days, weeks=week, hours=hour, seconds=second, minutes=minute)
  211. date = crawlerdate - timeDelta
  212. correct_date = str(date.strftime('%m/%d/%Y'))
  213. return correct_date
  214. def cleanHTML(html):
  215. formats = [
  216. "jpg", "jpeg", "jfif", "pjpeg", "pjp",
  217. "png", "apng", "svg", "bmp", "gif",
  218. "avif", "webp", "ico", "cur", "tiff"
  219. ]
  220. # remove images
  221. clean_html = re.sub(r"<img.*?>", "", html)
  222. clean_html = re.sub(r"<picture.*?>", "", clean_html)
  223. clean_html = re.sub(r"<svg.*?>", "", clean_html)
  224. for fmat in formats:
  225. clean_html = re.sub(r"<object.*" + fmat + ".*?>", "", clean_html)
  226. clean_html = re.sub(r"<canvas.*?>", "", clean_html)
  227. # remove JavaScript
  228. clean_html = re.sub(r"<script.*?>", "", clean_html)
  229. clean_html = re.sub(r"<object.*javascript.*?>", "", clean_html)
  230. clean_html = re.sub(r"<aplet.*mayscript?>", "", clean_html)
  231. clean_html = re.sub(r"<embed.*scriptable?>", "", clean_html)
  232. # image and JavaScript
  233. clean_html = re.sub(r"<div[^>]*style=\"[^\"]*background-image.*?>", "", clean_html)
  234. return clean_html