this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

66 lines
2.0 KiB

1 year ago
  1. import Forums.Classifier.transformer
  2. import pickle, re
  3. class Transformer:
  4. def __init__(self):
  5. self.ngram_list = ['res', 'vir', 'att', 'tta', 'tac', 'web', 'cat', 'rse', 'cra', 'rac', 'nso', 'omw', 'mwa', 'tec', 'boo', 'adv', 'abl', 'can', 'mmi', 'cke', 'bot', 'oks', 'ick', 'eak', 'whe', 'val', 'acc', 'mon', 'dvi', 'nto', 'phi', 'deo', 'hao', 'aos', 'pst', 'ddo', 'dos', 'iru', 'kit', 'jac']
  6. self.ngram_index_dict = self.get_ngram_index()
  7. def get_ngram_index(self):
  8. ngram_index = {}
  9. index = 0
  10. for ngram in self.ngram_list:
  11. if ngram not in ngram_index:
  12. ngram_index[ngram] = index
  13. index += 1
  14. return ngram_index
  15. def binary_vector(self, text):
  16. vec = [0] * len(self.ngram_index_dict)
  17. for ngram, index in self.ngram_index_dict.items():
  18. if ngram in text:
  19. vec[index] = 1
  20. return vec
  21. def frequency_vector(self, text):
  22. vec = [0] * len(self.ngram_index_dict)
  23. for ngram, index in self.ngram_index_dict.items():
  24. vec[index] = text.count(ngram)
  25. return vec
  26. def relative_frequency_vector(self, freq_vec):
  27. total = 0
  28. for count in freq_vec:
  29. total += count
  30. vec = [0] * len(freq_vec)
  31. if total > 0:
  32. for i, count in enumerate(freq_vec):
  33. vec[i] = count / total
  34. return vec
  35. def transform(self, text):
  36. clean_sent = re.sub(r'[^a-zA-Z ]', '', text).lower()
  37. bin_vec = self.binary_vector(clean_sent)
  38. return bin_vec
  39. def load(file_path):
  40. with open(file_path, 'rb') as f:
  41. return pickle.load(f)
  42. def fix(fname):
  43. t = load(fname)
  44. new_t = Forums.Classifier.transformer.Transformer()
  45. new_t.ngram_list = t.ngram_list
  46. new_t.ngram_index_dict = t.ngram_index_dict
  47. Forums.Classifier.transformer.save(new_t, fname + ".new")
  48. fix("topic_title_transformer.pickle")
  49. # fix("title_transformer.pickle")