this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

80 lines
2.3 KiB

1 year ago
  1. import re
  2. import pickle
  3. from scipy.sparse import coo_matrix, hstack, csr_matrix
  4. class Transformer:
  5. def __init__(self):
  6. self.ngram_list = ['ake', 'eta', 'ail', 'ing', 'sta', 'xpl', 'plo', 'loi', 'oit', 'tat', 'use', 'mal', 'alw', 'lwa', 'vel', 'ell', 'goo', 'ood', 'sec', 'ecu', 'cur', 'uri', 'rit', 'ity', 'tre', 'beg', 'gin', 'han', 'kin', 'ind', 'hac', 'ack', 'cki', 'ewb', 'res', 'vir', 'att', 'tta', 'tac', 'web', 'cat', 'rse', 'cra', 'rac', 'nso', 'omw', 'mwa', 'tec', 'boo', 'adv', 'abl', 'can', 'mmi', 'cke', 'bot', 'oks', 'ick', 'eak', 'whe', 'val', 'acc', 'mon', 'dvi', 'nto', 'phi', 'deo', 'hao', 'aos', 'pst', 'ddo', 'dos', 'iru', 'kit', 'jac']
  7. self.ngram_index_dict = self.get_ngram_index()
  8. def get_ngram_index(self):
  9. ngram_index = {}
  10. index = 0
  11. for ngram in self.ngram_list:
  12. if ngram not in ngram_index:
  13. ngram_index[ngram] = index
  14. index += 1
  15. return ngram_index
  16. def binary_vector(self, text):
  17. vec = [0] * len(self.ngram_index_dict)
  18. for ngram, index in self.ngram_index_dict.items():
  19. if ngram in text:
  20. vec[index] = 1
  21. return vec
  22. def frequency_vector(self, text):
  23. vec = [0] * len(self.ngram_index_dict)
  24. for ngram, index in self.ngram_index_dict.items():
  25. vec[index] = text.count(ngram)
  26. return vec
  27. def relative_frequency_vector(self, freq_vec):
  28. total = 0
  29. for count in freq_vec:
  30. total += count
  31. vec = [0] * len(freq_vec)
  32. if total > 0:
  33. for i, count in enumerate(freq_vec):
  34. vec[i] = count / total
  35. return vec
  36. def transform(self, text):
  37. clean_sent = re.sub(r'[^a-zA-Z ]', '', text).lower()
  38. bin_vec = self.binary_vector(clean_sent)
  39. return bin_vec
  40. def save(tf, fname):
  41. with open(fname, 'wb') as file:
  42. pickle.dump(tf, file)
  43. def load(fname):
  44. with open(fname, 'rb') as f:
  45. return pickle.load(f)
  46. # file = open('transformer.pickle', 'rb')
  47. # tf = pickle.load(file)
  48. # sentence = "The court has larger bounds when playing doubles"
  49. #
  50. # vec = tf.transform(sentence)
  51. # print(vec)
  52. # mat = coo_matrix(vec)
  53. #
  54. # temp = hstack([mat, mat])
  55. # print(temp)
  56. # file = open('mlp.pickle', 'rb')
  57. # mlp = pickle.load(file)
  58. #
  59. # result = mlp.predict_proba([vec])[0]
  60. # print(result)