import Forums.Classifier.transformer import pickle, re class Transformer: def __init__(self): self.ngram_list = ['res', 'vir', 'att', 'tta', 'tac', 'web', 'cat', 'rse', 'cra', 'rac', 'nso', 'omw', 'mwa', 'tec', 'boo', 'adv', 'abl', 'can', 'mmi', 'cke', 'bot', 'oks', 'ick', 'eak', 'whe', 'val', 'acc', 'mon', 'dvi', 'nto', 'phi', 'deo', 'hao', 'aos', 'pst', 'ddo', 'dos', 'iru', 'kit', 'jac'] self.ngram_index_dict = self.get_ngram_index() def get_ngram_index(self): ngram_index = {} index = 0 for ngram in self.ngram_list: if ngram not in ngram_index: ngram_index[ngram] = index index += 1 return ngram_index def binary_vector(self, text): vec = [0] * len(self.ngram_index_dict) for ngram, index in self.ngram_index_dict.items(): if ngram in text: vec[index] = 1 return vec def frequency_vector(self, text): vec = [0] * len(self.ngram_index_dict) for ngram, index in self.ngram_index_dict.items(): vec[index] = text.count(ngram) return vec def relative_frequency_vector(self, freq_vec): total = 0 for count in freq_vec: total += count vec = [0] * len(freq_vec) if total > 0: for i, count in enumerate(freq_vec): vec[i] = count / total return vec def transform(self, text): clean_sent = re.sub(r'[^a-zA-Z ]', '', text).lower() bin_vec = self.binary_vector(clean_sent) return bin_vec def load(file_path): with open(file_path, 'rb') as f: return pickle.load(f) def fix(fname): t = load(fname) new_t = Forums.Classifier.transformer.Transformer() new_t.ngram_list = t.ngram_list new_t.ngram_index_dict = t.ngram_index_dict Forums.Classifier.transformer.save(new_t, fname + ".new") fix("topic_title_transformer.pickle") # fix("title_transformer.pickle")