import Forums.Classifier.transformer
|
|
import pickle, re
|
|
|
|
|
|
class Transformer:
|
|
def __init__(self):
|
|
self.ngram_list = ['res', 'vir', 'att', 'tta', 'tac', 'web', 'cat', 'rse', 'cra', 'rac', 'nso', 'omw', 'mwa', 'tec', 'boo', 'adv', 'abl', 'can', 'mmi', 'cke', 'bot', 'oks', 'ick', 'eak', 'whe', 'val', 'acc', 'mon', 'dvi', 'nto', 'phi', 'deo', 'hao', 'aos', 'pst', 'ddo', 'dos', 'iru', 'kit', 'jac']
|
|
|
|
self.ngram_index_dict = self.get_ngram_index()
|
|
|
|
def get_ngram_index(self):
|
|
ngram_index = {}
|
|
index = 0
|
|
for ngram in self.ngram_list:
|
|
if ngram not in ngram_index:
|
|
ngram_index[ngram] = index
|
|
index += 1
|
|
return ngram_index
|
|
|
|
def binary_vector(self, text):
|
|
vec = [0] * len(self.ngram_index_dict)
|
|
for ngram, index in self.ngram_index_dict.items():
|
|
if ngram in text:
|
|
vec[index] = 1
|
|
return vec
|
|
|
|
def frequency_vector(self, text):
|
|
vec = [0] * len(self.ngram_index_dict)
|
|
for ngram, index in self.ngram_index_dict.items():
|
|
vec[index] = text.count(ngram)
|
|
return vec
|
|
|
|
def relative_frequency_vector(self, freq_vec):
|
|
total = 0
|
|
for count in freq_vec:
|
|
total += count
|
|
|
|
vec = [0] * len(freq_vec)
|
|
if total > 0:
|
|
for i, count in enumerate(freq_vec):
|
|
vec[i] = count / total
|
|
return vec
|
|
|
|
def transform(self, text):
|
|
clean_sent = re.sub(r'[^a-zA-Z ]', '', text).lower()
|
|
|
|
bin_vec = self.binary_vector(clean_sent)
|
|
|
|
return bin_vec
|
|
|
|
|
|
def load(file_path):
|
|
with open(file_path, 'rb') as f:
|
|
return pickle.load(f)
|
|
|
|
|
|
def fix(fname):
|
|
t = load(fname)
|
|
new_t = Forums.Classifier.transformer.Transformer()
|
|
new_t.ngram_list = t.ngram_list
|
|
new_t.ngram_index_dict = t.ngram_index_dict
|
|
Forums.Classifier.transformer.save(new_t, fname + ".new")
|
|
|
|
|
|
fix("topic_title_transformer.pickle")
|
|
# fix("title_transformer.pickle")
|