import re
|
|
import pickle
|
|
from scipy.sparse import coo_matrix, hstack, csr_matrix
|
|
|
|
|
|
class Transformer:
|
|
def __init__(self):
|
|
self.ngram_list = ['ead', 'ads', 'log', 'his', 'ste', 'pro', 'act', 'ans', 'che', 'hec', 'eck', 'als', 'ven', 'oli', 'war', 'are', 'cha', 'emp', 'pla', 'lat', 'ver', 'ifi', 'tio', 'ope', 'pen', 'enb', 'nbu', 'bul', 'ull', 'lle', 'cra', 'ack', 'cki', 'pre', 'wif', 'hac', 'oot', 'onf', 'nfi', 'fig', 'lol', 'cap', 'apt', 'ptu', 'ure', 'cti', 'nne', 'oin', 'spo', 'ser', 'ill', 'ous', 'log', 'pas', 'her', 'ale', 'non', 'mat', 'der', 'lea', 'bot', 'ols', 'ese', 'ord', 'ick', 'edi', 'ong', 'acc', 'ssi', 'det', 'tai', 'atm', 'mac', 'saf', 'tco', 'coi', 'tly', 'plo', 'loi', 'oit', 'mal', 'alw', 'lwa', 'tiv', 'ult', 'lti', 'tim', 'ima', 'len', 'ntl', 'rof', 'ofe', 'too', 'ool', 'pho', 'hon', 'ass', 'fac', 'ssw', 'swo', 'nti', 'ecu', 'ani', 'por', 'ort', 'pri', 'day', 'ano', 'ony', 'nym', 'tch', 'tid', 'pot', 'ema', 'how', 'otn', 'tne', 'tea', 'eal', 'ler', 'urc', 'rce', 'hot', 'era', 'tub', 'lde', 'ead', 'ndr', 'dro', 'roi', 'oid', 'nki', 'cry', 'ryp', 'ypt', 'ren', 'rus', 'som', 'ads', 'nso', 'omw', 'mwa', 'ita', 'mou', 'pay', 'lic', 'dar', 'rkn', 'nvi', 'ink', 'ayp', 'ypa', 'pal', 'tut', 'ria', 'vpn', 'nto', 'rin', 'utu', 'ond', 'his', 'tip', 'bin', 'iru', 'ier', 'byp', 'spy', 'eno', 'nom', 'ymo', 'vie', 'iew', 'tay']
|
|
|
|
self.ngram_index_dict = self.get_ngram_index()
|
|
|
|
def get_ngram_index(self):
|
|
ngram_index = {}
|
|
index = 0
|
|
for ngram in self.ngram_list:
|
|
if ngram not in ngram_index:
|
|
ngram_index[ngram] = index
|
|
index += 1
|
|
return ngram_index
|
|
|
|
def binary_vector(self, text):
|
|
vec = [0] * len(self.ngram_index_dict)
|
|
for ngram, index in self.ngram_index_dict.items():
|
|
if ngram in text:
|
|
vec[index] = 1
|
|
return vec
|
|
|
|
def frequency_vector(self, text):
|
|
vec = [0] * len(self.ngram_index_dict)
|
|
for ngram, index in self.ngram_index_dict.items():
|
|
vec[index] = text.count(ngram)
|
|
return vec
|
|
|
|
def relative_frequency_vector(self, freq_vec):
|
|
total = 0
|
|
for count in freq_vec:
|
|
total += count
|
|
|
|
vec = [0] * len(freq_vec)
|
|
if total > 0:
|
|
for i, count in enumerate(freq_vec):
|
|
vec[i] = count / total
|
|
return vec
|
|
|
|
def transform(self, text):
|
|
clean_sent = re.sub(r'[^a-zA-Z ]', '', text).lower()
|
|
|
|
bin_vec = self.binary_vector(clean_sent)
|
|
|
|
return bin_vec
|
|
|
|
|
|
def save(tf, fname):
|
|
with open(fname, 'wb') as file:
|
|
pickle.dump(tf, file)
|
|
|
|
|
|
def load(fname):
|
|
with open(fname, 'rb') as f:
|
|
return pickle.load(f)
|
|
|
|
|
|
# file = open('transformer.pickle', 'rb')
|
|
# tf = pickle.load(file)
|
|
#
|
|
# sentence = "The court has larger bounds when playing doubles"
|
|
#
|
|
# vec = tf.transform(sentence)
|
|
# print(vec)
|
|
|
|
# mat = coo_matrix(vec)
|
|
#
|
|
# temp = hstack([mat, mat])
|
|
# print(temp)
|
|
|
|
# file = open('mlp.pickle', 'rb')
|
|
# mlp = pickle.load(file)
|
|
#
|
|
# result = mlp.predict_proba([vec])[0]
|
|
# print(result)
|