this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

74 lines
2.0 KiB

import codecs, re, time
def is_unicode(s):
try:
str(s)
return False
except:
return True
def contains_num(s):
nums = range(10)
str_nums = [str(num) for num in nums]
char_set = set(s)
for num in str_nums:
if num in char_set:
return True
return False
from nltk.tokenize import regexp_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import string, re
stemmer = SnowballStemmer('english')
word_matcher = re.compile(u'[^\W\d_]+', re.UNICODE)
def is_unicode(s):
if word_matcher.match(s):
return True
return False
def stem_preprocessor(s):
return my_preprocessor(s, stem=True)
def my_preprocessor(s, stem=False):
pattern = u'[^\W\d_]+|[^\w\s]+|\d+'
tokens = regexp_tokenize(s, pattern)
cleaned_tokens = []
for token in tokens:
if token and is_unicode(token) and not token in stopwords.words('english'):
cleaned_token = stemmer.stem(token.lower()) if stem==True else token.lower()
cleaned_tokens.append(cleaned_token)
return ' '.join(cleaned_tokens)
regex = re.compile(u'[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html
def my_preprocessor2(s):
tokens = wordpunct_tokenize(s)
cleaned_tokens = [regex.sub(u'', token) for token in tokens]
return u' '.join(cleaned_tokens)
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.neighbors import NearestCentroid
from sklearn.cross_validation import KFold
def KFold_classification_report(clf, docs, labels, K=10):
y_pred = [-1] * len(docs)
cv = KFold(len(docs), K, shuffle=True)
for traincv, testcv in cv:
train_docs = [docs[i] for i in traincv]
train_labels = [labels[i] for i in traincv]
clf.fit(train_docs, train_labels)
for i in testcv:
y_pred[i] = clf.predict([docs[i]])[0]
return classification_report(labels, y_pred)