|
|
- import codecs, re, time
-
- def is_unicode(s):
- try:
- str(s)
- return False
- except:
- return True
-
- def contains_num(s):
- nums = range(10)
- str_nums = [str(num) for num in nums]
- char_set = set(s)
- for num in str_nums:
- if num in char_set:
- return True
- return False
-
-
- from nltk.tokenize import regexp_tokenize, wordpunct_tokenize
- from nltk.corpus import stopwords
- from nltk.stem.snowball import SnowballStemmer
- import string, re
- stemmer = SnowballStemmer('english')
-
- word_matcher = re.compile(u'[^\W\d_]+', re.UNICODE)
- def is_unicode(s):
- if word_matcher.match(s):
- return True
- return False
-
- def stem_preprocessor(s):
- return my_preprocessor(s, stem=True)
-
- def my_preprocessor(s, stem=False):
- pattern = u'[^\W\d_]+|[^\w\s]+|\d+'
- tokens = regexp_tokenize(s, pattern)
- cleaned_tokens = []
- for token in tokens:
- if token and is_unicode(token) and not token in stopwords.words('english'):
- cleaned_token = stemmer.stem(token.lower()) if stem==True else token.lower()
- cleaned_tokens.append(cleaned_token)
-
- return ' '.join(cleaned_tokens)
-
-
- regex = re.compile(u'[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html
- def my_preprocessor2(s):
- tokens = wordpunct_tokenize(s)
- cleaned_tokens = [regex.sub(u'', token) for token in tokens]
- return u' '.join(cleaned_tokens)
-
- from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
- from sklearn.linear_model import SGDClassifier
- from sklearn.pipeline import make_pipeline
- from sklearn.cross_validation import train_test_split
- from sklearn.metrics import classification_report
- from sklearn.neighbors import NearestCentroid
- from sklearn.cross_validation import KFold
-
- def KFold_classification_report(clf, docs, labels, K=10):
- y_pred = [-1] * len(docs)
- cv = KFold(len(docs), K, shuffle=True)
- for traincv, testcv in cv:
- train_docs = [docs[i] for i in traincv]
- train_labels = [labels[i] for i in traincv]
-
- clf.fit(train_docs, train_labels)
-
- for i in testcv:
- y_pred[i] = clf.predict([docs[i]])[0]
-
- return classification_report(labels, y_pred)
-
|