khangtran
/
dark_web_forums

import codecs, re, time
def is_unicode(s):	try:		str(s)		return False	except:		return True
def contains_num(s):	nums = range(10)	str_nums = [str(num) for num in nums]	char_set = set(s)	for num in str_nums:		if num in char_set:			return True	return False

from nltk.tokenize import regexp_tokenize, wordpunct_tokenizefrom nltk.corpus import stopwordsfrom nltk.stem.snowball import SnowballStemmerimport string, restemmer = SnowballStemmer('english')
word_matcher = re.compile(u'[^\W\d_]+', re.UNICODE)def is_unicode(s):	if word_matcher.match(s):		return True	return False
def stem_preprocessor(s):	return my_preprocessor(s, stem=True)
def my_preprocessor(s, stem=False):	pattern = u'[^\W\d_]+|[^\w\s]+|\d+'	tokens = regexp_tokenize(s, pattern)	cleaned_tokens = []	for token in tokens:		if token and is_unicode(token) and not token in stopwords.words('english'):			cleaned_token = stemmer.stem(token.lower()) if stem==True else token.lower()			cleaned_tokens.append(cleaned_token)
	return ' '.join(cleaned_tokens)

regex = re.compile(u'[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.htmldef my_preprocessor2(s):	tokens = wordpunct_tokenize(s)	cleaned_tokens = [regex.sub(u'', token) for token in tokens]	return u' '.join(cleaned_tokens)
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformerfrom sklearn.linear_model import SGDClassifierfrom sklearn.pipeline import make_pipelinefrom sklearn.cross_validation import train_test_splitfrom sklearn.metrics import classification_reportfrom sklearn.neighbors import NearestCentroidfrom sklearn.cross_validation import KFold
def KFold_classification_report(clf, docs, labels, K=10):	y_pred = [-1] * len(docs)	cv = KFold(len(docs), K, shuffle=True)	for traincv, testcv in cv:		train_docs = [docs[i] for i in traincv]		train_labels = [labels[i] for i in traincv]
		clf.fit(train_docs, train_labels)
		for i in testcv:			y_pred[i] = clf.predict([docs[i]])[0]		return classification_report(labels, y_pred)