khangtran
/
dark_web_forums


								import codecs, re, time


								def is_unicode(s):

									try:

										str(s)

										return False

									except:

										return True


								def contains_num(s):

									nums = range(10)

									str_nums = [str(num) for num in nums]

									char_set = set(s)

									for num in str_nums:

										if num in char_set:

											return True

									return False


								from nltk.tokenize import regexp_tokenize, wordpunct_tokenize

								from nltk.corpus import stopwords

								from nltk.stem.snowball import SnowballStemmer

								import string, re

								stemmer = SnowballStemmer('english')


								word_matcher = re.compile(u'[^\W\d_]+', re.UNICODE)

								def is_unicode(s):

									if word_matcher.match(s):

										return True

									return False


								def stem_preprocessor(s):

									return my_preprocessor(s, stem=True)


								def my_preprocessor(s, stem=False):

									pattern = u'[^\W\d_]+|[^\w\s]+|\d+'

									tokens = regexp_tokenize(s, pattern)

									cleaned_tokens = []

									for token in tokens:

										if token and is_unicode(token) and not token in stopwords.words('english'):

											cleaned_token = stemmer.stem(token.lower()) if stem==True else token.lower()

											cleaned_tokens.append(cleaned_token)


									return ' '.join(cleaned_tokens)


								regex = re.compile(u'[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html

								def my_preprocessor2(s):

									tokens = wordpunct_tokenize(s)

									cleaned_tokens = [regex.sub(u'', token) for token in tokens]

									return u' '.join(cleaned_tokens)


								from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

								from sklearn.linear_model import SGDClassifier

								from sklearn.pipeline import make_pipeline

								from sklearn.cross_validation import train_test_split

								from sklearn.metrics import classification_report

								from sklearn.neighbors import NearestCentroid

								from sklearn.cross_validation import KFold


								def KFold_classification_report(clf, docs, labels, K=10):

									y_pred = [-1] * len(docs)

									cv = KFold(len(docs), K, shuffle=True)

									for traincv, testcv in cv:

										train_docs = [docs[i] for i in traincv]

										train_labels = [labels[i] for i in traincv]


										clf.fit(train_docs, train_labels)


										for i in testcv:

											y_pred[i] = clf.predict([docs[i]])[0]


									return classification_report(labels, y_pred)