this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

74 lines
2.0 KiB

1 year ago
  1. import codecs, re, time
  2. def is_unicode(s):
  3. try:
  4. str(s)
  5. return False
  6. except:
  7. return True
  8. def contains_num(s):
  9. nums = range(10)
  10. str_nums = [str(num) for num in nums]
  11. char_set = set(s)
  12. for num in str_nums:
  13. if num in char_set:
  14. return True
  15. return False
  16. from nltk.tokenize import regexp_tokenize, wordpunct_tokenize
  17. from nltk.corpus import stopwords
  18. from nltk.stem.snowball import SnowballStemmer
  19. import string, re
  20. stemmer = SnowballStemmer('english')
  21. word_matcher = re.compile(u'[^\W\d_]+', re.UNICODE)
  22. def is_unicode(s):
  23. if word_matcher.match(s):
  24. return True
  25. return False
  26. def stem_preprocessor(s):
  27. return my_preprocessor(s, stem=True)
  28. def my_preprocessor(s, stem=False):
  29. pattern = u'[^\W\d_]+|[^\w\s]+|\d+'
  30. tokens = regexp_tokenize(s, pattern)
  31. cleaned_tokens = []
  32. for token in tokens:
  33. if token and is_unicode(token) and not token in stopwords.words('english'):
  34. cleaned_token = stemmer.stem(token.lower()) if stem==True else token.lower()
  35. cleaned_tokens.append(cleaned_token)
  36. return ' '.join(cleaned_tokens)
  37. regex = re.compile(u'[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html
  38. def my_preprocessor2(s):
  39. tokens = wordpunct_tokenize(s)
  40. cleaned_tokens = [regex.sub(u'', token) for token in tokens]
  41. return u' '.join(cleaned_tokens)
  42. from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
  43. from sklearn.linear_model import SGDClassifier
  44. from sklearn.pipeline import make_pipeline
  45. from sklearn.cross_validation import train_test_split
  46. from sklearn.metrics import classification_report
  47. from sklearn.neighbors import NearestCentroid
  48. from sklearn.cross_validation import KFold
  49. def KFold_classification_report(clf, docs, labels, K=10):
  50. y_pred = [-1] * len(docs)
  51. cv = KFold(len(docs), K, shuffle=True)
  52. for traincv, testcv in cv:
  53. train_docs = [docs[i] for i in traincv]
  54. train_labels = [labels[i] for i in traincv]
  55. clf.fit(train_docs, train_labels)
  56. for i in testcv:
  57. y_pred[i] = clf.predict([docs[i]])[0]
  58. return classification_report(labels, y_pred)