分类特征稀疏的文本

翻译者:@Loopy
校验者:@barrycg

这个示例展示了如何使用scikit-learn中的单词包方法,根据主题对文档进行分类。本例使用scipy.sparse中的矩阵来存储特征,并演示各种能够有效处理稀疏矩阵的分类器。

本例中使用的数据集是20条新闻组数据集。通过scikit-learn可以自动下载该数据集,并进行缓存。

下述条形图展示了各个不同分类器,其信息包括精度、训练时间(已归一化)和测试时间(已归一化)。

  1. import logging
  2. import numpy as np
  3. from optparse import OptionParser
  4. import sys
  5. from time import time
  6. import matplotlib.pyplot as plt
  7. from sklearn.datasets import fetch_20newsgroups
  8. from sklearn.feature_extraction.text import TfidfVectorizer
  9. from sklearn.feature_extraction.text import HashingVectorizer
  10. from sklearn.feature_selection import SelectFromModel
  11. from sklearn.feature_selection import SelectKBest, chi2
  12. from sklearn.linear_model import RidgeClassifier
  13. from sklearn.pipeline import Pipeline
  14. from sklearn.svm import LinearSVC
  15. from sklearn.linear_model import SGDClassifier
  16. from sklearn.linear_model import Perceptron
  17. from sklearn.linear_model import PassiveAggressiveClassifier
  18. from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
  19. from sklearn.neighbors import KNeighborsClassifier
  20. from sklearn.neighbors import NearestCentroid
  21. from sklearn.ensemble import RandomForestClassifier
  22. from sklearn.utils.extmath import density
  23. from sklearn import metrics
  1. # 在stdout上显示进度日志
  2. logging.basicConfig(level=logging.INFO,format='%(asctime)s %(levelname)s %(message)s')
  1. # 解析命令行参数
  2. op = OptionParser()
  3. op.add_option("--report",
  4. action="store_true", dest="print_report",
  5. help="Print a detailed classification report.")
  6. op.add_option("--chi2_select",
  7. action="store", type="int", dest="select_chi2",
  8. help="Select some number of features using a chi-squared test")
  9. op.add_option("--confusion_matrix",
  10. action="store_true", dest="print_cm",
  11. help="Print the confusion matrix.")
  12. op.add_option("--top10",
  13. action="store_true", dest="print_top10",
  14. help="Print ten most discriminative terms per class"
  15. " for every classifier.")
  16. op.add_option("--all_categories",
  17. action="store_true", dest="all_categories",
  18. help="Whether to use all categories or not.")
  19. op.add_option("--use_hashing",
  20. action="store_true",
  21. help="Use a hashing vectorizer.")
  22. op.add_option("--n_features",
  23. action="store", type=int, default=2 ** 16,
  24. help="n_features when using the hashing vectorizer.")
  25. op.add_option("--filtered",
  26. action="store_true",
  27. help="Remove newsgroup information that is easily overfit: "
  28. "headers, signatures, and quoting.")
  1. <Option at 0x7febca4f9320: --filtered>
  1. def is_interactive():
  2. return not hasattr(sys.modules['__main__'], '__file__')
  1. # Jupyter notebook上的运行方法
  2. argv = [] if is_interactive() else sys.argv[1:]
  3. (opts, args) = op.parse_args(argv)
  4. if len(args) > 0:
  5. op.error("this script takes no arguments.")
  6. sys.exit(1)
  7. print(__doc__)
  8. op.print_help()
  9. print()
  1. Automatically created module for IPython interactive environment
  2. Usage: ipykernel_launcher.py [options]
  3. Options:
  4. -h, --help show this help message and exit
  5. --report Print a detailed classification report.
  6. --chi2_select=SELECT_CHI2
  7. Select some number of features using a chi-squared
  8. test
  9. --confusion_matrix Print the confusion matrix.
  10. --top10 Print ten most discriminative terms per class for
  11. every classifier.
  12. --all_categories Whether to use all categories or not.
  13. --use_hashing Use a hashing vectorizer.
  14. --n_features=N_FEATURES
  15. n_features when using the hashing vectorizer.
  16. --filtered Remove newsgroup information that is easily overfit:
  17. headers, signatures, and quoting.
  1. # 从训练集中加载一些类别
  2. if opts.all_categories:
  3. categories = None
  4. else:
  5. categories = [
  6. 'alt.atheism',
  7. 'talk.religion.misc',
  8. 'comp.graphics',
  9. 'sci.space',
  10. ]
  1. if opts.filtered:
  2. remove = ('headers', 'footers', 'quotes')
  3. else:
  4. remove = ()
  5. print("Loading 20 newsgroups dataset for categories:")
  6. print(categories if categories else "all")
  1. Loading 20 newsgroups dataset for categories:
  2. ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
  1. # 下载数据集
  2. data_train = fetch_20newsgroups(subset='train', categories=categories,shuffle=True, random_state=42,remove=remove)
  3. data_test = fetch_20newsgroups(subset='test', categories=categories,shuffle=True, random_state=42,remove=remove)
  1. # target_names中的标签顺序可以与categories中的不同
  2. target_names = data_train.target_names
  3. def size_mb(docs):
  4. return sum(len(s.encode('utf-8')) for s in docs) / 1e6
  5. data_train_size_mb = size_mb(data_train.data)
  6. data_test_size_mb = size_mb(data_test.data)
  7. print("%d documents - %0.3fMB (training set)" % (
  8. len(data_train.data), data_train_size_mb))
  9. print("%d documents - %0.3fMB (test set)" % (
  10. len(data_test.data), data_test_size_mb))
  11. print("%d categories" % len(target_names))
  1. 2034 documents - 3.980MB (training set)
  2. 1353 documents - 2.867MB (test set)
  3. 4 categories
  1. # 划分测试,训练集
  2. y_train, y_test = data_train.target, data_test.target
  1. print("使用稀疏向量机从训练数据中提取特征")
  2. t0 = time()
  3. if opts.use_hashing:
  4. vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False,
  5. n_features=opts.n_features)
  6. X_train = vectorizer.transform(data_train.data)
  7. else:
  8. vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
  9. stop_words='english')
  10. X_train = vectorizer.fit_transform(data_train.data)
  11. duration = time() - t0
  12. print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
  13. print("n_samples: %d, n_features: %d" % X_train.shape)
  1. 使用稀疏向量机从训练数据中提取特征
  2. done in 0.476004s at 8.360MB/s
  3. n_samples: 2034, n_features: 33809
  1. print("使用相同的矢量化器从测试数据中提取特征")
  2. t0 = time()
  3. X_test = vectorizer.transform(data_test.data)
  4. duration = time() - t0
  5. print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
  6. print("n_samples: %d, n_features: %d" % X_test.shape)
  1. 使用相同的矢量化器从测试数据中提取特征
  2. done in 0.311447s at 9.207MB/s
  3. n_samples: 1353, n_features: 33809
  1. # 从整数的特征名称映射到原始的token字符串
  2. if opts.use_hashing:
  3. feature_names = None
  4. else:
  5. feature_names = vectorizer.get_feature_names()
  1. if opts.select_chi2:
  2. print("使用卡方检验提取 %d 个特征" %
  3. opts.select_chi2)
  4. t0 = time()
  5. ch2 = SelectKBest(chi2, k=opts.select_chi2)
  6. X_train = ch2.fit_transform(X_train, y_train)
  7. X_test = ch2.transform(X_test)
  8. if feature_names:
  9. # keep selected feature names
  10. feature_names = [feature_names[i] for i
  11. in ch2.get_support(indices=True)]
  12. print("done in %fs" % (time() - t0))
  1. if feature_names:
  2. feature_names = np.asarray(feature_names)
  1. # 修剪字符串以适应终端(假设显示80列)
  2. def trim(s):
  3. return s if len(s) <= 80 else s[:77] + "..."

基准分类器

  1. def benchmark(clf):
  2. print('_' * 80)
  3. print("训练: ")
  4. print(clf)
  5. t0 = time()
  6. clf.fit(X_train, y_train)
  7. train_time = time() - t0
  8. print("训练时间: %0.3fs" % train_time)
  9. t0 = time()
  10. pred = clf.predict(X_test)
  11. test_time = time() - t0
  12. print("最佳时间: %0.3fs" % test_time)
  13. score = metrics.accuracy_score(y_test, pred)
  14. print("准确率: %0.3f" % score)
  15. if hasattr(clf, 'coef_'):
  16. print("维数: %d" % clf.coef_.shape[1])
  17. print("密度: %f" % density(clf.coef_))
  18. if opts.print_top10 and feature_names is not None:
  19. print("每个类的前十个词:")
  20. for i, label in enumerate(target_names):
  21. top10 = np.argsort(clf.coef_[i])[-10:]
  22. print(trim("%s: %s" % (label, " ".join(feature_names[top10]))))
  23. print()
  24. if opts.print_report:
  25. print("分类报告:")
  26. print(metrics.classification_report(y_test, pred,
  27. target_names=target_names))
  28. if opts.print_cm:
  29. print("混淆矩阵:")
  30. print(metrics.confusion_matrix(y_test, pred))
  31. clf_descr = str(clf).split('(')[0]
  32. return clf_descr, score, train_time, test_time
  1. results = []
  2. for clf, name in (
  3. (RidgeClassifier(tol=1e-2, solver="sag"), "岭分类器"),
  4. (Perceptron(max_iter=50, tol=1e-3), "感知器"),
  5. (PassiveAggressiveClassifier(max_iter=50, tol=1e-3),
  6. "PAC分类器"),
  7. (KNeighborsClassifier(n_neighbors=10), "K近邻"),
  8. (RandomForestClassifier(n_estimators=100), "随机森林")):
  9. print('=' * 80)
  10. print(name)
  11. results.append(benchmark(clf))
  1. ================================================================================
  2. 岭分类器
  3. ________________________________________________________________________________
  4. 训练:
  5. RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
  6. max_iter=None, normalize=False, random_state=None, solver='sag',
  7. tol=0.01)
  8. 训练时间: 0.202s
  9. 最佳时间: 0.002s
  10. 准确率: 0.897
  11. 维数: 33809
  12. 密度: 1.000000
  13. ================================================================================
  14. 感知器
  15. ________________________________________________________________________________
  16. 训练:
  17. Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
  18. fit_intercept=True, max_iter=50, n_iter_no_change=5, n_jobs=None,
  19. penalty=None, random_state=0, shuffle=True, tol=0.001,
  20. validation_fraction=0.1, verbose=0, warm_start=False)
  21. 训练时间: 0.030s
  22. 最佳时间: 0.003s
  23. 准确率: 0.888
  24. 维数: 33809
  25. 密度: 0.255302
  26. ================================================================================
  27. PAC分类器
  28. ________________________________________________________________________________
  29. 训练:
  30. PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
  31. early_stopping=False, fit_intercept=True,
  32. loss='hinge', max_iter=50, n_iter_no_change=5,
  33. n_jobs=None, random_state=None, shuffle=True,
  34. tol=0.001, validation_fraction=0.1, verbose=0,
  35. warm_start=False)
  36. 训练时间: 0.063s
  37. 最佳时间: 0.003s
  38. 准确率: 0.902
  39. 维数: 33809
  40. 密度: 0.700487
  41. ================================================================================
  42. K近邻
  43. ________________________________________________________________________________
  44. 训练:
  45. KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
  46. metric_params=None, n_jobs=None, n_neighbors=10, p=2,
  47. weights='uniform')
  48. 训练时间: 0.002s
  49. 最佳时间: 0.235s
  50. 准确率: 0.858
  51. ================================================================================
  52. 随机森林
  53. ________________________________________________________________________________
  54. 训练:
  55. RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
  56. max_depth=None, max_features='auto', max_leaf_nodes=None,
  57. min_impurity_decrease=0.0, min_impurity_split=None,
  58. min_samples_leaf=1, min_samples_split=2,
  59. min_weight_fraction_leaf=0.0, n_estimators=100,
  60. n_jobs=None, oob_score=False, random_state=None,
  61. verbose=0, warm_start=False)
  62. 训练时间: 1.752s
  63. 最佳时间: 0.084s
  64. 准确率: 0.822
  1. for penalty in ["l2", "l1"]:
  2. print('=' * 80)
  3. print("%s 罚项" % penalty.upper())
  4. # 训练Liblinear模型
  5. results.append(benchmark(LinearSVC(penalty=penalty, dual=False,
  6. tol=1e-3)))
  7. # 训练SGD model模型
  8. results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50,
  9. penalty=penalty)))
  1. ================================================================================
  2. L2 罚项
  3. ________________________________________________________________________________
  4. 训练:
  5. LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
  6. intercept_scaling=1, loss='squared_hinge', max_iter=1000,
  7. multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
  8. verbose=0)
  9. 训练时间: 0.274s
  10. 最佳时间: 0.003s
  11. 准确率: 0.900
  12. 维数: 33809
  13. 密度: 1.000000
  14. ________________________________________________________________________________
  15. 训练:
  16. SGDClassifier(alpha=0.0001, average=False, class_weight=None,
  17. early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
  18. l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=50,
  19. n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
  20. random_state=None, shuffle=True, tol=0.001,
  21. validation_fraction=0.1, verbose=0, warm_start=False)
  22. 训练时间: 0.050s
  23. 最佳时间: 0.003s
  24. 准确率: 0.899
  25. 维数: 33809
  26. 密度: 0.573353
  27. ================================================================================
  28. L1 罚项
  29. ________________________________________________________________________________
  30. 训练:
  31. LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
  32. intercept_scaling=1, loss='squared_hinge', max_iter=1000,
  33. multi_class='ovr', penalty='l1', random_state=None, tol=0.001,
  34. verbose=0)
  35. 训练时间: 0.257s
  36. 最佳时间: 0.002s
  37. 准确率: 0.873
  38. 维数: 33809
  39. 密度: 0.005568
  40. ________________________________________________________________________________
  41. 训练:
  42. SGDClassifier(alpha=0.0001, average=False, class_weight=None,
  43. early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
  44. l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=50,
  45. n_iter_no_change=5, n_jobs=None, penalty='l1', power_t=0.5,
  46. random_state=None, shuffle=True, tol=0.001,
  47. validation_fraction=0.1, verbose=0, warm_start=False)
  48. 训练时间: 0.187s
  49. 最佳时间: 0.003s
  50. 准确率: 0.882
  51. 维数: 33809
  52. 密度: 0.023049
  1. # 训练带弹性网络(Elastic Net)罚项的SGD模型
  2. print('=' * 80)
  3. print("弹性网络(Elastic Net)罚项")
  4. results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50,
  5. penalty="elasticnet")))
  1. ================================================================================
  2. 弹性网络(Elastic Net)罚项
  3. ________________________________________________________________________________
  4. 训练:
  5. SGDClassifier(alpha=0.0001, average=False, class_weight=None,
  6. early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
  7. l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=50,
  8. n_iter_no_change=5, n_jobs=None, penalty='elasticnet',
  9. power_t=0.5, random_state=None, shuffle=True, tol=0.001,
  10. validation_fraction=0.1, verbose=0, warm_start=False)
  11. 训练时间: 0.295s
  12. 最佳时间: 0.003s
  13. 准确率: 0.897
  14. 维数: 33809
  15. 密度: 0.185956
  1. # 训练不带阈值的Rocchio分类器
  2. print('=' * 80)
  3. print("不带阈值的Rocchio分类器")
  4. results.append(benchmark(NearestCentroid()))
  1. ================================================================================
  2. 不带阈值的Rocchio分类器
  3. ________________________________________________________________________________
  4. 训练:
  5. NearestCentroid(metric='euclidean', shrink_threshold=None)
  6. 训练时间: 0.007s
  7. 最佳时间: 0.002s
  8. 准确率: 0.855
  1. # 训练稀疏朴素贝叶斯分类器
  2. print('=' * 80)
  3. print("稀疏朴素贝叶斯分类器")
  4. results.append(benchmark(MultinomialNB(alpha=.01)))
  5. results.append(benchmark(BernoulliNB(alpha=.01)))
  6. results.append(benchmark(ComplementNB(alpha=.1)))
  1. ================================================================================
  2. 稀疏朴素贝叶斯分类器
  3. ________________________________________________________________________________
  4. 训练:
  5. MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
  6. 训练时间: 0.007s
  7. 最佳时间: 0.003s
  8. 准确率: 0.899
  9. 维数: 33809
  10. 密度: 1.000000
  11. ________________________________________________________________________________
  12. 训练:
  13. BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)
  14. 训练时间: 0.010s
  15. 最佳时间: 0.008s
  16. 准确率: 0.884
  17. 维数: 33809
  18. 密度: 1.000000
  19. ________________________________________________________________________________
  20. 训练:
  21. ComplementNB(alpha=0.1, class_prior=None, fit_prior=True, norm=False)
  22. 训练时间: 0.007s
  23. 最佳时间: 0.002s
  24. 准确率: 0.911
  25. 维数: 33809
  26. 密度: 1.000000
  1. print('=' * 80)
  2. print("基于l1的特征选择的LinearSVC")
  3. # The smaller C, the stronger the regularization.
  4. # The more regularization, the more sparsity.
  5. results.append(benchmark(Pipeline([
  6. ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False,
  7. tol=1e-3))),
  8. ('classification', LinearSVC(penalty="l2"))])))
  1. ================================================================================
  2. 基于l1的特征选择的LinearSVC
  3. ________________________________________________________________________________
  4. 训练:
  5. Pipeline(memory=None,
  6. steps=[('feature_selection',
  7. SelectFromModel(estimator=LinearSVC(C=1.0, class_weight=None,
  8. dual=False,
  9. fit_intercept=True,
  10. intercept_scaling=1,
  11. loss='squared_hinge',
  12. max_iter=1000,
  13. multi_class='ovr',
  14. penalty='l1',
  15. random_state=None,
  16. tol=0.001, verbose=0),
  17. max_features=None, norm_order=1, prefit=False,
  18. threshold=None)),
  19. ('classification',
  20. LinearSVC(C=1.0, class_weight=None, dual=True,
  21. fit_intercept=True, intercept_scaling=1,
  22. loss='squared_hinge', max_iter=1000,
  23. multi_class='ovr', penalty='l2', random_state=None,
  24. tol=0.0001, verbose=0))],
  25. verbose=False)
  26. 训练时间: 0.277s
  27. 最佳时间: 0.002s
  28. 准确率: 0.880
  1. # 参考翻译
  2. classifier_dic={
  3. 'RidgeClassifier':'岭分类器(Ridge)',
  4. 'Perceptron':'感知器(Perceptron)',
  5. 'PassiveAggressiveClassifier':'PAC分类器',
  6. 'KNeighborsClassifier':'K近邻(KNN)',
  7. 'RandomForestClassifier':'随机森林',
  8. 'LinearSVC':'线性SVC',
  9. 'SGDClassifier':'SGD分类器',
  10. 'NearestCentroid':'线性SVC',
  11. 'MultinomialNB':'(多项式)稀疏朴素贝叶斯分类器',
  12. 'BernoulliNB':'(伯努利)稀疏朴素贝叶斯分类器',
  13. 'ComplementNB':'(补偿)稀疏朴素贝叶斯分类器',
  14. 'Pipeline':'基于l1的特征选择的LinearSVC',
  15. }
  1. # 绘图
  2. indices = np.arange(len(results))
  3. results = [[x[i] for x in results] for i in range(4)]
  4. clf_names, score, training_time, test_time = results
  5. training_time = np.array(training_time) / np.max(training_time)
  6. test_time = np.array(test_time) / np.max(test_time)
  7. plt.figure(figsize=(12, 8))
  8. plt.title("模型对比")
  9. plt.barh(indices, score, .2, label="得分(score)", color='navy')
  10. plt.barh(indices + .3, training_time, .2, label="训练时间",
  11. color='c')
  12. plt.barh(indices + .6, test_time, .2, label="最佳时间", color='darkorange')
  13. plt.yticks(())
  14. plt.legend(loc='best')
  15. plt.subplots_adjust(left=.25)
  16. plt.subplots_adjust(top=.95)
  17. plt.subplots_adjust(bottom=.05)
  18. for i, c in zip(indices, clf_names):
  19. plt.text(-.3, i, classifier_dic[c])
  20. plt.show()

png