新闻报告的文本分类。数据集将1000个文档分成20个类。要求:五折交叉验证结果
    数据集:http://www.cs.cmu.edu/afs/cs/project/theo-11/www/naive-bayes.html

    1. from sklearn.naive_bayes import MultinomialNB
    2. from sklearn.feature_extraction.text import TfidfVectorizer
    3. from sklearn.datasets import fetch_20newsgroups
    4. from sklearn.model_selection import StratifiedKFold
    5. import numpy as np
    6. def fold_5(news_data):
    7. x = np.array(news_data.data)
    8. y = news_data.target
    9. skf=StratifiedKFold(n_splits = 5)
    10. skf.get_n_splits(x,y)
    11. for train_index,test_index in skf.split(x,y):
    12. x_train,x_test = x[train_index],x[test_index]
    13. y_train,y_test = y[train_index],y[test_index]
    14. naive_bayes(x_train,x_test,y_train,y_test)
    15. def naive_bayes(x_train,x_test,y_train,y_test):
    16. #进行特征抽取
    17. tf = TfidfVectorizer()
    18. x_train = tf.fit_transform(x_train)
    19. x_test = tf.transform(x_test)
    20. #进行朴素贝叶斯算法分类
    21. bayes = MultinomialNB(alpha=1.0)
    22. bayes.fit(x_train,y_train)
    23. y_predict = bayes.predict(x_test)
    24. print("测试集的预测结果为:",y_predict)
    25. print("模型的预测准确率为:",bayes.score(x_test,y_test))
    26. if __name__ == '__main__':
    27. news_data = fetch_20newsgroups(subset="all") #读取数据
    28. fold_5(news_data) #五重交叉验证