新闻报告的文本分类。数据集将1000个文档分成20个类。要求:五折交叉验证结果
数据集:http://www.cs.cmu.edu/afs/cs/project/theo-11/www/naive-bayes.html
from sklearn.naive_bayes import MultinomialNBfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.datasets import fetch_20newsgroupsfrom sklearn.model_selection import StratifiedKFoldimport numpy as npdef fold_5(news_data):x = np.array(news_data.data)y = news_data.targetskf=StratifiedKFold(n_splits = 5)skf.get_n_splits(x,y)for train_index,test_index in skf.split(x,y):x_train,x_test = x[train_index],x[test_index]y_train,y_test = y[train_index],y[test_index]naive_bayes(x_train,x_test,y_train,y_test)def naive_bayes(x_train,x_test,y_train,y_test):#进行特征抽取tf = TfidfVectorizer()x_train = tf.fit_transform(x_train)x_test = tf.transform(x_test)#进行朴素贝叶斯算法分类bayes = MultinomialNB(alpha=1.0)bayes.fit(x_train,y_train)y_predict = bayes.predict(x_test)print("测试集的预测结果为:",y_predict)print("模型的预测准确率为:",bayes.score(x_test,y_test))if __name__ == '__main__':news_data = fetch_20newsgroups(subset="all") #读取数据fold_5(news_data) #五重交叉验证
