新闻报告的文本分类。数据集将1000个文档分成20个类。要求:五折交叉验证结果
数据集:http://www.cs.cmu.edu/afs/cs/project/theo-11/www/naive-bayes.html
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import StratifiedKFold
import numpy as np
def fold_5(news_data):
x = np.array(news_data.data)
y = news_data.target
skf=StratifiedKFold(n_splits = 5)
skf.get_n_splits(x,y)
for train_index,test_index in skf.split(x,y):
x_train,x_test = x[train_index],x[test_index]
y_train,y_test = y[train_index],y[test_index]
naive_bayes(x_train,x_test,y_train,y_test)
def naive_bayes(x_train,x_test,y_train,y_test):
#进行特征抽取
tf = TfidfVectorizer()
x_train = tf.fit_transform(x_train)
x_test = tf.transform(x_test)
#进行朴素贝叶斯算法分类
bayes = MultinomialNB(alpha=1.0)
bayes.fit(x_train,y_train)
y_predict = bayes.predict(x_test)
print("测试集的预测结果为:",y_predict)
print("模型的预测准确率为:",bayes.score(x_test,y_test))
if __name__ == '__main__':
news_data = fetch_20newsgroups(subset="all") #读取数据
fold_5(news_data) #五重交叉验证