让我们来看一个经典的儿分类问题:垃圾邮件过滤
问题描述:因为我们用到的数据集是来自UCI机器学习仓库中的垃圾信息数据集,所以我们要分类的是垃圾短信和非垃圾短信😝,数据集我们可以从https://archive.ics.uci.edu/ml/datasets/sms+spam+collection下载。下面让我们来看一下代码。
import numpy as npimport pandas as pdfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.model_selection import train_test_split, cross_val_scorefrom sklearn.linear_model import LogisticRegressionfrom sklearn.metrics import classification_report, f1_scoredef main():df = pd.read_csv('./smsspamcollection/SMSSpamCollection', sep='\t', names=['label', 'content'])print('垃圾短信数据量:{}条'.format(df[df['label'] == 'spam'].__len__()))print('非垃圾短信数据集:{}条'.format(df[df['label'] == 'ham'].__len__()))# 垃圾短信数据量:747条# 非垃圾短信数据集:4825条X_train_text, X_test_text, y_train, y_test = train_test_split(df['content'].values, df['label'].values,test_size=0.25, shuffle=True, random_state=2020)# 向量化tfidf = TfidfVectorizer()X_train = tfidf.fit_transform(X_train_text)X_test = tfidf.transform(X_test_text)logist_model = LogisticRegression()logist_model.fit(X_train, y_train)predictions = logist_model.predict(X_test)for i, prediction in enumerate(predictions[:1]):print('预测结果为:{},短信内容为:{}'.format(prediction, X_test_text[i]))# 预测结果为:ham,短信内容为:Hmm...my uncle just informed me that he's paying the school directly. So pls buy food.# test label 向量化y2int = dict(zip(logist_model.classes_, range(np.unique(y_test).shape[0])))y_test = np.apply_along_axis(lambda x: y2int[x[0]], arr=y_test.reshape(-1, 1), axis=-1)accuary = cross_val_score(logist_model, X_train, y_train, cv=5, scoring='accuracy') # accuracy分数5折交叉验证# train label 向量化y_train = np.apply_along_axis(lambda x: y2int[x[0]], arr=y_train.reshape(-1, 1), axis=-1)f1 = cross_val_score(logist_model, X_train, y_train, cv=5, scoring='f1') # f1分数5折交叉验证recall_score = cross_val_score(logist_model, X_train, y_train, cv=5, scoring='recall') # recall分数5折交叉验证precision_score = cross_val_score(logist_model, X_train, y_train, cv=5, scoring='precision') # precision分数5折交叉验证print(f'accuary得分:{accuary}\n'f'f1_score得分:{f1}\n'f'recall_score得分:{recall_score}\n'f'precision_score得分:{precision_score}\n')print(f'accuary5折交叉验证平均得分:{np.mean(accuary)}\n'f'f1_score5折交叉验证平均得分:{np.mean(f1)}\n'f'recall_score5折交叉验证平均得分:{np.mean(recall_score)}\n'f'precision_score5折交叉验证平均得分:{np.mean(precision_score)}\n')print(classification_report(y_test, np.apply_along_axis(lambda x: y2int[x[0]], arr=logist_model.predict(X_test).reshape(-1, 1), axis=-1)))"""accuary得分:[0.965311 0.95095694 0.95574163 0.94497608 0.95688623]f1_score得分:[0.85128205 0.77837838 0.8042328 0.74725275 0.80851064]recall_score得分:[0.74107143 0.64285714 0.67857143 0.60714286 0.67857143]precision_score得分:[1. 0.98630137 0.98701299 0.97142857 1. ]accuary5折交叉验证平均得分:0.9547743746955849f1_score5折交叉验证平均得分:0.7979313238887707recall_score5折交叉验证平均得分:0.6696428571428571precision_score5折交叉验证平均得分:0.9889485856609145precision recall f1-score support0 0.97 1.00 0.98 12061 0.99 0.79 0.88 187accuracy 0.97 1393macro avg 0.98 0.90 0.93 1393weighted avg 0.97 0.97 0.97 1393"""if __name__ == '__main__':main()
