第六章从线性回归到逻辑回归 - 二、垃圾邮件过滤 - 《sklearn_机器学习》

让我们来看一个经典的儿分类问题：垃圾邮件过滤
问题描述：因为我们用到的数据集是来自UCI机器学习仓库中的垃圾信息数据集，所以我们要分类的是垃圾短信和非垃圾短信😝，数据集我们可以从https://archive.ics.uci.edu/ml/datasets/sms+spam+collection下载。下面让我们来看一下代码。


import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
def main():
    df = pd.read_csv('./smsspamcollection/SMSSpamCollection', sep='\t', names=['label', 'content'])
    print('垃圾短信数据量：{}条'.format(df[df['label'] == 'spam'].__len__()))
    print('非垃圾短信数据集：{}条'.format(df[df['label'] == 'ham'].__len__()))
    # 垃圾短信数据量：747条
    # 非垃圾短信数据集：4825条
    X_train_text, X_test_text, y_train, y_test = train_test_split(df['content'].values, df['label'].values,
                                                                  test_size=0.25, shuffle=True, random_state=2020)
    # 向量化
    tfidf = TfidfVectorizer()
    X_train = tfidf.fit_transform(X_train_text)
    X_test = tfidf.transform(X_test_text)
    logist_model = LogisticRegression()
    logist_model.fit(X_train, y_train)
    predictions = logist_model.predict(X_test)
    for i, prediction in enumerate(predictions[:1]):
        print('预测结果为：{}，短信内容为：{}'.format(prediction, X_test_text[i]))
    # 预测结果为：ham，短信内容为：Hmm...my uncle just informed me that he's paying the school directly. So pls buy food.
    # test label 向量化
    y2int = dict(zip(logist_model.classes_, range(np.unique(y_test).shape[0])))
    y_test = np.apply_along_axis(lambda x: y2int[x[0]], arr=y_test.reshape(-1, 1), axis=-1)
    accuary = cross_val_score(logist_model, X_train, y_train, cv=5, scoring='accuracy')      # accuracy分数5折交叉验证
    # train label 向量化
    y_train = np.apply_along_axis(lambda x: y2int[x[0]], arr=y_train.reshape(-1, 1), axis=-1)
    f1 = cross_val_score(logist_model, X_train, y_train, cv=5, scoring='f1')                # f1分数5折交叉验证
    recall_score = cross_val_score(logist_model, X_train, y_train, cv=5, scoring='recall')        # recall分数5折交叉验证
    precision_score = cross_val_score(logist_model, X_train, y_train, cv=5, scoring='precision')  # precision分数5折交叉验证
    print(f'accuary得分：{accuary}\n'
          f'f1_score得分：{f1}\n'
          f'recall_score得分：{recall_score}\n'
          f'precision_score得分：{precision_score}\n')
    print(f'accuary5折交叉验证平均得分：{np.mean(accuary)}\n'
          f'f1_score5折交叉验证平均得分：{np.mean(f1)}\n'
          f'recall_score5折交叉验证平均得分：{np.mean(recall_score)}\n'
          f'precision_score5折交叉验证平均得分：{np.mean(precision_score)}\n')
    print(classification_report(y_test, np.apply_along_axis(lambda x: y2int[x[0]], arr=logist_model.predict(X_test).reshape(-1, 1), axis=-1)))
    """
        accuary得分：[0.965311   0.95095694 0.95574163 0.94497608 0.95688623]
        f1_score得分：[0.85128205 0.77837838 0.8042328  0.74725275 0.80851064]
        recall_score得分：[0.74107143 0.64285714 0.67857143 0.60714286 0.67857143]
        precision_score得分：[1.         0.98630137 0.98701299 0.97142857 1.        ]
        accuary5折交叉验证平均得分：0.9547743746955849
        f1_score5折交叉验证平均得分：0.7979313238887707
        recall_score5折交叉验证平均得分：0.6696428571428571
        precision_score5折交叉验证平均得分：0.9889485856609145
                  precision    recall  f1-score   support
               0       0.97      1.00      0.98      1206
               1       0.99      0.79      0.88       187
        accuracy                           0.97      1393
       macro avg       0.98      0.90      0.93      1393
    weighted avg       0.97      0.97      0.97      1393
    """
if __name__ == '__main__':
    main()

二、 垃圾邮件过滤

二、垃圾邮件过滤