让我们来看一个经典的儿分类问题:垃圾邮件过滤
    问题描述:因为我们用到的数据集是来自UCI机器学习仓库中的垃圾信息数据集,所以我们要分类的是垃圾短信和非垃圾短信😝,数据集我们可以从https://archive.ics.uci.edu/ml/datasets/sms+spam+collection下载。下面让我们来看一下代码。

    1. import numpy as np
    2. import pandas as pd
    3. from sklearn.feature_extraction.text import TfidfVectorizer
    4. from sklearn.model_selection import train_test_split, cross_val_score
    5. from sklearn.linear_model import LogisticRegression
    6. from sklearn.metrics import classification_report, f1_score
    7. def main():
    8. df = pd.read_csv('./smsspamcollection/SMSSpamCollection', sep='\t', names=['label', 'content'])
    9. print('垃圾短信数据量:{}条'.format(df[df['label'] == 'spam'].__len__()))
    10. print('非垃圾短信数据集:{}条'.format(df[df['label'] == 'ham'].__len__()))
    11. # 垃圾短信数据量:747条
    12. # 非垃圾短信数据集:4825条
    13. X_train_text, X_test_text, y_train, y_test = train_test_split(df['content'].values, df['label'].values,
    14. test_size=0.25, shuffle=True, random_state=2020)
    15. # 向量化
    16. tfidf = TfidfVectorizer()
    17. X_train = tfidf.fit_transform(X_train_text)
    18. X_test = tfidf.transform(X_test_text)
    19. logist_model = LogisticRegression()
    20. logist_model.fit(X_train, y_train)
    21. predictions = logist_model.predict(X_test)
    22. for i, prediction in enumerate(predictions[:1]):
    23. print('预测结果为:{},短信内容为:{}'.format(prediction, X_test_text[i]))
    24. # 预测结果为:ham,短信内容为:Hmm...my uncle just informed me that he's paying the school directly. So pls buy food.
    25. # test label 向量化
    26. y2int = dict(zip(logist_model.classes_, range(np.unique(y_test).shape[0])))
    27. y_test = np.apply_along_axis(lambda x: y2int[x[0]], arr=y_test.reshape(-1, 1), axis=-1)
    28. accuary = cross_val_score(logist_model, X_train, y_train, cv=5, scoring='accuracy') # accuracy分数5折交叉验证
    29. # train label 向量化
    30. y_train = np.apply_along_axis(lambda x: y2int[x[0]], arr=y_train.reshape(-1, 1), axis=-1)
    31. f1 = cross_val_score(logist_model, X_train, y_train, cv=5, scoring='f1') # f1分数5折交叉验证
    32. recall_score = cross_val_score(logist_model, X_train, y_train, cv=5, scoring='recall') # recall分数5折交叉验证
    33. precision_score = cross_val_score(logist_model, X_train, y_train, cv=5, scoring='precision') # precision分数5折交叉验证
    34. print(f'accuary得分:{accuary}\n'
    35. f'f1_score得分:{f1}\n'
    36. f'recall_score得分:{recall_score}\n'
    37. f'precision_score得分:{precision_score}\n')
    38. print(f'accuary5折交叉验证平均得分:{np.mean(accuary)}\n'
    39. f'f1_score5折交叉验证平均得分:{np.mean(f1)}\n'
    40. f'recall_score5折交叉验证平均得分:{np.mean(recall_score)}\n'
    41. f'precision_score5折交叉验证平均得分:{np.mean(precision_score)}\n')
    42. print(classification_report(y_test, np.apply_along_axis(lambda x: y2int[x[0]], arr=logist_model.predict(X_test).reshape(-1, 1), axis=-1)))
    43. """
    44. accuary得分:[0.965311 0.95095694 0.95574163 0.94497608 0.95688623]
    45. f1_score得分:[0.85128205 0.77837838 0.8042328 0.74725275 0.80851064]
    46. recall_score得分:[0.74107143 0.64285714 0.67857143 0.60714286 0.67857143]
    47. precision_score得分:[1. 0.98630137 0.98701299 0.97142857 1. ]
    48. accuary5折交叉验证平均得分:0.9547743746955849
    49. f1_score5折交叉验证平均得分:0.7979313238887707
    50. recall_score5折交叉验证平均得分:0.6696428571428571
    51. precision_score5折交叉验证平均得分:0.9889485856609145
    52. precision recall f1-score support
    53. 0 0.97 1.00 0.98 1206
    54. 1 0.99 0.79 0.88 187
    55. accuracy 0.97 1393
    56. macro avg 0.98 0.90 0.93 1393
    57. weighted avg 0.97 0.97 0.97 1393
    58. """
    59. if __name__ == '__main__':
    60. main()