准确率

带来的问题:比如一种事件的发生的概率是99.9999%,模型预测的准确率是99%,并不能说明这个模型是好的模型。

精确度:

是指:预测为1的个数 / (预测真正1的个数+预测错误为1的)——> TP / (TP + FP)

召回率:

是指: 预测为1的个数 / (预测真正1的个数+预测错误真正为1的) —- > TP /(TP + FN)

  1. import numpy as np
  2. from sklearn import datasets
  3. digits = datasets.load_digits()
  4. X = digits.data
  5. y = digits.target.copy()
  6. y[digits.target == 9] = 1
  7. y[digits.target != 9] = 0
  8. from sklearn.linear_model import LogisticRegression
  9. from sklearn.model_selection import train_test_split
  10. X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
  11. log_reg = LogisticRegression()
  12. log_reg.fit(X_train, y_train)
  13. log_reg.score(X_test, y_test)
  14. log_reg_prdeict = log_reg.predict(X_test)
  15. def TN(y_true, y_predict):
  16. return np.sum((y_true == 0) & (y_predict == 0))
  17. def FP(y_true, y_predict):
  18. return np.sum((y_true == 0) & (y_predict == 1))
  19. def FN(y_true, y_predict):
  20. return np.sum((y_true == 1) & (y_predict == 0))
  21. def TP(y_true, y_predict):
  22. return np.sum((y_true == 1) & (y_predict == 1))
  23. # 混淆矩阵
  24. def confusion_matrix(y_true, y_predict):
  25. return np.array([
  26. [TN(y_true, y_predict), FP(y_true, y_predict)],
  27. [FN(y_true, y_predict), TP(y_true, y_predict)],
  28. ])
  29. confusion_matrix(y_test, log_reg_prdeict)
  30. # 精确度
  31. def precision_score(y_true, y_predict):
  32. confusion_matrix_ = confusion_matrix(y_true, y_predict)
  33. return confusion_matrix_[1, 1] / (confusion_matrix_[0, 1] + confusion_matrix_[1, 1])
  34. # 召回率
  35. def recall_score(y_true, y_predict):
  36. confusion_matrix_ = confusion_matrix(y_true, y_predict)
  37. return confusion_matrix_[1, 1] / (confusion_matrix_[1, 1] + confusion_matrix_[1, 0])
  38. # sklearn库中的方法
  39. from sklearn.metrics import confusion_matrix, precision_score, recall_score
  40. confusion_matrix(y_test, log_reg_prdeict)
  41. ps = precision_score(y_test, log_reg_prdeict)
  42. rs = recall_score(y_test, log_reg_prdeict)

F1 score:

兼顾精确度和召回率

  1. def f1_score(precision_score, recall_score):
  2. try:
  3. return 2 * precision_score * recall_score / (precision_score + recall_score)
  4. except:
  5. return 0.0

精准率和召回率的平衡

  1. log_reg.decision_function(X_test)[:10]
  2. log_reg.predict(X_test)[:10]
  3. # 决策函数
  4. dec_scor = log_reg.decision_function(X_test)
  5. y_predict_2 = np.array(dec_scor >= 5, dtype='int')
  6. confusion_matrix(y_test, y_predict_2)
  7. recall_score(y_test, y_predict_2)
  8. y_predict_3 = np.array(dec_scor >= -5, dtype='int')
  9. confusion_matrix(y_test, y_predict_3)
  10. precision_score(y_test, y_predict_3)
  11. recall_score(y_test, y_predict_3)
  12. thr = np.arange(np.min(dec_scor), np. max(dec_scor), 0.1)
  13. for th in thr:
  14. y_predict = np.array(dec_scor >= th, dtype='int')
  15. precisions.append(precision_score(y_test, y_predict))
  16. recalls.append(recall_score(y_test, y_predict))
  17. from matplotlib import pyplot as plt
  18. plt.plot(thr, precisions)
  19. plt.plot(thr, recalls)

image.png

  1. plt.plot(precisions, recalls)

image.png

Precision-Recall曲线

  1. from sklearn.metrics import precision_recall_curve
  2. precision, recall, thresholds = precision_recall_curve(y_test, dec_scor)
  3. plt.plot(thresholds, precision[:-1])
  4. plt.plot(thresholds, recall[:-1])

image.png

ROC

  1. from sklearn.metrics import roc_curve
  2. fprs, tprs, thresholds = roc_curve(y_test, dec_scor)
  3. plt.plot(fprs, tprs)

image.png