鸢尾花种类预测
image.png
image.png

1.获取数据集

  1. from sklearn.datasets import load_iris
  2. iris = load_iris()
  3. print("鸢尾花数据集的返回值:\n", iris)
  4. # 返回值是一个继承自字典的Bench
  5. print("鸢尾花的特征值:\n", iris["data"])
  6. print("鸢尾花的目标值:\n", iris.target)
  7. print("鸢尾花特征的名字:\n", iris.feature_names)
  8. print("鸢尾花目标值的名字:\n", iris.target_names)
  9. print("鸢尾花的描述:\n", iris.DESCR)

2.基本数据处理

  1. import seaborn as sns
  2. import matplotlib.pyplot as plt
  3. import pandas as pd
  4. iris_d = pd.DataFrame(iris['data'], columns = ['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width'])
  5. iris_d['Species'] = iris.target
  6. def plot_iris(iris, col1, col2):
  7. sns.lmplot(x = col1, y = col2, data = iris, hue = "Species", fit_reg = False)
  8. plt.xlabel(col1)
  9. plt.ylabel(col2)
  10. plt.title('鸢尾花种类分布图')
  11. plt.show()
  12. plot_iris(iris_d, 'Petal_Width', 'Sepal_Length')

image.png

  1. from sklearn.model_selection import train_test_split
  2. x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.2and, random_state=22)

3.特征工程

  1. from sklearn.preprocessing import StandardScaler
  2. from sklearn.neighbors import KNeighborsClassifier
  3. transfer = StandardScaler()
  4. transfer.fit_transform(x_train)
  5. transfer.transform(x_test)

4.机器学习

  1. estimator = KNeighborsClassifier(n_neighbors=9)
  2. estimator.fit(x_train, y_train)

5.模型评估

  1. # 方法1:比对真实值和预测值
  2. y_predict = estimator.predict(x_test)
  3. print("预测结果为:\n", y_predict)
  4. print("比对真实值和预测值:\n", y_predict == y_test)
  5. # 方法2:直接计算准确率
  6. score = estimator.score(x_test, y_test)
  7. print("准确率为:\n", score)
  8. # Output:
  9. 预测结果为:
  10. [0 2 1 2 1 1 1 2 1 0 2 1 2 2 0 2 1 1 1 1 0 2 0 1 2 0 2 2 2 2]
  11. 比对真实值和预测值:
  12. [ True True True True True True True True True True True True
  13. True True True True True True False True True True True True
  14. True True True True True True]
  15. 准确率为:
  16. 0.9666666666666667

6.鸢尾花案例增加K值调优

  1. # 4.2 模型选择与调优——网格搜索和交叉验证
  2. # 准备要调的超参数
  3. from sklearn.model_selection import GridSearchCV
  4. estimator = KNeighborsClassifier()
  5. param_dict = {"n_neighbors": [1, 3, 5]}
  6. estimator = GridSearchCV(estimator, param_grid=param_dict, cv=3)
  7. estimator.fit(x_train, y_train)
  8. # 方法a:比对预测结果和真实值
  9. y_predict = estimator.predict(x_test)
  10. print("比对预测结果和真实值:\n", y_predict == y_test)
  11. # 方法b:直接计算准确率
  12. score = estimator.score(x_test, y_test)
  13. print("直接计算准确率:\n", score)
  14. print("在交叉验证中验证的最好结果:\n", estimator.best_score_)
  15. print("最好的参数模型:\n", estimator.best_estimator_)
  16. print("每次交叉验证后的准确率结果:\n", estimator.cv_results_)
  17. # Output:
  18. 比对预测结果和真实值:
  19. [ True True True True True True True True True True True True
  20. True True True True True True False True True True True True
  21. True True True True True True]
  22. 直接计算准确率:
  23. 0.9666666666666667
  24. 在交叉验证中验证的最好结果:
  25. 0.9666666666666667
  26. 最好的参数模型:
  27. KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
  28. metric_params=None, n_jobs=1, n_neighbors=3, p=2,
  29. weights='uniform')
  30. 每次交叉验证后的准确率结果:
  31. {'mean_fit_time': array([0.00049504, 0.00028332, 0.00028261]), 'std_fit_time': array([1.04830516e-04, 4.61627189e-06, 2.72766786e-06]), 'mean_score_time': array([0.00073298, 0.00054653, 0.00069141]), 'std_score_time': array([5.69916273e-05, 9.40939864e-06, 9.60944202e-05]), 'param_n_neighbors': masked_array(data=[1, 3, 5],
  32. mask=[False, False, False],
  33. fill_value='?',
  34. dtype=object), 'params': [{'n_neighbors': 1}, {'n_neighbors': 3}, {'n_neighbors': 5}], 'split0_test_score': array([1. , 0.97560976, 0.97560976]), 'split1_test_score': array([0.9 , 0.975, 0.975]), 'split2_test_score': array([0.94871795, 0.94871795, 0.94871795]), 'mean_test_score': array([0.95 , 0.96666667, 0.96666667]), 'std_test_score': array([0.04108569, 0.01245693, 0.01245693]), 'rank_test_score': array([3, 1, 1], dtype=int32), 'split0_train_score': array([1. , 0.93670886, 0.97468354]), 'split1_train_score': array([1. , 0.9875, 0.9625]), 'split2_train_score': array([1. , 0.96296296, 0.97530864]), 'mean_train_score': array([1. , 0.96239061, 0.97083073]), 'std_train_score': array([0. , 0.02073935, 0.00589624])}