导包和测试数据

  1. import numpy as np
  2. from sklearn.neighbors import KNeighborsClassifier
  3. from matplotlib import pyplot as plt
  4. # 测试数据
  5. raw_data_X = [[3.393533211, 2.331273381],
  6. [3.110073483, 1.781539638],
  7. [1.343808831, 3.368360954],
  8. [3.582294042, 4.679179110],
  9. [2.280362439, 2.866990263],
  10. [7.423436942, 4.696522875],
  11. [5.745051997, 3.533989803],
  12. [9.172168622, 2.511101045],
  13. [7.792783481, 3.424088941],
  14. [7.939820817, 0.791637231]
  15. ]
  16. raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
  17. # 转换成数组
  18. X_train = np.array(raw_data_X)
  19. y_train = np.array(raw_data_y)

画图

  1. # 画散点图
  2. plt.scatter(X_train[y_train == 0, 0], X_train[y_train == 0, 1])
  3. plt.scatter(X_train[y_train == 1, 0], X_train[y_train == 1, 1])

image.png

预测一个数据

  1. # 预测随机点x
  2. X=np.array([8.093607318, 3.365731514])
  3. # 画图
  4. plt.scatter(X_train[y_train == 0, 0], X_train[y_train == 0, 1])
  5. plt.scatter(X_train[y_train == 1, 0], X_train[y_train == 1, 1])
  6. plt.scatter(x[0], x[1])

画图

image.png

算法过程

  1. # 计算点与所有样本的距离---欧式距离
  2. distance = np.sqrt(np.sum((X_train - X) ** 2, axis=1))
  3. # 取K值等于6
  4. k = 6
  5. # 取前6个类别
  6. top_k = y_train[distancde_index[:k]]
  7. # 统计
  8. from collections import Counter
  9. count = Counter(top_k)
  10. # 预测结果
  11. y_prect = count.most_common(1)[0][0]

scikit-learn中的KNN

  1. knn = KNeighborsClassifier(n_neighbors=6)
  2. knn.fit(X_train, y_train)
  3. knn.predict([X])

实现KNN算法

  1. # 实现KNN算法
  2. class KNNClassifier:
  3. def __init__(self, k):
  4. self.__k = k
  5. self.__X_train = None
  6. self.__y_train = None
  7. def fit(self, X_train, y_train):
  8. assert 1<= self.__k <= X_train.shape[0]
  9. assert X_train.shape[0] == y_train.shape[0]
  10. self.__X_train = X_train
  11. self.__y_train = y_train
  12. return self
  13. def predict(self, X_predict):
  14. assert self.__X_train.shape[1] == X_predict.shape[1]
  15. predicr_res = []
  16. for X_predict in X_predict:
  17. distance = np.sqrt(np.sum((self.__X_train - X_predict) ** 2, axis=1))
  18. distance_index = np.argsort(distance)
  19. top_k = Counter(self.__y_train[:self.__k])
  20. predict_one = count.most_common(1)[0][0]
  21. predicr_res.append(predict_one)
  22. return predicr_res

测试

  1. my_knn = KNNClassifier(k=6)
  2. my_knn.fit(X_train, y_train)
  3. my_knn.predict(np.array([X]))

为什么要有训练集和测试集

答:训练集的数据用来通过算法训练模型,模型的好坏通过没有训练过的数据集来验证,训练集的数据是特征+结果的形式,如果是只有特征没有结果的数据进行预测你并不知道模型是好还是坏。

手动实现测试集和训练集

  1. from sklearn import datasets
  2. iris = datasets.load_iris()
  3. y = iris.target
  4. X = iris.data
  5. # 为了方便画图取鸢尾花的两个维度
  6. new_X = X[:, 2:]
  7. new_X.shape
  8. # 将原始数据打乱
  9. shuffie_index = np.random.permutation(new_X.shape[0])
  10. test_ratio = 0.2
  11. test_size = int(new_X.shape[0] * test_ratio)
  12. test_index = shuffie_index[:test_size]
  13. train_index = shuffie_index[test_size:]
  14. X_train = new_X[train_index]
  15. X_test = new_X[test_index]
  16. y_train = y[train_index]
  17. y_test = y[test_index]