准备数据

  1. import numpy as np
  2. import matplotlib.pyplot as plt
  3. # raw_data_X 第1列为特征1,第2列为特征2
  4. raw_data_X = [
  5. [3.393533211, 2.331273381],
  6. [3.110073483, 1.781539638],
  7. [1.343808831, 3.368360954],
  8. [3.582294042, 4.679179110],
  9. [2.280362439, 2.866990263],
  10. [7.423436942, 4.696522875],
  11. [5.745051997, 3.533989803],
  12. [9.172168622, 2.511101045],
  13. [7.792783481, 3.424088941],
  14. [7.939820817, 0.791637231]
  15. ]
  16. # 标签
  17. raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
  18. # 将list类型转换成ndarray
  19. X_train = np.array(raw_data_X)
  20. y_train = np.array(raw_data_y)

可视化探索

image.png

  1. plt.scatter(X_train[y_train==0,0], X_train[y_train==0,1], color='g') # 标签0 为绿色
  2. plt.scatter(X_train[y_train==1,0], X_train[y_train==1,1], color='r') # 标签1 为红色
  3. plt.show()

image.png

预测

  1. x = np.array([8.093607318, 3.365731514]) # 预测点
  2. plt.scatter(X_train[y_train==0,0], X_train[y_train==0,1], color='g')
  3. plt.scatter(X_train[y_train==1,0], X_train[y_train==1,1], color='r')
  4. plt.scatter(x[0], x[1], color='b')
  5. plt.show()

image.png

建模

image.png
image.png

模型封装

简单封装

  1. import numpy as np
  2. from math import sqrt
  3. from collections import Counter
  4. def kNN_classify(k, X_train, y_train, x):
  5. assert 1 <= k <= X_train.shape[0], "k must be valid"
  6. assert X_train.shape[0] == y_train.shape[0], \
  7. "the size of X_train must equal to the size of y_train"
  8. assert X_train.shape[1] == x.shape[0], \
  9. "the feature number of x must be equal to X_train"
  10. distances = [sqrt(np.sum((x_train - x)**2)) for x_train in X_train]
  11. nearest = np.argsort(distances)
  12. topK_y = [y_train[i] for i in nearest[:k]]
  13. votes = Counter(topK_y)
  14. return votes.most_common(1)[0][0]

image.png

进一步封装

为了将语法和scikit-learn统一

  1. # kNN.py
  2. import numpy as np
  3. from math import sqrt
  4. from collections import Counter
  5. class KNNClassifier:
  6. def __init__(self, k):
  7. """初始化kNN分类器"""
  8. assert k >= 1, "k must be valid"
  9. self.k = k
  10. self._X_train = None
  11. self._y_train = None
  12. def fit(self, X_train, y_train):
  13. """根据训练数据集X_train和y_train训练kNN分类器"""
  14. assert X_train.shape[0] == y_train.shape[0], \
  15. "the size of X_train must be equal to the size of y_train"
  16. assert self.k <= X_train.shape[0], \
  17. "the size of X_train must be at least k."
  18. self._X_train = X_train
  19. self._y_train = y_train
  20. return self
  21. def predict(self, X_predict):
  22. """给定待预测数据集X_predict,返回表示X_predict的结果向量"""
  23. assert self._X_train is not None and self._y_train is not None, \
  24. "must fit before predict!"
  25. assert X_predict.shape[1] == self._X_train.shape[1], \
  26. "the feature number of X_predict must be equal to X_train"
  27. y_predict = [self._predict(x) for x in X_predict]
  28. return np.array(y_predict)
  29. def _predict(self, x):
  30. """给定单个待预测数据x,返回x的预测结果值"""
  31. assert x.shape[0] == self._X_train.shape[1], \
  32. "the feature number of x must be equal to X_train"
  33. distances = [sqrt(np.sum((x_train - x) ** 2))
  34. for x_train in self._X_train]
  35. nearest = np.argsort(distances)
  36. topK_y = [self._y_train[i] for i in nearest[:self.k]]
  37. votes = Counter(topK_y)
  38. return votes.most_common(1)[0][0]
  39. def __repr__(self):
  40. return "KNN(k=%d)" % self.k

image.png