K近邻法


image.png

距离度量


设特征空间K近邻算法 - 图3K近邻算法 - 图4维实数向量空间 ,K近邻算法 - 图5,K近邻算法 - 图6,K近邻算法 - 图7 ,则:K近邻算法 - 图8,K近邻算法 - 图9K近邻算法 - 图10距离定义为:
K近邻算法 - 图11

  • K近邻算法 - 图12 曼哈顿距离
  • K近邻算法 - 图13 欧氏距离
  • K近邻算法 - 图14 切比雪夫距离 ```python import math from itertools import combinations

def L(x, y, p=2):

  1. # x1 = [1, 1], x2 = [5,1]
  2. if len(x) == len(y) and len(x) > 1:
  3. sum = 0
  4. for i in range(len(x)):
  5. sum += math.pow(abs(x[i] - y[i]), p)
  6. return math.pow(sum, 1 / p)
  7. else:
  8. return 0
  1. **
  2. <a name="DGkLq"></a>
  3. ## 例3.1
  4. ![image.png](https://cdn.nlark.com/yuque/0/2020/png/1061864/1597065549016-7abba4bd-31a1-4562-81b9-8942eed92858.png#align=left&display=inline&height=48&margin=%5Bobject%20Object%5D&name=image.png&originHeight=74&originWidth=934&size=28307&status=done&style=none&width=605)
  5. ---
  6. ```python
  7. x1 = [1, 1]
  8. x2 = [5, 1]
  9. x3 = [4, 4]
  1. # x1, x2
  2. for i in range(1, 5):
  3. r = {'1-{}'.format(c): L(x1, c, p=i) for c in [x2, x3]}
  4. print(min(zip(r.values(), r.keys()))) # 输出二者中较小的那个
  5. (4.0, '1-[5, 1]')
  6. (4.0, '1-[5, 1]')
  7. (3.7797631496846193, '1-[4, 4]')
  8. (3.5676213450081633, '1-[4, 4]')

image.png
_

K近邻算法实现


  • 数据集 | | sepal length | sepal width | petal length | petal width | label | | :—- | :—- | :—- | :—- | :—- | :—- | | 0 | 5.1 | 3.5 | 1.4 | 0.2 | 0 | | 1 | 4.9 | 3.0 | 1.4 | 0.2 | 0 | | 2 | 4.7 | 3.2 | 1.3 | 0.2 | 0 | | 3 | 4.6 | 3.1 | 1.5 | 0.2 | 0 | | 4 | 5.0 | 3.6 | 1.4 | 0.2 | 0 | | 5 | 5.4 | 3.9 | 1.7 | 0.4 | 0 | | 6 | 4.6 | 3.4 | 1.4 | 0.3 | 0 | | 7 | 5.0 | 3.4 | 1.5 | 0.2 | 0 | | 8 | 4.4 | 2.9 | 1.4 | 0.2 | 0 | | 9 | 4.9 | 3.1 | 1.5 | 0.1 | 0 | | 10 | 5.4 | 3.7 | 1.5 | 0.2 | 0 | | 11 | 4.8 | 3.4 | 1.6 | 0.2 | 0 | | 12 | 4.8 | 3.0 | 1.4 | 0.1 | 0 | | 13 | 4.3 | 3.0 | 1.1 | 0.1 | 0 | | 14 | 5.8 | 4.0 | 1.2 | 0.2 | 0 | | 15 | 5.7 | 4.4 | 1.5 | 0.4 | 0 | | 16 | 5.4 | 3.9 | 1.3 | 0.4 | 0 | | 17 | 5.1 | 3.5 | 1.4 | 0.3 | 0 | | 18 | 5.7 | 3.8 | 1.7 | 0.3 | 0 | | 19 | 5.1 | 3.8 | 1.5 | 0.3 | 0 | | 20 | 5.4 | 3.4 | 1.7 | 0.2 | 0 | | 21 | 5.1 | 3.7 | 1.5 | 0.4 | 0 | | 22 | 4.6 | 3.6 | 1.0 | 0.2 | 0 | | 23 | 5.1 | 3.3 | 1.7 | 0.5 | 0 | | 24 | 4.8 | 3.4 | 1.9 | 0.2 | 0 | | 25 | 5.0 | 3.0 | 1.6 | 0.2 | 0 | | 26 | 5.0 | 3.4 | 1.6 | 0.4 | 0 | | 27 | 5.2 | 3.5 | 1.5 | 0.2 | 0 | | 28 | 5.2 | 3.4 | 1.4 | 0.2 | 0 | | 29 | 4.7 | 3.2 | 1.6 | 0.2 | 0 | | … | … | … | … | … | … | | 120 | 6.9 | 3.2 | 5.7 | 2.3 | 2 | | 121 | 5.6 | 2.8 | 4.9 | 2.0 | 2 | | 122 | 7.7 | 2.8 | 6.7 | 2.0 | 2 | | 123 | 6.3 | 2.7 | 4.9 | 1.8 | 2 | | 124 | 6.7 | 3.3 | 5.7 | 2.1 | 2 | | 125 | 7.2 | 3.2 | 6.0 | 1.8 | 2 | | 126 | 6.2 | 2.8 | 4.8 | 1.8 | 2 | | 127 | 6.1 | 3.0 | 4.9 | 1.8 | 2 | | 128 | 6.4 | 2.8 | 5.6 | 2.1 | 2 | | 129 | 7.2 | 3.0 | 5.8 | 1.6 | 2 | | 130 | 7.4 | 2.8 | 6.1 | 1.9 | 2 | | 131 | 7.9 | 3.8 | 6.4 | 2.0 | 2 | | 132 | 6.4 | 2.8 | 5.6 | 2.2 | 2 | | 133 | 6.3 | 2.8 | 5.1 | 1.5 | 2 | | 134 | 6.1 | 2.6 | 5.6 | 1.4 | 2 | | 135 | 7.7 | 3.0 | 6.1 | 2.3 | 2 | | 136 | 6.3 | 3.4 | 5.6 | 2.4 | 2 | | 137 | 6.4 | 3.1 | 5.5 | 1.8 | 2 | | 138 | 6.0 | 3.0 | 4.8 | 1.8 | 2 | | 139 | 6.9 | 3.1 | 5.4 | 2.1 | 2 | | 140 | 6.7 | 3.1 | 5.6 | 2.4 | 2 | | 141 | 6.9 | 3.1 | 5.1 | 2.3 | 2 | | 142 | 5.8 | 2.7 | 5.1 | 1.9 | 2 | | 143 | 6.8 | 3.2 | 5.9 | 2.3 | 2 | | 144 | 6.7 | 3.3 | 5.7 | 2.5 | 2 | | 145 | 6.7 | 3.0 | 5.2 | 2.3 | 2 | | 146 | 6.3 | 2.5 | 5.0 | 1.9 | 2 | | 147 | 6.5 | 3.0 | 5.2 | 2.0 | 2 | | 148 | 6.2 | 3.4 | 5.4 | 2.3 | 2 | | 149 | 5.9 | 3.0 | 5.1 | 1.8 | 2 |

150 rows × 5 columns

  1. import numpy as np
  2. import pandas as pd
  3. import matplotlib.pyplot as plt
  4. from sklearn.datasets import load_iris
  5. from sklearn.model_selection import train_test_split
  6. from collections import Counter
  7. # data
  8. iris = load_iris()
  9. df = pd.DataFrame(iris.data, columns=iris.feature_names)
  10. df['label'] = iris.target
  11. df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
  12. # k近邻算法
  13. class KNN:
  14. def __init__(self, X_train, y_train, n_neighbors=3, p=2):
  15. """
  16. parameter: n_neighbors 临近点个数
  17. parameter: p 距离度量
  18. """
  19. self.n = n_neighbors
  20. self.p = p
  21. self.X_train = X_train
  22. self.y_train = y_train
  23. def predict(self, X):
  24. # 取出n个点
  25. knn_list = []
  26. for i in range(self.n):
  27. dist = np.linalg.norm(X - self.X_train[i], ord=self.p)
  28. knn_list.append((dist, self.y_train[i]))
  29. for i in range(self.n, len(self.X_train)):
  30. max_index = knn_list.index(max(knn_list, key=lambda x: x[0]))
  31. dist = np.linalg.norm(X - self.X_train[i], ord=self.p)
  32. if knn_list[max_index][0] > dist:
  33. knn_list[max_index] = (dist, self.y_train[i])
  34. # 统计
  35. knn = [k[-1] for k in knn_list]
  36. count_pairs = Counter(knn)
  37. # max_count = sorted(count_pairs, key=lambda x: x)[-1]
  38. max_count = sorted(count_pairs.items(), key=lambda x: x[1])[-1][0]
  39. return max_count
  40. def score(self, X_test, y_test):
  41. right_count = 0
  42. n = 10
  43. for X, y in zip(X_test, y_test):
  44. label = self.predict(X)
  45. if label == y:
  46. right_count += 1
  47. return right_count / len(X_test)
  48. data = np.array(df.iloc[:100, [0, 1, -1]]) # 提取数据
  49. X, y = data[:,:-1], data[:,-1]
  50. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 测试集占比20%
  51. clf = KNN(X_train, y_train)
  52. print('正确率为:{}'.format(clf.score(X_test, y_test)))
  53. test_point = [6.0, 3.0]
  54. print('Test Point: {}'.format(clf.predict(test_point)))
  55. # 画图表示
  56. plt.scatter(df[:50]['sepal length'], df[:50]['sepal width'], label='0')
  57. plt.scatter(df[50:100]['sepal length'], df[50:100]['sepal width'], label='1')
  58. plt.plot(test_point[0], test_point[1], 'bo', label='test_point')
  59. plt.xlabel('sepal length')
  60. plt.ylabel('sepal width')
  61. plt.legend()
  62. plt.show()
  1. 正确率为:1.0
  2. Test Point: 1.0

Figure_1.png

scikit-learn的KNeighborsClassifier用法


sklearn.neighbors.KNeighborsClassifier

  • n_neighbors: 临近点个数
  • p: 距离度量
  • algorithm: 近邻算法,可选{‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}
  • weights: 确定近邻的权重 ```python import numpy as np import pandas as pd from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import train_test_split from sklearn.datasets import load_iris

data

iris = load_iris() df = pd.DataFrame(iris.data, columns=iris.feature_names) df[‘label’] = iris.target df.columns = [‘sepal length’, ‘sepal width’, ‘petal length’, ‘petal width’, ‘label’]

data = np.array(df.iloc[:100, [0, 1, -1]]) # 提取数据 X, y = data[:, :-1], data[:, -1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) test_point = [6.0, 3.0]

clf_sk = KNeighborsClassifier() clf_sk.fit(X_train, y_train) print(‘正确率为:{}’.format(clf_sk.score(X_test, y_test))) print(‘Test Point: {}’.format(clf_sk.predict([test_point])))

  1. ```python
  2. 正确率为:1.0
  3. Test Point: [1.]