k-NN回顾

k-NN基本实现

  1. import numpy as np
  2. import pandas as pd
  1. ## 读取数据
  2. root = '../cifar-10-batches-py/'
  3. # a = pd.read_pickle(path=root+'data_batch_1')
  4. # print(a.keys())
  5. Xtr = np.zeros((50000, 32*32*3))
  6. ytr = np.zeros((50000, 1))
  7. for i in range(1, 6):
  8. Xtr[(i-1)*10000:i*10000, :] = pd.read_pickle(path=root+'data_batch_'+str(i))['data']
  9. ytr[(i-1)*10000:i*10000] = np.reshape(pd.read_pickle(path=root+'data_batch_'+str(i))['labels'],(10000,1))
  10. Xte = pd.read_pickle(path=root+'test_batch')['data']
  11. yte = np.reshape(pd.read_pickle(path=root+'test_batch')['labels'],(10000,1))
  1. ## 定义KNN
  2. class kNN(object):
  3. def __init__(self,X,y):
  4. self.Xtr = X
  5. self.ytr = y
  6. def forward(self, Xte, K):
  7. """
  8. Xte:测试集输入
  9. K:kNN的k
  10. """
  11. Ypred = np.zeros((Xte.shape[0], K),dtype = self.ytr.dtype)
  12. for i in range(Xte.shape[0]):
  13. distances = np.sum(np.abs(self.Xtr - Xte[i,:]), axis = 1) # 计算训练集里面每一幅图片和测试图片的距离
  14. min_indexs = np.argsort(distances)[0:K]
  15. # print(min_indexs)
  16. ## 从min_indexs中获取重复次数最多的一个类
  17. min_lables = [self.ytr[i][0] for i in min_indexs] # ytr[i][0] 是因为type(ytr[i]) => numpy.ndarray
  18. # print(np.array(min_lables))
  19. Ypred[i] = np.argmax(np.bincount(min_lables))
  20. # print(i)
  21. return Ypred
  1. nn = kNN(Xtr,ytr)
  2. N = 10
  3. Yte_predict = nn.forward(Xte[0:N,:],5) #取100个数据验证
  4. print('ACC:%f'%(np.mean(Yte_predict==yte[0:N])))
  1. ACC:0.500000
  1. !jupyter nbconvert --to markdown k-NN.ipynb
  1. [NbConvertApp] Converting notebook k-NN.ipynb to markdown
  2. [NbConvertApp] Support files will be in k-NN_files\
  3. [NbConvertApp] Making directory k-NN_files
  4. [NbConvertApp] Writing 4017 bytes to k-NN.md