准确度:计算Y_True / Y_predict之间的准确度
import numpy as npfrom matplotlib import pyplot as pltfrom sklearn import datasetsdigits= datasets.load_digits()X = digits.datafrom sklearn.neighbors import KNeighborsClassifierfrom sklearn.model_selection import train_test_splitX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)knn = KNeighborsClassifier(n_neighbors=3)knn.fit(X_train, y_train)y_predict = knn.predict(X_test)# 准确度sum(y_test == y_predict) / y_test.shape[0]# sklearn库的准确度knn.score(X_test, y_test)# sklearn准确度的函数from sklearn.metrics import accuracy_scoreaccuracy_score(y_test, y_predict)
超参数
什么是超参数:在运行机器学习算法之前需要设置的参数。
寻找最好的K
best_k = 1best_score =0.0weights = Nonefor k in range(1, 11): for weight in ['uniform', 'distance']: knn1 = KNeighborsClassifier(n_neighbors=k, weights=weight) knn1.fit(X_train, y_train) score = knn1.score(X_test, y_test) if score > best_score: weights = weight best_score = score best_k = kprint('best_k:%s' % best_k)print('best_score:%s' % best_score)print('best_weights:%s' % weights)
K近邻算法考虑距离的权重— 取距离的倒数。
网格搜索¶
param_grid = [ { 'weights': ['uniform'], 'n_neighbors': [i for i in range(1, 11)] }, { 'weights': ['distance'], 'n_neighbors': [i for i in range(1, 11)], 'p': [i for i in range(1, 5)], },]from sklearn.model_selection import GridSearchCVgrid = GridSearchCV(KNeighborsClassifier(), param_grid, n_jobs=4)grid.fit(X_train, y_train)# 最佳模型grid.best_estimator_
最值归一化
什么是归一化:将数据集映射到0-1之间 为什么:量纲不同导致距离的计算会偏向某一个个特征。 适用于有明显的边界。例如分数。受outlier影响较大
x = np.random.randint(0, 100, size=100)(x - np.min(x)) / (np.max(x) - np.min(x))
均值方差归一化
把所有的数据归一到均值是0方差为1的分布中
std_data = (x - np.mean(x)) / np.std(x)
机器学习库的均值方差归一化和最值归一化
from sklearn.preprocessing import StandardScaler, MinMaxScalerstd = StandardScaler()std.fit_transform(x.reshape(-1, 1))