K-means聚类

实现K-means

寻找最近中心

  1. def find_closest_centroids(x, centroids):
  2. # Set k,初始化中心的行数就是聚类个数
  3. k = centroids.shape[0]
  4. m = x.shape[0]
  5. # You need to return the following variables correctly.
  6. idx = np.zeros((m, 1), dtype=np.int32)
  7. for i in range(m):
  8. idx[i] = 1
  9. # 计算当前第i个样本到第一个聚类中心的距离,作为初始化值
  10. min_distance = np.linalg.norm(x[i, :] - centroids[1 - 1, :]) ** 2
  11. for j in range(2, k + 1):
  12. distance = np.linalg.norm(x[i, :] - centroids[j - 1, :]) ** 2
  13. if distance < min_distance:
  14. min_distance = distance
  15. idx[i] = j
  16. return idx

更新聚类中心

  1. def compute_centroids(x, idx, k):
  2. # Useful variables
  3. m, n = x.shape
  4. centroids = np.zeros((k, n))
  5. idx = np.ravel(idx)
  6. for i in range(k):
  7. centroids[i] = np.mean(x[np.where(idx == i + 1)], axis=0)
  8. return centroids

示例数据集上运行K-means

  1. def run_k_means(x, initial_centroids, max_iters, plot_progress=False):
  2. if plot_progress:
  3. plt.ion()
  4. plt.figure()
  5. centroids = initial_centroids
  6. previous_centroids = centroids
  7. k = initial_centroids.shape[0]
  8. idx = 0
  9. for i in range(max_iters):
  10. # Output progress
  11. print('K-Means iteration {}/{}...\n'.format(i + 1, max_iters), flush=True)
  12. # For each example in X, assign it to the closest centroid
  13. idx = find_closest_centroids(x, centroids)
  14. # Optionally, plot progress here
  15. if plot_progress:
  16. plot_progressk_means(x, centroids, previous_centroids, idx, k, i)
  17. previous_centroids = centroids
  18. # Given the memberships, compute new centroids
  19. centroids = compute_centroids(x, idx, k)
  20. if plot_progress:
  21. plt.close()
  22. return centroids, idx

图像压缩

  1. print('\nRunning K-Means clustering on pixels from an image.\n\n')
  2. A = imread('./data/bird_small.png')
  3. # If imread does not work for you, you can try instead
  4. # load_mat_file ('bird_small.mat');
  5. A = A / 255 # Divide by 255 so that all values are in the range 0 - 1
  6. # Size of the image
  7. img_size = A.shape
  8. # Reshape the image into an Nx3 matrix where N = number of pixels.
  9. # Each row will contain the Red, Green and Blue pixel values
  10. # This gives us our dataset matrix X that we will use K-Means on.
  11. X = np.reshape(A, (img_size[0] * img_size[1], 3), order='F')
  12. # Run your K-Means algorithm on this data
  13. # You should try different values of K and max_iters here
  14. K = 16
  15. pixels_iters = 10
  16. # When using K-Means, it is important the initialize the centroids
  17. # randomly.
  18. # You should complete the code in kMeansInitCentroids.m before proceeding
  19. initial_centroids = k_means_init_centroids(X, K)
  20. # Run K-Means
  21. centroids_img, idx_img = run_k_means(X, initial_centroids, pixels_iters)
  22. print('Program paused. Press enter to continue.\n')
  23. # pause_func()
  24. # ================= Part 5: Image Compression ======================
  25. print('\nApplying K-Means to compress an image.\n\n')
  26. # Find closest cluster members
  27. idx_img_2 = find_closest_centroids(X, centroids_img)
  28. X_recovered = np.zeros((idx_img_2.shape[0], X.shape[1]))
  29. for i in range(idx_img_2.shape[0]):
  30. X_recovered[i] = centroids_img[idx_img_2[i] - 1]
  31. X_recovered = np.reshape(X_recovered, (img_size[0], img_size[1], 3), order='F')
  32. plt.figure()
  33. plt.ion()
  34. plt.subplot(121)
  35. plt.imshow(A)
  36. plt.title('Original')
  37. plt.subplot(122)
  38. plt.imshow(X_recovered)
  39. plt.title('Compressed, with {} colors.'.format(K))
  40. plt.pause(5)

PCA(主成分分析)

示例数据集

  1. print('Visualizing example dataset for PCA.\n\n')
  2. data = load_mat_file('./data/ex7data1.mat')
  3. X = data['X']
  4. # Visualize the example dataset
  5. plt.ion()
  6. plt.figure()
  7. # X[:,0]表示取X的每一行的第0列
  8. plt.scatter(X[:, 0], X[:, 1])
  9. # plt.axis([0.5, 6.5, 2, 8])
  10. # 作图为正方形,并且x,y轴范围相同
  11. plt.axis("square")
  12. plt.pause(0.8)
  13. print('Program paused. Press enter to continue.\n')

实现PCA

特征缩放

  1. def feature_normalize(x):
  2. x_norm = np.zeros(x.shape)
  3. # axis=0,沿列方向计算每行的均值,x输入为50行2列,输出为1行两列
  4. mu = np.mean(x, axis=0)
  5. sigma = np.std(x, axis=0, ddof=1)
  6. for i in range(np.shape(x)[0]):
  7. x_norm[i] = (x[i] - mu) / sigma
  8. return x_norm, mu, sigma

PCA核心

# =============== Part 2: Principal Component Analysis ===============
    print('\nRunning PCA on example dataset.\n\n')
    # Before running PCA, it is important to first normalize X
    X_norm, mu, sigma = feature_normalize(X)
    U, S = pca(X_norm)
    # Draw the eigenvectors centered at mean of data.
    # These lines show the directions of maximum variations in the dataset.
    # 仅仅展示了最大方向
    # 画一条黑色硬实线

    draw_line(mu, (mu + 1.5 * np.dot(S[0], U[:, 0].T)), "-k")
    draw_line(mu, (mu + 1.5 * np.dot(S[1], U[:, 1].T)), "-k")

    plt.pause(0.8)
    print('Top eigenvector: \n')
    print(' U(:,1) = %s \n' % U[:, 0])
    print('\n(you should expect to see -0.707107 -0.707107)\n')

    print('Program paused. Press enter to continue.\n')
    # pause_func()
def pca(x):
    m = x.shape[0]
    # 计算协方差矩阵
    sigma = (1 / m) * (np.dot(x.T, x))
    # 奇异值分解,u为特征向量,s为对角矩阵
    u, s, v = np.linalg.svd(sigma)
    return u, s

投影与重现数据

def project_data(x, u, k):
    k_list = list(range(0, k))
    u_reduce = u[:, k_list]
    return np.dot(x, u_reduce)
def recover_data(z, u, k):
    k_list = list(range(0, k))
    u_reduce = u[:, k_list]
    return np.dot(z, u_reduce.T)

面部数据集

    face_date = load_mat_file('./data/ex7faces.mat')
    X = face_date['X']
    plt.close()
    plt.figure()
    display_data(X[0: 100, :])
    print('Program paused. Press enter to continue.\n')
    # pause_func()

    # =========== Part 5: PCA on Face Data: Eigenfaces  ===================
    # Run PCA and visualize the eigenvectors which are in this case eigenfaces
    # We display the first 36 eigenfaces.
    print('\nRunning PCA on face dataset.\n(this might take a minute or two ...)\n\n')
    # Before running PCA, it is important to first normalize X by subtracting
    # the mean value from each feature
    X_norm, mu, sigma = feature_normalize(X)

    # Run PCA
    U, S = pca(X_norm)
    # Visualize the top 36 eigenvectors found
    plt.close()
    plt.figure()
    display_data(U[:, 0:36].T)
    print('Program paused. Press enter to continue.\n')
    # pause_func()

    # ============= Part 6: Dimension Reduction for Faces =================
    # Project images to the eigen space using the top k eigenvectors
    # If you are applying a machine learning algorithm
    print('\nDimension reduction for face dataset.\n\n')

    K = 100
    Z = project_data(X_norm, U, K)

    print('The projected data Z has a size of: ')
    print(Z.shape)

    print('Program paused. Press enter to continue.\n')
    # pause_func()

    # ==== Part 7: Visualization of Faces after PCA Dimension Reduction ====
    # Project images to the eigen space using the top K eigen vectors and
    # visualize only using those K dimensions
    # Compare to the original input, which is also displayed

    print('\nVisualizing the projected (reduced dimension) faces.\n\n')
    K = 100
    X_rec = recover_data(Z, U, K)
    # Display normalized data
    plt.close()
    plt.figure()
    plt.subplot(1, 2, 1)
    plt.title('Original faces')
    display_data(X_norm[0:100, :])

    # Display reconstructed data from only k eigenfaces
    plt.subplot(1, 2, 2)
    plt.title('Recovered faces')
    display_data(X_rec[0:100, :])
    plt.close()

    print('Program paused. Press enter to continue.\n')

image.png