K-means聚类

实现K-means

寻找最近中心

def find_closest_centroids(x, centroids):
    # Set k,初始化中心的行数就是聚类个数
    k = centroids.shape[0]
    m = x.shape[0]
    # You need to return the following variables correctly.
    idx = np.zeros((m, 1), dtype=np.int32)
    for i in range(m):
        idx[i] = 1
        # 计算当前第i个样本到第一个聚类中心的距离，作为初始化值
        min_distance = np.linalg.norm(x[i, :] - centroids[1 - 1, :]) ** 2
        for j in range(2, k + 1):
            distance = np.linalg.norm(x[i, :] - centroids[j - 1, :]) ** 2
            if distance < min_distance:
                min_distance = distance
                idx[i] = j
    return idx

更新聚类中心

def compute_centroids(x, idx, k):
    # Useful variables
    m, n = x.shape
    centroids = np.zeros((k, n))
    idx = np.ravel(idx)
    for i in range(k):
        centroids[i] = np.mean(x[np.where(idx == i + 1)], axis=0)
    return centroids

示例数据集上运行K-means

def run_k_means(x, initial_centroids, max_iters, plot_progress=False):
    if plot_progress:
        plt.ion()
        plt.figure()
    centroids = initial_centroids
    previous_centroids = centroids
    k = initial_centroids.shape[0]
    idx = 0
    for i in range(max_iters):
        # Output progress
        print('K-Means iteration {}/{}...\n'.format(i + 1, max_iters), flush=True)
        # For each example in X, assign it to the closest centroid
        idx = find_closest_centroids(x, centroids)
        # Optionally, plot progress here
        if plot_progress:
            plot_progressk_means(x, centroids, previous_centroids, idx, k, i)
            previous_centroids = centroids
        # Given the memberships, compute new centroids
        centroids = compute_centroids(x, idx, k)
    if plot_progress:
        plt.close()
    return centroids, idx

图像压缩

    print('\nRunning K-Means clustering on pixels from an image.\n\n')
    A = imread('./data/bird_small.png')
    # If imread does not work for you, you can try instead
    # load_mat_file ('bird_small.mat');
    A = A / 255  # Divide by 255 so that all values are in the range 0 - 1
    # Size of the image
    img_size = A.shape
    # Reshape the image into an Nx3 matrix where N = number of pixels.
    # Each row will contain the Red, Green and Blue pixel values
    # This gives us our dataset matrix X that we will use K-Means on.
    X = np.reshape(A, (img_size[0] * img_size[1], 3), order='F')
    # Run your K-Means algorithm on this data
    # You should try different values of K and max_iters here
    K = 16
    pixels_iters = 10
    # When using K-Means, it is important the initialize the centroids
    # randomly.
    # You should complete the code in kMeansInitCentroids.m before proceeding
    initial_centroids = k_means_init_centroids(X, K)
    # Run K-Means
    centroids_img, idx_img = run_k_means(X, initial_centroids, pixels_iters)
    print('Program paused. Press enter to continue.\n')
    # pause_func()
    # ================= Part 5: Image Compression ======================
    print('\nApplying K-Means to compress an image.\n\n')
    # Find closest cluster members
    idx_img_2 = find_closest_centroids(X, centroids_img)
    X_recovered = np.zeros((idx_img_2.shape[0], X.shape[1]))
    for i in range(idx_img_2.shape[0]):
        X_recovered[i] = centroids_img[idx_img_2[i] - 1]
    X_recovered = np.reshape(X_recovered, (img_size[0], img_size[1], 3), order='F')
    plt.figure()
    plt.ion()
    plt.subplot(121)
    plt.imshow(A)
    plt.title('Original')
    plt.subplot(122)
    plt.imshow(X_recovered)
    plt.title('Compressed, with {} colors.'.format(K))
    plt.pause(5)

PCA（主成分分析）

示例数据集

    print('Visualizing example dataset for PCA.\n\n')
    data = load_mat_file('./data/ex7data1.mat')
    X = data['X']
    # Visualize the example dataset
    plt.ion()
    plt.figure()
    # X[:,0]表示取X的每一行的第0列
    plt.scatter(X[:, 0], X[:, 1])
    # plt.axis([0.5, 6.5, 2, 8])
    # 作图为正方形，并且x,y轴范围相同
    plt.axis("square")
    plt.pause(0.8)
    print('Program paused. Press enter to continue.\n')

实现PCA

特征缩放

def feature_normalize(x):
    x_norm = np.zeros(x.shape)
    # axis=0，沿列方向计算每行的均值，x输入为50行2列，输出为1行两列
    mu = np.mean(x, axis=0)
    sigma = np.std(x, axis=0, ddof=1)
    for i in range(np.shape(x)[0]):
        x_norm[i] = (x[i] - mu) / sigma
    return x_norm, mu, sigma

PCA核心

# =============== Part 2: Principal Component Analysis ===============
    print('\nRunning PCA on example dataset.\n\n')
    # Before running PCA, it is important to first normalize X
    X_norm, mu, sigma = feature_normalize(X)
    U, S = pca(X_norm)
    # Draw the eigenvectors centered at mean of data.
    # These lines show the directions of maximum variations in the dataset.
    # 仅仅展示了最大方向
    # 画一条黑色硬实线

    draw_line(mu, (mu + 1.5 * np.dot(S[0], U[:, 0].T)), "-k")
    draw_line(mu, (mu + 1.5 * np.dot(S[1], U[:, 1].T)), "-k")

    plt.pause(0.8)
    print('Top eigenvector: \n')
    print(' U(:,1) = %s \n' % U[:, 0])
    print('\n(you should expect to see -0.707107 -0.707107)\n')

    print('Program paused. Press enter to continue.\n')
    # pause_func()
def pca(x):
    m = x.shape[0]
    # 计算协方差矩阵
    sigma = (1 / m) * (np.dot(x.T, x))
    # 奇异值分解，u为特征向量，s为对角矩阵
    u, s, v = np.linalg.svd(sigma)
    return u, s

投影与重现数据

def project_data(x, u, k):
    k_list = list(range(0, k))
    u_reduce = u[:, k_list]
    return np.dot(x, u_reduce)
def recover_data(z, u, k):
    k_list = list(range(0, k))
    u_reduce = u[:, k_list]
    return np.dot(z, u_reduce.T)

面部数据集

    face_date = load_mat_file('./data/ex7faces.mat')
    X = face_date['X']
    plt.close()
    plt.figure()
    display_data(X[0: 100, :])
    print('Program paused. Press enter to continue.\n')
    # pause_func()

    # =========== Part 5: PCA on Face Data: Eigenfaces  ===================
    # Run PCA and visualize the eigenvectors which are in this case eigenfaces
    # We display the first 36 eigenfaces.
    print('\nRunning PCA on face dataset.\n(this might take a minute or two ...)\n\n')
    # Before running PCA, it is important to first normalize X by subtracting
    # the mean value from each feature
    X_norm, mu, sigma = feature_normalize(X)

    # Run PCA
    U, S = pca(X_norm)
    # Visualize the top 36 eigenvectors found
    plt.close()
    plt.figure()
    display_data(U[:, 0:36].T)
    print('Program paused. Press enter to continue.\n')
    # pause_func()

    # ============= Part 6: Dimension Reduction for Faces =================
    # Project images to the eigen space using the top k eigenvectors
    # If you are applying a machine learning algorithm
    print('\nDimension reduction for face dataset.\n\n')

    K = 100
    Z = project_data(X_norm, U, K)

    print('The projected data Z has a size of: ')
    print(Z.shape)

    print('Program paused. Press enter to continue.\n')
    # pause_func()

    # ==== Part 7: Visualization of Faces after PCA Dimension Reduction ====
    # Project images to the eigen space using the top K eigen vectors and
    # visualize only using those K dimensions
    # Compare to the original input, which is also displayed

    print('\nVisualizing the projected (reduced dimension) faces.\n\n')
    K = 100
    X_rec = recover_data(Z, U, K)
    # Display normalized data
    plt.close()
    plt.figure()
    plt.subplot(1, 2, 1)
    plt.title('Original faces')
    display_data(X_norm[0:100, :])

    # Display reconstructed data from only k eigenfaces
    plt.subplot(1, 2, 2)
    plt.title('Recovered faces')
    display_data(X_rec[0:100, :])
    plt.close()

    print('Program paused. Press enter to continue.\n')

深度学习

吴恩达机器学习作业7（Kmeans和PCA）

K-means聚类

实现K-means

寻找最近中心

更新聚类中心

示例数据集上运行K-means

图像压缩

PCA（主成分分析）

示例数据集

实现PCA

特征缩放

PCA核心

投影与重现数据

面部数据集