1. import numpy as np
    2. # 计算两个点的欧式距离
    3. def euclidean_distance(point1, point2):
    4. return (np.sum((point1 - point2) ** 2)) ** 0.5
    5. # 定义中心初始化函数
    6. def centroids_init(k, X):
    7. n_samples, n_features = X.shape #
    8. indexes = [i for i in range(n_samples)] # 索引
    9. centroids = np.zeros((k, n_features))
    10. for i in range(k):
    11. # 每一次循环随机选择一个类别中心
    12. random.shuffle(indexes) # 这样可以保证不被重复选取
    13. centroid = X[indexes.pop()]
    14. centroids[i] = centroid
    15. return centroids
    16. # 计算样本距离哪个质心最近
    17. def closest_centroid(sample, centroids):
    18. # sample 一个样本
    19. # centroids 所有质心
    20. cls = 0
    21. closest_dist = float('inf')
    22. for i, centroid in enumerate(centroids):
    23. # 根据欧式距离判断,选择最小距离的中心点所属类别
    24. distance = euclidean_distance(sample, centroid)
    25. if distance < closest_dist:
    26. cls = i
    27. closest_dist = distance
    28. return cls # 返回类别索引
    29. # 找出各个样本属于哪个类别
    30. def create_clusters(centroids, k, X):
    31. n_samples = np.shape(X)[0]
    32. clusters = [[] for _ in range(k)]
    33. for sample_i, sample in enumerate(X):
    34. # 将样本划分到最近的类别区域
    35. cls = closest_centroid(sample, centroids)
    36. clusters[cls].append(sample_i)
    37. return clusters
    38. # 根据聚类结果重新计算质心
    39. def calculate_centroids(clusters, k, X):
    40. n_features = np.shape(X)[1]
    41. centroids = np.zeros((k, n_features))
    42. # 以当前每个类样本的均值为新的中心点
    43. for i, cluster in enumerate(clusters):
    44. centroid = np.mean(X[cluster], axis=0)
    45. centroids[i] = centroid
    46. return centroids
    47. # 获取最终每个样本所属的聚类类别
    48. def get_cluster_labels(clusters, X):
    49. y_pred = np.zeros(np.shape(X)[0]) # 样本数量大小
    50. for cluster_i, cluster in enumerate(clusters): # 遍历每个簇
    51. for sample_i in cluster: # 遍历簇中的所有样本
    52. y_pred[sample_i] = cluster_i
    53. return y_pred
    54. # 根据上述各流程定义kmeans算法流程
    55. def kmeans(X, k, max_iterations):
    56. # 1.初始化中心点
    57. centroids = centroids_init(k, X)
    58. # 遍历迭代求解
    59. for itera in range(max_iterations):
    60. # 2.将各个样本分配到其所属质心的簇中
    61. clusters = create_clusters(centroids, k, X)
    62. # 保存当前中心点
    63. prev_centroids = centroids
    64. # 3.根据聚类结果计算新的中心点
    65. centroids = calculate_centroids(clusters, k, X)
    66. # 4.设定收敛条件为中心点是否发生变化
    67. diff = centroids - prev_centroids
    68. if not diff.any():
    69. break
    70. # 返回最终的聚类标签
    71. return get_cluster_labels(clusters, X)
    72. """
    73. 1、计算欧式距离
    74. 2、初始化中心
    75. 3、计算一个样本到所属的类别
    76. 4、各个样本的所属质心
    77. 5、重新计算质心
    78. 6、比较前后质心
    79. 7、获取最终类别
    80. 8、迭代
    81. """
    82. # 测试数据
    83. X = np.array([[0,2],[0,0],[1,0],[5,0],[5,2]])
    84. # 设定聚类类别为2个,最大迭代次数为10次
    85. labels = kmeans(X, 2, 10)
    86. # 打印每个样本所属的类别标签
    87. print(labels)