import numpy as np# 计算两个点的欧式距离def euclidean_distance(point1, point2): return (np.sum((point1 - point2) ** 2)) ** 0.5# 定义中心初始化函数def centroids_init(k, X): n_samples, n_features = X.shape # indexes = [i for i in range(n_samples)] # 索引 centroids = np.zeros((k, n_features)) for i in range(k): # 每一次循环随机选择一个类别中心 random.shuffle(indexes) # 这样可以保证不被重复选取 centroid = X[indexes.pop()] centroids[i] = centroid return centroids# 计算样本距离哪个质心最近def closest_centroid(sample, centroids): # sample 一个样本 # centroids 所有质心 cls = 0 closest_dist = float('inf') for i, centroid in enumerate(centroids): # 根据欧式距离判断,选择最小距离的中心点所属类别 distance = euclidean_distance(sample, centroid) if distance < closest_dist: cls = i closest_dist = distance return cls # 返回类别索引# 找出各个样本属于哪个类别def create_clusters(centroids, k, X): n_samples = np.shape(X)[0] clusters = [[] for _ in range(k)] for sample_i, sample in enumerate(X): # 将样本划分到最近的类别区域 cls = closest_centroid(sample, centroids) clusters[cls].append(sample_i) return clusters# 根据聚类结果重新计算质心def calculate_centroids(clusters, k, X): n_features = np.shape(X)[1] centroids = np.zeros((k, n_features)) # 以当前每个类样本的均值为新的中心点 for i, cluster in enumerate(clusters): centroid = np.mean(X[cluster], axis=0) centroids[i] = centroid return centroids# 获取最终每个样本所属的聚类类别def get_cluster_labels(clusters, X): y_pred = np.zeros(np.shape(X)[0]) # 样本数量大小 for cluster_i, cluster in enumerate(clusters): # 遍历每个簇 for sample_i in cluster: # 遍历簇中的所有样本 y_pred[sample_i] = cluster_i return y_pred# 根据上述各流程定义kmeans算法流程def kmeans(X, k, max_iterations): # 1.初始化中心点 centroids = centroids_init(k, X) # 遍历迭代求解 for itera in range(max_iterations): # 2.将各个样本分配到其所属质心的簇中 clusters = create_clusters(centroids, k, X) # 保存当前中心点 prev_centroids = centroids # 3.根据聚类结果计算新的中心点 centroids = calculate_centroids(clusters, k, X) # 4.设定收敛条件为中心点是否发生变化 diff = centroids - prev_centroids if not diff.any(): break # 返回最终的聚类标签 return get_cluster_labels(clusters, X)"""1、计算欧式距离2、初始化中心3、计算一个样本到所属的类别4、各个样本的所属质心5、重新计算质心6、比较前后质心7、获取最终类别8、迭代"""# 测试数据X = np.array([[0,2],[0,0],[1,0],[5,0],[5,2]])# 设定聚类类别为2个,最大迭代次数为10次labels = kmeans(X, 2, 10)# 打印每个样本所属的类别标签print(labels)