import numpy as np
# 计算两个点的欧式距离
def euclidean_distance(point1, point2):
return (np.sum((point1 - point2) ** 2)) ** 0.5
# 定义中心初始化函数
def centroids_init(k, X):
n_samples, n_features = X.shape #
indexes = [i for i in range(n_samples)] # 索引
centroids = np.zeros((k, n_features))
for i in range(k):
# 每一次循环随机选择一个类别中心
random.shuffle(indexes) # 这样可以保证不被重复选取
centroid = X[indexes.pop()]
centroids[i] = centroid
return centroids
# 计算样本距离哪个质心最近
def closest_centroid(sample, centroids):
# sample 一个样本
# centroids 所有质心
cls = 0
closest_dist = float('inf')
for i, centroid in enumerate(centroids):
# 根据欧式距离判断,选择最小距离的中心点所属类别
distance = euclidean_distance(sample, centroid)
if distance < closest_dist:
cls = i
closest_dist = distance
return cls # 返回类别索引
# 找出各个样本属于哪个类别
def create_clusters(centroids, k, X):
n_samples = np.shape(X)[0]
clusters = [[] for _ in range(k)]
for sample_i, sample in enumerate(X):
# 将样本划分到最近的类别区域
cls = closest_centroid(sample, centroids)
clusters[cls].append(sample_i)
return clusters
# 根据聚类结果重新计算质心
def calculate_centroids(clusters, k, X):
n_features = np.shape(X)[1]
centroids = np.zeros((k, n_features))
# 以当前每个类样本的均值为新的中心点
for i, cluster in enumerate(clusters):
centroid = np.mean(X[cluster], axis=0)
centroids[i] = centroid
return centroids
# 获取最终每个样本所属的聚类类别
def get_cluster_labels(clusters, X):
y_pred = np.zeros(np.shape(X)[0]) # 样本数量大小
for cluster_i, cluster in enumerate(clusters): # 遍历每个簇
for sample_i in cluster: # 遍历簇中的所有样本
y_pred[sample_i] = cluster_i
return y_pred
# 根据上述各流程定义kmeans算法流程
def kmeans(X, k, max_iterations):
# 1.初始化中心点
centroids = centroids_init(k, X)
# 遍历迭代求解
for itera in range(max_iterations):
# 2.将各个样本分配到其所属质心的簇中
clusters = create_clusters(centroids, k, X)
# 保存当前中心点
prev_centroids = centroids
# 3.根据聚类结果计算新的中心点
centroids = calculate_centroids(clusters, k, X)
# 4.设定收敛条件为中心点是否发生变化
diff = centroids - prev_centroids
if not diff.any():
break
# 返回最终的聚类标签
return get_cluster_labels(clusters, X)
"""
1、计算欧式距离
2、初始化中心
3、计算一个样本到所属的类别
4、各个样本的所属质心
5、重新计算质心
6、比较前后质心
7、获取最终类别
8、迭代
"""
# 测试数据
X = np.array([[0,2],[0,0],[1,0],[5,0],[5,2]])
# 设定聚类类别为2个,最大迭代次数为10次
labels = kmeans(X, 2, 10)
# 打印每个样本所属的类别标签
print(labels)