基于 K-means clustering 算法,实现对消费者的聚类

    1. import numpy as np
    2. import pandas as pd
    3. from tqdm import tqdm
    1. data = pd.read_csv(r"dataset/order.csv", header=0)
    2. # data.drop(["Id"], axis=1, inplace=True)
    3. t = data.iloc[:,-8:]
    4. t
    1. Food% Fresh% Drinks% Home% Beauty% Health% Baby% Pets%
    2. 0 9.46 87.06 3.48 0.00 0.00 0.00 0.0 0.0
    3. 1 15.87 75.80 6.22 2.12 0.00 0.00 0.0 0.0
    4. 2 16.88 56.75 3.37 16.48 6.53 0.00 0.0 0.0
    5. 3 28.81 35.99 11.78 4.62 2.87 15.92 0.0 0.0
    6. 4 24.13 60.38 7.78 7.72 0.00 0.00 0.0 0.0
    7. ... ... ... ... ... ... ... ... ...
    8. 29995 5.80 0.00 51.30 0.00 0.00 0.00 0.0 42.9
    9. 29996 0.00 0.00 0.00 0.00 100.00 0.00 0.0 0.0
    10. 29997 9.25 0.00 77.48 13.27 0.00 0.00 0.0 0.0
    11. 29998 0.00 0.00 100.00 0.00 0.00 0.00 0.0 0.0
    12. 29999 0.00 0.00 0.00 0.00 0.00 0.00 100.0 0.0

    30000 rows × 8 columns

    1. class KMeans:
    2. '''Kmeans聚类算法实现'''
    3. def __init__(self, k, times):
    4. '''初始化
    5. Parameters
    6. -----
    7. k: int 聚成几个类
    8. times: int 迭代次数
    9. '''
    10. self.k = k
    11. self.times = times
    12. def fit(self, X):
    13. '''根据所给数据训练
    14. Pararmeters
    15. ------
    16. X: 类数组类型,形如:[样本数量,特征数量]
    17. '''
    18. X = np.asarray(X)
    19. # 设置随机数种子,以便于可以相同的随机系列,以便随机结果重现
    20. np.random.seed(0)
    21. # 从数组中随机选择K个点作为初始聚类中心
    22. self.cluster_centers_ = X[np.random.randint(0, len(X), self.k)]
    23. # 用于存放数据所属标签
    24. self.labels_ = np.zeros(len(X))
    25. # 开始迭代
    26. for t in tqdm(range(self.times)):
    27. # 循环遍历样本计算每个样本与聚类中心的距离
    28. for index,x in enumerate(X):
    29. # 计算每个样本与每个聚类中心的欧式距离
    30. dis = np.sqrt(np.sum((x - self.cluster_centers_)**2, axis=1))
    31. # 将最小距离的索引赋值给标签数组,索引的值就是当前所属的簇。范围威威(0,K-1)
    32. self.labels_[index] = dis.argmin()
    33. # 循环便利每一个数更新聚类中心
    34. for i in range(self.k):
    35. # 计算每个簇内所有点的均值,用来更新聚类中心
    36. self.cluster_centers_[i] = np.mean(X[self.labels_==i], axis=0)
    37. def predict(self, X):
    38. '''预测样本属于哪个簇
    39. Parameters
    40. -----
    41. x: 类数组类型。形如[样本数量。特征数量]
    42. Reeturn
    43. -----
    44. result: 类数组,每一个x所属的簇
    45. '''
    46. X = np.asarray(X)
    47. result = np.zeros(len(X))
    48. for index,x in enumerate(X):
    49. # 计算样本与聚类中心的距离
    50. dis = np.sqrt(np.sum((x - self.cluster_centers_)**2, axis=1))
    51. # 找到距离最近的聚类中中心划分一个类别
    52. result[index] = dis.argmin()
    53. return result
    1. kmeans = KMeans(3, 50)
    2. kmeans.fit(t)
    3. kmeans.cluster_centers_
    1. 100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:43<00:00, 1.15it/s]

    array([[46.33977936, 8.93380516, 23.19047005, 13.11741633, 4.8107557 ,
    1.17283735, 1.35704647, 0.95392773],
    [19.5308009 , 50.42856608, 14.70652695, 7.89437019, 3.69829234,
    0.91000428, 1.92515077, 0.82113238],
    [ 7.93541008, 4.56182052, 30.65583437, 18.57726789, 8.61597195,
    1.28482514, 26.81950293, 1.30158264]])

    1. # 查看某个簇内的所有样本数据
    2. t[kmeans.labels_==0]
    1. Food% Fresh% Drinks% Home% Beauty% Health% Baby% Pets%
    2. 15 48.23 20.37 15.38 8.29 7.73 0.0 0.0 0.0
    3. 23 24.10 22.29 38.69 14.92 0.00 0.0 0.0 0.0
    4. 24 36.51 31.93 27.18 4.38 0.00 0.0 0.0 0.0
    5. 40 22.76 0.00 0.00 77.24 0.00 0.0 0.0 0.0
    6. 43 65.64 12.36 21.99 0.00 0.00 0.0 0.0 0.0
    7. ... ... ... ... ... ... ... ... ...
    8. 29974 33.93 0.00 17.46 41.46 7.15 0.0 0.0 0.0
    9. 29977 45.10 0.00 26.68 28.22 0.00 0.0 0.0 0.0
    10. 29988 28.21 0.00 48.34 23.44 0.00 0.0 0.0 0.0
    11. 29989 61.32 0.00 23.34 15.34 0.00 0.0 0.0 0.0
    12. 29990 29.74 28.72 19.52 22.02 0.00 0.0 0.0 0.0

    9382 rows × 8 columns

    1. kmeans.predict([[30,30,40,0,0,0,0,0],[0,0,0,0,0,30,30,40],[30,30,0,0,0,0,20,20]])

    array([0., 2., 1.])

    1. t2 = data.loc[:,"Food%":"Fresh%"] # 需要注意loc函数是用字符作为索引的。并且包含:后面的那一列
    2. kmeans = KMeans(3, 50)
    3. kmeans.fit(t2)
    1. 100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:43<00:00, 1.15it/s]
    1. import matplotlib as mpl
    2. import matplotlib.pyplot as plt
    3. mpl.rcParams["font.family"] = "SimHei"
    4. mpl.rcParams["axes.unicode_minus"] = False
    1. plt.figure(figsize=(10,10))
    2. # 绘制每个类别散点图
    3. plt.scatter(t2[kmeans.labels_==0].iloc[:,0], t2[kmeans.labels_==0].iloc[:,1], label="类别1")
    4. plt.scatter(t2[kmeans.labels_==1].iloc[:,0], t2[kmeans.labels_==1].iloc[:,1], label="类别1")
    5. plt.scatter(t2[kmeans.labels_==2].iloc[:,0], t2[kmeans.labels_==2].iloc[:,1], label="类别1")
    6. # 绘制聚类中心
    7. plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1], marker="+",s=300)
    8. plt.title("食物与肉类购买聚类分析")
    9. plt.xlabel("食物")
    10. plt.ylabel("肉类")
    11. plt.legend()
    12. plt.show()

    image.png