image.png
    image.png
    image.png

    image.png

    1. #-*- coding: utf-8 -*-
    2. #数据规范化
    3. import pandas as pd
    4. import numpy as np
    5. datafile = '../data/normalization_data.xls' #参数初始化
    6. data = pd.read_excel(datafile, header = None) #读取数据
    7. (data - data.min())/(data.max() - data.min()) #最小-最大规范化
    8. (data - data.mean())/data.std() #零-均值规范化
    9. data/10**np.ceil(np.log10(data.abs().max())) #小数定标规范化
    1. #-*- coding: utf-8 -*-
    2. #数据规范化
    3. import pandas as pd
    4. datafile = '../data/discretization_data.xls' #参数初始化
    5. data = pd.read_excel(datafile) #读取数据
    6. data = data[u'肝气郁结证型系数'].copy()
    7. k = 4
    8. d1 = pd.cut(data, k, labels = range(k)) #等宽离散化,各个类比依次命名为0,1,2,3
    9. #等频率离散化
    10. w = [1.0*i/k for i in range(k+1)]
    11. w = data.describe(percentiles = w)[4:4+k+1] #使用describe函数自动计算分位数
    12. w[0] = w[0]*(1-1e-10)
    13. d2 = pd.cut(data, w, labels = range(k))
    14. from sklearn.cluster import KMeans #引入KMeans
    15. kmodel = KMeans(n_clusters = k, n_jobs = 4) #建立模型,n_jobs是并行数,一般等于CPU数较好
    16. # kmodel.fit(data.reshape((len(data), 1))) #训练模型
    17. # AttributeError: 'Series' object has no attribute 'reshape'
    18. kmodel.fit(data.values.reshape((len(data), 1))) #训练模型
    19. c = pd.DataFrame(kmodel.cluster_centers_).sort_index(0) #输出聚类中心,并且排序(默认是随机序的)
    20. # w = pd.rolling_mean(c, 2).iloc[1:] #相邻两项求中点,作为边界点
    21. # AttributeError: module 'pandas' has no attribute 'rolling_mean' 旧版本问题
    22. w = c.rolling(2).mean().iloc[1:] #相邻两项求中点,作为边界点
    23. # c
    24. # Out[40]:
    25. # 0
    26. # 0 0.221695
    27. # 1 0.138327
    28. # 2 0.408679
    29. # 3 0.295406
    30. # w
    31. # Out[42]:
    32. # 0
    33. # 1 0.180011
    34. # 2 0.273503
    35. # 3 0.352043
    36. w = [0] + list(w[0]) + [data.max()] #把首末边界点加上
    37. # d3 = pd.cut(data, w, labels = range(k))
    38. # 这里k=4,w也有4
    39. d3 = pd.cut(data, w, labels = range(k)) #????
    40. def cluster_plot(d, k): #自定义作图函数来显示聚类结果
    41. import matplotlib.pyplot as plt
    42. plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
    43. plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
    44. plt.figure(figsize = (8, 3))
    45. for j in range(0, k):
    46. plt.plot(data[d==j], [j for i in d[d==j]], 'o')
    47. plt.ylim(-0.5, k-0.5)
    48. return plt
    49. cluster_plot(d1, k).show()
    50. cluster_plot(d2, k).show()
    51. cluster_plot(d3, k).show()