1. import pandas as pd
    2. from math import log
    3. import treePlotter
    4. import operator
    5. #计算给定数据集的香农熵
    6. def calcShannonEnt(dataSet):
    7. numEntries = len(dataSet)
    8. labelCounts = {}
    9. for featVec in dataSet:
    10. currentLabel = featVec[-1]
    11. if currentLabel not in labelCounts.keys():
    12. labelCounts[currentLabel] = 0
    13. labelCounts[currentLabel] += 1
    14. shannonEnt = 0.0
    15. for key in labelCounts:
    16. prob = float(labelCounts[key])/numEntries
    17. shannonEnt -= prob * log(prob, 2)
    18. return shannonEnt
    19. # 对离散变量划分数据集,取出该特征取值为value的所有样本
    20. def splitDataSet(dataSet, axis, value):
    21. retDataSet = []
    22. for featVec in dataSet:
    23. if featVec[axis] == value:
    24. reducedFeatVec = featVec[:axis]
    25. reducedFeatVec.extend(featVec[axis + 1:])
    26. retDataSet.append(reducedFeatVec)
    27. return retDataSet
    28. # 对连续变量划分数据集——二分法。不大于或者大于value的样本分别保存,进行划分
    29. # direction规定划分的方向,决定是划分出小于value的数据样本还是大于value的数据样本集
    30. def splitContinuousDataSet(dataSet, axis, value, direction):
    31. retDataSet = []
    32. for featVec in dataSet:
    33. if direction == 0:
    34. if featVec[axis] > value:
    35. retDataSet.append(featVec) # 连续型特征和特征值都不删除
    36. else:
    37. if featVec[axis] <= value:
    38. retDataSet.append(featVec)
    39. return retDataSet
    40. # 选择最好的数据集划分方式
    41. def chooseBestFeatureToSplit(dataSet, labels):
    42. numFeatures = len(dataSet[0]) - 1 # 特征的数目,最后一列是类别
    43. baseEntropy = calcShannonEnt(dataSet) # 经验熵 Ent(D)
    44. bestInfoGain = 0.0 # 最优的信息增益值。
    45. bestFeature = -1 # 最优的Feature编号
    46. bestSplitDict = {} # key:value = {连续型特征标签:最优划分点}
    47. # bestSplitValue = None # 连续型特征的最优划分点
    48. for i in range(numFeatures):
    49. featList = [example[i] for example in dataSet] # 获取第i列(第i特征)下的所有数据,存到列表中
    50. # 对连续型特征进行处理
    51. # 为每一个连续型特征寻找最优划分点,并计算在最优划分点时的信息增益
    52. if type(featList[0]).__name__ == 'float' or type(featList[0]).__name__ == 'int': # 判断当前属性是否为连续型.等价于type(featList[0]) == float:
    53. # 产生n-1个候选划分点
    54. sortfeatList = sorted(featList) # 二分法:先对属性值从小到大进行排序
    55. splitList = []
    56. for j in range(len(sortfeatList) - 1): # 每一个划分点是相邻属性值的平均
    57. splitList.append((sortfeatList[j] + sortfeatList[j + 1]) / 2.0)
    58. bestSplitEntropy = 10000
    59. slen = len(splitList) # 划分点个数
    60. # 求用第j个候选划分点划分时,得到的信息熵,并记录最佳划分点
    61. for j in range(slen):
    62. value = splitList[j] # 划分点的值value <=value,>value
    63. newEntropy = 0.0 # 创建一个临时的信息熵,用来作比较
    64. subDataSet0 = splitContinuousDataSet(dataSet, i, value, 0) # 划分数据集 >value
    65. subDataSet1 = splitContinuousDataSet(dataSet, i, value, 1) # <=value
    66. # print(subDataSet0)
    67. # print(subDataSet1)
    68. prob0 = len(subDataSet0) / float(len(dataSet)) # >value的比例
    69. prob1 = len(subDataSet1) / float(len(dataSet))
    70. newEntropy = prob0 * calcShannonEnt(subDataSet0) + prob1 * calcShannonEnt(subDataSet1) # P75(4.2)的减数
    71. if newEntropy < bestSplitEntropy: # 越小越好 因为Ent(D)固定-newEntropy
    72. bestSplitEntropy = newEntropy
    73. bestSplit = j
    74. # 用字典记录当前特征(属性)的最佳划分点
    75. bestSplitDict[labels[i]] = splitList[bestSplit]
    76. infoGain = baseEntropy - bestSplitEntropy # 当前连续型特征最优划分点的信息增益
    77. # 对离散型特征进行处理
    78. else:
    79. uniqueVals = set(featList) # 特征值去重
    80. newEntropy = 0.0
    81. # 针对每个特征值,划分数据集 并 计算各子集的信息熵
    82. for value in uniqueVals:
    83. subDataSet = splitDataSet(dataSet, i, value) # 第i个特征取值为value对应的每个样本组成的数据集
    84. prob = len(subDataSet) / float(len(dataSet)) # Dv/D
    85. newEntropy += prob * calcShannonEnt(subDataSet) # prob*信息熵,累加每一个特征值value的
    86. infoGain = baseEntropy - newEntropy # 离散型特征的 信息增益
    87. # 比较每一个特征的信息增益,选择最大的。 例如纹理
    88. if infoGain > bestInfoGain:
    89. bestInfoGain = infoGain
    90. bestFeature = i
    91. # 返回:最优划分特征所在的列,和连续型{特征标签:最优划分点}
    92. return bestFeature, bestSplitDict
    93. # 采用多数表决(投票)的方法决定该叶子结点的分类。
    94. def majorityCnt(classList):
    95. classCount = {} # 存储每个类别标签出现的频率 key:value = {属性值:出现的次数}
    96. for vote in classList:
    97. if vote not in classCount.keys():
    98. classCount[vote] = 0
    99. classCount[vote] += 1
    100. sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) # 字典按1(value)进行排序,变成列表
    101. # [(k,v), (),...]
    102. return sortedClassCount[0][0]
    103. # 主程序,递归产生决策树
    104. def createTree(dataSet, labels, data_full, labels_full):
    105. classList = [example[-1] for example in dataSet] # 数据集最后一列的数据,类别
    106. if classList.count(classList[0]) == len(classList): # 一、如果样本都属于同一类,例如都是好瓜,就没必要再分类了。
    107. return classList[0]
    108. if len(dataSet[0]) == 1: # 二、所有特征都用完了,但类别标签仍然不是唯一的,返回出现次数最多的类别
    109. return majorityCnt(classList)
    110. bestFeat, bestSplitDict = chooseBestFeatureToSplit(dataSet, labels) # 选择最优划分特征 bestFeat:最优特征所在索引
    111. if bestFeat == -1: # 三、没有选出最优划分特征,返回出现次数最多的类别
    112. return majorityCnt(classList)
    113. bestFeatLabel = labels[bestFeat] # 列对应的特征标签。
    114. # 1、最优划分特征是离散型
    115. if type(dataSet[0][bestFeat]).__name__ == 'str':
    116. # 注意:这里不能在if上面,全局变量的话对于连续型特征的key会不相等,例如密度和密度<=0.381
    117. myTree = {bestFeatLabel: {}} # 初始化myTree,使用特征标签创建树,多级字典的形式展现树
    118. featValues = [example[bestFeat] for example in dataSet] # 最优特征列对应的取值
    119. uniqueVals = set(featValues) # 特征值去重,有几个不同取值 dataSet随着数据集划分会变小 {'青绿', '乌黑'}
    120. # dataSet随着数据集的划分,子集中包含的属性值会减少,比如纹理:清晰,稍糊,模糊。
    121. # 在清晰下的子集中5个样本,最优特征为色泽,实际色泽有青绿、乌黑、浅白,这5个样本没有浅白,会造成缺失。
    122. # 因此,需要一个包含全部数据的data_full,来获取所有特征值。
    123. # 因为,在划分数据时,删除最优划分特征所对应的列,导致在dataSet和data_full 中对应的列(bestFeat)不同
    124. # 所以,先找到最优特征对应的标签即bestFeatLabel,然后标签对应在data_full列,最后得到该列对应的不同特征值。
    125. bestFeatIndexInFull = labels_full.index(bestFeatLabel) # list.index(key):返回key出现的第一个位置
    126. featValuesFull = [example[bestFeatIndexInFull] for example in data_full]
    127. uniqueValsFull = set(featValuesFull) # 这一特征在全部数据集中的所有特征值
    128. # print(uniqueVals, 'uniqueValsFull:', uniqueValsFull) # {'浅白', '青绿', '乌黑'}
    129. del (labels[bestFeat]) # 离散型特征选择完之后就不再作为划分特征
    130. for value in uniqueValsFull: # 例如最优特征:纹理;uniqueVals:清晰,稍糊,模糊
    131. if value in uniqueVals:
    132. subLabels = labels[:] # 去掉最优划分特征之后,剩下的特征集
    133. valueDataSet = splitDataSet(dataSet, bestFeat, value) # 特征值为value的所有数据
    134. myTree[bestFeatLabel][value] = createTree(valueDataSet, subLabels, data_full, labels_full)
    135. # print('===', myTree) # 先输出嵌套的最里层
    136. else:
    137. myTree[bestFeatLabel][value] = majorityCnt(classList) # 子集没有的特征值,返回出现次数最多的类别
    138. # 2、连续型特征,不删除
    139. else:
    140. bestSplitValue = bestSplitDict[bestFeatLabel] # 特征对应的最优划分点
    141. bestFeatLabel = labels[bestFeat] + '<=' + str(bestSplitValue) # 密度变为:密度<=0.381
    142. myTree = {bestFeatLabel: {}} # 初始化myTree,使用特征标签创建树,多级字典的形式展现树
    143. subDataSet0 = splitContinuousDataSet(dataSet, bestFeat, bestSplitValue, 0) # >value
    144. subDataSet1 = splitContinuousDataSet(dataSet, bestFeat, bestSplitValue, 1) # <=value
    145. myTree[bestFeatLabel]['no'] = createTree(subDataSet0, labels, data_full, labels_full)
    146. # print('连续型否:', myTree) # 先输出最底层的键值对,作为上一层的value
    147. myTree[bestFeatLabel]['yes'] = createTree(subDataSet1, labels, data_full, labels_full)
    148. # print('连续型是:', myTree)
    149. return myTree
    150. #决策树
    151. def classify(inputTree, featLabels, testVec):
    152. firstStr = list(inputTree.keys())[0]
    153. secondDict = inputTree[firstStr]
    154. featIndex = featLabels.index(firstStr)
    155. for key in secondDict.keys():
    156. if testVec[featIndex] == key:
    157. if type(secondDict[key]).__name__ == 'dict':
    158. classLabel = classify(secondDict[key], featLabels, testVec)
    159. else:
    160. classLabel = secondDict[key]
    161. return classLabel
    162. #决策树
    163. def classifyAll(inputTree, featLabels, testDataSet):
    164. classLabelAll = []
    165. for testVec in testDataSet:
    166. classLabelAll.append(classify(inputTree, featLabels, testVec))
    167. return classLabelAll
    168. if __name__ == '__main__':
    169. file = pd.read_csv('data.csv')
    170. data = file.values[:, 3:].tolist()
    171. for i in [t[0:-1] for t in data]:
    172. for j in i:
    173. j = float(j)
    174. label = file.columns.values[3:].tolist()
    175. label_full = label
    176. train_data = data[1:200]
    177. train_data_full = train_data
    178. test_data = data[200:245]
    179. desicionTree = createTree(train_data, label, train_data_full, label_full)
    180. print('desicionTree:\n', desicionTree)
    181. treePlotter.createPlot(desicionTree)
    182. print('classifyResult:\n', classifyAll(desicionTree, label, test_data))