import pandas as pdfrom math import logimport treePlotterimport operator#计算给定数据集的香农熵def calcShannonEnt(dataSet): numEntries = len(dataSet) labelCounts = {} for featVec in dataSet: currentLabel = featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key])/numEntries shannonEnt -= prob * log(prob, 2) return shannonEnt# 对离散变量划分数据集,取出该特征取值为value的所有样本def splitDataSet(dataSet, axis, value): retDataSet = [] for featVec in dataSet: if featVec[axis] == value: reducedFeatVec = featVec[:axis] reducedFeatVec.extend(featVec[axis + 1:]) retDataSet.append(reducedFeatVec) return retDataSet# 对连续变量划分数据集——二分法。不大于或者大于value的样本分别保存,进行划分# direction规定划分的方向,决定是划分出小于value的数据样本还是大于value的数据样本集def splitContinuousDataSet(dataSet, axis, value, direction): retDataSet = [] for featVec in dataSet: if direction == 0: if featVec[axis] > value: retDataSet.append(featVec) # 连续型特征和特征值都不删除 else: if featVec[axis] <= value: retDataSet.append(featVec) return retDataSet# 选择最好的数据集划分方式def chooseBestFeatureToSplit(dataSet, labels): numFeatures = len(dataSet[0]) - 1 # 特征的数目,最后一列是类别 baseEntropy = calcShannonEnt(dataSet) # 经验熵 Ent(D) bestInfoGain = 0.0 # 最优的信息增益值。 bestFeature = -1 # 最优的Feature编号 bestSplitDict = {} # key:value = {连续型特征标签:最优划分点} # bestSplitValue = None # 连续型特征的最优划分点 for i in range(numFeatures): featList = [example[i] for example in dataSet] # 获取第i列(第i特征)下的所有数据,存到列表中 # 对连续型特征进行处理 # 为每一个连续型特征寻找最优划分点,并计算在最优划分点时的信息增益 if type(featList[0]).__name__ == 'float' or type(featList[0]).__name__ == 'int': # 判断当前属性是否为连续型.等价于type(featList[0]) == float: # 产生n-1个候选划分点 sortfeatList = sorted(featList) # 二分法:先对属性值从小到大进行排序 splitList = [] for j in range(len(sortfeatList) - 1): # 每一个划分点是相邻属性值的平均 splitList.append((sortfeatList[j] + sortfeatList[j + 1]) / 2.0) bestSplitEntropy = 10000 slen = len(splitList) # 划分点个数 # 求用第j个候选划分点划分时,得到的信息熵,并记录最佳划分点 for j in range(slen): value = splitList[j] # 划分点的值value <=value,>value newEntropy = 0.0 # 创建一个临时的信息熵,用来作比较 subDataSet0 = splitContinuousDataSet(dataSet, i, value, 0) # 划分数据集 >value subDataSet1 = splitContinuousDataSet(dataSet, i, value, 1) # <=value # print(subDataSet0) # print(subDataSet1) prob0 = len(subDataSet0) / float(len(dataSet)) # >value的比例 prob1 = len(subDataSet1) / float(len(dataSet)) newEntropy = prob0 * calcShannonEnt(subDataSet0) + prob1 * calcShannonEnt(subDataSet1) # P75(4.2)的减数 if newEntropy < bestSplitEntropy: # 越小越好 因为Ent(D)固定-newEntropy bestSplitEntropy = newEntropy bestSplit = j # 用字典记录当前特征(属性)的最佳划分点 bestSplitDict[labels[i]] = splitList[bestSplit] infoGain = baseEntropy - bestSplitEntropy # 当前连续型特征最优划分点的信息增益 # 对离散型特征进行处理 else: uniqueVals = set(featList) # 特征值去重 newEntropy = 0.0 # 针对每个特征值,划分数据集 并 计算各子集的信息熵 for value in uniqueVals: subDataSet = splitDataSet(dataSet, i, value) # 第i个特征取值为value对应的每个样本组成的数据集 prob = len(subDataSet) / float(len(dataSet)) # Dv/D newEntropy += prob * calcShannonEnt(subDataSet) # prob*信息熵,累加每一个特征值value的 infoGain = baseEntropy - newEntropy # 离散型特征的 信息增益 # 比较每一个特征的信息增益,选择最大的。 例如纹理 if infoGain > bestInfoGain: bestInfoGain = infoGain bestFeature = i # 返回:最优划分特征所在的列,和连续型{特征标签:最优划分点} return bestFeature, bestSplitDict# 采用多数表决(投票)的方法决定该叶子结点的分类。def majorityCnt(classList): classCount = {} # 存储每个类别标签出现的频率 key:value = {属性值:出现的次数} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) # 字典按1(value)进行排序,变成列表 # [(k,v), (),...] return sortedClassCount[0][0]# 主程序,递归产生决策树def createTree(dataSet, labels, data_full, labels_full): classList = [example[-1] for example in dataSet] # 数据集最后一列的数据,类别 if classList.count(classList[0]) == len(classList): # 一、如果样本都属于同一类,例如都是好瓜,就没必要再分类了。 return classList[0] if len(dataSet[0]) == 1: # 二、所有特征都用完了,但类别标签仍然不是唯一的,返回出现次数最多的类别 return majorityCnt(classList) bestFeat, bestSplitDict = chooseBestFeatureToSplit(dataSet, labels) # 选择最优划分特征 bestFeat:最优特征所在索引 if bestFeat == -1: # 三、没有选出最优划分特征,返回出现次数最多的类别 return majorityCnt(classList) bestFeatLabel = labels[bestFeat] # 列对应的特征标签。 # 1、最优划分特征是离散型 if type(dataSet[0][bestFeat]).__name__ == 'str': # 注意:这里不能在if上面,全局变量的话对于连续型特征的key会不相等,例如密度和密度<=0.381 myTree = {bestFeatLabel: {}} # 初始化myTree,使用特征标签创建树,多级字典的形式展现树 featValues = [example[bestFeat] for example in dataSet] # 最优特征列对应的取值 uniqueVals = set(featValues) # 特征值去重,有几个不同取值 dataSet随着数据集划分会变小 {'青绿', '乌黑'} # dataSet随着数据集的划分,子集中包含的属性值会减少,比如纹理:清晰,稍糊,模糊。 # 在清晰下的子集中5个样本,最优特征为色泽,实际色泽有青绿、乌黑、浅白,这5个样本没有浅白,会造成缺失。 # 因此,需要一个包含全部数据的data_full,来获取所有特征值。 # 因为,在划分数据时,删除最优划分特征所对应的列,导致在dataSet和data_full 中对应的列(bestFeat)不同 # 所以,先找到最优特征对应的标签即bestFeatLabel,然后标签对应在data_full列,最后得到该列对应的不同特征值。 bestFeatIndexInFull = labels_full.index(bestFeatLabel) # list.index(key):返回key出现的第一个位置 featValuesFull = [example[bestFeatIndexInFull] for example in data_full] uniqueValsFull = set(featValuesFull) # 这一特征在全部数据集中的所有特征值 # print(uniqueVals, 'uniqueValsFull:', uniqueValsFull) # {'浅白', '青绿', '乌黑'} del (labels[bestFeat]) # 离散型特征选择完之后就不再作为划分特征 for value in uniqueValsFull: # 例如最优特征:纹理;uniqueVals:清晰,稍糊,模糊 if value in uniqueVals: subLabels = labels[:] # 去掉最优划分特征之后,剩下的特征集 valueDataSet = splitDataSet(dataSet, bestFeat, value) # 特征值为value的所有数据 myTree[bestFeatLabel][value] = createTree(valueDataSet, subLabels, data_full, labels_full) # print('===', myTree) # 先输出嵌套的最里层 else: myTree[bestFeatLabel][value] = majorityCnt(classList) # 子集没有的特征值,返回出现次数最多的类别 # 2、连续型特征,不删除 else: bestSplitValue = bestSplitDict[bestFeatLabel] # 特征对应的最优划分点 bestFeatLabel = labels[bestFeat] + '<=' + str(bestSplitValue) # 密度变为:密度<=0.381 myTree = {bestFeatLabel: {}} # 初始化myTree,使用特征标签创建树,多级字典的形式展现树 subDataSet0 = splitContinuousDataSet(dataSet, bestFeat, bestSplitValue, 0) # >value subDataSet1 = splitContinuousDataSet(dataSet, bestFeat, bestSplitValue, 1) # <=value myTree[bestFeatLabel]['no'] = createTree(subDataSet0, labels, data_full, labels_full) # print('连续型否:', myTree) # 先输出最底层的键值对,作为上一层的value myTree[bestFeatLabel]['yes'] = createTree(subDataSet1, labels, data_full, labels_full) # print('连续型是:', myTree) return myTree#决策树def classify(inputTree, featLabels, testVec): firstStr = list(inputTree.keys())[0] secondDict = inputTree[firstStr] featIndex = featLabels.index(firstStr) for key in secondDict.keys(): if testVec[featIndex] == key: if type(secondDict[key]).__name__ == 'dict': classLabel = classify(secondDict[key], featLabels, testVec) else: classLabel = secondDict[key] return classLabel#决策树def classifyAll(inputTree, featLabels, testDataSet): classLabelAll = [] for testVec in testDataSet: classLabelAll.append(classify(inputTree, featLabels, testVec)) return classLabelAllif __name__ == '__main__': file = pd.read_csv('data.csv') data = file.values[:, 3:].tolist() for i in [t[0:-1] for t in data]: for j in i: j = float(j) label = file.columns.values[3:].tolist() label_full = label train_data = data[1:200] train_data_full = train_data test_data = data[200:245] desicionTree = createTree(train_data, label, train_data_full, label_full) print('desicionTree:\n', desicionTree) treePlotter.createPlot(desicionTree) print('classifyResult:\n', classifyAll(desicionTree, label, test_data))