import pandas as pd
from math import log
import treePlotter
import operator
#计算给定数据集的香农熵
def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -= prob * log(prob, 2)
return shannonEnt
# 对离散变量划分数据集,取出该特征取值为value的所有样本
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis + 1:])
retDataSet.append(reducedFeatVec)
return retDataSet
# 对连续变量划分数据集——二分法。不大于或者大于value的样本分别保存,进行划分
# direction规定划分的方向,决定是划分出小于value的数据样本还是大于value的数据样本集
def splitContinuousDataSet(dataSet, axis, value, direction):
retDataSet = []
for featVec in dataSet:
if direction == 0:
if featVec[axis] > value:
retDataSet.append(featVec)
else:
if featVec[axis] <= value:
retDataSet.append(featVec)
return retDataSet
# 选择最好的数据集划分方式
def chooseBestFeatureToSplit(dataSet, labels):
numFeatures = len(dataSet[0]) - 1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0
bestFeature = -1
bestSplitDict = {}
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
# 对连续型特征进行处理
if type(featList[0]).__name__ == 'float' or type(featList[0]).__name__ == 'int':
sortfeatList = sorted(featList) # 二分法:先对属性值从小到大进行排序
splitList = []
for j in range(len(sortfeatList) - 1):
splitList.append((sortfeatList[j] + sortfeatList[j + 1]) / 2.0)
bestSplitEntropy = 10000
slen = len(splitList)
for j in range(slen):
value = splitList[j]
newEntropy = 0.0
subDataSet0 = splitContinuousDataSet(dataSet, i, value, 0)
subDataSet1 = splitContinuousDataSet(dataSet, i, value, 1)
prob0 = len(subDataSet0) / float(len(dataSet))
prob1 = len(subDataSet1) / float(len(dataSet))
newEntropy = prob0 * calcShannonEnt(subDataSet0) \
+ prob1 * calcShannonEnt(subDataSet1)
if newEntropy < bestSplitEntropy:
bestSplitEntropy = newEntropy
bestSplit = j
bestSplitDict[labels[i]] = splitList[bestSplit]
infoGain = baseEntropy - bestSplitEntropy # 当前连续型特征最优划分点的信息增益
# 对离散型特征进行处理
else:
uniqueVals = set(featList)
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
# 比较每一个特征的信息增益,选择最大的。
if infoGain > bestInfoGain:
bestInfoGain = infoGain
bestFeature = i
# 返回:最优划分特征所在的列,和连续型{特征标签:最优划分点}
return bestFeature, bestSplitDict
# 采用多数表决(投票)的方法决定该叶子结点的分类。
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys():
classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.items(), \
key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
# 主程序,递归产生决策树
def createTree(dataSet, labels, data_full, labels_full):
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList):
# 一、如果样本都属于同一类,就没必要再分类了。
return classList[0]
if len(dataSet[0]) == 1:
# 二、所有特征都用完了,但类别标签仍然不是唯一的,返回出现次数最多的类别
return majorityCnt(classList)
bestFeat, bestSplitDict = chooseBestFeatureToSplit(dataSet, labels)
if bestFeat == -1:
# 三、没有选出最优划分特征,返回出现次数最多的类别
return majorityCnt(classList)
bestFeatLabel = labels[bestFeat]
# 1、最优划分特征是离散型
if type(dataSet[0][bestFeat]).__name__ == 'str':
myTree = {bestFeatLabel: {}}
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
bestFeatIndexInFull = labels_full.index(bestFeatLabel)
featValuesFull = [example[bestFeatIndexInFull] \
for example in data_full]
uniqueValsFull = set(featValuesFull)
del (labels[bestFeat])
for value in uniqueValsFull:
if value in uniqueVals:
subLabels = labels[:]
valueDataSet = splitDataSet(dataSet, bestFeat, value)
myTree[bestFeatLabel][value] = createTree(valueDataSet, \
subLabels, data_full, labels_full)
else:
myTree[bestFeatLabel][value] = majorityCnt(classList)
# 2、连续型特征,不删除
else:
bestSplitValue = bestSplitDict[bestFeatLabel]
bestFeatLabel = labels[bestFeat] + '<=' + str(bestSplitValue)
myTree = {bestFeatLabel: {}}
subDataSet0 = splitContinuousDataSet(dataSet, bestFeat, bestSplitValue, 0) # >value
subDataSet1 = splitContinuousDataSet(dataSet, bestFeat, bestSplitValue, 1) # <=value
myTree[bestFeatLabel]['no'] = createTree(subDataSet0, labels, \
data_full, labels_full)
myTree[bestFeatLabel]['yes'] = createTree(subDataSet1, labels, \
data_full, labels_full)
return myTree
#决策树
def classify(inputTree, featLabels, testVec):
firstStr = list(inputTree.keys())[0]
secondDict = inputTree[firstStr]
featIndex = featLabels.index(firstStr)
for key in secondDict.keys():
if testVec[featIndex] == key:
if type(secondDict[key]).__name__ == 'dict':
classLabel = classify(secondDict[key], featLabels, testVec)
else:
classLabel = secondDict[key]
return classLabel
#决策树
def classifyAll(inputTree, featLabels, testDataSet):
classLabelAll = []
for testVec in testDataSet:
classLabelAll.append(classify(inputTree, featLabels, testVec))
return classLabelAll
if __name__ == '__main__':
file = pd.read_csv('data.csv')
data = file.values[:, 3:].tolist()
for i in [t[0:-1] for t in data]: #连续型变量转化为浮点型
for j in i:
j = float(j)
label = file.columns.values[3:].tolist()
label_full = label
train_data = data[1:200]
train_data_full = train_data
test_data = data[200:245]
desicionTree = createTree(train_data, label, train_data_full, label_full)
print('desicionTree:\n', desicionTree)
treePlotter.createPlot(desicionTree)
print('classifyResult:\n', classifyAll(desicionTree, label, test_data))