一.决策树的构造


from __future__ import print_functionprint(__doc__)import operatorfrom math import logimport decisionTreePlot as dtPlotfrom collections import Counterdef createDataSet(): labels = ['no surfacing', 'flippers'] return dataSet, labelsdef calcShannonEnt(dataSet): numEntries = len(dataSet) labelCounts = {} for featVec in dataSet: currentLabel = featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 # 对于label标签的占比,求出label标签的香农熵 shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key])/numEntries shannonEnt -= prob * log(prob, 2) return shannonEntdef splitDataSet(dataSet, index, value): retDataSet = [] for featVec in dataSet: if featVec[index] == value: reducedFeatVec = featVec[:index] reducedFeatVec.extend(featVec[index+1:]) retDataSet.append(reducedFeatVec) return retDataSetdef chooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0]) - 1 baseEntropy = calcShannonEnt(dataSet) bestInfoGain, bestFeature = 0.0, -1 for i in range(numFeatures): featList = [example[i] for example in dataSet] uniqueVals = set(featList) newEntropy = 0.0 for value in uniqueVals: subDataSet = splitDataSet(dataSet, i, value) prob = len(subDataSet)/float(len(dataSet)) newEntropy += prob * calcShannonEnt(subDataSet) infoGain = baseEntropy - newEntropy print('infoGain=', infoGain, 'bestFeature=', i, baseEntropy, newEntropy) if (infoGain > bestInfoGain): bestInfoGain = infoGain bestFeature = i return bestFeaturedef majorityCnt(classList): classCount = {} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0]def createTree(dataSet, labels): classList = [example[-1] for example in dataSet] if classList.count(classList[0]) == len(classList): return classList[0] if len(dataSet[0]) == 1: return majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataSet) bestFeatLabel = labels[bestFeat] myTree = {bestFeatLabel: {}} del(labels[bestFeat]) featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels) return myTreedef classify(inputTree, featLabels, testVec): firstStr = inputTree.keys()[0] secondDict = inputTree[firstStr] featIndex = featLabels.index(firstStr) key = testVec[featIndex] valueOfFeat = secondDict[key] print('+++', firstStr, 'xxx', secondDict, '---', key, '>>>', valueOfFeat) if isinstance(valueOfFeat, dict): classLabel = classify(valueOfFeat, featLabels, testVec) else: classLabel = valueOfFeat return classLabeldef storeTree(inputTree, filename): import pickle fw = open(filename, 'wb') pickle.dump(inputTree, fw) fw.close() with open(filename, 'wb') as fw: pickle.dump(inputTree, fw)def grabTree(filename): import pickle fr = open(filename,'rb') return pickle.load(fr)def fishTest(): import copy myTree = createTree(myDat, copy.deepcopy(labels)) print(myTree) print(classify(myTree, labels, [1, 1])) # 获得树的高度 print(get_tree_height(myTree)) # 画图可视化展现 dtPlot.createPlot(myTree)def ContactLensesTest(): fr = open('data/3.DecisionTree/lenses.txt') lenses = [inst.strip().split('\t') for inst in fr.readlines()] lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate'] lensesTree = createTree(lenses, lensesLabels) print(lensesTree) # 画图可视化展现 dtPlot.createPlot(lensesTree)def get_tree_height(tree): if not isinstance(tree, dict): return 1 child_trees = tree.values()[0].values() # 遍历子树, 获得子树的最大高度 max_height = 0 for child_tree in child_trees: child_tree_height = get_tree_height(child_tree) if child_tree_height > max_height: max_height = child_tree_height return max_height + 1if __name__ == "__main__": fishTest() # ContactLensesTest()