ID3 - ID3 - 《机器学习与算法》

import pandas as pd
from math import log
import treePlotter
import operator
#计算给定数据集的香农熵
def calcShannonEnt(dataSet):
    numEntries = len(dataSet)
    labelCounts = {}
    for featVec in dataSet:
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries
        shannonEnt -= prob * log(prob, 2)
    return shannonEnt
# 对离散变量划分数据集，取出该特征取值为value的所有样本
def splitDataSet(dataSet, axis, value):
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis + 1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet
# 对连续变量划分数据集——二分法。不大于或者大于value的样本分别保存，进行划分
# direction规定划分的方向，决定是划分出小于value的数据样本还是大于value的数据样本集
def splitContinuousDataSet(dataSet, axis, value, direction):
    retDataSet = []
    for featVec in dataSet:
        if direction == 0:
            if featVec[axis] > value:
                retDataSet.append(featVec)
        else:
            if featVec[axis] <= value:
                retDataSet.append(featVec)
    return retDataSet
# 选择最好的数据集划分方式
def chooseBestFeatureToSplit(dataSet, labels):
    numFeatures = len(dataSet[0]) - 1
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0
    bestFeature = -1
    bestSplitDict = {}
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        # 对连续型特征进行处理
        if type(featList[0]).__name__ == 'float' or type(featList[0]).__name__ == 'int':
            sortfeatList = sorted(featList)  # 二分法：先对属性值从小到大进行排序
            splitList = []
            for j in range(len(sortfeatList) - 1):
                splitList.append((sortfeatList[j] + sortfeatList[j + 1]) / 2.0)
            bestSplitEntropy = 10000
            slen = len(splitList)
            for j in range(slen):
                value = splitList[j]
                newEntropy = 0.0
                subDataSet0 = splitContinuousDataSet(dataSet, i, value, 0)
                subDataSet1 = splitContinuousDataSet(dataSet, i, value, 1)
                prob0 = len(subDataSet0) / float(len(dataSet))
                prob1 = len(subDataSet1) / float(len(dataSet))
                newEntropy = prob0 * calcShannonEnt(subDataSet0) \
                    + prob1 * calcShannonEnt(subDataSet1)
                if newEntropy < bestSplitEntropy:
                    bestSplitEntropy = newEntropy
                    bestSplit = j
            bestSplitDict[labels[i]] = splitList[bestSplit]
            infoGain = baseEntropy - bestSplitEntropy  # 当前连续型特征最优划分点的信息增益
        # 对离散型特征进行处理
        else:
            uniqueVals = set(featList)
            newEntropy = 0.0
            for value in uniqueVals:
                subDataSet = splitDataSet(dataSet, i, value)
                prob = len(subDataSet) / float(len(dataSet))
                newEntropy += prob * calcShannonEnt(subDataSet)
            infoGain = baseEntropy - newEntropy
        #  比较每一个特征的信息增益，选择最大的。
        if infoGain > bestInfoGain:
            bestInfoGain = infoGain
            bestFeature = i
    # 返回：最优划分特征所在的列，和连续型{特征标签:最优划分点}
    return bestFeature, bestSplitDict
# 采用多数表决（投票）的方法决定该叶子结点的分类。
def majorityCnt(classList):
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.items(), \
        key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]
# 主程序，递归产生决策树
def createTree(dataSet, labels, data_full, labels_full):
    classList = [example[-1] for example in dataSet]
    if classList.count(classList[0]) == len(classList):
        # 一、如果样本都属于同一类，就没必要再分类了。
        return classList[0]
    if len(dataSet[0]) == 1:
        # 二、所有特征都用完了，但类别标签仍然不是唯一的,返回出现次数最多的类别
        return majorityCnt(classList)
    bestFeat, bestSplitDict = chooseBestFeatureToSplit(dataSet, labels)
    if bestFeat == -1:
        # 三、没有选出最优划分特征，返回出现次数最多的类别
        return majorityCnt(classList)
    bestFeatLabel = labels[bestFeat]
    # 1、最优划分特征是离散型
    if type(dataSet[0][bestFeat]).__name__ == 'str':
        myTree = {bestFeatLabel: {}}
        featValues = [example[bestFeat] for example in dataSet]
        uniqueVals = set(featValues)
        bestFeatIndexInFull = labels_full.index(bestFeatLabel)
        featValuesFull = [example[bestFeatIndexInFull] \
            for example in data_full]
        uniqueValsFull = set(featValuesFull)
        del (labels[bestFeat])
        for value in uniqueValsFull:
            if value in uniqueVals:
                subLabels = labels[:]
                valueDataSet = splitDataSet(dataSet, bestFeat, value)
                myTree[bestFeatLabel][value] = createTree(valueDataSet, \
                    subLabels, data_full, labels_full)
            else:
                myTree[bestFeatLabel][value] = majorityCnt(classList)
    # 2、连续型特征，不删除
    else:
        bestSplitValue = bestSplitDict[bestFeatLabel]
        bestFeatLabel = labels[bestFeat] + '<=' + str(bestSplitValue)
        myTree = {bestFeatLabel: {}}
        subDataSet0 = splitContinuousDataSet(dataSet, bestFeat, bestSplitValue, 0)  # >value
        subDataSet1 = splitContinuousDataSet(dataSet, bestFeat, bestSplitValue, 1)  # <=value
        myTree[bestFeatLabel]['no'] = createTree(subDataSet0, labels, \
            data_full, labels_full)
        myTree[bestFeatLabel]['yes'] = createTree(subDataSet1, labels, \
            data_full, labels_full)
    return myTree
#决策树
def classify(inputTree, featLabels, testVec):
    firstStr = list(inputTree.keys())[0]
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    for key in secondDict.keys():
        if testVec[featIndex] == key:
            if type(secondDict[key]).__name__ == 'dict':
                classLabel = classify(secondDict[key], featLabels, testVec)
            else:
                classLabel = secondDict[key]
    return classLabel
#决策树
def classifyAll(inputTree, featLabels, testDataSet):
    classLabelAll = []
    for testVec in testDataSet:
        classLabelAll.append(classify(inputTree, featLabels, testVec))
    return classLabelAll
if __name__ == '__main__':
    file = pd.read_csv('data.csv')
    data = file.values[:, 3:].tolist()
    for i in [t[0:-1] for t in data]: #连续型变量转化为浮点型
        for j in i:
            j = float(j)
    label = file.columns.values[3:].tolist()
    label_full = label
    train_data = data[1:200]
    train_data_full = train_data
    test_data = data[200:245]
    desicionTree = createTree(train_data, label, train_data_full, label_full)
    print('desicionTree:\n', desicionTree)
    treePlotter.createPlot(desicionTree)
    print('classifyResult:\n', classifyAll(desicionTree, label, test_data))