ID3
    image.png
    CSDN包含代码、可视化等:https://blog.csdn.net/asialee_bird/article/details/81118245?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522165577243116781432948069%2522%252C%2522scm%2522%253A%252220140713.130102334..%2522%257D&request_id=165577243116781432948069&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~all~top_positive~default-1-81118245-null-null.142^v18^rank_v32,157^v15^new_3&utm_term=ID3&spm=1018.2226.3001.4187
    image.png

    1. from math import log
    2. import operator
    3. def calcShannonEnt(dataSet): # 计算数据的熵(entropy)
    4. numEntries=len(dataSet) # 数据条数
    5. labelCounts={}
    6. for featVec in dataSet:
    7. currentLabel=featVec[-1] # 每行数据的最后一个字(类别)
    8. if currentLabel not in labelCounts.keys():
    9. labelCounts[currentLabel]=0
    10. labelCounts[currentLabel]+=1 # 统计有多少个类以及每个类的数量
    11. shannonEnt=0
    12. for key in labelCounts:
    13. prob=float(labelCounts[key])/numEntries # 计算单个类的熵值
    14. shannonEnt-=prob*log(prob,2) # 累加每个类的熵值
    15. return shannonEnt
    16. def createDataSet1(): # 创造示例数据
    17. dataSet = [['长', '粗', '男'],
    18. ['短', '粗', '男'],
    19. ['短', '粗', '男'],
    20. ['长', '细', '女'],
    21. ['短', '细', '女'],
    22. ['短', '粗', '女'],
    23. ['长', '粗', '女'],
    24. ['长', '粗', '女']]
    25. labels = ['头发','声音'] #两个特征
    26. return dataSet,labels
    27. def splitDataSet(dataSet,axis,value): # 按某个特征分类后的数据
    28. retDataSet=[]
    29. for featVec in dataSet:
    30. if featVec[axis]==value:
    31. reducedFeatVec =featVec[:axis]
    32. reducedFeatVec.extend(featVec[axis+1:])
    33. retDataSet.append(reducedFeatVec)
    34. return retDataSet
    35. def chooseBestFeatureToSplit(dataSet): # 选择最优的分类特征
    36. numFeatures = len(dataSet[0])-1
    37. baseEntropy = calcShannonEnt(dataSet) # 原始的熵
    38. bestInfoGain = 0
    39. bestFeature = -1
    40. for i in range(numFeatures):
    41. featList = [example[i] for example in dataSet]
    42. uniqueVals = set(featList)
    43. newEntropy = 0
    44. for value in uniqueVals:
    45. subDataSet = splitDataSet(dataSet,i,value)
    46. prob =len(subDataSet)/float(len(dataSet))
    47. newEntropy +=prob*calcShannonEnt(subDataSet) # 按特征分类后的熵
    48. infoGain = baseEntropy - newEntropy # 原始熵与按特征分类后的熵的差值
    49. if (infoGain>bestInfoGain): # 若按某特征划分后,熵值减少的最大,则次特征为最优分类特征
    50. bestInfoGain=infoGain
    51. bestFeature = i
    52. return bestFeature
    53. def majorityCnt(classList): #按分类后类别数量排序,比如:最后分类为2男1女,则判定为男;
    54. classCount={}
    55. for vote in classList:
    56. if vote not in classCount.keys():
    57. classCount[vote]=0
    58. classCount[vote]+=1
    59. sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    60. return sortedClassCount[0][0]
    61. def createTree(dataSet,labels):
    62. classList=[example[-1] for example in dataSet] # 类别:男或女
    63. if classList.count(classList[0])==len(classList):
    64. return classList[0]
    65. if len(dataSet[0])==1:
    66. return majorityCnt(classList)
    67. bestFeat=chooseBestFeatureToSplit(dataSet) #选择最优特征
    68. bestFeatLabel=labels[bestFeat]
    69. myTree={bestFeatLabel:{}} #分类结果以字典形式保存
    70. del(labels[bestFeat])
    71. featValues=[example[bestFeat] for example in dataSet]
    72. uniqueVals=set(featValues)
    73. for value in uniqueVals:
    74. subLabels=labels[:]
    75. myTree[bestFeatLabel][value]=createTree(splitDataSet\
    76. (dataSet,bestFeat,value),subLabels)
    77. return myTree
    78. if __name__=='__main__':
    79. dataSet, labels=createDataSet1() # 创造示列数据
    80. print(createTree(dataSet, labels)) # 输出决策树模型结果