1. def calc_ent(datasets):
  2. """计算熵"""
  3. data_length = len(datasets)
  4. label_count = {}
  5. for i in range(data_length):
  6. label = datasets[i][-1]
  7. if label not in label_count:
  8. label_count[label] = 0
  9. label_count[label] += 1
  10. ent = -sum([(p / data_length) * log(p / data_length, 2)
  11. for p in label_count.values()])
  12. return ent
  13. def cond_ent(datasets, axis=0):
  14. """条件熵"""
  15. data_length = len(datasets)
  16. feature_sets = {}
  17. for i in range(data_length):
  18. feature = datasets[i][axis]
  19. if feature not in feature_sets:
  20. feature_sets[feature] = []
  21. feature_sets[feature].append(datasets[i])
  22. cond_ent = sum(
  23. [(len(p) / data_length) * calc_ent(p) for p in feature_sets.values()])
  24. return cond_ent
  25. def info_gain_train(datasets):
  26. count = len(datasets[0]) - 1
  27. ent = calc_ent(datasets)
  28. # ent = entropy(datasets)
  29. best_feature = []
  30. for c in range(count):
  31. c_info_gain = info_gain(ent, cond_ent(datasets, axis=c))
  32. best_feature.append((c, c_info_gain))
  33. print('特征({}) - info_gain - {:.3f}'.format(labels[c], c_info_gain))
  34. # 比较大小
  35. best_ = max(best_feature, key=lambda x: x[-1])
  36. return '特征({})的信息增益最大,选择为根节点特征'.format(labels[best_[0]])
  1. print("-----------------------------------------")
  2. print(info_gain_train(np.array(datasets)))
  3. print("-----------------------------------------")
  4. print("-----------------------------------------")
  5. print(info_gain_train(np.array(datasets)))
  6. print("-----------------------------------------")

每次分类的信息增益

image.png
image.png

分类结果如下

image.pngimage.pngimage.png

决策树如下

实现代码 - 图6实现代码 - 图7