1. %% I. 清空环境变量
    2. clear all
    3. clc
    4. warning off
    5. %% II. 导入数据
    6. load data.mat
    7. %%
    8. % 1. 随机产生训练集/测试集
    9. a = randperm(569);
    10. Train = data(a(1:500),:);
    11. Test = data(a(501:end),:);
    12. %%
    13. % 2. 训练数据
    14. P_train = Train(:,3:end);
    15. T_train = Train(:,2);
    16. %%
    17. % 3. 测试数据
    18. P_test = Test(:,3:end);
    19. T_test = Test(:,2);
    20. %% III. 创建决策树分类器
    21. ctree = ClassificationTree.fit(P_train,T_train);
    22. %%
    23. % 1. 查看决策树视图
    24. view(ctree);
    25. view(ctree,'mode','graph');
    26. %% IV. 仿真测试
    27. T_sim = predict(ctree,P_test);
    28. %% V. 结果分析
    29. count_B = length(find(T_train == 1));
    30. count_M = length(find(T_train == 2));
    31. rate_B = count_B / 500;
    32. rate_M = count_M / 500;
    33. total_B = length(find(data(:,2) == 1));
    34. total_M = length(find(data(:,2) == 2));
    35. number_B = length(find(T_test == 1));
    36. number_M = length(find(T_test == 2));
    37. number_B_sim = length(find(T_sim == 1 & T_test == 1));
    38. number_M_sim = length(find(T_sim == 2 & T_test == 2));
    39. disp(['病例总数:' num2str(569)...
    40. ' 良性:' num2str(total_B)...
    41. ' 恶性:' num2str(total_M)]);
    42. disp(['训练集病例总数:' num2str(500)...
    43. ' 良性:' num2str(count_B)...
    44. ' 恶性:' num2str(count_M)]);
    45. disp(['测试集病例总数:' num2str(69)...
    46. ' 良性:' num2str(number_B)...
    47. ' 恶性:' num2str(number_M)]);
    48. disp(['良性乳腺肿瘤确诊:' num2str(number_B_sim)...
    49. ' 误诊:' num2str(number_B - number_B_sim)...
    50. ' 确诊率p1=' num2str(number_B_sim/number_B*100) '%']);
    51. disp(['恶性乳腺肿瘤确诊:' num2str(number_M_sim)...
    52. ' 误诊:' num2str(number_M - number_M_sim)...
    53. ' 确诊率p2=' num2str(number_M_sim/number_M*100) '%']);
    54. %% VI. 叶子节点含有的最小样本数对决策树性能的影响
    55. leafs = logspace(1,2,10);
    56. N = numel(leafs);
    57. err = zeros(N,1);
    58. for n = 1:N
    59. t = ClassificationTree.fit(P_train,T_train,'crossval','on','minleaf',leafs(n));
    60. err(n) = kfoldLoss(t);
    61. end
    62. plot(leafs,err);
    63. xlabel('叶子节点含有的最小样本数');
    64. ylabel('交叉验证误差');
    65. title('叶子节点含有的最小样本数对决策树性能的影响')
    66. %% VII. 设置minleaf13,产生优化决策树
    67. OptimalTree = ClassificationTree.fit(P_train,T_train,'minleaf',13);
    68. view(OptimalTree,'mode','graph')
    69. %%
    70. % 1. 计算优化后决策树的重采样误差和交叉验证误差
    71. resubOpt = resubLoss(OptimalTree)
    72. lossOpt = kfoldLoss(crossval(OptimalTree))
    73. %%
    74. % 2. 计算优化前决策树的重采样误差和交叉验证误差
    75. resubDefault = resubLoss(ctree)
    76. lossDefault = kfoldLoss(crossval(ctree))
    77. %% VIII. 剪枝
    78. [~,~,~,bestlevel] = cvLoss(ctree,'subtrees','all','treesize','min')
    79. cptree = prune(ctree,'Level',bestlevel);
    80. view(cptree,'mode','graph')
    81. %%
    82. % 1. 计算剪枝后决策树的重采样误差和交叉验证误差
    83. resubPrune = resubLoss(cptree)
    84. lossPrune = kfoldLoss(crossval(cptree))

    data.zip
    原文链接