基于 Decision Tree ID3 算法实现贷款风险审批

    1. import numpy as np
    2. import pandas as pd
    1. data = pd.read_csv('dataset/loans.csv').sample(40000)
    2. display(data.sample(10))
    3. display(data.shape)
    4. # grade: 贷款级别
    5. # sub_grade: 贷款细分级别
    6. # short_emp: 一年以内短期雇佣
    7. # emp_length_num: 受雇年限
    8. # home_ownership:居住状态(自有,按揭,租住)
    9. # dti:贷款占收入比例
    10. # purpose:贷款用途
    11. # term:贷款周期
    12. # last_delinq_none:贷款申请人是否有不良记录 
    13. # last_major_derog_none:贷款申请人是否有还款逾期90天以上记录
    14. # revol_util:透支额度占信用比例
    15. # total_rec_late_fee:逾期罚款总额
    16. # safe_loans:贷款是否安全
    1. grade sub_grade short_emp emp_length_num home_ownership dti purpose term last_delinq_none last_major_derog_none revol_util total_rec_late_fee safe_loans
    2. 19847 C C3 0 11 MORTGAGE 20.18 debt_consolidation 36 months 1 1 80.2 0.0000 1
    3. 24186 D D2 0 5 RENT 21.19 debt_consolidation 36 months 0 1 47.6 0.0000 1
    4. 40013 B B5 0 11 MORTGAGE 30.90 debt_consolidation 36 months 0 1 90.1 0.0000 -1
    5. 5405 B B1 1 1 MORTGAGE 22.17 debt_consolidation 36 months 1 1 40.1 0.0000 -1
    6. 20203 F F4 0 3 RENT 21.94 debt_consolidation 60 months 0 1 34.5 0.0000 1
    7. 12408 B B5 0 7 RENT 8.56 debt_consolidation 36 months 1 1 34.4 0.0000 -1
    8. 31935 B B2 0 3 RENT 9.68 debt_consolidation 36 months 1 1 78.6 0.0000 1
    9. 4621 D D1 0 3 MORTGAGE 3.78 other 36 months 1 1 70.4 14.9409 -1
    10. 30060 C C3 0 6 MORTGAGE 14.60 debt_consolidation 60 months 0 1 39.0 0.0000 1
    11. 8089 D D3 0 7 OWN 17.85 debt_consolidation 36 months 1 1 70.7 0.3500 -1

    (40000, 13)

    1. from sklearn.preprocessing import LabelEncoder
    2. from collections import defaultdict
    1. # 将以上非数值数据映射为数值型。
    2. d = defaultdict(LabelEncoder)
    3. data = data.apply(lambda x : d[x.name].fit_transform(x))
    4. X_train = data.iloc[:800, :-1]
    5. y_train = data.iloc[:800, -1]
    6. test_X = data.iloc[800:, :-1]
    7. test_y = data.iloc[800:, -1]
    8. display(X_train)
    9. # 特征二值化
    10. # 由于特征值太过复杂,不利于处理。我们根据均值来二值化
    11. for i in X_train.columns:
    12. mean = np.mean(X_train[i])
    13. for j in range(len(X_train[i])):
    14. X_train[i].values[j] = 1 if X_train[i].values[j]>mean else 0
    15. for i in test_X.columns:
    16. mean = np.mean(test_X[i])
    17. for j in range(len(test_X[i])):
    18. test_X[i].values[j] = 1 if test_X[i].values[j]>mean else 0
    19. display(X_train)
    1. grade sub_grade short_emp emp_length_num home_ownership dti purpose term last_delinq_none last_major_derog_none revol_util total_rec_late_fee
    2. 4104 2 12 1 1 0 1764 2 0 0 1 320 0
    3. 18915 0 3 0 2 3 1608 1 0 0 1 240 0
    4. 44721 1 8 0 7 3 3240 1 0 1 1 561 0
    5. 21933 0 2 0 3 0 2328 1 0 1 1 325 0
    6. 20111 2 14 0 8 0 1048 2 0 0 1 280 0
    7. ... ... ... ... ... ... ... ... ... ... ... ... ...
    8. 13839 3 16 0 11 0 2025 2 1 1 1 817 0
    9. 15773 3 17 0 11 2 1483 2 0 0 1 958 0
    10. 27211 2 14 0 6 3 1701 2 1 1 1 157 0
    11. 17372 2 11 0 11 0 489 2 1 0 0 279 0
    12. 5410 2 14 0 11 0 1785 1 1 0 0 755 0

    800 rows × 12 columns

    1. grade sub_grade short_emp emp_length_num home_ownership dti purpose term last_delinq_none last_major_derog_none revol_util total_rec_late_fee
    2. 4104 1 1 1 0 0 1 0 0 0 1 0 0
    3. 18915 0 0 0 0 1 1 0 0 0 1 0 0
    4. 44721 0 0 0 1 1 1 0 0 1 1 0 0
    5. 21933 0 0 0 0 0 1 0 0 1 1 0 0
    6. 20111 1 1 0 1 0 0 0 0 0 1 0 0
    7. ... ... ... ... ... ... ... ... ... ... ... ... ...
    8. 13839 1 1 0 1 0 1 0 1 1 1 1 0
    9. 15773 1 1 0 1 1 0 0 0 0 1 1 0
    10. 27211 1 1 0 0 1 1 0 1 1 1 0 0
    11. 17372 1 0 0 1 0 0 0 1 0 0 0 0
    12. 5410 1 1 0 1 0 1 0 1 0 0 1 0

    800 rows × 12 columns

    1. from collections import Counter
    2. from tqdm import tqdm
    1. class DecisionTreeID3:
    2. def __init__(self):
    3. pass
    4. def calc_entropy(self):
    5. '''计算信息熵、条件熵和信息增益'''
    6. # 把标签放在数据最后一列,方便create_tree处理
    7. self.X = pd.concat([self.X, self.y], axis=1)
    8. # ************ 计算信息熵 **************
    9. yes = np.asarray(self.X[self.X[self.X.columns[-1]]==1])
    10. no = np.asarray(self.X[self.X[self.X.columns[-1]]==0])
    11. P_yes = len(yes)/len(self.X)
    12. P_no = len(no)/len(self.X)
    13. self.HX = - P_yes*np.log2(P_yes)- P_no*np.log2(P_no)
    14. # display("信息熵 = " +str(self.HX))
    15. # ************ 计算条件熵 **************
    16. # H存放条件熵
    17. self.Gda = []
    18. # 遍历每一特征列。除了标签列
    19. for i in self.X.columns[:-1]:
    20. # 存放条件熵
    21. Hi = 0
    22. # 获取当前特征每种情况出现的次数,以便计算每个情况各自的概率
    23. condProbCollections = Counter(self.X[i]).items()
    24. # 每种特征可能有N中情况,累加
    25. for k in condProbCollections:
    26. # 获取当前条件X发生的总样本(包含所有列)
    27. samples_of_current = self.X[self.X[i]==k[0]]
    28. # 获取当前条件X发生的总样本(仅包含当前特征列)
    29. samples_of_current_features = samples_of_current[i]
    30. # 获取当前条件X发生下,被判定为安全和不安全的总次数
    31. total = len(samples_of_current_features)
    32. # 安全总次数
    33. k_safe = len(samples_of_current[samples_of_current[samples_of_current.columns[-1]]==1])
    34. # 不安全总次数
    35. k_unsafe = total - k_safe
    36. # 计算安全和不安全的概率
    37. P_k_safe = k_safe/total
    38. P_k_unsafe = k_unsafe/total
    39. # 累加条件熵
    40. log_P_k_safe = 0 if P_k_safe==0 else np.log2(P_k_safe) # 防止出现0值报错
    41. log_P_k_unsafe = 0 if P_k_unsafe==0 else np.log2(P_k_unsafe) # 防止出现0值报错
    42. Hi += - (total/len(self.X))*(P_k_safe * log_P_k_safe + P_k_unsafe * log_P_k_unsafe)
    43. # 保存信息增益
    44. self.Gda.append({"value":self.HX - Hi, "feature":i})
    45. # print("信息增益为")
    46. # print(self.Gda)
    47. def create_tree(self, node=False):
    48. '''构建决策树结构。决策树信息存储在JSON字典当中
    49. Parameters
    50. -----
    51. node: Series类型,用于传递当前节点包含的信息
    52. Return
    53. -----
    54. tree: 构建好的树json结构
    55. '''
    56. # 递归出口
    57. if len(self.Gda)==0:
    58. return node.iloc[:1,-1].values[0]
    59. # 获取第一个特征列
    60. feature = self.Gda[0]['feature']
    61. # 删除该列,以便递归时不会重复到达这里
    62. del self.Gda[0]
    63. # 获取当前特征每种情况出现的次数,以便计算每个情况各自的概率
    64. condProbCollections = Counter(self.X[feature]).items()
    65. # print(feature)
    66. # print(condProbCollections)
    67. # 定义树字典。必须使用特征feature作为键名
    68. tree = {feature:{}}
    69. for [value,counts] in condProbCollections:
    70. # print(condProbCollections)
    71. # print(value)
    72. tree[feature][value] = self.create_tree(self.X[self.X[feature]==value])
    73. # 保存树结构
    74. self.tree = tree
    75. return tree
    76. def fit(self, X, y):
    77. '''训练
    78. Parameters
    79. -----
    80. X: 训练数据,形如 [样本数量,特征数量]
    81. y: 类数组类型,形状为:[样本数量]
    82. '''
    83. self.X = X
    84. self.y = y
    85. self.calc_entropy()
    86. self.create_tree()
    87. def predict_item(self, x, node=False):
    88. '''构建决策树结构。决策树信息存储在JSON字典当中
    89. Parameters
    90. -----
    91. node: Series类型,用于传递当前节点包含的信息
    92. Return
    93. -----
    94. tree: 构建好的树json结构
    95. '''
    96. if node==0 or node==1:
    97. return node
    98. label = -1
    99. # 获取当前节点名
    100. key = next(iter(node))
    101. # 如果当前节点的值等于0,递归0下面的分支,否则1
    102. if x[key].values[0] == 0:
    103. label = self.predict_item(x, node=node[key][0])
    104. else:
    105. label = self.predict_item(x, node=node[key][1])
    106. return label
    107. def predict(self, X):
    108. '''对样本进行预测
    109. Parameters:
    110. X: 类数组类型,可以是List也可以是Ndarray,形状为: [样本数量,特征数量]
    111. Returns:
    112. 数组类型,预测结果
    113. '''
    114. result = []
    115. for i in range(len(X)):
    116. result.append(self.predict_item(X.iloc[i:i+1,], node=self.tree))
    117. return result
    1. dt = DecisionTreeID3()
    2. dt.fit(X_train, y_train)
    3. result = dt.predict(test_X)
    4. display(np.sum(result==test_y)/len(result))
    1. a=[{"a":1},{"b":2},{"c":3}]
    2. c = iter(a)
    3. display(next(c))
    4. display(next(c))
    5. display(next(c))
    1. {'a': 1}
    2. {'b': 2}
    3. {'c': 3}