基于 Decision Tree ID3 算法实现贷款风险审批
import numpy as np
import pandas as pd
data = pd.read_csv('dataset/loans.csv').sample(40000)
display(data.sample(10))
display(data.shape)
# grade: 贷款级别
# sub_grade: 贷款细分级别
# short_emp: 一年以内短期雇佣
# emp_length_num: 受雇年限
# home_ownership:居住状态(自有,按揭,租住)
# dti:贷款占收入比例
# purpose:贷款用途
# term:贷款周期
# last_delinq_none:贷款申请人是否有不良记录
# last_major_derog_none:贷款申请人是否有还款逾期90天以上记录
# revol_util:透支额度占信用比例
# total_rec_late_fee:逾期罚款总额
# safe_loans:贷款是否安全
grade sub_grade short_emp emp_length_num home_ownership dti purpose term last_delinq_none last_major_derog_none revol_util total_rec_late_fee safe_loans
19847 C C3 0 11 MORTGAGE 20.18 debt_consolidation 36 months 1 1 80.2 0.0000 1
24186 D D2 0 5 RENT 21.19 debt_consolidation 36 months 0 1 47.6 0.0000 1
40013 B B5 0 11 MORTGAGE 30.90 debt_consolidation 36 months 0 1 90.1 0.0000 -1
5405 B B1 1 1 MORTGAGE 22.17 debt_consolidation 36 months 1 1 40.1 0.0000 -1
20203 F F4 0 3 RENT 21.94 debt_consolidation 60 months 0 1 34.5 0.0000 1
12408 B B5 0 7 RENT 8.56 debt_consolidation 36 months 1 1 34.4 0.0000 -1
31935 B B2 0 3 RENT 9.68 debt_consolidation 36 months 1 1 78.6 0.0000 1
4621 D D1 0 3 MORTGAGE 3.78 other 36 months 1 1 70.4 14.9409 -1
30060 C C3 0 6 MORTGAGE 14.60 debt_consolidation 60 months 0 1 39.0 0.0000 1
8089 D D3 0 7 OWN 17.85 debt_consolidation 36 months 1 1 70.7 0.3500 -1
(40000, 13)
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
# 将以上非数值数据映射为数值型。
d = defaultdict(LabelEncoder)
data = data.apply(lambda x : d[x.name].fit_transform(x))
X_train = data.iloc[:800, :-1]
y_train = data.iloc[:800, -1]
test_X = data.iloc[800:, :-1]
test_y = data.iloc[800:, -1]
display(X_train)
# 特征二值化
# 由于特征值太过复杂,不利于处理。我们根据均值来二值化
for i in X_train.columns:
mean = np.mean(X_train[i])
for j in range(len(X_train[i])):
X_train[i].values[j] = 1 if X_train[i].values[j]>mean else 0
for i in test_X.columns:
mean = np.mean(test_X[i])
for j in range(len(test_X[i])):
test_X[i].values[j] = 1 if test_X[i].values[j]>mean else 0
display(X_train)
grade sub_grade short_emp emp_length_num home_ownership dti purpose term last_delinq_none last_major_derog_none revol_util total_rec_late_fee
4104 2 12 1 1 0 1764 2 0 0 1 320 0
18915 0 3 0 2 3 1608 1 0 0 1 240 0
44721 1 8 0 7 3 3240 1 0 1 1 561 0
21933 0 2 0 3 0 2328 1 0 1 1 325 0
20111 2 14 0 8 0 1048 2 0 0 1 280 0
... ... ... ... ... ... ... ... ... ... ... ... ...
13839 3 16 0 11 0 2025 2 1 1 1 817 0
15773 3 17 0 11 2 1483 2 0 0 1 958 0
27211 2 14 0 6 3 1701 2 1 1 1 157 0
17372 2 11 0 11 0 489 2 1 0 0 279 0
5410 2 14 0 11 0 1785 1 1 0 0 755 0
800 rows × 12 columns
grade sub_grade short_emp emp_length_num home_ownership dti purpose term last_delinq_none last_major_derog_none revol_util total_rec_late_fee
4104 1 1 1 0 0 1 0 0 0 1 0 0
18915 0 0 0 0 1 1 0 0 0 1 0 0
44721 0 0 0 1 1 1 0 0 1 1 0 0
21933 0 0 0 0 0 1 0 0 1 1 0 0
20111 1 1 0 1 0 0 0 0 0 1 0 0
... ... ... ... ... ... ... ... ... ... ... ... ...
13839 1 1 0 1 0 1 0 1 1 1 1 0
15773 1 1 0 1 1 0 0 0 0 1 1 0
27211 1 1 0 0 1 1 0 1 1 1 0 0
17372 1 0 0 1 0 0 0 1 0 0 0 0
5410 1 1 0 1 0 1 0 1 0 0 1 0
800 rows × 12 columns
from collections import Counter
from tqdm import tqdm
class DecisionTreeID3:
def __init__(self):
pass
def calc_entropy(self):
'''计算信息熵、条件熵和信息增益'''
# 把标签放在数据最后一列,方便create_tree处理
self.X = pd.concat([self.X, self.y], axis=1)
# ************ 计算信息熵 **************
yes = np.asarray(self.X[self.X[self.X.columns[-1]]==1])
no = np.asarray(self.X[self.X[self.X.columns[-1]]==0])
P_yes = len(yes)/len(self.X)
P_no = len(no)/len(self.X)
self.HX = - P_yes*np.log2(P_yes)- P_no*np.log2(P_no)
# display("信息熵 = " +str(self.HX))
# ************ 计算条件熵 **************
# H存放条件熵
self.Gda = []
# 遍历每一特征列。除了标签列
for i in self.X.columns[:-1]:
# 存放条件熵
Hi = 0
# 获取当前特征每种情况出现的次数,以便计算每个情况各自的概率
condProbCollections = Counter(self.X[i]).items()
# 每种特征可能有N中情况,累加
for k in condProbCollections:
# 获取当前条件X发生的总样本(包含所有列)
samples_of_current = self.X[self.X[i]==k[0]]
# 获取当前条件X发生的总样本(仅包含当前特征列)
samples_of_current_features = samples_of_current[i]
# 获取当前条件X发生下,被判定为安全和不安全的总次数
total = len(samples_of_current_features)
# 安全总次数
k_safe = len(samples_of_current[samples_of_current[samples_of_current.columns[-1]]==1])
# 不安全总次数
k_unsafe = total - k_safe
# 计算安全和不安全的概率
P_k_safe = k_safe/total
P_k_unsafe = k_unsafe/total
# 累加条件熵
log_P_k_safe = 0 if P_k_safe==0 else np.log2(P_k_safe) # 防止出现0值报错
log_P_k_unsafe = 0 if P_k_unsafe==0 else np.log2(P_k_unsafe) # 防止出现0值报错
Hi += - (total/len(self.X))*(P_k_safe * log_P_k_safe + P_k_unsafe * log_P_k_unsafe)
# 保存信息增益
self.Gda.append({"value":self.HX - Hi, "feature":i})
# print("信息增益为")
# print(self.Gda)
def create_tree(self, node=False):
'''构建决策树结构。决策树信息存储在JSON字典当中
Parameters
-----
node: Series类型,用于传递当前节点包含的信息
Return
-----
tree: 构建好的树json结构
'''
# 递归出口
if len(self.Gda)==0:
return node.iloc[:1,-1].values[0]
# 获取第一个特征列
feature = self.Gda[0]['feature']
# 删除该列,以便递归时不会重复到达这里
del self.Gda[0]
# 获取当前特征每种情况出现的次数,以便计算每个情况各自的概率
condProbCollections = Counter(self.X[feature]).items()
# print(feature)
# print(condProbCollections)
# 定义树字典。必须使用特征feature作为键名
tree = {feature:{}}
for [value,counts] in condProbCollections:
# print(condProbCollections)
# print(value)
tree[feature][value] = self.create_tree(self.X[self.X[feature]==value])
# 保存树结构
self.tree = tree
return tree
def fit(self, X, y):
'''训练
Parameters
-----
X: 训练数据,形如 [样本数量,特征数量]
y: 类数组类型,形状为:[样本数量]
'''
self.X = X
self.y = y
self.calc_entropy()
self.create_tree()
def predict_item(self, x, node=False):
'''构建决策树结构。决策树信息存储在JSON字典当中
Parameters
-----
node: Series类型,用于传递当前节点包含的信息
Return
-----
tree: 构建好的树json结构
'''
if node==0 or node==1:
return node
label = -1
# 获取当前节点名
key = next(iter(node))
# 如果当前节点的值等于0,递归0下面的分支,否则1
if x[key].values[0] == 0:
label = self.predict_item(x, node=node[key][0])
else:
label = self.predict_item(x, node=node[key][1])
return label
def predict(self, X):
'''对样本进行预测
Parameters:
X: 类数组类型,可以是List也可以是Ndarray,形状为: [样本数量,特征数量]
Returns:
数组类型,预测结果
'''
result = []
for i in range(len(X)):
result.append(self.predict_item(X.iloc[i:i+1,], node=self.tree))
return result
dt = DecisionTreeID3()
dt.fit(X_train, y_train)
result = dt.predict(test_X)
display(np.sum(result==test_y)/len(result))
a=[{"a":1},{"b":2},{"c":3}]
c = iter(a)
display(next(c))
display(next(c))
display(next(c))
{'a': 1}
{'b': 2}
{'c': 3}