阿里云算法 - DecisionTreeID3 - 《人工智能》

基于 Decision Tree ID3 算法实现贷款风险审批

import numpy as np
import pandas as pd

data = pd.read_csv('dataset/loans.csv').sample(40000)
display(data.sample(10))
display(data.shape)
# grade: 贷款级别
# sub_grade: 贷款细分级别
# short_emp: 一年以内短期雇佣
# emp_length_num:　受雇年限
# home_ownership:居住状态（自有，按揭，租住）
# dti：贷款占收入比例
# purpose:贷款用途
# term:贷款周期
# last_delinq_none:贷款申请人是否有不良记录　
# last_major_derog_none:贷款申请人是否有还款逾期90天以上记录
# reｖol_util：透支额度占信用比例
# total_rec_late_fee:逾期罚款总额
# safe_loans:贷款是否安全


        grade    sub_grade    short_emp    emp_length_num    home_ownership    dti        purpose                term        last_delinq_none    last_major_derog_none    revol_util    total_rec_late_fee    safe_loans
19847    C        C3            0            11                MORTGAGE        20.18    debt_consolidation    36 months    1                    1                        80.2        0.0000                1
24186    D        D2            0            5                RENT            21.19    debt_consolidation    36 months    0                    1                        47.6        0.0000                1
40013    B        B5            0            11                MORTGAGE        30.90    debt_consolidation    36 months    0                    1                        90.1        0.0000                -1
5405    B        B1            1            1                MORTGAGE        22.17    debt_consolidation    36 months    1                    1                        40.1        0.0000                -1
20203    F        F4            0            3                RENT            21.94    debt_consolidation    60 months    0                    1                        34.5        0.0000                1
12408    B        B5            0            7                RENT            8.56    debt_consolidation    36 months    1                    1                        34.4        0.0000                -1
31935    B        B2            0            3                RENT            9.68    debt_consolidation    36 months    1                    1                        78.6        0.0000                1
4621    D        D1            0            3                MORTGAGE        3.78    other                36 months    1                    1                        70.4        14.9409                -1
30060    C        C3            0            6                MORTGAGE        14.60    debt_consolidation    60 months    0                    1                        39.0        0.0000                1
8089    D        D3            0            7                OWN                17.85    debt_consolidation    36 months    1                    1                        70.7        0.3500                -1

(40000, 13)

from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

# 将以上非数值数据映射为数值型。
d = defaultdict(LabelEncoder)
data = data.apply(lambda x : d[x.name].fit_transform(x))
X_train = data.iloc[:800, :-1]
y_train = data.iloc[:800, -1]
test_X = data.iloc[800:, :-1]
test_y = data.iloc[800:, -1]
display(X_train)
# 特征二值化
# 由于特征值太过复杂，不利于处理。我们根据均值来二值化
for i in X_train.columns:
    mean = np.mean(X_train[i])
    for j in range(len(X_train[i])):
        X_train[i].values[j] = 1 if X_train[i].values[j]>mean else 0
for i in test_X.columns:
    mean = np.mean(test_X[i])
    for j in range(len(test_X[i])):
        test_X[i].values[j] = 1 if test_X[i].values[j]>mean else 0
display(X_train)

        grade    sub_grade    short_emp    emp_length_num    home_ownership    dti        purpose    term    last_delinq_none    last_major_derog_none    revol_util    total_rec_late_fee
4104    2        12            1            1                0                1764    2        0        0                    1                        320            0
18915    0        3            0            2                3                1608    1        0        0                    1                        240            0
44721    1        8            0            7                3                3240    1        0        1                    1                        561            0
21933    0        2            0            3                0                2328    1        0        1                    1                        325            0
20111    2        14            0            8                0                1048    2        0        0                    1                        280            0
...        ...        ...            ...            ...                ...                ...        ...        ...        ...                    ...                        ...            ...
13839    3        16            0            11                0                2025    2        1        1                    1                        817            0
15773    3        17            0            11                2                1483    2        0        0                    1                        958            0
27211    2        14            0            6                3                1701    2        1        1                    1                        157            0
17372    2        11            0            11                0                489        2        1        0                    0                        279            0
5410    2        14            0            11                0                1785    1        1        0                    0                        755            0

800 rows × 12 columns

        grade    sub_grade    short_emp    emp_length_num    home_ownership    dti    purpose    term    last_delinq_none    last_major_derog_none    revol_util    total_rec_late_fee
4104    1        1            1            0                0                1    0        0        0                    1                        0            0
18915    0        0            0            0                1                1    0        0        0                    1                        0            0
44721    0        0            0            1                1                1    0        0        1                    1                        0            0
21933    0        0            0            0                0                1    0        0        1                    1                        0            0
20111    1        1            0            1                0                0    0        0        0                    1                        0            0
...        ...        ...            ...            ...                ...                ...    ...        ...        ...                    ...                        ...            ...
13839    1        1            0            1                0                1    0        1        1                    1                        1            0
15773    1        1            0            1                1                0    0        0        0                    1                        1            0
27211    1        1            0            0                1                1    0        1        1                    1                        0            0
17372    1        0            0            1                0                0    0        1        0                    0                        0            0
5410    1        1            0            1                0                1    0        1        0                    0                        1            0

800 rows × 12 columns

from collections import  Counter
from tqdm import tqdm

class DecisionTreeID3:
    def __init__(self):
        pass
    def calc_entropy(self):
        '''计算信息熵、条件熵和信息增益'''
        # 把标签放在数据最后一列，方便create_tree处理
        self.X = pd.concat([self.X, self.y], axis=1)
        # ************ 计算信息熵 **************
        yes = np.asarray(self.X[self.X[self.X.columns[-1]]==1])
        no = np.asarray(self.X[self.X[self.X.columns[-1]]==0])
        P_yes = len(yes)/len(self.X)
        P_no = len(no)/len(self.X)
        self.HX = - P_yes*np.log2(P_yes)- P_no*np.log2(P_no)
#         display("信息熵 = " +str(self.HX))
        # ************ 计算条件熵 **************
        # H存放条件熵
        self.Gda = []
        # 遍历每一特征列。除了标签列
        for i in self.X.columns[:-1]:
            # 存放条件熵
            Hi = 0
            # 获取当前特征每种情况出现的次数，以便计算每个情况各自的概率
            condProbCollections = Counter(self.X[i]).items()
            # 每种特征可能有N中情况，累加
            for k in condProbCollections:
                # 获取当前条件X发生的总样本(包含所有列)
                samples_of_current = self.X[self.X[i]==k[0]]
                # 获取当前条件X发生的总样本(仅包含当前特征列)
                samples_of_current_features = samples_of_current[i]
                # 获取当前条件X发生下，被判定为安全和不安全的总次数
                total = len(samples_of_current_features)
                # 安全总次数
                k_safe = len(samples_of_current[samples_of_current[samples_of_current.columns[-1]]==1])
                # 不安全总次数
                k_unsafe = total - k_safe
                # 计算安全和不安全的概率
                P_k_safe = k_safe/total
                P_k_unsafe = k_unsafe/total
                # 累加条件熵
                log_P_k_safe = 0 if P_k_safe==0 else np.log2(P_k_safe) # 防止出现0值报错
                log_P_k_unsafe = 0 if P_k_unsafe==0 else np.log2(P_k_unsafe) # 防止出现0值报错
                Hi +=  - (total/len(self.X))*(P_k_safe * log_P_k_safe + P_k_unsafe * log_P_k_unsafe)
            # 保存信息增益
            self.Gda.append({"value":self.HX - Hi, "feature":i})
#         print("信息增益为")
#         print(self.Gda)
    def create_tree(self, node=False):
        '''构建决策树结构。决策树信息存储在JSON字典当中
        Parameters
        -----
        node: Series类型，用于传递当前节点包含的信息
        Return
        -----
        tree: 构建好的树json结构
        '''
        # 递归出口
        if len(self.Gda)==0:
            return node.iloc[:1,-1].values[0]
        # 获取第一个特征列
        feature = self.Gda[0]['feature']
        # 删除该列，以便递归时不会重复到达这里
        del self.Gda[0]
        # 获取当前特征每种情况出现的次数，以便计算每个情况各自的概率
        condProbCollections = Counter(self.X[feature]).items()
#         print(feature)
#         print(condProbCollections)
        # 定义树字典。必须使用特征feature作为键名
        tree = {feature:{}}
        for [value,counts] in condProbCollections:
#             print(condProbCollections)
#             print(value)
            tree[feature][value] = self.create_tree(self.X[self.X[feature]==value])
        # 保存树结构
        self.tree = tree
        return tree
    def fit(self, X, y):
        '''训练
        Parameters
        -----
        X: 训练数据，形如 [样本数量，特征数量]
        y: 类数组类型，形状为：[样本数量]
        '''
        self.X = X
        self.y = y
        self.calc_entropy()
        self.create_tree()
    def predict_item(self, x, node=False):
        '''构建决策树结构。决策树信息存储在JSON字典当中
        Parameters
        -----
        node: Series类型，用于传递当前节点包含的信息
        Return
        -----
        tree: 构建好的树json结构
        '''
        if node==0 or node==1:
            return node
        label = -1
        # 获取当前节点名
        key = next(iter(node))
        # 如果当前节点的值等于0，递归0下面的分支，否则1
        if x[key].values[0] == 0:
            label = self.predict_item(x, node=node[key][0])
        else:
            label = self.predict_item(x, node=node[key][1])
        return label
    def predict(self, X):
        '''对样本进行预测
        Parameters:
        X: 类数组类型，可以是List也可以是Ndarray，形状为： [样本数量,特征数量]
        Returns:
        数组类型，预测结果
        '''
        result = []
        for i in range(len(X)):
            result.append(self.predict_item(X.iloc[i:i+1,], node=self.tree))
        return result

dt = DecisionTreeID3()
dt.fit(X_train, y_train)
result = dt.predict(test_X)
display(np.sum(result==test_y)/len(result))

a=[{"a":1},{"b":2},{"c":3}]
c = iter(a)
display(next(c))
display(next(c))
display(next(c))

{'a': 1}
{'b': 2}
{'c': 3}