离散型变量切尾.py

-- coding: utf-8 --
pip install toad

import toad
import pandas as pd
import numpy as np
import xlwt
from toad.plot import bin_plot
from matplotlib import pyplot as plt

IV计算函数
def cal_iv(x,y):
crtab=pd.crosstab(x, y,margins=True)
crtab.columns=[‘good’,’bad’,’total’]
crtab[‘factor_per’]=crtab[‘total’]/len(y)
crtab[‘bad_per’]=crtab[‘bad’]/crtab[‘total’]
crtab[‘lift’]=crtab[‘bad_per’]/crtab.loc[‘All’,’bad_per’]
crtab[‘p’]=crtab[‘bad’]/crtab.loc[‘All’,’bad’]
crtab[‘q’]=crtab[‘good’]/crtab.loc[‘All’,’good’]
crtab[‘woe’]=np.log(crtab[‘p’]/crtab[‘q’])
crtab2=crtab[abs(crtab.woe)!=np.inf]


crtab[‘IV’]=sum((crtab2[‘p’]-crtab2[‘q’])*np.log(crtab2[‘p’]/crtab2[‘q’]))
crtab.reset_index(inplace=True)
crtab[‘varname’]=crtab.columns[0]
crtab.rename(columns={crtab.columns[0]:’var_level’},inplace=True)
crtab.val_level=crtab.var_level.apply(str)

  1. crtab['lift']=round(crtab['lift'], 4)<br /> crtab['factor_per']=round(crtab['factor_per'], 4)<br /> crtab['bad_per']=round(crtab['bad_per'], 4)<br /> crtab['IV']=round(crtab['IV'], 4)<br /> <br /> return crtab[['varname','var_level','bad', 'total', 'factor_per', 'bad_per', 'IV','lift']]<br /> <br /> <br />

数据读取

if name == ‘main‘:
#读数
data=pd.read_excel(r’..\外部数据测试/综合指数V2_sample.xlsx’)
#获取列名
data_head=data.head()
data_columns=data.columns #得到所有列
print(data_columns)

分箱

#生成分箱初始化对象
bin_transformer=toad.transform.Combiner()

data=data.drop([‘回溯日期’],axis = 1)
#chi分箱
bin_transformer.fit(data,y=’bad_label’,n_bins=5,method=’quantile’,empty_separate=True)
#分箱数据
trans_data=bin_transformer.transform(data,labels=True)



#循环计算IV

  1. var_list=data_columns=data.columns<br /> var_list = var_list.drop(['bad_label'])<br /> <br /> dd=pd.DataFrame(columns=('varname','var_level','bad','total','factor_per','bad_per','IV','lift'))<br /> # add_data = pd.Series({'varname': ' ', 'var_level': ' ','bad': ' ','total': ' ','factor_per': ' ','bad_per': ' ','IV': ' ','lift': ' '})<br /> for i in var_list:<br /> kk=cal_iv(trans_data[i],trans_data['bad_label'])<br /> dd=dd.append(kk) <br /> dd=dd.append(pd.Series(), ignore_index=True)<br /> <br /> <br /> dd.to_excel(r'../综合指数V2_quantile_sample_result.xlsx') <br /> <br /> <br /> #拆分字段 <br /> var_list=['身份证-事实类原因', '身份证-预警类原因', '身份证-综合类原因', '身份证-近期不良行为类原因', <br /> '手机号-事实类原因','手机号-预警类原因','手机号-综合类原因', '手机号-近期不良行为类原因']

dd=pd.DataFrame(columns=(‘varname’,’var_level’,’bad’,’total’,’factor_per’,’bad_per’,’IV’,’lift’))

for i in varlist:
data1=data[[i,’bad_label’]]
data1[i]=data1[i].fillna(‘NA’)
data1[i]=data1[i].map(lambda x:x.split(‘ ‘))
data1=data1.explode(i)
#生成分箱初始化对象
bin_transformer=toad.transform.Combiner()
#chi分箱
bin_transformer.fit(data1,y=’bad_label’,n_bins=5,method=’chi’,empty_separate=True)
#分箱数据
trans_data=bin_transformer.transform(data1,labels=True)
kk=cal_iv(trans_data[i],trans_data[‘bad_label’])
dd=dd.append(kk)
dd=dd.append(pd.Series(), ignore_index=True)
dd.to_excel(r’..\【外部数据测试】度小满/风险名单v3
原因_result.xlsx’)