【6】风控分析方法 - 《智能风控：评分卡建模》本书实用代码 - 《YT库》

特征工程提取有效的风险特征_20211018
评分卡模型开发_20211018
评分卡模型部署_20211018
评分卡模型可解释性_20211018
从评分卡模型到高维机器学习模型_20211018

特征工程提取有效的风险特征_20211018

#------------------------------------------------------------------------------
"""
功能说明：
    本代码是第5章特征工程提取有效的风险特征配套代码。
算法流程：
    1、特征组合多项式特征
    2、非负矩阵分解
    3、featuretools包
    4、TSFresh包
输入数据：
    使用代码自带数据，无需额外的外部数据
输出数据：
    各代码段输出相应结果变量
版本历史：
    20211018：定稿提交出版
"""
#------------------------------------------------------------------------------
# 特征组合多项式特征
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
X=np.arange(9).reshape(3,3)
poly=PolynomialFeatures(2) #二阶多项式
poly.fit_transform(X)
poly=PolynomialFeatures(degree=3, interaction_only=True) #三阶多项式仅保留交叉项
poly.fit_transform(X)
#------------------------------------------------------------------------------
# 非负矩阵分解
import numpy as np
from sklearn.decomposition import NMF
from sklearn.datasets import load_iris
X, _ = load_iris(True)
#定义模型
nmf = NMF(n_components=2,  # n_components即前文矩阵分解中的k，如果不设定该参数则默认保留全部特征
          init=None,  # W和H的初始化方法，包括'random','nndsvd'(默认),'nndsvda','nndsvdar','custom'.
          solver='cd',  #取值：'cd'、'mu'
          beta_loss='frobenius',  #取值：{'frobenius','kullback-leibler','itakura-saito'}，一般保持默认
          tol=1e-4,  # 停止迭代的极限条件
          max_iter=1000,  #最大迭代次数
          random_state=None,
          alpha=0.,  #正则化参数
          l1_ratio=0.,  #正则化参数
          verbose=0,  #冗长模式
          shuffle=False  #针对"cd solver"
          )
#模型参数
print('params:', nmf.get_params())  #获取构造函数参数的值，也可以通过nmf.attr得到
#模型拟合
nmf.fit(X)
W = nmf.fit_transform(X)
nmf.inverse_transform(W)
H = nmf.components_  # H矩阵
X_= np.dot(W,H)
print('reconstruction_err_', nmf.reconstruction_err_)  #损失函数值
print('n_iter_', nmf.n_iter_)  #迭代次数
#------------------------------------------------------------------------------
# 使用featuretools包进行特征衍生
#导入相关包
import featuretools as ft
#查看自带的数据情况
es = ft.demo.load_mock_customer(return_entityset=True)
es.plot()
#数据载入
data=ft.demo.load_mock_customer()
customers_df=data["customers"]
sessions_df=data["sessions"]
transactions_df=data["transactions"]
#创建实体和实体间关联关系
dataframes={"customers": (customers_df,"customer_id"),
          "sessions": (sessions_df,"session_id","session_start"),
          "transactions":(transactions_df,"transaction_id","transaction_time")
        }
relationships=[("sessions","session_id","transactions","session_id"),
               ("customers","customer_id","sessions","customer_id")
        ]
#运行DFS衍生特征
feature_matrix_customers, features_defs=ft.dfs(
        dataframes=dataframes,
        relationships=relationships,
        target_dataframe_name="customers")
#查看衍生的变量
feature_matrix_customers_columnslst=list(feature_matrix_customers.columns)
#------------------------------------------------------------------------------
# 使用tsfresh包进行特征衍生
#导入相关包
from tsfresh.examples.robot_execution_failures import download_robot_execution_failures, load_robot_execution_failures
#下载和读入数据
download_robot_execution_failures()  #下载数据
timeseries, y = load_robot_execution_failures() #加载数据
timeseries.head()
y.head()
#显示数据前几行
print(timeseries.head())
print(y.head())
#显示时间序列
import matplotlib.pyplot as plt
timeseries[timeseries['id'] == 3].plot(subplots=True, sharex=True, figsize=(10,10))
y[3] #True正常
plt.show()
timeseries[timeseries['id'] == 21].plot(subplots=True, sharex=True, figsize=(10,10))
y[21] #False有故障
plt.show()
#特征提取
from tsfresh import extract_features
extracted_features = extract_features(timeseries, column_id="id", column_sort="time")
#特征选择，基于上一步特征提取的结果，注意不允许出现NaN值，所以需要使用impute先填充
from tsfresh.utilities.dataframe_functions import impute
impute(extracted_features) #缺失值都用0填充
from tsfresh import select_features
features_filtered = select_features(extracted_features, y)
#特征提取+特征选择
from tsfresh import extract_relevant_features
features_filtered_direct = extract_relevant_features(timeseries, y,column_id='id', column_sort='time')

评分卡模型开发_20211018

#------------------------------------------------------------------------------
"""
功能说明：
    本代码是第7章评分卡模型开发配套代码
算法流程：
    1、使用scikit-learn包的LogisticRegression类建立逻辑回归模型
    2、使用statsmodels包的Logit类建立逻辑回归模型
    3、使用scorecardpy包建模
    4、使用toad包建模
输入数据：
    使用代码自带数据，无需额外的外部数据
输出数据：
    各代码段输出相应结果变量
版本历史：
    20211018：定稿提交出版
"""
#------------------------------------------------------------------------------
# 使用scikit-learn的LogisticRegression类建立逻辑回归模型
#导入相关包和模块
import pandas as pd
from sklearn.datasets import load_breast_cancer #乳腺癌数据，Target：二分类
from sklearn.linear_model import LogisticRegression #分类
from sklearn.model_selection import train_test_split #数据集划分
#准备数据，Target二分类
ds_cancer = load_breast_cancer()
data=pd.DataFrame(ds_cancer.data).add_prefix('X')
target = pd.DataFrame(ds_cancer.target,columns=['y'])
X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.3)
#定义分类器
clf=LogisticRegression(fit_intercept=True, random_state=123) #模型带截距项
clf.get_params()
#模型拟合
clf.fit(X_train,y_train)
#获取模型拟合系数
clf.coef_
clf.intercept_
'''
注：上述代码仅用于演示LogisticRegression类的使用，计算模型拟合结果后还需要进行模型结果的评估和验证，
必要时需要迭代地进行变量选择。
'''
#------------------------------------------------------------------------------
# 使用statsmodels包的Logit类建立逻辑回归模型
#导入包
import statsmodels.api as sm  #回归类模型
#变量筛选通过相关性
X_train_corr=X_train.corr()[X_train.corr()>0.9] #计算变量相关性发现X0和X1、X2,X20和X22、X23相关性很高
X_train1=X_train.drop(['X0','X2','X3','X10','X12','X13','X20','X22','X23'],axis=1)
#加上常数项
X_train1=sm.add_constant(X_train1)
#拟合模型
model = sm.Logit(y_train, X_train1)
results = model.fit()
#模型结果
print(results.summary())
print(results.params)
'''
注：上述代码仅用于演示Logit类的使用，计算出模型拟合结果后还需要进行进一步的统计检验，
统计检验显示，多个变量的p值>0.05，故变量不显著，所以需要迭代地将不显著的变量去除。
'''
#------------------------------------------------------------------------------
# 使用scorecardpy包建模
"""
功能说明：
    本程序使用scorecardpy进行评分卡建模
算法流程：
    依次读入数据、变量筛选、数据分区、变量分箱、分箱调整、变量转换WOE、训练模型、模型评估、模型验证、评分标尺
输入数据：
    本程序不需要额外输入数据，sc.germancredit自带建模数据
输出数据：
    评分卡模型结果
版本历史：
"""
#导入相关包
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import plot_roc_curve
import scorecardpy as sc
#1.读入数据
#读入数据
data = sc.germancredit()
#数据信息
data.info()
data.describe()
#2.变量筛选
data_s = sc.var_filter(data, 
                       y="creditability",
                       iv_limit=0.02, 
                       missing_limit=0.95, 
                       identical_limit=0.95, 
                       var_rm=None, 
                       var_kp=None, 
                       return_rm_reason=False, 
                       positive='bad|1')
#3.数据分区
train, test = sc.split_df(data_s, 'creditability', ratio=0.7, seed=123).values()
#4.变量分箱
#自动分箱
bins = sc.woebin(train, y="creditability")
#细分箱结果报告
sc.woebin_plot(bins)
#5.分箱调整
#交互式输入cut后分箱
#breaks_adj = sc.woebin_adj(train, "creditability", bins) 
#也可以手动设置
breaks_adj = {'age.in.years': [22, 35, 40,60],
              'other.debtors.or.guarantors': ["none", "co-applicant%,%guarantor"]}
bins_adj = sc.woebin(train, y="creditability", breaks_list=breaks_adj)
#6.变量转换WOE
train_woe = sc.woebin_ply(train, bins_adj)
test_woe = sc.woebin_ply(test, bins_adj)
#7.训练模型
#处理数据
X_train = train_woe.loc[:,train_woe.columns != 'creditability']
y_train = train_woe.loc[:,'creditability']
X_test = test_woe.loc[:,train_woe.columns != 'creditability']
y_test = test_woe.loc[:,'creditability']
#定义分类器
lr = LogisticRegression(penalty='l1', C=0.9, solver='saga', n_jobs=-1)
lr.get_params()
#拟合模型
lr.fit(X_train, y_train)
#拟合的参数
lr.coef_
lr.intercept_
#8.模型评估
# predicted proability for train
y_train_pred = lr.predict_proba(X_train)[:,1]
# 绘制KS和ROC、PR曲线
train_perf = sc.perf_eva(y_train, y_train_pred, plot_type=["ks", "roc","pr","lift"], title = "train")
plot_roc_curve(lr,X_train,y_train) 
plot_precision_recall_curve(lr,X_train,y_train)
#9.模型验证
# predicted proability for test
y_test_pred = lr.predict_proba(X_test)[:,1]
# 绘制KS和ROC、PR曲线
test_perf = sc.perf_eva(y_test, y_test_pred, plot_type=["ks", "roc","pr","lift"], title = "test")
plot_roc_curve(lr,X_test,y_test) 
plot_precision_recall_curve(lr,X_test,y_test) 
#10.评分标尺
card = sc.scorecard(bins_adj, 
                    lr, 
                    X_train.columns,
                    points0=600, 
                    odds0=1/19, 
                    pdo=50, 
                    basepoints_eq0=True)
#使用评分标尺打分
train_score = sc.scorecard_ply(train, card, print_step=0)
test_score = sc.scorecard_ply(test, card, print_step=0)
#比较train/test分数分布是否一致，计算分值分布PSI
sc.perf_psi(
        score = {'train':train_score, 'test':test_score},
        label = {'train':y_train, 'test':y_test})
#------------------------------------------------------------------------------
# 使用toad包建模
"""
功能说明：
    本程序使用toad进行评分卡建模
算法流程：
    依次进行读入数据、样本分区、数据EDA报告、特征分析、特征预筛选、特征分箱、调整合并分箱、特征选择、模型训练、模型评估、模型验证、评分标尺
输入数据：
    数据下载为https://www.kaggle.com/c/GiveMeSomeCredit/data
输出数据：
    评分模型结果
版本历史：
"""
#导入相关包
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import toad
from toad.plot import badrate_plot, proportion_plot, bin_plot
from toad.metrics import KS, F1, AUC
#1. 读入数据
#读入数据
data = pd.read_csv(r'D:\cs-training.csv')
#数据描述
data.info()
data.describe()
data.head()
#2. 样本分区
Xtr,Xts,Ytr,Yts = train_test_split(data.drop('SeriousDlqin2yrs',axis=1),
                                   data['SeriousDlqin2yrs'],
                                   test_size=0.25,
                                   random_state=450)
data_tr = pd.concat([Xtr,Ytr],axis=1)
data_tr['type'] = 'train'
data_ts = pd.concat([Xts,Yts],axis=1)
data_ts['type'] = 'test'
#3. 数据EDA报告
toad.detector.detect(data_tr).to_excel(r'D:\数据EDA结果.xlsx')
#4. 特征分析计算特征IV、gini、entropy、unique
quality = toad.quality(data,'SeriousDlqin2yrs')
quality.head(6)
#5. 特征预筛选
selected_train, drop_lst= toad.selection.select(data_tr,target = 'SeriousDlqin2yrs', 
                                               empty = 0.5, 
                                               iv = 0.05, 
                                               corr = 0.7, 
                                               return_drop=True, 
                                               exclude='type')
selected_test = data_ts[selected_train.columns]
selected_train.shape
drop_lst  #删除的额变量
#6. 特征分箱，必须基于train数据集来做
# 初始化一个combiner类
combiner = toad.transform.Combiner()
# 训练数据并指定分箱方法，需要分箱的变量共7个
combiner.fit(selected_train,
             y='SeriousDlqin2yrs',
             method='chi',
             min_samples =  0.05,
             exclude='type')
# 以字典形式保存分箱结果
bins = combiner.export()
#查看每个特征的分箱结果
print('DebtRatio分箱cut:',bins['DebtRatio'])
print('MonthlyIncome分箱cut:',bins['MonthlyIncome'])
print('NumberOfOpenCreditLinesAndLoans分箱cut:',bins['NumberOfOpenCreditLinesAndLoans'])
print('NumberOfTimes90DaysLate分箱cut:',bins['NumberOfTimes90DaysLate'])
print('NumberRealEstateLoansOrLines分箱cut:',bins['NumberRealEstateLoansOrLines'])
print('RevolvingUtilizationOfUnsecuredLines分箱cut:',bins['RevolvingUtilizationOfUnsecuredLines'])
print('age分箱cut:',bins['age'])
#使用combiner.transform方法对数据进行分箱转换
selected_train_bin = combiner.transform(selected_train)
#画分箱图，bin_plot双轴图同时绘制分箱占比和分箱badrate
proportion_plot(selected_train_bin['DebtRatio'])
proportion_plot(selected_train_bin['MonthlyIncome'])
proportion_plot(selected_train_bin['NumberOfOpenCreditLinesAndLoans'])
proportion_plot(selected_train_bin['NumberOfTimes90DaysLate'])
proportion_plot(selected_train_bin['NumberRealEstateLoansOrLines'])
proportion_plot(selected_train_bin['RevolvingUtilizationOfUnsecuredLines'])
proportion_plot(selected_train_bin['age'])
badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'DebtRatio')
badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'MonthlyIncome')
badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberOfOpenCreditLinesAndLoans')
badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberOfTimes90DaysLate')
badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberRealEstateLoansOrLines')
badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'RevolvingUtilizationOfUnsecuredLines')
badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'age')
bin_plot(selected_train_bin,x='DebtRatio',target='SeriousDlqin2yrs') 
bin_plot(selected_train_bin,x='MonthlyIncome',target='SeriousDlqin2yrs') 
bin_plot(selected_train_bin,x='NumberOfOpenCreditLinesAndLoans',target='SeriousDlqin2yrs')
bin_plot(selected_train_bin,x='NumberOfTimes90DaysLate',target='SeriousDlqin2yrs')
bin_plot(selected_train_bin,x='NumberRealEstateLoansOrLines',target='SeriousDlqin2yrs')
bin_plot(selected_train_bin,x='RevolvingUtilizationOfUnsecuredLines',target='SeriousDlqin2yrs')
bin_plot(selected_train_bin,x='age',target='SeriousDlqin2yrs')
#7. 调整合并分箱
#定义调整分箱#调整分箱cutpoint
bins_adj=bins
bins_adj["age"]=[22, 35, 45, 60]
bins_adj["NumberOfOpenCreditLinesAndLoans"]=[2]
bins_adj["DebtRatio"]=[0.02,0.4,0.5,2] 
#定义分箱combiner
combiner2 = toad.transform.Combiner() #定义分箱combiner
combiner2.set_rules(bins_adj) #设置需要施加的分箱
#应用调整分箱
selected_train_binadj = combiner2.transform(selected_train)
#画分箱坏账率badrate图
proportion_plot(selected_train_binadj['DebtRatio'])
proportion_plot(selected_train_binadj['MonthlyIncome'])
proportion_plot(selected_train_binadj['NumberOfOpenCreditLinesAndLoans'])
proportion_plot(selected_train_binadj['NumberOfTimes90DaysLate'])
proportion_plot(selected_train_binadj['NumberRealEstateLoansOrLines'])
proportion_plot(selected_train_binadj['RevolvingUtilizationOfUnsecuredLines'])
proportion_plot(selected_train_binadj['age'])
badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'DebtRatio')
badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'MonthlyIncome')
badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberOfOpenCreditLinesAndLoans')
badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberOfTimes90DaysLate')
badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberRealEstateLoansOrLines')
badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'RevolvingUtilizationOfUnsecuredLines')
badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'age')
#8. 转换WOE值
#设置分箱号
combiner.set_rules(bins_adj)
#将特征的值转化为分箱的箱号。
selected_train_binadj = combiner.transform(selected_train)
selected_test_binadj = combiner.transform(selected_test)
#定义WOE转换器
WOETransformer = toad.transform.WOETransformer()
#对WOE的值进行转化，映射到原数据集上。对训练集用fit_transform，测试集用transform
data_tr_woe = WOETransformer.fit_transform(selected_train_binadj, 
                                           selected_train_binadj['SeriousDlqin2yrs'], 
                                           exclude=['SeriousDlqin2yrs','type'])
data_ts_woe = WOETransformer.transform(selected_test_binadj)
#9. 特征选择，使用stepwise选择变量
train_final = toad.selection.stepwise(data_tr_woe.drop('type',axis=1),
                                     target = 'SeriousDlqin2yrs',
                                     direction = 'both', 
                                     criterion = 'aic')
test_final = data_ts_woe[train_final.columns]
print(train_final.shape) #7个特征减少为5个特征。
#10. 模型训练
#准备数据
Xtr = train_final.drop('SeriousDlqin2yrs',axis=1)
Ytr = train_final['SeriousDlqin2yrs']
#逻辑回归模型拟合
lr = LogisticRegression()
lr.fit(Xtr, Ytr)
#打印模型拟合的参数
lr.coef_
lr.intercept_
#11. 模型评估
#在训练集上的模型表现
EYtr_proba = lr.predict_proba(Xtr)[:,1]
EYtr = lr.predict(Xtr)
print('train F1:', F1(EYtr_proba,Ytr))
print('train KS:', KS(EYtr_proba,Ytr))
print('train AUC:', AUC(EYtr_proba,Ytr))
#分值排序性
tr_bucket = toad.metrics.KS_bucket(EYtr_proba,Ytr,bucket=10,method='quantile')  #等频分段
tr_bucket
#12. 模型验证
#在测试集上的模型表现
Xts = test_final.drop('SeriousDlqin2yrs',axis=1)
Yts = test_final['SeriousDlqin2yrs']
EYts_proba = lr.predict_proba(Xts)[:,1]
EYts = lr.predict(Xts)
print('test F1:', F1(EYts_proba,Yts))
print('test KS:', KS(EYts_proba,Yts))
print('test AUC:', AUC(EYts_proba,Yts))
#比较train、test变量稳定性分布是否有显著差异，基于分箱之后的数据
psi = toad.metrics.PSI(train_final,test_final)
psi.sort_values(0,ascending=False)
#13. 分值转换scaling
scorecard = toad.scorecard.ScoreCard(combiner = combiner, transer = WOETransformer , C = 0.1)
scorecard.fit(Xtr, Ytr)
scorecard.export(to_frame = True,)

评分卡模型部署_20211018

#------------------------------------------------------------------------------
"""
功能说明：
    本代码是第9章评分卡模型部署配套代码。
算法流程：  
    - 训练模型并将模型持久化为PKL文件
    - 本地加载模型PKL文件
    - 训练模型并将模型持久化为PMML文件
    - 本地加载模型PMML文件
    - 在服务器部署模型PMML，然后在客户端调用打分服务
输入数据：
    使用代码自带数据，无需额外的外部数据
输出数据：
    各代码段输出相应结果变量
版本历史：
    20211018：定稿提交出版
"""
#------------------------------------------------------------------------------
# 训练模型并将模型持久化为PKL文件
#导入相关包
import pandas as pd
from sklearn.datasets import load_iris
from sklearn import tree
from sklearn2pmml import PMMLPipeline
#读入数据
iris = load_iris()    
X_train=pd.DataFrame(iris.data,columns=['sepal_length','sepal_width','petal_length', 'petal_width'])
y_train=pd.DataFrame(iris.target,columns=['series'])
#训练模型pipeline
clf = tree.DecisionTreeClassifier(max_depth=2) #定义分类器
pipeline = PMMLPipeline([("classifier", clf)]) #定义pipeline
pipeline.fit(X_train, y_train) #此处使用带columns变量名称的dataframe进行模型训练
#方法1：使用pickle包将模型保存为pkl
import pickle
with open("D:\\mdl.pkl", "wb") as f:
    pickle.dump(pipeline, f)
#方法2：使用joblib包将模型导出为pkl
#from sklearn.externals import joblib  #高版本sklearn不再支持joblib
import joblib
joblib.dump(pipeline, "d:\\mdl.pkl", compress = 9)
#------------------------------------------------------------------------------
# 本地加载和使用模型PKL文件
#使用pickle包读取pickle
with open('D:\\mdl.pkl', 'rb') as f:
    mdl_in = pickle.load(f)
y_pred=mdl_in.predict(iris.data)
#使用joblib包读取pickle
mdl_in=joblib.load("d:\\mdl.pkl")
y_pred=mdl_in.predict(iris.data)
#------------------------------------------------------------------------------
# 将模型持久化为PMML文件
# 方法一：使用sklearn2pmml包导出模型PMML文件
#导入相关包
from sklearn.datasets import load_iris
from sklearn import tree
from sklearn2pmml import PMMLPipeline
#读入数据
iris = load_iris()
X_train=pd.DataFrame(iris.data,columns=['sepal_length','sepal_width','petal_length', 'petal_width'])
y_train=pd.DataFrame(iris.target,columns=['series'])
#训练模型pipeline
clf = tree.DecisionTreeClassifier(max_depth=2) #定义分类器
pipeline = PMMLPipeline([("classifier", clf)]) #定义pipeline
pipeline.fit(X_train, y_train) #此处使用带columns变量名称的dataframe进行模型训练
#模型导出为PMML
from sklearn2pmml import sklearn2pmml
sklearn2pmml(pipeline, "d:\\DecisionTree_Iris_sklearn2pmml.pmml", with_repr = True) #生成PMML时带变量名
# 方法二：使用nyoka包导出模型PMML文件
#导入相关包
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn2pmml import PMMLPipeline
#读入数据
iris = load_iris()
features = iris.feature_names
target = 'Species'
#创建pipeline并训练模型
clf_pipeline=PMMLPipeline([('clf',DecisionTreeClassifier(max_depth=2))])
clf_pipeline.fit(iris.data, iris.target) #此处训练模型时用的是数组不带变量名称
#使用nyoka将模型导出为pmml
from nyoka import skl_to_pmml
skl_to_pmml(clf_pipeline, features, target, "d:\\DecisionTree_iris_nyoka.pmml") #生成PMML时带变量名
#------------------------------------------------------------------------------
# 本地加载和使用PMML模型文件
#加载pmml
from pypmml import Model
model = Model.fromFile("d:\\DecisionTree_Iris_sklearn2pmml.pmml")
#使用PMML的模型打分，整个数据集
y_train_pred=model.predict(X_train) #注：此处待打分的DataFrame是否带变量名称须与训练模型PMML时保持一致
#使用PMML的模型打分，单条记录
model.predict({'sepal_length': 5.1, 'sepal_width': 3.5, 'petal_length': 1.4, 'petal_width': 0.2})
model.predict('[{"sepal_length": 5.1, "sepal_width": 3.5, "petal_length": 1.4, "petal_width": 0.2}]')
model.predict('{"columns": ["sepal_length", "sepal_width", "petal_length", "petal_width"], "data": [[5.1, 3.5, 1.4, 0.2]]}')
model.predict(pd.Series({'sepal_length': 5.1, 'sepal_width': 3.5, 'petal_length': 1.4, 'petal_width': 0.2}))
#------------------------------------------------------------------------------
# 下面代码使用FastAPI包实现在服务器部署模型
# （1）首先将下面代码保存在服务器端，命名为main.py，然后在服务器端执行命令行：先定位到main.py目录，然后执行：uvicorn main:app –-reload
#导入相关包和模块
from fastapi import FastAPI
from pypmml import Model
#定义FastAPI对象
app = FastAPI()
@app.get("/items/{item_id}")
async def read_item(item_id: int, x: str=''):
    #读取模型PMML
    mdl = Model.fromFile("d:\\DecisionTree_Iris_sklearn2pmml.pmml")
    #将读入的字符串x输入predict函数得到预测结果
    y_predict=mdl.predict(x)
    #将计算结果返回给客户端
    return {"item_id": item_id, "x":x, "y_predict": y_predict}
# （2）客户端执行如下代码，在服务器模式下执行时将127.0.0.1替换为服务器IP地址
URL_str='http://127.0.0.1:8000/items/5?x='+'[{"sepal_length":5.1,"sepal_width":3.5,"petal_length":1.4,"petal_width":0.2}]'
res=requests.get(URL_str)
returnjson=res.text
print(returnjson)
#------------------------------------------------------------------------------
# 下面代码使用Flask包实现在服务器部署模型
# （1）首先将下面代码保存在服务器端，命名为main.py，然后在服务器端执行命令行：python main.py
#导入相关包和模块
import numpy as np
import pandas as pd
from pypmml import Model
from flask import Flask
from flask import request
from flask import jsonify
#导入模型
model = Model.fromFile("d:\\DecisionTree_Iris_sklearn2pmml.pmml")
app = Flask(__name__)
@app.route('/',methods=['POST','GET'])
def scoring():
    text=request.args.get('inputdata')
    if text:
        temp =  [float(x) for x in text.split(',')]
        temp = pd.DataFrame(data=np.array(temp).reshape((1, -1)),columns=["sepal_length", "sepal_width", "petal_length", "petal_width"])
        ouputdata = model.predict(temp)    #outputdata是DataFrame格式
        return jsonify(dict(ouputdata.iloc[0])) #进行json化
if __name__ == '__main__':
    app.config['JSON_AS_ASCII'] = False
    app.run(host='127.0.0.1',port=5003)  # 127.0.0.1 #指的是本地ip
# （2）客户端执行如下代码，在服务器模式下执行时将127.0.0.1替换为服务器IP地址
import requests
base = 'http://127.0.0.1:5003/?inputdata=5.1,3.5,1.4,2'
response = requests.get(base)
print(response.text)
answer = response.json()
print('预测结果',answer)

评分卡模型可解释性_20211018

#------------------------------------------------------------------------------
"""
功能说明：
    本代码是第13章评分卡模型可解释性配套代码。
算法流程：  
    - PDP与ICE
    - 变量重要性方法：XGBoost和LightGBM的plot_importance
    - SKlearn模型解释工具treeinterpreter包
    - 特征随机置换Permutation Importance，使用eli5包
    - LIME
    - SHAP
输入数据：
    使用代码自带数据，无需额外的外部数据
输出数据：
    各代码段输出相应结果变量
版本历史：
    20211018：定稿提交出版
"""
#------------------------------------------------------------------------------
# 导入相关包和模块
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.datasets import fetch_california_housing
from sklearn.tree import DecisionTreeClassifier #分类
from sklearn.tree import DecisionTreeRegressor #回归
from sklearn.ensemble import RandomForestRegressor #随机森林
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
#------------------------------------------------------------------------------
# PDP与ICE
"""
PDP方法有两个工具包可用：
—— sklearn.inspection
—— pdpbox
"""
#读入数据
from sklearn.datasets import fetch_california_housing
cal_housing=fetch_california_housing()
X=pd.DataFrame(cal_housing.data, columns=cal_housing.feature_names)
y=cal_housing.target
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
#训练模型
from sklearn.ensemble import GradientBoostingRegressor
gbdt=GradientBoostingRegressor()
gbdt.fit(X_train,y_train)
#方法一：使用sklearn.inspection进行PDP分析
from sklearn.inspection import plot_partial_dependence
fig,ax=plt.subplots(figsize=(12,4))
plot_partial_dependence(gbdt,
                        X_train,
                        ['MedInc','AveOccup','HouseAge'],
                        method="brute",
                        ax=ax)
#也可以输出三维图形，考察两个变量间的交互性
fig,ax=plt.subplots(figsize=(9,6))
plot_partial_dependence(gbdt,
                        X_train,
                        [('HouseAge','AveOccup')],
                        grid_resolution=50,
                        method="brute",
                        ax=ax)
#方法二：使用pdpbox包
from pdpbox import pdb
pdp_MedInc=pdp.pdp_isolate(model=gbdt,
                           dataset=X_train,
                           model_features=X_train.columns.tolist(),
                           feature='MedInc',
                           num_grid_points=30)
pdb.pdp_plot(pdp_MedInc,
             'MedInc',
             center=False
             )
#使用pdpbox包绘制单实例ICE图
pdb.pdp_plot(pdp_MedInc,
             'MedInc',
             center=False,
             plot_lines=True,
             frac_to_plot=10,
             plot_pts_dist=True)
#------------------------------------------------------------------------------
#变量重要性方法：XGBoost和LightGBM的plot_importance
#导入包
from sklearn.datasets import load_boston
import xgboost as xgb
import lightgbm as lgb
#读取数据
ds=load_boston()
df=pd.DataFrame(data=ds.data)
df=df.add_prefix('X')
df=df.join(pd.DataFrame(ds.target,columns=['y']))
#定义xgb预测器
clf=xgb.XGBRegressor()
clf.get_params()
#拟合模型
clf.fit(df.iloc[:,0:13],df.iloc[:,-1])
#模型评估
clf.score(df.iloc[:,0:13],df.iloc[:,-1])
#打印变量重要性
xgb.plot_importance(clf,importance_type='gain')
#定义ligbm预测器
lgbdata=lgb.Dataset(df.iloc[:,0:13],df.iloc[:,-1])
# 将参数写成字典下形式
params = {
    'task': 'train',
    'boosting_type': 'gbdt',  # 设置提升类型
    'objective': 'regression', # 目标函数
    'metric': {'l2', 'auc'},  # 评估函数
    'num_leaves': 31,   # 叶子节点数
    'learning_rate': 0.05,  # 学习速率
    'feature_fraction': 0.9, # 建树的特征选择比例
    'bagging_fraction': 0.8, # 建树的样本采样比例
    'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
    'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}
clf_lgb=lgb.train(params,lgbdata)
#绘制模型重要性
clf_lgb.feature_importance()
plt.bar(height=clf_lgb.feature_importance(),x=df.iloc[:,0:13].columns)
#------------------------------------------------------------------------------
# SKlearn模型解释工具treeinterpreter包
#导入treeinterpreter包
from treeinterpreter import treeinterpreter as ti
#加载数据
ds=load_boston()
#定义分类器
rf=RandomForestRegressor(random_state=123)
#拟合模型
rf.fit(ds.data,ds.target)
#取出一个样本
spl=ds.data[0].reshape(1,-1)
#使用模型打分
rf.predict(spl)
#使用treeinterpreter解释，prediction是预测值，bias是全体样本Y平均值
prediction,bias,contributions=ti.predict(rf,spl)
#各变量的contributions
df_contributions=pd.DataFrame(data=np.hstack([ds.feature_names.reshape(-1,1),
                                            contributions.reshape(-1,1)]),
                            columns=['Feature','contribution'])
df_contributions.sort_values(by=['contribution'],ascending=False)
#验证计算逻辑
print(ds.target.mean()) #全体样本目标真实值平均值
print(rf.predict(ds.data).mean()) #rf预测值平均值
print(rf.predict(spl))
print(prediction)
print(bias)
print(prediction-np.sum(contributions))  #prediction是bias和每个变量贡献的总和
#------------------------------------------------------------------------------
#特征随机置换Permutation Importance，使用eli5包
#导入包
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor #随机森林
import eli5
from eli5.sklearn import PermutationImportance
#加载数据
ds=load_boston()
#定义分类器
rf=RandomForestRegressor(random_state=123)
#拟合模型
rf.fit(pd.DataFrame(ds.data,columns=ds.feature_names),ds.target)
rf.feature_importances_
#计算置换变量值重要性
perm=PermutationImportance(rf).fit(pd.DataFrame(ds.data,columns=ds.feature_names),ds.target)
df_perm=pd.DataFrame(data=np.hstack([ds.feature_names.reshape(-1,1),
                                     perm.feature_importances_.reshape(-1,1).round(4),
                                     perm.feature_importances_std_.reshape(-1,1).round(4)]),
                    columns=['Feature','mean','std'])
df_perm.sort_values(by=['mean'],ascending=False,inplace=True)
#查看置换变量重要性绘图，本段代码只能在notebook中查看
eli5.show_weights(perm,feature_names=ds.feature_names) 
#------------------------------------------------------------------------------
#LIME
#导入包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor #随机森林
from sklearn.model_selection import train_test_split
import lime
import lime.lime_tabular
#加载数据
ds=load_boston()
#定义分类器
rf=RandomForestRegressor(random_state=123)
#拟合模型
rf.fit(ds.data,ds.target)
#取值水平数小于10的视作分类变量
categorical_features=np.argwhere(np.array([len(set(ds.data[:,1])) for i in range(ds.data.shape[1])])<=10).flatten()
#创建解释器
explainer=lime.lime_tabular.LimeTabularExplainer(
        ds.data,
        feature_names=ds.feature_names,
        class_names=['house_price'],
        categorical_features=None,
        verbose=True,
        mode='regression'
        )
#选取一个样本
spl=ds.data[0]
#生成模型解释结果
exp=explainer.explain_instance(
        spl,
        rf.predict,
        num_features=5
        )
#输出各变量的贡献
exp.as_list()
#进行可视化，本段代码只能在Jupyter notebook中查看
exp.show_in_notebook(show_table=True)
#------------------------------------------------------------------------------
# SHAP
#导入包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
import shap
#初始化图形环境
shap.initjs()
#加载数据
ds=load_boston()
#取出一个样本作为下文单样本SHAP的实例
spl=ds.data[0].reshape(1,-1)
#定义基准分类器
rf=RandomForestRegressor(random_state=123)
#拟合模型
rf.fit(ds.data,ds.target)
#定义shap树解释器
explainer=shap.TreeExplainer(rf,data=ds.data)
#训练集上全体样本预测均值作为基准值
explainer.expected_value  #22.28338
#------------------------------------
#该单样本上各变量的SHAP值
splshapvalues=explainer.shap_values(spl).round(4)
df_splshapvalues=pd.DataFrame(data=np.hstack([ds.feature_names.reshape(-1,1),
                                            splshapvalues.reshape(-1,1),
                                            abs(splshapvalues).reshape(-1,1)]),
                            columns=['Feature','shap','shapabs'])
df_splshapvalues.sort_values(by=['shapabs'],ascending=False,inplace=True)  #按SHAP绝对值降序排列
df_splshapvalues.drop(['shapabs'],axis=1,inplace=True) #df_splshapvalues是纵表存储
df_splshapvaluescol=pd.DataFrame(data=splshapvalues,columns=ds.feature_names) #df_splshapvaluescol是横表存储
df_splshapvalues #显示单样本各变量shap值
#该单样本上验证计算逻辑
ds.target.mean()  #全体样本真实值均值 22.533
rf.predict(ds.data).mean() #全体样本rf预测值均值 22.535
explainer.expected_value  #Shap计算基准值即全体样本rf预测值均值22.28338
rf.predict(spl) #给定样本预测值 25.421
rf.predict(spl)-splshapvalues.sum() #22.2835 约等于explainer.expected_value
#查看单样本shap值绘图，本段代码只能在notebook中查看
shap.force_plot(explainer.expected_value,
                splshapvalues,
                features=spl,
                feature_names=ds.feature_names)
#------------------------------------
#样本集上各变量的SHAP值
shapvalues=explainer.shap_values(ds.data)
#查看样本集上shap值绘图，本段代码只能在Jupyter notebook中查看
shap.force_plot(explainer.expected_value,
                shapvalues,
                features=ds.data,
                feature_names=ds.feature_names)
#绘制决策路径图，本段代码只能在Jupyter notebook中查看
shap.decision_plot(explainer.expected_value,
                   shapvalues[:12],
                   ds.feature_names)
#绘制特征依赖图，本段代码只能在Jupyter notebook中查看
shap.dependence_plot(ds.feature_names.tolist().index('LSTAT'),
                     shapvalues,
                     ds.data)
#全局特征重要性，本段代码只能在Jupyter notebook中查看
shap.summary_plot(shapvalues,
                  ds.data,
                  feature_names=ds.feature_names,
                  max_display=5)
#以柱状图方式展示各变量SHAP绝对值平均值
shap.summary_plot(shapvalues,
                  feature_names=ds.feature_names,
                  plot_type='bar',
                  max_display=5)

从评分卡模型到高维机器学习模型_20211018

#------------------------------------------------------------------------------
"""
功能说明：
    本代码是第15章从评分卡模型到高维机器学习模型配套代码。
算法流程：
    - 使用XGBoost建立预测模型
    - 使用LightGBM建立预测模型
输入数据：
    使用代码自带数据，无需额外的外部数据
输出数据：
    各代码段输出相应结果变量
版本历史：
    20211018：定稿提交出版
"""
#------------------------------------------------------------------------------
# 使用XGBoost建立预测模型
#导入库包
import pandas as pd
from sklearn.datasets import load_breast_cancer #乳腺癌数据，Target：二分类
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import plot_roc_curve
import xgboost
from xgboost import XGBClassifier
#准备数据
ds_cancer = load_breast_cancer()
data = pd.DataFrame(data=ds_cancer.data,columns=ds_cancer.feature_names)
target = pd.DataFrame(data=ds_cancer.target,columns=['target'])
#数据分区
X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.3)
#定义XGBoost模型
xgb = XGBClassifier(n_estimators=3, max_depth=2)
#显示模型参数
xgb.get_params()
#模型拟合
xgb.fit(X_train, y_train)
#获得模型对象的属性和方法
xgb.score(X_train, y_train)
xgb.feature_importances_ 
#模型预测
y_train_predict=xgb.predict(X_train)
y_train_predict_proba=xgb.predict_proba(X_train)
#模型评估
accuracy_score(y_train,y_train_predict) #Accuracy指标基于真实标签vs预测标签
roc_auc_score(y_train, y_train_predict_proba[:,1]) #AUC指标基于真实标签vs预测概率
plot_roc_curve(xgb,X_train,y_train) #绘制ROC曲线
plot_precision_recall_curve(xgb,X_train,y_train) #绘制PR曲线
#打印变量重要性
xgboost.plot_importance(xgb) 
#------------------------------------------------------------------------------
# 使用LightGBM建立预测模型
#导入库包
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer #乳腺癌数据，Target：二分类
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import plot_roc_curve
import lightgbm as lgb
from lightgbm import LGBMClassifier
# 准备数据
ds_cancer = load_breast_cancer()
data = pd.DataFrame(data=ds_cancer.data,columns=ds_cancer.feature_names)
target = pd.DataFrame(data=ds_cancer.target,columns=['target'])
#数据分区
X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.3)
#定义分类器
lgbm = LGBMClassifier(boosting_type="gbdt", class_weight=None, colsample_bytree=0.7, 
                                 isunbalance=True, learning_rate=0.01, max_bin=15, 
                                 max_depth=1, min_child_samples=100, min_child_weight=1, 
                                 min_split_gain=0.04, n_estimators=100, num_leaves=32, 
                                 objective="binary", random_state=27, subsample=0.8, subsample_freq=1)
#显示模型对象参数
lgbm.get_params()
#拟合模型
lgbm.fit(X_train,y_train)
#获得模型对象的属性
lgbm.classes_
lgbm.feature_importances_
lgbm.n_classes_
lgbm.n_features_
lgbm.objective_
#模型预测
y_train_predict=lgbm.predict(X_train)
y_train_predict_proba=lgbm.predict_proba(X_train)
#模型评估
fpr,tpr,pct = roc_curve(y_train, y_train_predict_proba[:,1]) #ROC曲线计算FPR和TPR序列值
ks=abs(fpr-tpr).max() #KS指标
plt.plot(tpr,"b-",fpr,"r-") #KS曲线
accuracy_score(y_train,y_train_predict) #Accuracy指标基于真实标签vs预测标签
roc_auc_score(y_train, y_train_predict_proba[:,1]) #AUC指标基于真实标签vs预测概率
plot_precision_recall_curve(lgbm,X_train,y_train) #绘制PR曲线
plot_roc_curve(lgbm,X_train,y_train) #绘制ROC曲线
#调用lightGBM函数绘制相关图
lgb.create_tree_digraph(lgbm,tree_index=1)
lgb.plot_importance(lgbm)
lgb.plot_tree(lgbm,tree_index=1,figsize=(12,9))