特征工程提取有效的风险特征_20211018
#------------------------------------------------------------------------------
"""
功能说明:
本代码是第5章特征工程提取有效的风险特征配套代码。
算法流程:
1、特征组合多项式特征
2、非负矩阵分解
3、featuretools包
4、TSFresh包
输入数据:
使用代码自带数据,无需额外的外部数据
输出数据:
各代码段输出相应结果变量
版本历史:
20211018:定稿提交出版
"""
#------------------------------------------------------------------------------
# 特征组合多项式特征
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
X=np.arange(9).reshape(3,3)
poly=PolynomialFeatures(2) #二阶多项式
poly.fit_transform(X)
poly=PolynomialFeatures(degree=3, interaction_only=True) #三阶多项式仅保留交叉项
poly.fit_transform(X)
#------------------------------------------------------------------------------
# 非负矩阵分解
import numpy as np
from sklearn.decomposition import NMF
from sklearn.datasets import load_iris
X, _ = load_iris(True)
#定义模型
nmf = NMF(n_components=2, # n_components即前文矩阵分解中的k,如果不设定该参数则默认保留全部特征
init=None, # W和H的初始化方法,包括'random','nndsvd'(默认),'nndsvda','nndsvdar','custom'.
solver='cd', #取值:'cd'、'mu'
beta_loss='frobenius', #取值:{'frobenius','kullback-leibler','itakura-saito'},一般保持默认
tol=1e-4, # 停止迭代的极限条件
max_iter=1000, #最大迭代次数
random_state=None,
alpha=0., #正则化参数
l1_ratio=0., #正则化参数
verbose=0, #冗长模式
shuffle=False #针对"cd solver"
)
#模型参数
print('params:', nmf.get_params()) #获取构造函数参数的值,也可以通过nmf.attr得到
#模型拟合
nmf.fit(X)
W = nmf.fit_transform(X)
nmf.inverse_transform(W)
H = nmf.components_ # H矩阵
X_= np.dot(W,H)
print('reconstruction_err_', nmf.reconstruction_err_) #损失函数值
print('n_iter_', nmf.n_iter_) #迭代次数
#------------------------------------------------------------------------------
# 使用featuretools包进行特征衍生
#导入相关包
import featuretools as ft
#查看自带的数据情况
es = ft.demo.load_mock_customer(return_entityset=True)
es.plot()
#数据载入
data=ft.demo.load_mock_customer()
customers_df=data["customers"]
sessions_df=data["sessions"]
transactions_df=data["transactions"]
#创建实体和实体间关联关系
dataframes={"customers": (customers_df,"customer_id"),
"sessions": (sessions_df,"session_id","session_start"),
"transactions":(transactions_df,"transaction_id","transaction_time")
}
relationships=[("sessions","session_id","transactions","session_id"),
("customers","customer_id","sessions","customer_id")
]
#运行DFS衍生特征
feature_matrix_customers, features_defs=ft.dfs(
dataframes=dataframes,
relationships=relationships,
target_dataframe_name="customers")
#查看衍生的变量
feature_matrix_customers_columnslst=list(feature_matrix_customers.columns)
#------------------------------------------------------------------------------
# 使用tsfresh包进行特征衍生
#导入相关包
from tsfresh.examples.robot_execution_failures import download_robot_execution_failures, load_robot_execution_failures
#下载和读入数据
download_robot_execution_failures() #下载数据
timeseries, y = load_robot_execution_failures() #加载数据
timeseries.head()
y.head()
#显示数据前几行
print(timeseries.head())
print(y.head())
#显示时间序列
import matplotlib.pyplot as plt
timeseries[timeseries['id'] == 3].plot(subplots=True, sharex=True, figsize=(10,10))
y[3] #True正常
plt.show()
timeseries[timeseries['id'] == 21].plot(subplots=True, sharex=True, figsize=(10,10))
y[21] #False有故障
plt.show()
#特征提取
from tsfresh import extract_features
extracted_features = extract_features(timeseries, column_id="id", column_sort="time")
#特征选择,基于上一步特征提取的结果,注意不允许出现NaN值,所以需要使用impute先填充
from tsfresh.utilities.dataframe_functions import impute
impute(extracted_features) #缺失值都用0填充
from tsfresh import select_features
features_filtered = select_features(extracted_features, y)
#特征提取+特征选择
from tsfresh import extract_relevant_features
features_filtered_direct = extract_relevant_features(timeseries, y,column_id='id', column_sort='time')
评分卡模型开发_20211018
#------------------------------------------------------------------------------
"""
功能说明:
本代码是第7章评分卡模型开发配套代码
算法流程:
1、使用scikit-learn包的LogisticRegression类建立逻辑回归模型
2、使用statsmodels包的Logit类建立逻辑回归模型
3、使用scorecardpy包建模
4、使用toad包建模
输入数据:
使用代码自带数据,无需额外的外部数据
输出数据:
各代码段输出相应结果变量
版本历史:
20211018:定稿提交出版
"""
#------------------------------------------------------------------------------
# 使用scikit-learn的LogisticRegression类建立逻辑回归模型
#导入相关包和模块
import pandas as pd
from sklearn.datasets import load_breast_cancer #乳腺癌数据,Target:二分类
from sklearn.linear_model import LogisticRegression #分类
from sklearn.model_selection import train_test_split #数据集划分
#准备数据,Target二分类
ds_cancer = load_breast_cancer()
data=pd.DataFrame(ds_cancer.data).add_prefix('X')
target = pd.DataFrame(ds_cancer.target,columns=['y'])
X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.3)
#定义分类器
clf=LogisticRegression(fit_intercept=True, random_state=123) #模型带截距项
clf.get_params()
#模型拟合
clf.fit(X_train,y_train)
#获取模型拟合系数
clf.coef_
clf.intercept_
'''
注:上述代码仅用于演示LogisticRegression类的使用,计算模型拟合结果后还需要进行模型结果的评估和验证,
必要时需要迭代地进行变量选择。
'''
#------------------------------------------------------------------------------
# 使用statsmodels包的Logit类建立逻辑回归模型
#导入包
import statsmodels.api as sm #回归类模型
#变量筛选通过相关性
X_train_corr=X_train.corr()[X_train.corr()>0.9] #计算变量相关性发现X0和X1、X2,X20和X22、X23相关性很高
X_train1=X_train.drop(['X0','X2','X3','X10','X12','X13','X20','X22','X23'],axis=1)
#加上常数项
X_train1=sm.add_constant(X_train1)
#拟合模型
model = sm.Logit(y_train, X_train1)
results = model.fit()
#模型结果
print(results.summary())
print(results.params)
'''
注:上述代码仅用于演示Logit类的使用,计算出模型拟合结果后还需要进行进一步的统计检验,
统计检验显示,多个变量的p值>0.05,故变量不显著,所以需要迭代地将不显著的变量去除。
'''
#------------------------------------------------------------------------------
# 使用scorecardpy包建模
"""
功能说明:
本程序使用scorecardpy进行评分卡建模
算法流程:
依次读入数据、变量筛选、数据分区、变量分箱、分箱调整、变量转换WOE、训练模型、模型评估、模型验证、评分标尺
输入数据:
本程序不需要额外输入数据,sc.germancredit自带建模数据
输出数据:
评分卡模型结果
版本历史:
"""
#导入相关包
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import plot_roc_curve
import scorecardpy as sc
#1.读入数据
#读入数据
data = sc.germancredit()
#数据信息
data.info()
data.describe()
#2.变量筛选
data_s = sc.var_filter(data,
y="creditability",
iv_limit=0.02,
missing_limit=0.95,
identical_limit=0.95,
var_rm=None,
var_kp=None,
return_rm_reason=False,
positive='bad|1')
#3.数据分区
train, test = sc.split_df(data_s, 'creditability', ratio=0.7, seed=123).values()
#4.变量分箱
#自动分箱
bins = sc.woebin(train, y="creditability")
#细分箱结果报告
sc.woebin_plot(bins)
#5.分箱调整
#交互式输入cut后分箱
#breaks_adj = sc.woebin_adj(train, "creditability", bins)
#也可以手动设置
breaks_adj = {'age.in.years': [22, 35, 40,60],
'other.debtors.or.guarantors': ["none", "co-applicant%,%guarantor"]}
bins_adj = sc.woebin(train, y="creditability", breaks_list=breaks_adj)
#6.变量转换WOE
train_woe = sc.woebin_ply(train, bins_adj)
test_woe = sc.woebin_ply(test, bins_adj)
#7.训练模型
#处理数据
X_train = train_woe.loc[:,train_woe.columns != 'creditability']
y_train = train_woe.loc[:,'creditability']
X_test = test_woe.loc[:,train_woe.columns != 'creditability']
y_test = test_woe.loc[:,'creditability']
#定义分类器
lr = LogisticRegression(penalty='l1', C=0.9, solver='saga', n_jobs=-1)
lr.get_params()
#拟合模型
lr.fit(X_train, y_train)
#拟合的参数
lr.coef_
lr.intercept_
#8.模型评估
# predicted proability for train
y_train_pred = lr.predict_proba(X_train)[:,1]
# 绘制KS和ROC、PR曲线
train_perf = sc.perf_eva(y_train, y_train_pred, plot_type=["ks", "roc","pr","lift"], title = "train")
plot_roc_curve(lr,X_train,y_train)
plot_precision_recall_curve(lr,X_train,y_train)
#9.模型验证
# predicted proability for test
y_test_pred = lr.predict_proba(X_test)[:,1]
# 绘制KS和ROC、PR曲线
test_perf = sc.perf_eva(y_test, y_test_pred, plot_type=["ks", "roc","pr","lift"], title = "test")
plot_roc_curve(lr,X_test,y_test)
plot_precision_recall_curve(lr,X_test,y_test)
#10.评分标尺
card = sc.scorecard(bins_adj,
lr,
X_train.columns,
points0=600,
odds0=1/19,
pdo=50,
basepoints_eq0=True)
#使用评分标尺打分
train_score = sc.scorecard_ply(train, card, print_step=0)
test_score = sc.scorecard_ply(test, card, print_step=0)
#比较train/test分数分布是否一致,计算分值分布PSI
sc.perf_psi(
score = {'train':train_score, 'test':test_score},
label = {'train':y_train, 'test':y_test})
#------------------------------------------------------------------------------
# 使用toad包建模
"""
功能说明:
本程序使用toad进行评分卡建模
算法流程:
依次进行读入数据、样本分区、数据EDA报告、特征分析、特征预筛选、特征分箱、调整合并分箱、特征选择、模型训练、模型评估、模型验证、评分标尺
输入数据:
数据下载为https://www.kaggle.com/c/GiveMeSomeCredit/data
输出数据:
评分模型结果
版本历史:
"""
#导入相关包
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import toad
from toad.plot import badrate_plot, proportion_plot, bin_plot
from toad.metrics import KS, F1, AUC
#1. 读入数据
#读入数据
data = pd.read_csv(r'D:\cs-training.csv')
#数据描述
data.info()
data.describe()
data.head()
#2. 样本分区
Xtr,Xts,Ytr,Yts = train_test_split(data.drop('SeriousDlqin2yrs',axis=1),
data['SeriousDlqin2yrs'],
test_size=0.25,
random_state=450)
data_tr = pd.concat([Xtr,Ytr],axis=1)
data_tr['type'] = 'train'
data_ts = pd.concat([Xts,Yts],axis=1)
data_ts['type'] = 'test'
#3. 数据EDA报告
toad.detector.detect(data_tr).to_excel(r'D:\数据EDA结果.xlsx')
#4. 特征分析计算特征IV、gini、entropy、unique
quality = toad.quality(data,'SeriousDlqin2yrs')
quality.head(6)
#5. 特征预筛选
selected_train, drop_lst= toad.selection.select(data_tr,target = 'SeriousDlqin2yrs',
empty = 0.5,
iv = 0.05,
corr = 0.7,
return_drop=True,
exclude='type')
selected_test = data_ts[selected_train.columns]
selected_train.shape
drop_lst #删除的额变量
#6. 特征分箱,必须基于train数据集来做
# 初始化一个combiner类
combiner = toad.transform.Combiner()
# 训练数据并指定分箱方法,需要分箱的变量共7个
combiner.fit(selected_train,
y='SeriousDlqin2yrs',
method='chi',
min_samples = 0.05,
exclude='type')
# 以字典形式保存分箱结果
bins = combiner.export()
#查看每个特征的分箱结果
print('DebtRatio分箱cut:',bins['DebtRatio'])
print('MonthlyIncome分箱cut:',bins['MonthlyIncome'])
print('NumberOfOpenCreditLinesAndLoans分箱cut:',bins['NumberOfOpenCreditLinesAndLoans'])
print('NumberOfTimes90DaysLate分箱cut:',bins['NumberOfTimes90DaysLate'])
print('NumberRealEstateLoansOrLines分箱cut:',bins['NumberRealEstateLoansOrLines'])
print('RevolvingUtilizationOfUnsecuredLines分箱cut:',bins['RevolvingUtilizationOfUnsecuredLines'])
print('age分箱cut:',bins['age'])
#使用combiner.transform方法对数据进行分箱转换
selected_train_bin = combiner.transform(selected_train)
#画分箱图,bin_plot双轴图同时绘制分箱占比和分箱badrate
proportion_plot(selected_train_bin['DebtRatio'])
proportion_plot(selected_train_bin['MonthlyIncome'])
proportion_plot(selected_train_bin['NumberOfOpenCreditLinesAndLoans'])
proportion_plot(selected_train_bin['NumberOfTimes90DaysLate'])
proportion_plot(selected_train_bin['NumberRealEstateLoansOrLines'])
proportion_plot(selected_train_bin['RevolvingUtilizationOfUnsecuredLines'])
proportion_plot(selected_train_bin['age'])
badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'DebtRatio')
badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'MonthlyIncome')
badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberOfOpenCreditLinesAndLoans')
badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberOfTimes90DaysLate')
badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberRealEstateLoansOrLines')
badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'RevolvingUtilizationOfUnsecuredLines')
badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'age')
bin_plot(selected_train_bin,x='DebtRatio',target='SeriousDlqin2yrs')
bin_plot(selected_train_bin,x='MonthlyIncome',target='SeriousDlqin2yrs')
bin_plot(selected_train_bin,x='NumberOfOpenCreditLinesAndLoans',target='SeriousDlqin2yrs')
bin_plot(selected_train_bin,x='NumberOfTimes90DaysLate',target='SeriousDlqin2yrs')
bin_plot(selected_train_bin,x='NumberRealEstateLoansOrLines',target='SeriousDlqin2yrs')
bin_plot(selected_train_bin,x='RevolvingUtilizationOfUnsecuredLines',target='SeriousDlqin2yrs')
bin_plot(selected_train_bin,x='age',target='SeriousDlqin2yrs')
#7. 调整合并分箱
#定义调整分箱#调整分箱cutpoint
bins_adj=bins
bins_adj["age"]=[22, 35, 45, 60]
bins_adj["NumberOfOpenCreditLinesAndLoans"]=[2]
bins_adj["DebtRatio"]=[0.02,0.4,0.5,2]
#定义分箱combiner
combiner2 = toad.transform.Combiner() #定义分箱combiner
combiner2.set_rules(bins_adj) #设置需要施加的分箱
#应用调整分箱
selected_train_binadj = combiner2.transform(selected_train)
#画分箱坏账率badrate图
proportion_plot(selected_train_binadj['DebtRatio'])
proportion_plot(selected_train_binadj['MonthlyIncome'])
proportion_plot(selected_train_binadj['NumberOfOpenCreditLinesAndLoans'])
proportion_plot(selected_train_binadj['NumberOfTimes90DaysLate'])
proportion_plot(selected_train_binadj['NumberRealEstateLoansOrLines'])
proportion_plot(selected_train_binadj['RevolvingUtilizationOfUnsecuredLines'])
proportion_plot(selected_train_binadj['age'])
badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'DebtRatio')
badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'MonthlyIncome')
badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberOfOpenCreditLinesAndLoans')
badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberOfTimes90DaysLate')
badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberRealEstateLoansOrLines')
badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'RevolvingUtilizationOfUnsecuredLines')
badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'age')
#8. 转换WOE值
#设置分箱号
combiner.set_rules(bins_adj)
#将特征的值转化为分箱的箱号。
selected_train_binadj = combiner.transform(selected_train)
selected_test_binadj = combiner.transform(selected_test)
#定义WOE转换器
WOETransformer = toad.transform.WOETransformer()
#对WOE的值进行转化,映射到原数据集上。对训练集用fit_transform,测试集用transform
data_tr_woe = WOETransformer.fit_transform(selected_train_binadj,
selected_train_binadj['SeriousDlqin2yrs'],
exclude=['SeriousDlqin2yrs','type'])
data_ts_woe = WOETransformer.transform(selected_test_binadj)
#9. 特征选择,使用stepwise选择变量
train_final = toad.selection.stepwise(data_tr_woe.drop('type',axis=1),
target = 'SeriousDlqin2yrs',
direction = 'both',
criterion = 'aic')
test_final = data_ts_woe[train_final.columns]
print(train_final.shape) #7个特征减少为5个特征。
#10. 模型训练
#准备数据
Xtr = train_final.drop('SeriousDlqin2yrs',axis=1)
Ytr = train_final['SeriousDlqin2yrs']
#逻辑回归模型拟合
lr = LogisticRegression()
lr.fit(Xtr, Ytr)
#打印模型拟合的参数
lr.coef_
lr.intercept_
#11. 模型评估
#在训练集上的模型表现
EYtr_proba = lr.predict_proba(Xtr)[:,1]
EYtr = lr.predict(Xtr)
print('train F1:', F1(EYtr_proba,Ytr))
print('train KS:', KS(EYtr_proba,Ytr))
print('train AUC:', AUC(EYtr_proba,Ytr))
#分值排序性
tr_bucket = toad.metrics.KS_bucket(EYtr_proba,Ytr,bucket=10,method='quantile') #等频分段
tr_bucket
#12. 模型验证
#在测试集上的模型表现
Xts = test_final.drop('SeriousDlqin2yrs',axis=1)
Yts = test_final['SeriousDlqin2yrs']
EYts_proba = lr.predict_proba(Xts)[:,1]
EYts = lr.predict(Xts)
print('test F1:', F1(EYts_proba,Yts))
print('test KS:', KS(EYts_proba,Yts))
print('test AUC:', AUC(EYts_proba,Yts))
#比较train、test变量稳定性分布是否有显著差异,基于分箱之后的数据
psi = toad.metrics.PSI(train_final,test_final)
psi.sort_values(0,ascending=False)
#13. 分值转换scaling
scorecard = toad.scorecard.ScoreCard(combiner = combiner, transer = WOETransformer , C = 0.1)
scorecard.fit(Xtr, Ytr)
scorecard.export(to_frame = True,)
评分卡模型部署_20211018
#------------------------------------------------------------------------------
"""
功能说明:
本代码是第9章评分卡模型部署配套代码。
算法流程:
- 训练模型并将模型持久化为PKL文件
- 本地加载模型PKL文件
- 训练模型并将模型持久化为PMML文件
- 本地加载模型PMML文件
- 在服务器部署模型PMML,然后在客户端调用打分服务
输入数据:
使用代码自带数据,无需额外的外部数据
输出数据:
各代码段输出相应结果变量
版本历史:
20211018:定稿提交出版
"""
#------------------------------------------------------------------------------
# 训练模型并将模型持久化为PKL文件
#导入相关包
import pandas as pd
from sklearn.datasets import load_iris
from sklearn import tree
from sklearn2pmml import PMMLPipeline
#读入数据
iris = load_iris()
X_train=pd.DataFrame(iris.data,columns=['sepal_length','sepal_width','petal_length', 'petal_width'])
y_train=pd.DataFrame(iris.target,columns=['series'])
#训练模型pipeline
clf = tree.DecisionTreeClassifier(max_depth=2) #定义分类器
pipeline = PMMLPipeline([("classifier", clf)]) #定义pipeline
pipeline.fit(X_train, y_train) #此处使用带columns变量名称的dataframe进行模型训练
#方法1:使用pickle包将模型保存为pkl
import pickle
with open("D:\\mdl.pkl", "wb") as f:
pickle.dump(pipeline, f)
#方法2:使用joblib包将模型导出为pkl
#from sklearn.externals import joblib #高版本sklearn不再支持joblib
import joblib
joblib.dump(pipeline, "d:\\mdl.pkl", compress = 9)
#------------------------------------------------------------------------------
# 本地加载和使用模型PKL文件
#使用pickle包读取pickle
with open('D:\\mdl.pkl', 'rb') as f:
mdl_in = pickle.load(f)
y_pred=mdl_in.predict(iris.data)
#使用joblib包读取pickle
mdl_in=joblib.load("d:\\mdl.pkl")
y_pred=mdl_in.predict(iris.data)
#------------------------------------------------------------------------------
# 将模型持久化为PMML文件
# 方法一:使用sklearn2pmml包导出模型PMML文件
#导入相关包
from sklearn.datasets import load_iris
from sklearn import tree
from sklearn2pmml import PMMLPipeline
#读入数据
iris = load_iris()
X_train=pd.DataFrame(iris.data,columns=['sepal_length','sepal_width','petal_length', 'petal_width'])
y_train=pd.DataFrame(iris.target,columns=['series'])
#训练模型pipeline
clf = tree.DecisionTreeClassifier(max_depth=2) #定义分类器
pipeline = PMMLPipeline([("classifier", clf)]) #定义pipeline
pipeline.fit(X_train, y_train) #此处使用带columns变量名称的dataframe进行模型训练
#模型导出为PMML
from sklearn2pmml import sklearn2pmml
sklearn2pmml(pipeline, "d:\\DecisionTree_Iris_sklearn2pmml.pmml", with_repr = True) #生成PMML时带变量名
# 方法二:使用nyoka包导出模型PMML文件
#导入相关包
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn2pmml import PMMLPipeline
#读入数据
iris = load_iris()
features = iris.feature_names
target = 'Species'
#创建pipeline并训练模型
clf_pipeline=PMMLPipeline([('clf',DecisionTreeClassifier(max_depth=2))])
clf_pipeline.fit(iris.data, iris.target) #此处训练模型时用的是数组不带变量名称
#使用nyoka将模型导出为pmml
from nyoka import skl_to_pmml
skl_to_pmml(clf_pipeline, features, target, "d:\\DecisionTree_iris_nyoka.pmml") #生成PMML时带变量名
#------------------------------------------------------------------------------
# 本地加载和使用PMML模型文件
#加载pmml
from pypmml import Model
model = Model.fromFile("d:\\DecisionTree_Iris_sklearn2pmml.pmml")
#使用PMML的模型打分,整个数据集
y_train_pred=model.predict(X_train) #注:此处待打分的DataFrame是否带变量名称须与训练模型PMML时保持一致
#使用PMML的模型打分,单条记录
model.predict({'sepal_length': 5.1, 'sepal_width': 3.5, 'petal_length': 1.4, 'petal_width': 0.2})
model.predict('[{"sepal_length": 5.1, "sepal_width": 3.5, "petal_length": 1.4, "petal_width": 0.2}]')
model.predict('{"columns": ["sepal_length", "sepal_width", "petal_length", "petal_width"], "data": [[5.1, 3.5, 1.4, 0.2]]}')
model.predict(pd.Series({'sepal_length': 5.1, 'sepal_width': 3.5, 'petal_length': 1.4, 'petal_width': 0.2}))
#------------------------------------------------------------------------------
# 下面代码使用FastAPI包实现在服务器部署模型
# (1)首先将下面代码保存在服务器端,命名为main.py,然后在服务器端执行命令行:先定位到main.py目录,然后执行:uvicorn main:app –-reload
#导入相关包和模块
from fastapi import FastAPI
from pypmml import Model
#定义FastAPI对象
app = FastAPI()
@app.get("/items/{item_id}")
async def read_item(item_id: int, x: str=''):
#读取模型PMML
mdl = Model.fromFile("d:\\DecisionTree_Iris_sklearn2pmml.pmml")
#将读入的字符串x输入predict函数得到预测结果
y_predict=mdl.predict(x)
#将计算结果返回给客户端
return {"item_id": item_id, "x":x, "y_predict": y_predict}
# (2)客户端执行如下代码,在服务器模式下执行时将127.0.0.1替换为服务器IP地址
URL_str='http://127.0.0.1:8000/items/5?x='+'[{"sepal_length":5.1,"sepal_width":3.5,"petal_length":1.4,"petal_width":0.2}]'
res=requests.get(URL_str)
returnjson=res.text
print(returnjson)
#------------------------------------------------------------------------------
# 下面代码使用Flask包实现在服务器部署模型
# (1)首先将下面代码保存在服务器端,命名为main.py,然后在服务器端执行命令行:python main.py
#导入相关包和模块
import numpy as np
import pandas as pd
from pypmml import Model
from flask import Flask
from flask import request
from flask import jsonify
#导入模型
model = Model.fromFile("d:\\DecisionTree_Iris_sklearn2pmml.pmml")
app = Flask(__name__)
@app.route('/',methods=['POST','GET'])
def scoring():
text=request.args.get('inputdata')
if text:
temp = [float(x) for x in text.split(',')]
temp = pd.DataFrame(data=np.array(temp).reshape((1, -1)),columns=["sepal_length", "sepal_width", "petal_length", "petal_width"])
ouputdata = model.predict(temp) #outputdata是DataFrame格式
return jsonify(dict(ouputdata.iloc[0])) #进行json化
if __name__ == '__main__':
app.config['JSON_AS_ASCII'] = False
app.run(host='127.0.0.1',port=5003) # 127.0.0.1 #指的是本地ip
# (2)客户端执行如下代码,在服务器模式下执行时将127.0.0.1替换为服务器IP地址
import requests
base = 'http://127.0.0.1:5003/?inputdata=5.1,3.5,1.4,2'
response = requests.get(base)
print(response.text)
answer = response.json()
print('预测结果',answer)
评分卡模型可解释性_20211018
#------------------------------------------------------------------------------
"""
功能说明:
本代码是第13章评分卡模型可解释性配套代码。
算法流程:
- PDP与ICE
- 变量重要性方法:XGBoost和LightGBM的plot_importance
- SKlearn模型解释工具treeinterpreter包
- 特征随机置换Permutation Importance,使用eli5包
- LIME
- SHAP
输入数据:
使用代码自带数据,无需额外的外部数据
输出数据:
各代码段输出相应结果变量
版本历史:
20211018:定稿提交出版
"""
#------------------------------------------------------------------------------
# 导入相关包和模块
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.datasets import fetch_california_housing
from sklearn.tree import DecisionTreeClassifier #分类
from sklearn.tree import DecisionTreeRegressor #回归
from sklearn.ensemble import RandomForestRegressor #随机森林
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
#------------------------------------------------------------------------------
# PDP与ICE
"""
PDP方法有两个工具包可用:
—— sklearn.inspection
—— pdpbox
"""
#读入数据
from sklearn.datasets import fetch_california_housing
cal_housing=fetch_california_housing()
X=pd.DataFrame(cal_housing.data, columns=cal_housing.feature_names)
y=cal_housing.target
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
#训练模型
from sklearn.ensemble import GradientBoostingRegressor
gbdt=GradientBoostingRegressor()
gbdt.fit(X_train,y_train)
#方法一:使用sklearn.inspection进行PDP分析
from sklearn.inspection import plot_partial_dependence
fig,ax=plt.subplots(figsize=(12,4))
plot_partial_dependence(gbdt,
X_train,
['MedInc','AveOccup','HouseAge'],
method="brute",
ax=ax)
#也可以输出三维图形,考察两个变量间的交互性
fig,ax=plt.subplots(figsize=(9,6))
plot_partial_dependence(gbdt,
X_train,
[('HouseAge','AveOccup')],
grid_resolution=50,
method="brute",
ax=ax)
#方法二:使用pdpbox包
from pdpbox import pdb
pdp_MedInc=pdp.pdp_isolate(model=gbdt,
dataset=X_train,
model_features=X_train.columns.tolist(),
feature='MedInc',
num_grid_points=30)
pdb.pdp_plot(pdp_MedInc,
'MedInc',
center=False
)
#使用pdpbox包绘制单实例ICE图
pdb.pdp_plot(pdp_MedInc,
'MedInc',
center=False,
plot_lines=True,
frac_to_plot=10,
plot_pts_dist=True)
#------------------------------------------------------------------------------
#变量重要性方法:XGBoost和LightGBM的plot_importance
#导入包
from sklearn.datasets import load_boston
import xgboost as xgb
import lightgbm as lgb
#读取数据
ds=load_boston()
df=pd.DataFrame(data=ds.data)
df=df.add_prefix('X')
df=df.join(pd.DataFrame(ds.target,columns=['y']))
#定义xgb预测器
clf=xgb.XGBRegressor()
clf.get_params()
#拟合模型
clf.fit(df.iloc[:,0:13],df.iloc[:,-1])
#模型评估
clf.score(df.iloc[:,0:13],df.iloc[:,-1])
#打印变量重要性
xgb.plot_importance(clf,importance_type='gain')
#定义ligbm预测器
lgbdata=lgb.Dataset(df.iloc[:,0:13],df.iloc[:,-1])
# 将参数写成字典下形式
params = {
'task': 'train',
'boosting_type': 'gbdt', # 设置提升类型
'objective': 'regression', # 目标函数
'metric': {'l2', 'auc'}, # 评估函数
'num_leaves': 31, # 叶子节点数
'learning_rate': 0.05, # 学习速率
'feature_fraction': 0.9, # 建树的特征选择比例
'bagging_fraction': 0.8, # 建树的样本采样比例
'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging
'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}
clf_lgb=lgb.train(params,lgbdata)
#绘制模型重要性
clf_lgb.feature_importance()
plt.bar(height=clf_lgb.feature_importance(),x=df.iloc[:,0:13].columns)
#------------------------------------------------------------------------------
# SKlearn模型解释工具treeinterpreter包
#导入treeinterpreter包
from treeinterpreter import treeinterpreter as ti
#加载数据
ds=load_boston()
#定义分类器
rf=RandomForestRegressor(random_state=123)
#拟合模型
rf.fit(ds.data,ds.target)
#取出一个样本
spl=ds.data[0].reshape(1,-1)
#使用模型打分
rf.predict(spl)
#使用treeinterpreter解释,prediction是预测值,bias是全体样本Y平均值
prediction,bias,contributions=ti.predict(rf,spl)
#各变量的contributions
df_contributions=pd.DataFrame(data=np.hstack([ds.feature_names.reshape(-1,1),
contributions.reshape(-1,1)]),
columns=['Feature','contribution'])
df_contributions.sort_values(by=['contribution'],ascending=False)
#验证计算逻辑
print(ds.target.mean()) #全体样本目标真实值平均值
print(rf.predict(ds.data).mean()) #rf预测值平均值
print(rf.predict(spl))
print(prediction)
print(bias)
print(prediction-np.sum(contributions)) #prediction是bias和每个变量贡献的总和
#------------------------------------------------------------------------------
#特征随机置换Permutation Importance,使用eli5包
#导入包
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor #随机森林
import eli5
from eli5.sklearn import PermutationImportance
#加载数据
ds=load_boston()
#定义分类器
rf=RandomForestRegressor(random_state=123)
#拟合模型
rf.fit(pd.DataFrame(ds.data,columns=ds.feature_names),ds.target)
rf.feature_importances_
#计算置换变量值重要性
perm=PermutationImportance(rf).fit(pd.DataFrame(ds.data,columns=ds.feature_names),ds.target)
df_perm=pd.DataFrame(data=np.hstack([ds.feature_names.reshape(-1,1),
perm.feature_importances_.reshape(-1,1).round(4),
perm.feature_importances_std_.reshape(-1,1).round(4)]),
columns=['Feature','mean','std'])
df_perm.sort_values(by=['mean'],ascending=False,inplace=True)
#查看置换变量重要性绘图,本段代码只能在notebook中查看
eli5.show_weights(perm,feature_names=ds.feature_names)
#------------------------------------------------------------------------------
#LIME
#导入包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor #随机森林
from sklearn.model_selection import train_test_split
import lime
import lime.lime_tabular
#加载数据
ds=load_boston()
#定义分类器
rf=RandomForestRegressor(random_state=123)
#拟合模型
rf.fit(ds.data,ds.target)
#取值水平数小于10的视作分类变量
categorical_features=np.argwhere(np.array([len(set(ds.data[:,1])) for i in range(ds.data.shape[1])])<=10).flatten()
#创建解释器
explainer=lime.lime_tabular.LimeTabularExplainer(
ds.data,
feature_names=ds.feature_names,
class_names=['house_price'],
categorical_features=None,
verbose=True,
mode='regression'
)
#选取一个样本
spl=ds.data[0]
#生成模型解释结果
exp=explainer.explain_instance(
spl,
rf.predict,
num_features=5
)
#输出各变量的贡献
exp.as_list()
#进行可视化,本段代码只能在Jupyter notebook中查看
exp.show_in_notebook(show_table=True)
#------------------------------------------------------------------------------
# SHAP
#导入包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
import shap
#初始化图形环境
shap.initjs()
#加载数据
ds=load_boston()
#取出一个样本作为下文单样本SHAP的实例
spl=ds.data[0].reshape(1,-1)
#定义基准分类器
rf=RandomForestRegressor(random_state=123)
#拟合模型
rf.fit(ds.data,ds.target)
#定义shap树解释器
explainer=shap.TreeExplainer(rf,data=ds.data)
#训练集上全体样本预测均值作为基准值
explainer.expected_value #22.28338
#------------------------------------
#该单样本上各变量的SHAP值
splshapvalues=explainer.shap_values(spl).round(4)
df_splshapvalues=pd.DataFrame(data=np.hstack([ds.feature_names.reshape(-1,1),
splshapvalues.reshape(-1,1),
abs(splshapvalues).reshape(-1,1)]),
columns=['Feature','shap','shapabs'])
df_splshapvalues.sort_values(by=['shapabs'],ascending=False,inplace=True) #按SHAP绝对值降序排列
df_splshapvalues.drop(['shapabs'],axis=1,inplace=True) #df_splshapvalues是纵表存储
df_splshapvaluescol=pd.DataFrame(data=splshapvalues,columns=ds.feature_names) #df_splshapvaluescol是横表存储
df_splshapvalues #显示单样本各变量shap值
#该单样本上验证计算逻辑
ds.target.mean() #全体样本真实值均值 22.533
rf.predict(ds.data).mean() #全体样本rf预测值均值 22.535
explainer.expected_value #Shap计算基准值即全体样本rf预测值均值22.28338
rf.predict(spl) #给定样本预测值 25.421
rf.predict(spl)-splshapvalues.sum() #22.2835 约等于explainer.expected_value
#查看单样本shap值绘图,本段代码只能在notebook中查看
shap.force_plot(explainer.expected_value,
splshapvalues,
features=spl,
feature_names=ds.feature_names)
#------------------------------------
#样本集上各变量的SHAP值
shapvalues=explainer.shap_values(ds.data)
#查看样本集上shap值绘图,本段代码只能在Jupyter notebook中查看
shap.force_plot(explainer.expected_value,
shapvalues,
features=ds.data,
feature_names=ds.feature_names)
#绘制决策路径图,本段代码只能在Jupyter notebook中查看
shap.decision_plot(explainer.expected_value,
shapvalues[:12],
ds.feature_names)
#绘制特征依赖图,本段代码只能在Jupyter notebook中查看
shap.dependence_plot(ds.feature_names.tolist().index('LSTAT'),
shapvalues,
ds.data)
#全局特征重要性,本段代码只能在Jupyter notebook中查看
shap.summary_plot(shapvalues,
ds.data,
feature_names=ds.feature_names,
max_display=5)
#以柱状图方式展示各变量SHAP绝对值平均值
shap.summary_plot(shapvalues,
feature_names=ds.feature_names,
plot_type='bar',
max_display=5)
从评分卡模型到高维机器学习模型_20211018
#------------------------------------------------------------------------------
"""
功能说明:
本代码是第15章从评分卡模型到高维机器学习模型配套代码。
算法流程:
- 使用XGBoost建立预测模型
- 使用LightGBM建立预测模型
输入数据:
使用代码自带数据,无需额外的外部数据
输出数据:
各代码段输出相应结果变量
版本历史:
20211018:定稿提交出版
"""
#------------------------------------------------------------------------------
# 使用XGBoost建立预测模型
#导入库包
import pandas as pd
from sklearn.datasets import load_breast_cancer #乳腺癌数据,Target:二分类
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import plot_roc_curve
import xgboost
from xgboost import XGBClassifier
#准备数据
ds_cancer = load_breast_cancer()
data = pd.DataFrame(data=ds_cancer.data,columns=ds_cancer.feature_names)
target = pd.DataFrame(data=ds_cancer.target,columns=['target'])
#数据分区
X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.3)
#定义XGBoost模型
xgb = XGBClassifier(n_estimators=3, max_depth=2)
#显示模型参数
xgb.get_params()
#模型拟合
xgb.fit(X_train, y_train)
#获得模型对象的属性和方法
xgb.score(X_train, y_train)
xgb.feature_importances_
#模型预测
y_train_predict=xgb.predict(X_train)
y_train_predict_proba=xgb.predict_proba(X_train)
#模型评估
accuracy_score(y_train,y_train_predict) #Accuracy指标基于真实标签vs预测标签
roc_auc_score(y_train, y_train_predict_proba[:,1]) #AUC指标基于真实标签vs预测概率
plot_roc_curve(xgb,X_train,y_train) #绘制ROC曲线
plot_precision_recall_curve(xgb,X_train,y_train) #绘制PR曲线
#打印变量重要性
xgboost.plot_importance(xgb)
#------------------------------------------------------------------------------
# 使用LightGBM建立预测模型
#导入库包
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer #乳腺癌数据,Target:二分类
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import plot_roc_curve
import lightgbm as lgb
from lightgbm import LGBMClassifier
# 准备数据
ds_cancer = load_breast_cancer()
data = pd.DataFrame(data=ds_cancer.data,columns=ds_cancer.feature_names)
target = pd.DataFrame(data=ds_cancer.target,columns=['target'])
#数据分区
X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.3)
#定义分类器
lgbm = LGBMClassifier(boosting_type="gbdt", class_weight=None, colsample_bytree=0.7,
isunbalance=True, learning_rate=0.01, max_bin=15,
max_depth=1, min_child_samples=100, min_child_weight=1,
min_split_gain=0.04, n_estimators=100, num_leaves=32,
objective="binary", random_state=27, subsample=0.8, subsample_freq=1)
#显示模型对象参数
lgbm.get_params()
#拟合模型
lgbm.fit(X_train,y_train)
#获得模型对象的属性
lgbm.classes_
lgbm.feature_importances_
lgbm.n_classes_
lgbm.n_features_
lgbm.objective_
#模型预测
y_train_predict=lgbm.predict(X_train)
y_train_predict_proba=lgbm.predict_proba(X_train)
#模型评估
fpr,tpr,pct = roc_curve(y_train, y_train_predict_proba[:,1]) #ROC曲线计算FPR和TPR序列值
ks=abs(fpr-tpr).max() #KS指标
plt.plot(tpr,"b-",fpr,"r-") #KS曲线
accuracy_score(y_train,y_train_predict) #Accuracy指标基于真实标签vs预测标签
roc_auc_score(y_train, y_train_predict_proba[:,1]) #AUC指标基于真实标签vs预测概率
plot_precision_recall_curve(lgbm,X_train,y_train) #绘制PR曲线
plot_roc_curve(lgbm,X_train,y_train) #绘制ROC曲线
#调用lightGBM函数绘制相关图
lgb.create_tree_digraph(lgbm,tree_index=1)
lgb.plot_importance(lgbm)
lgb.plot_tree(lgbm,tree_index=1,figsize=(12,9))