特征工程提取有效的风险特征_20211018
#------------------------------------------------------------------------------"""功能说明: 本代码是第5章特征工程提取有效的风险特征配套代码。算法流程: 1、特征组合多项式特征 2、非负矩阵分解 3、featuretools包 4、TSFresh包输入数据: 使用代码自带数据,无需额外的外部数据输出数据: 各代码段输出相应结果变量版本历史: 20211018:定稿提交出版"""#------------------------------------------------------------------------------# 特征组合多项式特征import numpy as npfrom sklearn.preprocessing import PolynomialFeaturesX=np.arange(9).reshape(3,3)poly=PolynomialFeatures(2) #二阶多项式poly.fit_transform(X)poly=PolynomialFeatures(degree=3, interaction_only=True) #三阶多项式仅保留交叉项poly.fit_transform(X)#------------------------------------------------------------------------------# 非负矩阵分解import numpy as npfrom sklearn.decomposition import NMFfrom sklearn.datasets import load_irisX, _ = load_iris(True)#定义模型nmf = NMF(n_components=2, # n_components即前文矩阵分解中的k,如果不设定该参数则默认保留全部特征 init=None, # W和H的初始化方法,包括'random','nndsvd'(默认),'nndsvda','nndsvdar','custom'. solver='cd', #取值:'cd'、'mu' beta_loss='frobenius', #取值:{'frobenius','kullback-leibler','itakura-saito'},一般保持默认 tol=1e-4, # 停止迭代的极限条件 max_iter=1000, #最大迭代次数 random_state=None, alpha=0., #正则化参数 l1_ratio=0., #正则化参数 verbose=0, #冗长模式 shuffle=False #针对"cd solver" )#模型参数print('params:', nmf.get_params()) #获取构造函数参数的值,也可以通过nmf.attr得到#模型拟合nmf.fit(X)W = nmf.fit_transform(X)nmf.inverse_transform(W)H = nmf.components_ # H矩阵X_= np.dot(W,H)print('reconstruction_err_', nmf.reconstruction_err_) #损失函数值print('n_iter_', nmf.n_iter_) #迭代次数#------------------------------------------------------------------------------# 使用featuretools包进行特征衍生#导入相关包import featuretools as ft#查看自带的数据情况es = ft.demo.load_mock_customer(return_entityset=True)es.plot()#数据载入data=ft.demo.load_mock_customer()customers_df=data["customers"]sessions_df=data["sessions"]transactions_df=data["transactions"]#创建实体和实体间关联关系dataframes={"customers": (customers_df,"customer_id"), "sessions": (sessions_df,"session_id","session_start"), "transactions":(transactions_df,"transaction_id","transaction_time") }relationships=[("sessions","session_id","transactions","session_id"), ("customers","customer_id","sessions","customer_id") ]#运行DFS衍生特征feature_matrix_customers, features_defs=ft.dfs( dataframes=dataframes, relationships=relationships, target_dataframe_name="customers")#查看衍生的变量feature_matrix_customers_columnslst=list(feature_matrix_customers.columns)#------------------------------------------------------------------------------# 使用tsfresh包进行特征衍生#导入相关包from tsfresh.examples.robot_execution_failures import download_robot_execution_failures, load_robot_execution_failures#下载和读入数据download_robot_execution_failures() #下载数据timeseries, y = load_robot_execution_failures() #加载数据timeseries.head()y.head()#显示数据前几行print(timeseries.head())print(y.head())#显示时间序列import matplotlib.pyplot as plttimeseries[timeseries['id'] == 3].plot(subplots=True, sharex=True, figsize=(10,10))y[3] #True正常plt.show()timeseries[timeseries['id'] == 21].plot(subplots=True, sharex=True, figsize=(10,10))y[21] #False有故障plt.show()#特征提取from tsfresh import extract_featuresextracted_features = extract_features(timeseries, column_id="id", column_sort="time")#特征选择,基于上一步特征提取的结果,注意不允许出现NaN值,所以需要使用impute先填充from tsfresh.utilities.dataframe_functions import imputeimpute(extracted_features) #缺失值都用0填充from tsfresh import select_featuresfeatures_filtered = select_features(extracted_features, y)#特征提取+特征选择from tsfresh import extract_relevant_featuresfeatures_filtered_direct = extract_relevant_features(timeseries, y,column_id='id', column_sort='time')
评分卡模型开发_20211018
#------------------------------------------------------------------------------"""功能说明: 本代码是第7章评分卡模型开发配套代码算法流程: 1、使用scikit-learn包的LogisticRegression类建立逻辑回归模型 2、使用statsmodels包的Logit类建立逻辑回归模型 3、使用scorecardpy包建模 4、使用toad包建模输入数据: 使用代码自带数据,无需额外的外部数据输出数据: 各代码段输出相应结果变量版本历史: 20211018:定稿提交出版"""#------------------------------------------------------------------------------# 使用scikit-learn的LogisticRegression类建立逻辑回归模型#导入相关包和模块import pandas as pdfrom sklearn.datasets import load_breast_cancer #乳腺癌数据,Target:二分类from sklearn.linear_model import LogisticRegression #分类from sklearn.model_selection import train_test_split #数据集划分#准备数据,Target二分类ds_cancer = load_breast_cancer()data=pd.DataFrame(ds_cancer.data).add_prefix('X')target = pd.DataFrame(ds_cancer.target,columns=['y'])X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.3)#定义分类器clf=LogisticRegression(fit_intercept=True, random_state=123) #模型带截距项clf.get_params()#模型拟合clf.fit(X_train,y_train)#获取模型拟合系数clf.coef_clf.intercept_'''注:上述代码仅用于演示LogisticRegression类的使用,计算模型拟合结果后还需要进行模型结果的评估和验证,必要时需要迭代地进行变量选择。'''#------------------------------------------------------------------------------# 使用statsmodels包的Logit类建立逻辑回归模型#导入包import statsmodels.api as sm #回归类模型#变量筛选通过相关性X_train_corr=X_train.corr()[X_train.corr()>0.9] #计算变量相关性发现X0和X1、X2,X20和X22、X23相关性很高X_train1=X_train.drop(['X0','X2','X3','X10','X12','X13','X20','X22','X23'],axis=1)#加上常数项X_train1=sm.add_constant(X_train1)#拟合模型model = sm.Logit(y_train, X_train1)results = model.fit()#模型结果print(results.summary())print(results.params)'''注:上述代码仅用于演示Logit类的使用,计算出模型拟合结果后还需要进行进一步的统计检验,统计检验显示,多个变量的p值>0.05,故变量不显著,所以需要迭代地将不显著的变量去除。'''#------------------------------------------------------------------------------# 使用scorecardpy包建模"""功能说明: 本程序使用scorecardpy进行评分卡建模算法流程: 依次读入数据、变量筛选、数据分区、变量分箱、分箱调整、变量转换WOE、训练模型、模型评估、模型验证、评分标尺输入数据: 本程序不需要额外输入数据,sc.germancredit自带建模数据输出数据: 评分卡模型结果版本历史:"""#导入相关包from sklearn.linear_model import LogisticRegressionfrom sklearn.metrics import plot_precision_recall_curvefrom sklearn.metrics import plot_roc_curveimport scorecardpy as sc#1.读入数据#读入数据data = sc.germancredit()#数据信息data.info()data.describe()#2.变量筛选data_s = sc.var_filter(data, y="creditability", iv_limit=0.02, missing_limit=0.95, identical_limit=0.95, var_rm=None, var_kp=None, return_rm_reason=False, positive='bad|1')#3.数据分区train, test = sc.split_df(data_s, 'creditability', ratio=0.7, seed=123).values()#4.变量分箱#自动分箱bins = sc.woebin(train, y="creditability")#细分箱结果报告sc.woebin_plot(bins)#5.分箱调整#交互式输入cut后分箱#breaks_adj = sc.woebin_adj(train, "creditability", bins) #也可以手动设置breaks_adj = {'age.in.years': [22, 35, 40,60], 'other.debtors.or.guarantors': ["none", "co-applicant%,%guarantor"]}bins_adj = sc.woebin(train, y="creditability", breaks_list=breaks_adj)#6.变量转换WOEtrain_woe = sc.woebin_ply(train, bins_adj)test_woe = sc.woebin_ply(test, bins_adj)#7.训练模型#处理数据X_train = train_woe.loc[:,train_woe.columns != 'creditability']y_train = train_woe.loc[:,'creditability']X_test = test_woe.loc[:,train_woe.columns != 'creditability']y_test = test_woe.loc[:,'creditability']#定义分类器lr = LogisticRegression(penalty='l1', C=0.9, solver='saga', n_jobs=-1)lr.get_params()#拟合模型lr.fit(X_train, y_train)#拟合的参数lr.coef_lr.intercept_#8.模型评估# predicted proability for trainy_train_pred = lr.predict_proba(X_train)[:,1]# 绘制KS和ROC、PR曲线train_perf = sc.perf_eva(y_train, y_train_pred, plot_type=["ks", "roc","pr","lift"], title = "train")plot_roc_curve(lr,X_train,y_train) plot_precision_recall_curve(lr,X_train,y_train)#9.模型验证# predicted proability for testy_test_pred = lr.predict_proba(X_test)[:,1]# 绘制KS和ROC、PR曲线test_perf = sc.perf_eva(y_test, y_test_pred, plot_type=["ks", "roc","pr","lift"], title = "test")plot_roc_curve(lr,X_test,y_test) plot_precision_recall_curve(lr,X_test,y_test) #10.评分标尺card = sc.scorecard(bins_adj, lr, X_train.columns, points0=600, odds0=1/19, pdo=50, basepoints_eq0=True)#使用评分标尺打分train_score = sc.scorecard_ply(train, card, print_step=0)test_score = sc.scorecard_ply(test, card, print_step=0)#比较train/test分数分布是否一致,计算分值分布PSIsc.perf_psi( score = {'train':train_score, 'test':test_score}, label = {'train':y_train, 'test':y_test})#------------------------------------------------------------------------------# 使用toad包建模"""功能说明: 本程序使用toad进行评分卡建模算法流程: 依次进行读入数据、样本分区、数据EDA报告、特征分析、特征预筛选、特征分箱、调整合并分箱、特征选择、模型训练、模型评估、模型验证、评分标尺输入数据: 数据下载为https://www.kaggle.com/c/GiveMeSomeCredit/data输出数据: 评分模型结果版本历史:"""#导入相关包import pandas as pdfrom sklearn.linear_model import LogisticRegressionfrom sklearn.model_selection import train_test_splitimport toadfrom toad.plot import badrate_plot, proportion_plot, bin_plotfrom toad.metrics import KS, F1, AUC#1. 读入数据#读入数据data = pd.read_csv(r'D:\cs-training.csv')#数据描述data.info()data.describe()data.head()#2. 样本分区Xtr,Xts,Ytr,Yts = train_test_split(data.drop('SeriousDlqin2yrs',axis=1), data['SeriousDlqin2yrs'], test_size=0.25, random_state=450)data_tr = pd.concat([Xtr,Ytr],axis=1)data_tr['type'] = 'train'data_ts = pd.concat([Xts,Yts],axis=1)data_ts['type'] = 'test'#3. 数据EDA报告toad.detector.detect(data_tr).to_excel(r'D:\数据EDA结果.xlsx')#4. 特征分析计算特征IV、gini、entropy、uniquequality = toad.quality(data,'SeriousDlqin2yrs')quality.head(6)#5. 特征预筛选selected_train, drop_lst= toad.selection.select(data_tr,target = 'SeriousDlqin2yrs', empty = 0.5, iv = 0.05, corr = 0.7, return_drop=True, exclude='type')selected_test = data_ts[selected_train.columns]selected_train.shapedrop_lst #删除的额变量#6. 特征分箱,必须基于train数据集来做# 初始化一个combiner类combiner = toad.transform.Combiner()# 训练数据并指定分箱方法,需要分箱的变量共7个combiner.fit(selected_train, y='SeriousDlqin2yrs', method='chi', min_samples = 0.05, exclude='type')# 以字典形式保存分箱结果bins = combiner.export()#查看每个特征的分箱结果print('DebtRatio分箱cut:',bins['DebtRatio'])print('MonthlyIncome分箱cut:',bins['MonthlyIncome'])print('NumberOfOpenCreditLinesAndLoans分箱cut:',bins['NumberOfOpenCreditLinesAndLoans'])print('NumberOfTimes90DaysLate分箱cut:',bins['NumberOfTimes90DaysLate'])print('NumberRealEstateLoansOrLines分箱cut:',bins['NumberRealEstateLoansOrLines'])print('RevolvingUtilizationOfUnsecuredLines分箱cut:',bins['RevolvingUtilizationOfUnsecuredLines'])print('age分箱cut:',bins['age'])#使用combiner.transform方法对数据进行分箱转换selected_train_bin = combiner.transform(selected_train)#画分箱图,bin_plot双轴图同时绘制分箱占比和分箱badrateproportion_plot(selected_train_bin['DebtRatio'])proportion_plot(selected_train_bin['MonthlyIncome'])proportion_plot(selected_train_bin['NumberOfOpenCreditLinesAndLoans'])proportion_plot(selected_train_bin['NumberOfTimes90DaysLate'])proportion_plot(selected_train_bin['NumberRealEstateLoansOrLines'])proportion_plot(selected_train_bin['RevolvingUtilizationOfUnsecuredLines'])proportion_plot(selected_train_bin['age'])badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'DebtRatio')badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'MonthlyIncome')badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberOfOpenCreditLinesAndLoans')badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberOfTimes90DaysLate')badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberRealEstateLoansOrLines')badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'RevolvingUtilizationOfUnsecuredLines')badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'age')bin_plot(selected_train_bin,x='DebtRatio',target='SeriousDlqin2yrs') bin_plot(selected_train_bin,x='MonthlyIncome',target='SeriousDlqin2yrs') bin_plot(selected_train_bin,x='NumberOfOpenCreditLinesAndLoans',target='SeriousDlqin2yrs')bin_plot(selected_train_bin,x='NumberOfTimes90DaysLate',target='SeriousDlqin2yrs')bin_plot(selected_train_bin,x='NumberRealEstateLoansOrLines',target='SeriousDlqin2yrs')bin_plot(selected_train_bin,x='RevolvingUtilizationOfUnsecuredLines',target='SeriousDlqin2yrs')bin_plot(selected_train_bin,x='age',target='SeriousDlqin2yrs')#7. 调整合并分箱#定义调整分箱#调整分箱cutpointbins_adj=binsbins_adj["age"]=[22, 35, 45, 60]bins_adj["NumberOfOpenCreditLinesAndLoans"]=[2]bins_adj["DebtRatio"]=[0.02,0.4,0.5,2] #定义分箱combinercombiner2 = toad.transform.Combiner() #定义分箱combinercombiner2.set_rules(bins_adj) #设置需要施加的分箱#应用调整分箱selected_train_binadj = combiner2.transform(selected_train)#画分箱坏账率badrate图proportion_plot(selected_train_binadj['DebtRatio'])proportion_plot(selected_train_binadj['MonthlyIncome'])proportion_plot(selected_train_binadj['NumberOfOpenCreditLinesAndLoans'])proportion_plot(selected_train_binadj['NumberOfTimes90DaysLate'])proportion_plot(selected_train_binadj['NumberRealEstateLoansOrLines'])proportion_plot(selected_train_binadj['RevolvingUtilizationOfUnsecuredLines'])proportion_plot(selected_train_binadj['age'])badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'DebtRatio')badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'MonthlyIncome')badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberOfOpenCreditLinesAndLoans')badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberOfTimes90DaysLate')badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberRealEstateLoansOrLines')badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'RevolvingUtilizationOfUnsecuredLines')badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'age')#8. 转换WOE值#设置分箱号combiner.set_rules(bins_adj)#将特征的值转化为分箱的箱号。selected_train_binadj = combiner.transform(selected_train)selected_test_binadj = combiner.transform(selected_test)#定义WOE转换器WOETransformer = toad.transform.WOETransformer()#对WOE的值进行转化,映射到原数据集上。对训练集用fit_transform,测试集用transformdata_tr_woe = WOETransformer.fit_transform(selected_train_binadj, selected_train_binadj['SeriousDlqin2yrs'], exclude=['SeriousDlqin2yrs','type'])data_ts_woe = WOETransformer.transform(selected_test_binadj)#9. 特征选择,使用stepwise选择变量train_final = toad.selection.stepwise(data_tr_woe.drop('type',axis=1), target = 'SeriousDlqin2yrs', direction = 'both', criterion = 'aic')test_final = data_ts_woe[train_final.columns]print(train_final.shape) #7个特征减少为5个特征。#10. 模型训练#准备数据Xtr = train_final.drop('SeriousDlqin2yrs',axis=1)Ytr = train_final['SeriousDlqin2yrs']#逻辑回归模型拟合lr = LogisticRegression()lr.fit(Xtr, Ytr)#打印模型拟合的参数lr.coef_lr.intercept_#11. 模型评估#在训练集上的模型表现EYtr_proba = lr.predict_proba(Xtr)[:,1]EYtr = lr.predict(Xtr)print('train F1:', F1(EYtr_proba,Ytr))print('train KS:', KS(EYtr_proba,Ytr))print('train AUC:', AUC(EYtr_proba,Ytr))#分值排序性tr_bucket = toad.metrics.KS_bucket(EYtr_proba,Ytr,bucket=10,method='quantile') #等频分段tr_bucket#12. 模型验证#在测试集上的模型表现Xts = test_final.drop('SeriousDlqin2yrs',axis=1)Yts = test_final['SeriousDlqin2yrs']EYts_proba = lr.predict_proba(Xts)[:,1]EYts = lr.predict(Xts)print('test F1:', F1(EYts_proba,Yts))print('test KS:', KS(EYts_proba,Yts))print('test AUC:', AUC(EYts_proba,Yts))#比较train、test变量稳定性分布是否有显著差异,基于分箱之后的数据psi = toad.metrics.PSI(train_final,test_final)psi.sort_values(0,ascending=False)#13. 分值转换scalingscorecard = toad.scorecard.ScoreCard(combiner = combiner, transer = WOETransformer , C = 0.1)scorecard.fit(Xtr, Ytr)scorecard.export(to_frame = True,)
评分卡模型部署_20211018
#------------------------------------------------------------------------------"""功能说明: 本代码是第9章评分卡模型部署配套代码。算法流程: - 训练模型并将模型持久化为PKL文件 - 本地加载模型PKL文件 - 训练模型并将模型持久化为PMML文件 - 本地加载模型PMML文件 - 在服务器部署模型PMML,然后在客户端调用打分服务输入数据: 使用代码自带数据,无需额外的外部数据输出数据: 各代码段输出相应结果变量版本历史: 20211018:定稿提交出版"""#------------------------------------------------------------------------------# 训练模型并将模型持久化为PKL文件#导入相关包import pandas as pdfrom sklearn.datasets import load_irisfrom sklearn import treefrom sklearn2pmml import PMMLPipeline#读入数据iris = load_iris() X_train=pd.DataFrame(iris.data,columns=['sepal_length','sepal_width','petal_length', 'petal_width'])y_train=pd.DataFrame(iris.target,columns=['series'])#训练模型pipelineclf = tree.DecisionTreeClassifier(max_depth=2) #定义分类器pipeline = PMMLPipeline([("classifier", clf)]) #定义pipelinepipeline.fit(X_train, y_train) #此处使用带columns变量名称的dataframe进行模型训练#方法1:使用pickle包将模型保存为pklimport picklewith open("D:\\mdl.pkl", "wb") as f: pickle.dump(pipeline, f)#方法2:使用joblib包将模型导出为pkl#from sklearn.externals import joblib #高版本sklearn不再支持joblibimport joblibjoblib.dump(pipeline, "d:\\mdl.pkl", compress = 9)#------------------------------------------------------------------------------# 本地加载和使用模型PKL文件#使用pickle包读取picklewith open('D:\\mdl.pkl', 'rb') as f: mdl_in = pickle.load(f)y_pred=mdl_in.predict(iris.data)#使用joblib包读取picklemdl_in=joblib.load("d:\\mdl.pkl")y_pred=mdl_in.predict(iris.data)#------------------------------------------------------------------------------# 将模型持久化为PMML文件# 方法一:使用sklearn2pmml包导出模型PMML文件#导入相关包from sklearn.datasets import load_irisfrom sklearn import treefrom sklearn2pmml import PMMLPipeline#读入数据iris = load_iris()X_train=pd.DataFrame(iris.data,columns=['sepal_length','sepal_width','petal_length', 'petal_width'])y_train=pd.DataFrame(iris.target,columns=['series'])#训练模型pipelineclf = tree.DecisionTreeClassifier(max_depth=2) #定义分类器pipeline = PMMLPipeline([("classifier", clf)]) #定义pipelinepipeline.fit(X_train, y_train) #此处使用带columns变量名称的dataframe进行模型训练#模型导出为PMMLfrom sklearn2pmml import sklearn2pmmlsklearn2pmml(pipeline, "d:\\DecisionTree_Iris_sklearn2pmml.pmml", with_repr = True) #生成PMML时带变量名# 方法二:使用nyoka包导出模型PMML文件#导入相关包from sklearn.datasets import load_irisfrom sklearn.tree import DecisionTreeClassifierfrom sklearn2pmml import PMMLPipeline#读入数据iris = load_iris()features = iris.feature_namestarget = 'Species'#创建pipeline并训练模型clf_pipeline=PMMLPipeline([('clf',DecisionTreeClassifier(max_depth=2))])clf_pipeline.fit(iris.data, iris.target) #此处训练模型时用的是数组不带变量名称#使用nyoka将模型导出为pmmlfrom nyoka import skl_to_pmmlskl_to_pmml(clf_pipeline, features, target, "d:\\DecisionTree_iris_nyoka.pmml") #生成PMML时带变量名#------------------------------------------------------------------------------# 本地加载和使用PMML模型文件#加载pmmlfrom pypmml import Modelmodel = Model.fromFile("d:\\DecisionTree_Iris_sklearn2pmml.pmml")#使用PMML的模型打分,整个数据集y_train_pred=model.predict(X_train) #注:此处待打分的DataFrame是否带变量名称须与训练模型PMML时保持一致#使用PMML的模型打分,单条记录model.predict({'sepal_length': 5.1, 'sepal_width': 3.5, 'petal_length': 1.4, 'petal_width': 0.2})model.predict('[{"sepal_length": 5.1, "sepal_width": 3.5, "petal_length": 1.4, "petal_width": 0.2}]')model.predict('{"columns": ["sepal_length", "sepal_width", "petal_length", "petal_width"], "data": [[5.1, 3.5, 1.4, 0.2]]}')model.predict(pd.Series({'sepal_length': 5.1, 'sepal_width': 3.5, 'petal_length': 1.4, 'petal_width': 0.2}))#------------------------------------------------------------------------------# 下面代码使用FastAPI包实现在服务器部署模型# (1)首先将下面代码保存在服务器端,命名为main.py,然后在服务器端执行命令行:先定位到main.py目录,然后执行:uvicorn main:app –-reload#导入相关包和模块from fastapi import FastAPIfrom pypmml import Model#定义FastAPI对象app = FastAPI()@app.get("/items/{item_id}")async def read_item(item_id: int, x: str=''): #读取模型PMML mdl = Model.fromFile("d:\\DecisionTree_Iris_sklearn2pmml.pmml") #将读入的字符串x输入predict函数得到预测结果 y_predict=mdl.predict(x) #将计算结果返回给客户端 return {"item_id": item_id, "x":x, "y_predict": y_predict}# (2)客户端执行如下代码,在服务器模式下执行时将127.0.0.1替换为服务器IP地址URL_str='http://127.0.0.1:8000/items/5?x='+'[{"sepal_length":5.1,"sepal_width":3.5,"petal_length":1.4,"petal_width":0.2}]'res=requests.get(URL_str)returnjson=res.textprint(returnjson)#------------------------------------------------------------------------------# 下面代码使用Flask包实现在服务器部署模型# (1)首先将下面代码保存在服务器端,命名为main.py,然后在服务器端执行命令行:python main.py#导入相关包和模块import numpy as npimport pandas as pdfrom pypmml import Modelfrom flask import Flaskfrom flask import requestfrom flask import jsonify#导入模型model = Model.fromFile("d:\\DecisionTree_Iris_sklearn2pmml.pmml")app = Flask(__name__)@app.route('/',methods=['POST','GET'])def scoring(): text=request.args.get('inputdata') if text: temp = [float(x) for x in text.split(',')] temp = pd.DataFrame(data=np.array(temp).reshape((1, -1)),columns=["sepal_length", "sepal_width", "petal_length", "petal_width"]) ouputdata = model.predict(temp) #outputdata是DataFrame格式 return jsonify(dict(ouputdata.iloc[0])) #进行json化if __name__ == '__main__': app.config['JSON_AS_ASCII'] = False app.run(host='127.0.0.1',port=5003) # 127.0.0.1 #指的是本地ip# (2)客户端执行如下代码,在服务器模式下执行时将127.0.0.1替换为服务器IP地址import requestsbase = 'http://127.0.0.1:5003/?inputdata=5.1,3.5,1.4,2'response = requests.get(base)print(response.text)answer = response.json()print('预测结果',answer)
评分卡模型可解释性_20211018
#------------------------------------------------------------------------------"""功能说明: 本代码是第13章评分卡模型可解释性配套代码。算法流程: - PDP与ICE - 变量重要性方法:XGBoost和LightGBM的plot_importance - SKlearn模型解释工具treeinterpreter包 - 特征随机置换Permutation Importance,使用eli5包 - LIME - SHAP输入数据: 使用代码自带数据,无需额外的外部数据输出数据: 各代码段输出相应结果变量版本历史: 20211018:定稿提交出版"""#------------------------------------------------------------------------------# 导入相关包和模块import numpy as npimport pandas as pdimport matplotlib.pyplot as pltfrom sklearn.datasets import load_bostonfrom sklearn.datasets import fetch_california_housingfrom sklearn.tree import DecisionTreeClassifier #分类from sklearn.tree import DecisionTreeRegressor #回归from sklearn.ensemble import RandomForestRegressor #随机森林from sklearn.model_selection import train_test_splitfrom sklearn.metrics import accuracy_score#------------------------------------------------------------------------------# PDP与ICE"""PDP方法有两个工具包可用:—— sklearn.inspection—— pdpbox"""#读入数据from sklearn.datasets import fetch_california_housingcal_housing=fetch_california_housing()X=pd.DataFrame(cal_housing.data, columns=cal_housing.feature_names)y=cal_housing.targetX_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)#训练模型from sklearn.ensemble import GradientBoostingRegressorgbdt=GradientBoostingRegressor()gbdt.fit(X_train,y_train)#方法一:使用sklearn.inspection进行PDP分析from sklearn.inspection import plot_partial_dependencefig,ax=plt.subplots(figsize=(12,4))plot_partial_dependence(gbdt, X_train, ['MedInc','AveOccup','HouseAge'], method="brute", ax=ax)#也可以输出三维图形,考察两个变量间的交互性fig,ax=plt.subplots(figsize=(9,6))plot_partial_dependence(gbdt, X_train, [('HouseAge','AveOccup')], grid_resolution=50, method="brute", ax=ax)#方法二:使用pdpbox包from pdpbox import pdbpdp_MedInc=pdp.pdp_isolate(model=gbdt, dataset=X_train, model_features=X_train.columns.tolist(), feature='MedInc', num_grid_points=30)pdb.pdp_plot(pdp_MedInc, 'MedInc', center=False )#使用pdpbox包绘制单实例ICE图pdb.pdp_plot(pdp_MedInc, 'MedInc', center=False, plot_lines=True, frac_to_plot=10, plot_pts_dist=True)#------------------------------------------------------------------------------#变量重要性方法:XGBoost和LightGBM的plot_importance#导入包from sklearn.datasets import load_bostonimport xgboost as xgbimport lightgbm as lgb#读取数据ds=load_boston()df=pd.DataFrame(data=ds.data)df=df.add_prefix('X')df=df.join(pd.DataFrame(ds.target,columns=['y']))#定义xgb预测器clf=xgb.XGBRegressor()clf.get_params()#拟合模型clf.fit(df.iloc[:,0:13],df.iloc[:,-1])#模型评估clf.score(df.iloc[:,0:13],df.iloc[:,-1])#打印变量重要性xgb.plot_importance(clf,importance_type='gain')#定义ligbm预测器lgbdata=lgb.Dataset(df.iloc[:,0:13],df.iloc[:,-1])# 将参数写成字典下形式params = { 'task': 'train', 'boosting_type': 'gbdt', # 设置提升类型 'objective': 'regression', # 目标函数 'metric': {'l2', 'auc'}, # 评估函数 'num_leaves': 31, # 叶子节点数 'learning_rate': 0.05, # 学习速率 'feature_fraction': 0.9, # 建树的特征选择比例 'bagging_fraction': 0.8, # 建树的样本采样比例 'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging 'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息}clf_lgb=lgb.train(params,lgbdata)#绘制模型重要性clf_lgb.feature_importance()plt.bar(height=clf_lgb.feature_importance(),x=df.iloc[:,0:13].columns)#------------------------------------------------------------------------------# SKlearn模型解释工具treeinterpreter包#导入treeinterpreter包from treeinterpreter import treeinterpreter as ti#加载数据ds=load_boston()#定义分类器rf=RandomForestRegressor(random_state=123)#拟合模型rf.fit(ds.data,ds.target)#取出一个样本spl=ds.data[0].reshape(1,-1)#使用模型打分rf.predict(spl)#使用treeinterpreter解释,prediction是预测值,bias是全体样本Y平均值prediction,bias,contributions=ti.predict(rf,spl)#各变量的contributionsdf_contributions=pd.DataFrame(data=np.hstack([ds.feature_names.reshape(-1,1), contributions.reshape(-1,1)]), columns=['Feature','contribution'])df_contributions.sort_values(by=['contribution'],ascending=False)#验证计算逻辑print(ds.target.mean()) #全体样本目标真实值平均值print(rf.predict(ds.data).mean()) #rf预测值平均值print(rf.predict(spl))print(prediction)print(bias)print(prediction-np.sum(contributions)) #prediction是bias和每个变量贡献的总和#------------------------------------------------------------------------------#特征随机置换Permutation Importance,使用eli5包#导入包from sklearn.datasets import load_bostonfrom sklearn.ensemble import RandomForestRegressor #随机森林import eli5from eli5.sklearn import PermutationImportance#加载数据ds=load_boston()#定义分类器rf=RandomForestRegressor(random_state=123)#拟合模型rf.fit(pd.DataFrame(ds.data,columns=ds.feature_names),ds.target)rf.feature_importances_#计算置换变量值重要性perm=PermutationImportance(rf).fit(pd.DataFrame(ds.data,columns=ds.feature_names),ds.target)df_perm=pd.DataFrame(data=np.hstack([ds.feature_names.reshape(-1,1), perm.feature_importances_.reshape(-1,1).round(4), perm.feature_importances_std_.reshape(-1,1).round(4)]), columns=['Feature','mean','std'])df_perm.sort_values(by=['mean'],ascending=False,inplace=True)#查看置换变量重要性绘图,本段代码只能在notebook中查看eli5.show_weights(perm,feature_names=ds.feature_names) #------------------------------------------------------------------------------#LIME#导入包import numpy as npimport pandas as pdimport matplotlib.pyplot as pltfrom sklearn.datasets import load_bostonfrom sklearn.ensemble import RandomForestRegressor #随机森林from sklearn.model_selection import train_test_splitimport limeimport lime.lime_tabular#加载数据ds=load_boston()#定义分类器rf=RandomForestRegressor(random_state=123)#拟合模型rf.fit(ds.data,ds.target)#取值水平数小于10的视作分类变量categorical_features=np.argwhere(np.array([len(set(ds.data[:,1])) for i in range(ds.data.shape[1])])<=10).flatten()#创建解释器explainer=lime.lime_tabular.LimeTabularExplainer( ds.data, feature_names=ds.feature_names, class_names=['house_price'], categorical_features=None, verbose=True, mode='regression' )#选取一个样本spl=ds.data[0]#生成模型解释结果exp=explainer.explain_instance( spl, rf.predict, num_features=5 )#输出各变量的贡献exp.as_list()#进行可视化,本段代码只能在Jupyter notebook中查看exp.show_in_notebook(show_table=True)#------------------------------------------------------------------------------# SHAP#导入包import numpy as npimport pandas as pdimport matplotlib.pyplot as pltfrom sklearn.datasets import load_bostonfrom sklearn.ensemble import RandomForestRegressorimport shap#初始化图形环境shap.initjs()#加载数据ds=load_boston()#取出一个样本作为下文单样本SHAP的实例spl=ds.data[0].reshape(1,-1)#定义基准分类器rf=RandomForestRegressor(random_state=123)#拟合模型rf.fit(ds.data,ds.target)#定义shap树解释器explainer=shap.TreeExplainer(rf,data=ds.data)#训练集上全体样本预测均值作为基准值explainer.expected_value #22.28338#------------------------------------#该单样本上各变量的SHAP值splshapvalues=explainer.shap_values(spl).round(4)df_splshapvalues=pd.DataFrame(data=np.hstack([ds.feature_names.reshape(-1,1), splshapvalues.reshape(-1,1), abs(splshapvalues).reshape(-1,1)]), columns=['Feature','shap','shapabs'])df_splshapvalues.sort_values(by=['shapabs'],ascending=False,inplace=True) #按SHAP绝对值降序排列df_splshapvalues.drop(['shapabs'],axis=1,inplace=True) #df_splshapvalues是纵表存储df_splshapvaluescol=pd.DataFrame(data=splshapvalues,columns=ds.feature_names) #df_splshapvaluescol是横表存储df_splshapvalues #显示单样本各变量shap值#该单样本上验证计算逻辑ds.target.mean() #全体样本真实值均值 22.533rf.predict(ds.data).mean() #全体样本rf预测值均值 22.535explainer.expected_value #Shap计算基准值即全体样本rf预测值均值22.28338rf.predict(spl) #给定样本预测值 25.421rf.predict(spl)-splshapvalues.sum() #22.2835 约等于explainer.expected_value#查看单样本shap值绘图,本段代码只能在notebook中查看shap.force_plot(explainer.expected_value, splshapvalues, features=spl, feature_names=ds.feature_names)#------------------------------------#样本集上各变量的SHAP值shapvalues=explainer.shap_values(ds.data)#查看样本集上shap值绘图,本段代码只能在Jupyter notebook中查看shap.force_plot(explainer.expected_value, shapvalues, features=ds.data, feature_names=ds.feature_names)#绘制决策路径图,本段代码只能在Jupyter notebook中查看shap.decision_plot(explainer.expected_value, shapvalues[:12], ds.feature_names)#绘制特征依赖图,本段代码只能在Jupyter notebook中查看shap.dependence_plot(ds.feature_names.tolist().index('LSTAT'), shapvalues, ds.data)#全局特征重要性,本段代码只能在Jupyter notebook中查看shap.summary_plot(shapvalues, ds.data, feature_names=ds.feature_names, max_display=5)#以柱状图方式展示各变量SHAP绝对值平均值shap.summary_plot(shapvalues, feature_names=ds.feature_names, plot_type='bar', max_display=5)
从评分卡模型到高维机器学习模型_20211018
#------------------------------------------------------------------------------"""功能说明: 本代码是第15章从评分卡模型到高维机器学习模型配套代码。算法流程: - 使用XGBoost建立预测模型 - 使用LightGBM建立预测模型输入数据: 使用代码自带数据,无需额外的外部数据输出数据: 各代码段输出相应结果变量版本历史: 20211018:定稿提交出版"""#------------------------------------------------------------------------------# 使用XGBoost建立预测模型#导入库包import pandas as pdfrom sklearn.datasets import load_breast_cancer #乳腺癌数据,Target:二分类from sklearn.model_selection import train_test_splitfrom sklearn.metrics import accuracy_scorefrom sklearn.metrics import roc_auc_scorefrom sklearn.metrics import roc_curvefrom sklearn.metrics import precision_recall_curvefrom sklearn.metrics import plot_precision_recall_curvefrom sklearn.metrics import plot_roc_curveimport xgboostfrom xgboost import XGBClassifier#准备数据ds_cancer = load_breast_cancer()data = pd.DataFrame(data=ds_cancer.data,columns=ds_cancer.feature_names)target = pd.DataFrame(data=ds_cancer.target,columns=['target'])#数据分区X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.3)#定义XGBoost模型xgb = XGBClassifier(n_estimators=3, max_depth=2)#显示模型参数xgb.get_params()#模型拟合xgb.fit(X_train, y_train)#获得模型对象的属性和方法xgb.score(X_train, y_train)xgb.feature_importances_ #模型预测y_train_predict=xgb.predict(X_train)y_train_predict_proba=xgb.predict_proba(X_train)#模型评估accuracy_score(y_train,y_train_predict) #Accuracy指标基于真实标签vs预测标签roc_auc_score(y_train, y_train_predict_proba[:,1]) #AUC指标基于真实标签vs预测概率plot_roc_curve(xgb,X_train,y_train) #绘制ROC曲线plot_precision_recall_curve(xgb,X_train,y_train) #绘制PR曲线#打印变量重要性xgboost.plot_importance(xgb) #------------------------------------------------------------------------------# 使用LightGBM建立预测模型#导入库包import pandas as pdimport matplotlib.pyplot as pltfrom sklearn.datasets import load_breast_cancer #乳腺癌数据,Target:二分类from sklearn.model_selection import train_test_splitfrom sklearn.metrics import accuracy_scorefrom sklearn.metrics import roc_auc_scorefrom sklearn.metrics import roc_curvefrom sklearn.metrics import precision_recall_curvefrom sklearn.metrics import plot_precision_recall_curvefrom sklearn.metrics import plot_roc_curveimport lightgbm as lgbfrom lightgbm import LGBMClassifier# 准备数据ds_cancer = load_breast_cancer()data = pd.DataFrame(data=ds_cancer.data,columns=ds_cancer.feature_names)target = pd.DataFrame(data=ds_cancer.target,columns=['target'])#数据分区X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.3)#定义分类器lgbm = LGBMClassifier(boosting_type="gbdt", class_weight=None, colsample_bytree=0.7, isunbalance=True, learning_rate=0.01, max_bin=15, max_depth=1, min_child_samples=100, min_child_weight=1, min_split_gain=0.04, n_estimators=100, num_leaves=32, objective="binary", random_state=27, subsample=0.8, subsample_freq=1)#显示模型对象参数lgbm.get_params()#拟合模型lgbm.fit(X_train,y_train)#获得模型对象的属性lgbm.classes_lgbm.feature_importances_lgbm.n_classes_lgbm.n_features_lgbm.objective_#模型预测y_train_predict=lgbm.predict(X_train)y_train_predict_proba=lgbm.predict_proba(X_train)#模型评估fpr,tpr,pct = roc_curve(y_train, y_train_predict_proba[:,1]) #ROC曲线计算FPR和TPR序列值ks=abs(fpr-tpr).max() #KS指标plt.plot(tpr,"b-",fpr,"r-") #KS曲线accuracy_score(y_train,y_train_predict) #Accuracy指标基于真实标签vs预测标签roc_auc_score(y_train, y_train_predict_proba[:,1]) #AUC指标基于真实标签vs预测概率plot_precision_recall_curve(lgbm,X_train,y_train) #绘制PR曲线plot_roc_curve(lgbm,X_train,y_train) #绘制ROC曲线#调用lightGBM函数绘制相关图lgb.create_tree_digraph(lgbm,tree_index=1)lgb.plot_importance(lgbm)lgb.plot_tree(lgbm,tree_index=1,figsize=(12,9))