特征工程提取有效的风险特征_20211018

  1. #------------------------------------------------------------------------------
  2. """
  3. 功能说明:
  4. 本代码是第5章特征工程提取有效的风险特征配套代码。
  5. 算法流程:
  6. 1、特征组合多项式特征
  7. 2、非负矩阵分解
  8. 3、featuretools包
  9. 4、TSFresh包
  10. 输入数据:
  11. 使用代码自带数据,无需额外的外部数据
  12. 输出数据:
  13. 各代码段输出相应结果变量
  14. 版本历史:
  15. 20211018:定稿提交出版
  16. """
  17. #------------------------------------------------------------------------------
  18. # 特征组合多项式特征
  19. import numpy as np
  20. from sklearn.preprocessing import PolynomialFeatures
  21. X=np.arange(9).reshape(3,3)
  22. poly=PolynomialFeatures(2) #二阶多项式
  23. poly.fit_transform(X)
  24. poly=PolynomialFeatures(degree=3, interaction_only=True) #三阶多项式仅保留交叉项
  25. poly.fit_transform(X)
  26. #------------------------------------------------------------------------------
  27. # 非负矩阵分解
  28. import numpy as np
  29. from sklearn.decomposition import NMF
  30. from sklearn.datasets import load_iris
  31. X, _ = load_iris(True)
  32. #定义模型
  33. nmf = NMF(n_components=2, # n_components即前文矩阵分解中的k,如果不设定该参数则默认保留全部特征
  34. init=None, # W和H的初始化方法,包括'random','nndsvd'(默认),'nndsvda','nndsvdar','custom'.
  35. solver='cd', #取值:'cd'、'mu'
  36. beta_loss='frobenius', #取值:{'frobenius','kullback-leibler','itakura-saito'},一般保持默认
  37. tol=1e-4, # 停止迭代的极限条件
  38. max_iter=1000, #最大迭代次数
  39. random_state=None,
  40. alpha=0., #正则化参数
  41. l1_ratio=0., #正则化参数
  42. verbose=0, #冗长模式
  43. shuffle=False #针对"cd solver"
  44. )
  45. #模型参数
  46. print('params:', nmf.get_params()) #获取构造函数参数的值,也可以通过nmf.attr得到
  47. #模型拟合
  48. nmf.fit(X)
  49. W = nmf.fit_transform(X)
  50. nmf.inverse_transform(W)
  51. H = nmf.components_ # H矩阵
  52. X_= np.dot(W,H)
  53. print('reconstruction_err_', nmf.reconstruction_err_) #损失函数值
  54. print('n_iter_', nmf.n_iter_) #迭代次数
  55. #------------------------------------------------------------------------------
  56. # 使用featuretools包进行特征衍生
  57. #导入相关包
  58. import featuretools as ft
  59. #查看自带的数据情况
  60. es = ft.demo.load_mock_customer(return_entityset=True)
  61. es.plot()
  62. #数据载入
  63. data=ft.demo.load_mock_customer()
  64. customers_df=data["customers"]
  65. sessions_df=data["sessions"]
  66. transactions_df=data["transactions"]
  67. #创建实体和实体间关联关系
  68. dataframes={"customers": (customers_df,"customer_id"),
  69. "sessions": (sessions_df,"session_id","session_start"),
  70. "transactions":(transactions_df,"transaction_id","transaction_time")
  71. }
  72. relationships=[("sessions","session_id","transactions","session_id"),
  73. ("customers","customer_id","sessions","customer_id")
  74. ]
  75. #运行DFS衍生特征
  76. feature_matrix_customers, features_defs=ft.dfs(
  77. dataframes=dataframes,
  78. relationships=relationships,
  79. target_dataframe_name="customers")
  80. #查看衍生的变量
  81. feature_matrix_customers_columnslst=list(feature_matrix_customers.columns)
  82. #------------------------------------------------------------------------------
  83. # 使用tsfresh包进行特征衍生
  84. #导入相关包
  85. from tsfresh.examples.robot_execution_failures import download_robot_execution_failures, load_robot_execution_failures
  86. #下载和读入数据
  87. download_robot_execution_failures() #下载数据
  88. timeseries, y = load_robot_execution_failures() #加载数据
  89. timeseries.head()
  90. y.head()
  91. #显示数据前几行
  92. print(timeseries.head())
  93. print(y.head())
  94. #显示时间序列
  95. import matplotlib.pyplot as plt
  96. timeseries[timeseries['id'] == 3].plot(subplots=True, sharex=True, figsize=(10,10))
  97. y[3] #True正常
  98. plt.show()
  99. timeseries[timeseries['id'] == 21].plot(subplots=True, sharex=True, figsize=(10,10))
  100. y[21] #False有故障
  101. plt.show()
  102. #特征提取
  103. from tsfresh import extract_features
  104. extracted_features = extract_features(timeseries, column_id="id", column_sort="time")
  105. #特征选择,基于上一步特征提取的结果,注意不允许出现NaN值,所以需要使用impute先填充
  106. from tsfresh.utilities.dataframe_functions import impute
  107. impute(extracted_features) #缺失值都用0填充
  108. from tsfresh import select_features
  109. features_filtered = select_features(extracted_features, y)
  110. #特征提取+特征选择
  111. from tsfresh import extract_relevant_features
  112. features_filtered_direct = extract_relevant_features(timeseries, y,column_id='id', column_sort='time')

评分卡模型开发_20211018

  1. #------------------------------------------------------------------------------
  2. """
  3. 功能说明:
  4. 本代码是第7章评分卡模型开发配套代码
  5. 算法流程:
  6. 1、使用scikit-learn包的LogisticRegression类建立逻辑回归模型
  7. 2、使用statsmodels包的Logit类建立逻辑回归模型
  8. 3、使用scorecardpy包建模
  9. 4、使用toad包建模
  10. 输入数据:
  11. 使用代码自带数据,无需额外的外部数据
  12. 输出数据:
  13. 各代码段输出相应结果变量
  14. 版本历史:
  15. 20211018:定稿提交出版
  16. """
  17. #------------------------------------------------------------------------------
  18. # 使用scikit-learn的LogisticRegression类建立逻辑回归模型
  19. #导入相关包和模块
  20. import pandas as pd
  21. from sklearn.datasets import load_breast_cancer #乳腺癌数据,Target:二分类
  22. from sklearn.linear_model import LogisticRegression #分类
  23. from sklearn.model_selection import train_test_split #数据集划分
  24. #准备数据,Target二分类
  25. ds_cancer = load_breast_cancer()
  26. data=pd.DataFrame(ds_cancer.data).add_prefix('X')
  27. target = pd.DataFrame(ds_cancer.target,columns=['y'])
  28. X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.3)
  29. #定义分类器
  30. clf=LogisticRegression(fit_intercept=True, random_state=123) #模型带截距项
  31. clf.get_params()
  32. #模型拟合
  33. clf.fit(X_train,y_train)
  34. #获取模型拟合系数
  35. clf.coef_
  36. clf.intercept_
  37. '''
  38. 注:上述代码仅用于演示LogisticRegression类的使用,计算模型拟合结果后还需要进行模型结果的评估和验证,
  39. 必要时需要迭代地进行变量选择。
  40. '''
  41. #------------------------------------------------------------------------------
  42. # 使用statsmodels包的Logit类建立逻辑回归模型
  43. #导入包
  44. import statsmodels.api as sm #回归类模型
  45. #变量筛选通过相关性
  46. X_train_corr=X_train.corr()[X_train.corr()>0.9] #计算变量相关性发现X0和X1、X2,X20和X22、X23相关性很高
  47. X_train1=X_train.drop(['X0','X2','X3','X10','X12','X13','X20','X22','X23'],axis=1)
  48. #加上常数项
  49. X_train1=sm.add_constant(X_train1)
  50. #拟合模型
  51. model = sm.Logit(y_train, X_train1)
  52. results = model.fit()
  53. #模型结果
  54. print(results.summary())
  55. print(results.params)
  56. '''
  57. 注:上述代码仅用于演示Logit类的使用,计算出模型拟合结果后还需要进行进一步的统计检验,
  58. 统计检验显示,多个变量的p值>0.05,故变量不显著,所以需要迭代地将不显著的变量去除。
  59. '''
  60. #------------------------------------------------------------------------------
  61. # 使用scorecardpy包建模
  62. """
  63. 功能说明:
  64. 本程序使用scorecardpy进行评分卡建模
  65. 算法流程:
  66. 依次读入数据、变量筛选、数据分区、变量分箱、分箱调整、变量转换WOE、训练模型、模型评估、模型验证、评分标尺
  67. 输入数据:
  68. 本程序不需要额外输入数据,sc.germancredit自带建模数据
  69. 输出数据:
  70. 评分卡模型结果
  71. 版本历史:
  72. """
  73. #导入相关包
  74. from sklearn.linear_model import LogisticRegression
  75. from sklearn.metrics import plot_precision_recall_curve
  76. from sklearn.metrics import plot_roc_curve
  77. import scorecardpy as sc
  78. #1.读入数据
  79. #读入数据
  80. data = sc.germancredit()
  81. #数据信息
  82. data.info()
  83. data.describe()
  84. #2.变量筛选
  85. data_s = sc.var_filter(data,
  86. y="creditability",
  87. iv_limit=0.02,
  88. missing_limit=0.95,
  89. identical_limit=0.95,
  90. var_rm=None,
  91. var_kp=None,
  92. return_rm_reason=False,
  93. positive='bad|1')
  94. #3.数据分区
  95. train, test = sc.split_df(data_s, 'creditability', ratio=0.7, seed=123).values()
  96. #4.变量分箱
  97. #自动分箱
  98. bins = sc.woebin(train, y="creditability")
  99. #细分箱结果报告
  100. sc.woebin_plot(bins)
  101. #5.分箱调整
  102. #交互式输入cut后分箱
  103. #breaks_adj = sc.woebin_adj(train, "creditability", bins)
  104. #也可以手动设置
  105. breaks_adj = {'age.in.years': [22, 35, 40,60],
  106. 'other.debtors.or.guarantors': ["none", "co-applicant%,%guarantor"]}
  107. bins_adj = sc.woebin(train, y="creditability", breaks_list=breaks_adj)
  108. #6.变量转换WOE
  109. train_woe = sc.woebin_ply(train, bins_adj)
  110. test_woe = sc.woebin_ply(test, bins_adj)
  111. #7.训练模型
  112. #处理数据
  113. X_train = train_woe.loc[:,train_woe.columns != 'creditability']
  114. y_train = train_woe.loc[:,'creditability']
  115. X_test = test_woe.loc[:,train_woe.columns != 'creditability']
  116. y_test = test_woe.loc[:,'creditability']
  117. #定义分类器
  118. lr = LogisticRegression(penalty='l1', C=0.9, solver='saga', n_jobs=-1)
  119. lr.get_params()
  120. #拟合模型
  121. lr.fit(X_train, y_train)
  122. #拟合的参数
  123. lr.coef_
  124. lr.intercept_
  125. #8.模型评估
  126. # predicted proability for train
  127. y_train_pred = lr.predict_proba(X_train)[:,1]
  128. # 绘制KS和ROC、PR曲线
  129. train_perf = sc.perf_eva(y_train, y_train_pred, plot_type=["ks", "roc","pr","lift"], title = "train")
  130. plot_roc_curve(lr,X_train,y_train)
  131. plot_precision_recall_curve(lr,X_train,y_train)
  132. #9.模型验证
  133. # predicted proability for test
  134. y_test_pred = lr.predict_proba(X_test)[:,1]
  135. # 绘制KS和ROC、PR曲线
  136. test_perf = sc.perf_eva(y_test, y_test_pred, plot_type=["ks", "roc","pr","lift"], title = "test")
  137. plot_roc_curve(lr,X_test,y_test)
  138. plot_precision_recall_curve(lr,X_test,y_test)
  139. #10.评分标尺
  140. card = sc.scorecard(bins_adj,
  141. lr,
  142. X_train.columns,
  143. points0=600,
  144. odds0=1/19,
  145. pdo=50,
  146. basepoints_eq0=True)
  147. #使用评分标尺打分
  148. train_score = sc.scorecard_ply(train, card, print_step=0)
  149. test_score = sc.scorecard_ply(test, card, print_step=0)
  150. #比较train/test分数分布是否一致,计算分值分布PSI
  151. sc.perf_psi(
  152. score = {'train':train_score, 'test':test_score},
  153. label = {'train':y_train, 'test':y_test})
  154. #------------------------------------------------------------------------------
  155. # 使用toad包建模
  156. """
  157. 功能说明:
  158. 本程序使用toad进行评分卡建模
  159. 算法流程:
  160. 依次进行读入数据、样本分区、数据EDA报告、特征分析、特征预筛选、特征分箱、调整合并分箱、特征选择、模型训练、模型评估、模型验证、评分标尺
  161. 输入数据:
  162. 数据下载为https://www.kaggle.com/c/GiveMeSomeCredit/data
  163. 输出数据:
  164. 评分模型结果
  165. 版本历史:
  166. """
  167. #导入相关包
  168. import pandas as pd
  169. from sklearn.linear_model import LogisticRegression
  170. from sklearn.model_selection import train_test_split
  171. import toad
  172. from toad.plot import badrate_plot, proportion_plot, bin_plot
  173. from toad.metrics import KS, F1, AUC
  174. #1. 读入数据
  175. #读入数据
  176. data = pd.read_csv(r'D:\cs-training.csv')
  177. #数据描述
  178. data.info()
  179. data.describe()
  180. data.head()
  181. #2. 样本分区
  182. Xtr,Xts,Ytr,Yts = train_test_split(data.drop('SeriousDlqin2yrs',axis=1),
  183. data['SeriousDlqin2yrs'],
  184. test_size=0.25,
  185. random_state=450)
  186. data_tr = pd.concat([Xtr,Ytr],axis=1)
  187. data_tr['type'] = 'train'
  188. data_ts = pd.concat([Xts,Yts],axis=1)
  189. data_ts['type'] = 'test'
  190. #3. 数据EDA报告
  191. toad.detector.detect(data_tr).to_excel(r'D:\数据EDA结果.xlsx')
  192. #4. 特征分析计算特征IV、gini、entropy、unique
  193. quality = toad.quality(data,'SeriousDlqin2yrs')
  194. quality.head(6)
  195. #5. 特征预筛选
  196. selected_train, drop_lst= toad.selection.select(data_tr,target = 'SeriousDlqin2yrs',
  197. empty = 0.5,
  198. iv = 0.05,
  199. corr = 0.7,
  200. return_drop=True,
  201. exclude='type')
  202. selected_test = data_ts[selected_train.columns]
  203. selected_train.shape
  204. drop_lst #删除的额变量
  205. #6. 特征分箱,必须基于train数据集来做
  206. # 初始化一个combiner类
  207. combiner = toad.transform.Combiner()
  208. # 训练数据并指定分箱方法,需要分箱的变量共7个
  209. combiner.fit(selected_train,
  210. y='SeriousDlqin2yrs',
  211. method='chi',
  212. min_samples = 0.05,
  213. exclude='type')
  214. # 以字典形式保存分箱结果
  215. bins = combiner.export()
  216. #查看每个特征的分箱结果
  217. print('DebtRatio分箱cut:',bins['DebtRatio'])
  218. print('MonthlyIncome分箱cut:',bins['MonthlyIncome'])
  219. print('NumberOfOpenCreditLinesAndLoans分箱cut:',bins['NumberOfOpenCreditLinesAndLoans'])
  220. print('NumberOfTimes90DaysLate分箱cut:',bins['NumberOfTimes90DaysLate'])
  221. print('NumberRealEstateLoansOrLines分箱cut:',bins['NumberRealEstateLoansOrLines'])
  222. print('RevolvingUtilizationOfUnsecuredLines分箱cut:',bins['RevolvingUtilizationOfUnsecuredLines'])
  223. print('age分箱cut:',bins['age'])
  224. #使用combiner.transform方法对数据进行分箱转换
  225. selected_train_bin = combiner.transform(selected_train)
  226. #画分箱图,bin_plot双轴图同时绘制分箱占比和分箱badrate
  227. proportion_plot(selected_train_bin['DebtRatio'])
  228. proportion_plot(selected_train_bin['MonthlyIncome'])
  229. proportion_plot(selected_train_bin['NumberOfOpenCreditLinesAndLoans'])
  230. proportion_plot(selected_train_bin['NumberOfTimes90DaysLate'])
  231. proportion_plot(selected_train_bin['NumberRealEstateLoansOrLines'])
  232. proportion_plot(selected_train_bin['RevolvingUtilizationOfUnsecuredLines'])
  233. proportion_plot(selected_train_bin['age'])
  234. badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'DebtRatio')
  235. badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'MonthlyIncome')
  236. badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberOfOpenCreditLinesAndLoans')
  237. badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberOfTimes90DaysLate')
  238. badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberRealEstateLoansOrLines')
  239. badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'RevolvingUtilizationOfUnsecuredLines')
  240. badrate_plot(selected_train_bin, target = 'SeriousDlqin2yrs', x = 'type',by = 'age')
  241. bin_plot(selected_train_bin,x='DebtRatio',target='SeriousDlqin2yrs')
  242. bin_plot(selected_train_bin,x='MonthlyIncome',target='SeriousDlqin2yrs')
  243. bin_plot(selected_train_bin,x='NumberOfOpenCreditLinesAndLoans',target='SeriousDlqin2yrs')
  244. bin_plot(selected_train_bin,x='NumberOfTimes90DaysLate',target='SeriousDlqin2yrs')
  245. bin_plot(selected_train_bin,x='NumberRealEstateLoansOrLines',target='SeriousDlqin2yrs')
  246. bin_plot(selected_train_bin,x='RevolvingUtilizationOfUnsecuredLines',target='SeriousDlqin2yrs')
  247. bin_plot(selected_train_bin,x='age',target='SeriousDlqin2yrs')
  248. #7. 调整合并分箱
  249. #定义调整分箱#调整分箱cutpoint
  250. bins_adj=bins
  251. bins_adj["age"]=[22, 35, 45, 60]
  252. bins_adj["NumberOfOpenCreditLinesAndLoans"]=[2]
  253. bins_adj["DebtRatio"]=[0.02,0.4,0.5,2]
  254. #定义分箱combiner
  255. combiner2 = toad.transform.Combiner() #定义分箱combiner
  256. combiner2.set_rules(bins_adj) #设置需要施加的分箱
  257. #应用调整分箱
  258. selected_train_binadj = combiner2.transform(selected_train)
  259. #画分箱坏账率badrate图
  260. proportion_plot(selected_train_binadj['DebtRatio'])
  261. proportion_plot(selected_train_binadj['MonthlyIncome'])
  262. proportion_plot(selected_train_binadj['NumberOfOpenCreditLinesAndLoans'])
  263. proportion_plot(selected_train_binadj['NumberOfTimes90DaysLate'])
  264. proportion_plot(selected_train_binadj['NumberRealEstateLoansOrLines'])
  265. proportion_plot(selected_train_binadj['RevolvingUtilizationOfUnsecuredLines'])
  266. proportion_plot(selected_train_binadj['age'])
  267. badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'DebtRatio')
  268. badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'MonthlyIncome')
  269. badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberOfOpenCreditLinesAndLoans')
  270. badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberOfTimes90DaysLate')
  271. badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'NumberRealEstateLoansOrLines')
  272. badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'RevolvingUtilizationOfUnsecuredLines')
  273. badrate_plot(selected_train_binadj, target = 'SeriousDlqin2yrs', x = 'type',by = 'age')
  274. #8. 转换WOE值
  275. #设置分箱号
  276. combiner.set_rules(bins_adj)
  277. #将特征的值转化为分箱的箱号。
  278. selected_train_binadj = combiner.transform(selected_train)
  279. selected_test_binadj = combiner.transform(selected_test)
  280. #定义WOE转换器
  281. WOETransformer = toad.transform.WOETransformer()
  282. #对WOE的值进行转化,映射到原数据集上。对训练集用fit_transform,测试集用transform
  283. data_tr_woe = WOETransformer.fit_transform(selected_train_binadj,
  284. selected_train_binadj['SeriousDlqin2yrs'],
  285. exclude=['SeriousDlqin2yrs','type'])
  286. data_ts_woe = WOETransformer.transform(selected_test_binadj)
  287. #9. 特征选择,使用stepwise选择变量
  288. train_final = toad.selection.stepwise(data_tr_woe.drop('type',axis=1),
  289. target = 'SeriousDlqin2yrs',
  290. direction = 'both',
  291. criterion = 'aic')
  292. test_final = data_ts_woe[train_final.columns]
  293. print(train_final.shape) #7个特征减少为5个特征。
  294. #10. 模型训练
  295. #准备数据
  296. Xtr = train_final.drop('SeriousDlqin2yrs',axis=1)
  297. Ytr = train_final['SeriousDlqin2yrs']
  298. #逻辑回归模型拟合
  299. lr = LogisticRegression()
  300. lr.fit(Xtr, Ytr)
  301. #打印模型拟合的参数
  302. lr.coef_
  303. lr.intercept_
  304. #11. 模型评估
  305. #在训练集上的模型表现
  306. EYtr_proba = lr.predict_proba(Xtr)[:,1]
  307. EYtr = lr.predict(Xtr)
  308. print('train F1:', F1(EYtr_proba,Ytr))
  309. print('train KS:', KS(EYtr_proba,Ytr))
  310. print('train AUC:', AUC(EYtr_proba,Ytr))
  311. #分值排序性
  312. tr_bucket = toad.metrics.KS_bucket(EYtr_proba,Ytr,bucket=10,method='quantile') #等频分段
  313. tr_bucket
  314. #12. 模型验证
  315. #在测试集上的模型表现
  316. Xts = test_final.drop('SeriousDlqin2yrs',axis=1)
  317. Yts = test_final['SeriousDlqin2yrs']
  318. EYts_proba = lr.predict_proba(Xts)[:,1]
  319. EYts = lr.predict(Xts)
  320. print('test F1:', F1(EYts_proba,Yts))
  321. print('test KS:', KS(EYts_proba,Yts))
  322. print('test AUC:', AUC(EYts_proba,Yts))
  323. #比较train、test变量稳定性分布是否有显著差异,基于分箱之后的数据
  324. psi = toad.metrics.PSI(train_final,test_final)
  325. psi.sort_values(0,ascending=False)
  326. #13. 分值转换scaling
  327. scorecard = toad.scorecard.ScoreCard(combiner = combiner, transer = WOETransformer , C = 0.1)
  328. scorecard.fit(Xtr, Ytr)
  329. scorecard.export(to_frame = True,)

评分卡模型部署_20211018

  1. #------------------------------------------------------------------------------
  2. """
  3. 功能说明:
  4. 本代码是第9章评分卡模型部署配套代码。
  5. 算法流程:
  6. - 训练模型并将模型持久化为PKL文件
  7. - 本地加载模型PKL文件
  8. - 训练模型并将模型持久化为PMML文件
  9. - 本地加载模型PMML文件
  10. - 在服务器部署模型PMML,然后在客户端调用打分服务
  11. 输入数据:
  12. 使用代码自带数据,无需额外的外部数据
  13. 输出数据:
  14. 各代码段输出相应结果变量
  15. 版本历史:
  16. 20211018:定稿提交出版
  17. """
  18. #------------------------------------------------------------------------------
  19. # 训练模型并将模型持久化为PKL文件
  20. #导入相关包
  21. import pandas as pd
  22. from sklearn.datasets import load_iris
  23. from sklearn import tree
  24. from sklearn2pmml import PMMLPipeline
  25. #读入数据
  26. iris = load_iris()
  27. X_train=pd.DataFrame(iris.data,columns=['sepal_length','sepal_width','petal_length', 'petal_width'])
  28. y_train=pd.DataFrame(iris.target,columns=['series'])
  29. #训练模型pipeline
  30. clf = tree.DecisionTreeClassifier(max_depth=2) #定义分类器
  31. pipeline = PMMLPipeline([("classifier", clf)]) #定义pipeline
  32. pipeline.fit(X_train, y_train) #此处使用带columns变量名称的dataframe进行模型训练
  33. #方法1:使用pickle包将模型保存为pkl
  34. import pickle
  35. with open("D:\\mdl.pkl", "wb") as f:
  36. pickle.dump(pipeline, f)
  37. #方法2:使用joblib包将模型导出为pkl
  38. #from sklearn.externals import joblib #高版本sklearn不再支持joblib
  39. import joblib
  40. joblib.dump(pipeline, "d:\\mdl.pkl", compress = 9)
  41. #------------------------------------------------------------------------------
  42. # 本地加载和使用模型PKL文件
  43. #使用pickle包读取pickle
  44. with open('D:\\mdl.pkl', 'rb') as f:
  45. mdl_in = pickle.load(f)
  46. y_pred=mdl_in.predict(iris.data)
  47. #使用joblib包读取pickle
  48. mdl_in=joblib.load("d:\\mdl.pkl")
  49. y_pred=mdl_in.predict(iris.data)
  50. #------------------------------------------------------------------------------
  51. # 将模型持久化为PMML文件
  52. # 方法一:使用sklearn2pmml包导出模型PMML文件
  53. #导入相关包
  54. from sklearn.datasets import load_iris
  55. from sklearn import tree
  56. from sklearn2pmml import PMMLPipeline
  57. #读入数据
  58. iris = load_iris()
  59. X_train=pd.DataFrame(iris.data,columns=['sepal_length','sepal_width','petal_length', 'petal_width'])
  60. y_train=pd.DataFrame(iris.target,columns=['series'])
  61. #训练模型pipeline
  62. clf = tree.DecisionTreeClassifier(max_depth=2) #定义分类器
  63. pipeline = PMMLPipeline([("classifier", clf)]) #定义pipeline
  64. pipeline.fit(X_train, y_train) #此处使用带columns变量名称的dataframe进行模型训练
  65. #模型导出为PMML
  66. from sklearn2pmml import sklearn2pmml
  67. sklearn2pmml(pipeline, "d:\\DecisionTree_Iris_sklearn2pmml.pmml", with_repr = True) #生成PMML时带变量名
  68. # 方法二:使用nyoka包导出模型PMML文件
  69. #导入相关包
  70. from sklearn.datasets import load_iris
  71. from sklearn.tree import DecisionTreeClassifier
  72. from sklearn2pmml import PMMLPipeline
  73. #读入数据
  74. iris = load_iris()
  75. features = iris.feature_names
  76. target = 'Species'
  77. #创建pipeline并训练模型
  78. clf_pipeline=PMMLPipeline([('clf',DecisionTreeClassifier(max_depth=2))])
  79. clf_pipeline.fit(iris.data, iris.target) #此处训练模型时用的是数组不带变量名称
  80. #使用nyoka将模型导出为pmml
  81. from nyoka import skl_to_pmml
  82. skl_to_pmml(clf_pipeline, features, target, "d:\\DecisionTree_iris_nyoka.pmml") #生成PMML时带变量名
  83. #------------------------------------------------------------------------------
  84. # 本地加载和使用PMML模型文件
  85. #加载pmml
  86. from pypmml import Model
  87. model = Model.fromFile("d:\\DecisionTree_Iris_sklearn2pmml.pmml")
  88. #使用PMML的模型打分,整个数据集
  89. y_train_pred=model.predict(X_train) #注:此处待打分的DataFrame是否带变量名称须与训练模型PMML时保持一致
  90. #使用PMML的模型打分,单条记录
  91. model.predict({'sepal_length': 5.1, 'sepal_width': 3.5, 'petal_length': 1.4, 'petal_width': 0.2})
  92. model.predict('[{"sepal_length": 5.1, "sepal_width": 3.5, "petal_length": 1.4, "petal_width": 0.2}]')
  93. model.predict('{"columns": ["sepal_length", "sepal_width", "petal_length", "petal_width"], "data": [[5.1, 3.5, 1.4, 0.2]]}')
  94. model.predict(pd.Series({'sepal_length': 5.1, 'sepal_width': 3.5, 'petal_length': 1.4, 'petal_width': 0.2}))
  95. #------------------------------------------------------------------------------
  96. # 下面代码使用FastAPI包实现在服务器部署模型
  97. # (1)首先将下面代码保存在服务器端,命名为main.py,然后在服务器端执行命令行:先定位到main.py目录,然后执行:uvicorn main:app –-reload
  98. #导入相关包和模块
  99. from fastapi import FastAPI
  100. from pypmml import Model
  101. #定义FastAPI对象
  102. app = FastAPI()
  103. @app.get("/items/{item_id}")
  104. async def read_item(item_id: int, x: str=''):
  105. #读取模型PMML
  106. mdl = Model.fromFile("d:\\DecisionTree_Iris_sklearn2pmml.pmml")
  107. #将读入的字符串x输入predict函数得到预测结果
  108. y_predict=mdl.predict(x)
  109. #将计算结果返回给客户端
  110. return {"item_id": item_id, "x":x, "y_predict": y_predict}
  111. # (2)客户端执行如下代码,在服务器模式下执行时将127.0.0.1替换为服务器IP地址
  112. URL_str='http://127.0.0.1:8000/items/5?x='+'[{"sepal_length":5.1,"sepal_width":3.5,"petal_length":1.4,"petal_width":0.2}]'
  113. res=requests.get(URL_str)
  114. returnjson=res.text
  115. print(returnjson)
  116. #------------------------------------------------------------------------------
  117. # 下面代码使用Flask包实现在服务器部署模型
  118. # (1)首先将下面代码保存在服务器端,命名为main.py,然后在服务器端执行命令行:python main.py
  119. #导入相关包和模块
  120. import numpy as np
  121. import pandas as pd
  122. from pypmml import Model
  123. from flask import Flask
  124. from flask import request
  125. from flask import jsonify
  126. #导入模型
  127. model = Model.fromFile("d:\\DecisionTree_Iris_sklearn2pmml.pmml")
  128. app = Flask(__name__)
  129. @app.route('/',methods=['POST','GET'])
  130. def scoring():
  131. text=request.args.get('inputdata')
  132. if text:
  133. temp = [float(x) for x in text.split(',')]
  134. temp = pd.DataFrame(data=np.array(temp).reshape((1, -1)),columns=["sepal_length", "sepal_width", "petal_length", "petal_width"])
  135. ouputdata = model.predict(temp) #outputdata是DataFrame格式
  136. return jsonify(dict(ouputdata.iloc[0])) #进行json化
  137. if __name__ == '__main__':
  138. app.config['JSON_AS_ASCII'] = False
  139. app.run(host='127.0.0.1',port=5003) # 127.0.0.1 #指的是本地ip
  140. # (2)客户端执行如下代码,在服务器模式下执行时将127.0.0.1替换为服务器IP地址
  141. import requests
  142. base = 'http://127.0.0.1:5003/?inputdata=5.1,3.5,1.4,2'
  143. response = requests.get(base)
  144. print(response.text)
  145. answer = response.json()
  146. print('预测结果',answer)

评分卡模型可解释性_20211018

  1. #------------------------------------------------------------------------------
  2. """
  3. 功能说明:
  4. 本代码是第13章评分卡模型可解释性配套代码。
  5. 算法流程:
  6. - PDP与ICE
  7. - 变量重要性方法:XGBoost和LightGBM的plot_importance
  8. - SKlearn模型解释工具treeinterpreter包
  9. - 特征随机置换Permutation Importance,使用eli5包
  10. - LIME
  11. - SHAP
  12. 输入数据:
  13. 使用代码自带数据,无需额外的外部数据
  14. 输出数据:
  15. 各代码段输出相应结果变量
  16. 版本历史:
  17. 20211018:定稿提交出版
  18. """
  19. #------------------------------------------------------------------------------
  20. # 导入相关包和模块
  21. import numpy as np
  22. import pandas as pd
  23. import matplotlib.pyplot as plt
  24. from sklearn.datasets import load_boston
  25. from sklearn.datasets import fetch_california_housing
  26. from sklearn.tree import DecisionTreeClassifier #分类
  27. from sklearn.tree import DecisionTreeRegressor #回归
  28. from sklearn.ensemble import RandomForestRegressor #随机森林
  29. from sklearn.model_selection import train_test_split
  30. from sklearn.metrics import accuracy_score
  31. #------------------------------------------------------------------------------
  32. # PDP与ICE
  33. """
  34. PDP方法有两个工具包可用:
  35. —— sklearn.inspection
  36. —— pdpbox
  37. """
  38. #读入数据
  39. from sklearn.datasets import fetch_california_housing
  40. cal_housing=fetch_california_housing()
  41. X=pd.DataFrame(cal_housing.data, columns=cal_housing.feature_names)
  42. y=cal_housing.target
  43. X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
  44. #训练模型
  45. from sklearn.ensemble import GradientBoostingRegressor
  46. gbdt=GradientBoostingRegressor()
  47. gbdt.fit(X_train,y_train)
  48. #方法一:使用sklearn.inspection进行PDP分析
  49. from sklearn.inspection import plot_partial_dependence
  50. fig,ax=plt.subplots(figsize=(12,4))
  51. plot_partial_dependence(gbdt,
  52. X_train,
  53. ['MedInc','AveOccup','HouseAge'],
  54. method="brute",
  55. ax=ax)
  56. #也可以输出三维图形,考察两个变量间的交互性
  57. fig,ax=plt.subplots(figsize=(9,6))
  58. plot_partial_dependence(gbdt,
  59. X_train,
  60. [('HouseAge','AveOccup')],
  61. grid_resolution=50,
  62. method="brute",
  63. ax=ax)
  64. #方法二:使用pdpbox包
  65. from pdpbox import pdb
  66. pdp_MedInc=pdp.pdp_isolate(model=gbdt,
  67. dataset=X_train,
  68. model_features=X_train.columns.tolist(),
  69. feature='MedInc',
  70. num_grid_points=30)
  71. pdb.pdp_plot(pdp_MedInc,
  72. 'MedInc',
  73. center=False
  74. )
  75. #使用pdpbox包绘制单实例ICE图
  76. pdb.pdp_plot(pdp_MedInc,
  77. 'MedInc',
  78. center=False,
  79. plot_lines=True,
  80. frac_to_plot=10,
  81. plot_pts_dist=True)
  82. #------------------------------------------------------------------------------
  83. #变量重要性方法:XGBoost和LightGBM的plot_importance
  84. #导入包
  85. from sklearn.datasets import load_boston
  86. import xgboost as xgb
  87. import lightgbm as lgb
  88. #读取数据
  89. ds=load_boston()
  90. df=pd.DataFrame(data=ds.data)
  91. df=df.add_prefix('X')
  92. df=df.join(pd.DataFrame(ds.target,columns=['y']))
  93. #定义xgb预测器
  94. clf=xgb.XGBRegressor()
  95. clf.get_params()
  96. #拟合模型
  97. clf.fit(df.iloc[:,0:13],df.iloc[:,-1])
  98. #模型评估
  99. clf.score(df.iloc[:,0:13],df.iloc[:,-1])
  100. #打印变量重要性
  101. xgb.plot_importance(clf,importance_type='gain')
  102. #定义ligbm预测器
  103. lgbdata=lgb.Dataset(df.iloc[:,0:13],df.iloc[:,-1])
  104. # 将参数写成字典下形式
  105. params = {
  106. 'task': 'train',
  107. 'boosting_type': 'gbdt', # 设置提升类型
  108. 'objective': 'regression', # 目标函数
  109. 'metric': {'l2', 'auc'}, # 评估函数
  110. 'num_leaves': 31, # 叶子节点数
  111. 'learning_rate': 0.05, # 学习速率
  112. 'feature_fraction': 0.9, # 建树的特征选择比例
  113. 'bagging_fraction': 0.8, # 建树的样本采样比例
  114. 'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging
  115. 'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
  116. }
  117. clf_lgb=lgb.train(params,lgbdata)
  118. #绘制模型重要性
  119. clf_lgb.feature_importance()
  120. plt.bar(height=clf_lgb.feature_importance(),x=df.iloc[:,0:13].columns)
  121. #------------------------------------------------------------------------------
  122. # SKlearn模型解释工具treeinterpreter包
  123. #导入treeinterpreter包
  124. from treeinterpreter import treeinterpreter as ti
  125. #加载数据
  126. ds=load_boston()
  127. #定义分类器
  128. rf=RandomForestRegressor(random_state=123)
  129. #拟合模型
  130. rf.fit(ds.data,ds.target)
  131. #取出一个样本
  132. spl=ds.data[0].reshape(1,-1)
  133. #使用模型打分
  134. rf.predict(spl)
  135. #使用treeinterpreter解释,prediction是预测值,bias是全体样本Y平均值
  136. prediction,bias,contributions=ti.predict(rf,spl)
  137. #各变量的contributions
  138. df_contributions=pd.DataFrame(data=np.hstack([ds.feature_names.reshape(-1,1),
  139. contributions.reshape(-1,1)]),
  140. columns=['Feature','contribution'])
  141. df_contributions.sort_values(by=['contribution'],ascending=False)
  142. #验证计算逻辑
  143. print(ds.target.mean()) #全体样本目标真实值平均值
  144. print(rf.predict(ds.data).mean()) #rf预测值平均值
  145. print(rf.predict(spl))
  146. print(prediction)
  147. print(bias)
  148. print(prediction-np.sum(contributions)) #prediction是bias和每个变量贡献的总和
  149. #------------------------------------------------------------------------------
  150. #特征随机置换Permutation Importance,使用eli5包
  151. #导入包
  152. from sklearn.datasets import load_boston
  153. from sklearn.ensemble import RandomForestRegressor #随机森林
  154. import eli5
  155. from eli5.sklearn import PermutationImportance
  156. #加载数据
  157. ds=load_boston()
  158. #定义分类器
  159. rf=RandomForestRegressor(random_state=123)
  160. #拟合模型
  161. rf.fit(pd.DataFrame(ds.data,columns=ds.feature_names),ds.target)
  162. rf.feature_importances_
  163. #计算置换变量值重要性
  164. perm=PermutationImportance(rf).fit(pd.DataFrame(ds.data,columns=ds.feature_names),ds.target)
  165. df_perm=pd.DataFrame(data=np.hstack([ds.feature_names.reshape(-1,1),
  166. perm.feature_importances_.reshape(-1,1).round(4),
  167. perm.feature_importances_std_.reshape(-1,1).round(4)]),
  168. columns=['Feature','mean','std'])
  169. df_perm.sort_values(by=['mean'],ascending=False,inplace=True)
  170. #查看置换变量重要性绘图,本段代码只能在notebook中查看
  171. eli5.show_weights(perm,feature_names=ds.feature_names)
  172. #------------------------------------------------------------------------------
  173. #LIME
  174. #导入包
  175. import numpy as np
  176. import pandas as pd
  177. import matplotlib.pyplot as plt
  178. from sklearn.datasets import load_boston
  179. from sklearn.ensemble import RandomForestRegressor #随机森林
  180. from sklearn.model_selection import train_test_split
  181. import lime
  182. import lime.lime_tabular
  183. #加载数据
  184. ds=load_boston()
  185. #定义分类器
  186. rf=RandomForestRegressor(random_state=123)
  187. #拟合模型
  188. rf.fit(ds.data,ds.target)
  189. #取值水平数小于10的视作分类变量
  190. categorical_features=np.argwhere(np.array([len(set(ds.data[:,1])) for i in range(ds.data.shape[1])])<=10).flatten()
  191. #创建解释器
  192. explainer=lime.lime_tabular.LimeTabularExplainer(
  193. ds.data,
  194. feature_names=ds.feature_names,
  195. class_names=['house_price'],
  196. categorical_features=None,
  197. verbose=True,
  198. mode='regression'
  199. )
  200. #选取一个样本
  201. spl=ds.data[0]
  202. #生成模型解释结果
  203. exp=explainer.explain_instance(
  204. spl,
  205. rf.predict,
  206. num_features=5
  207. )
  208. #输出各变量的贡献
  209. exp.as_list()
  210. #进行可视化,本段代码只能在Jupyter notebook中查看
  211. exp.show_in_notebook(show_table=True)
  212. #------------------------------------------------------------------------------
  213. # SHAP
  214. #导入包
  215. import numpy as np
  216. import pandas as pd
  217. import matplotlib.pyplot as plt
  218. from sklearn.datasets import load_boston
  219. from sklearn.ensemble import RandomForestRegressor
  220. import shap
  221. #初始化图形环境
  222. shap.initjs()
  223. #加载数据
  224. ds=load_boston()
  225. #取出一个样本作为下文单样本SHAP的实例
  226. spl=ds.data[0].reshape(1,-1)
  227. #定义基准分类器
  228. rf=RandomForestRegressor(random_state=123)
  229. #拟合模型
  230. rf.fit(ds.data,ds.target)
  231. #定义shap树解释器
  232. explainer=shap.TreeExplainer(rf,data=ds.data)
  233. #训练集上全体样本预测均值作为基准值
  234. explainer.expected_value #22.28338
  235. #------------------------------------
  236. #该单样本上各变量的SHAP值
  237. splshapvalues=explainer.shap_values(spl).round(4)
  238. df_splshapvalues=pd.DataFrame(data=np.hstack([ds.feature_names.reshape(-1,1),
  239. splshapvalues.reshape(-1,1),
  240. abs(splshapvalues).reshape(-1,1)]),
  241. columns=['Feature','shap','shapabs'])
  242. df_splshapvalues.sort_values(by=['shapabs'],ascending=False,inplace=True) #按SHAP绝对值降序排列
  243. df_splshapvalues.drop(['shapabs'],axis=1,inplace=True) #df_splshapvalues是纵表存储
  244. df_splshapvaluescol=pd.DataFrame(data=splshapvalues,columns=ds.feature_names) #df_splshapvaluescol是横表存储
  245. df_splshapvalues #显示单样本各变量shap值
  246. #该单样本上验证计算逻辑
  247. ds.target.mean() #全体样本真实值均值 22.533
  248. rf.predict(ds.data).mean() #全体样本rf预测值均值 22.535
  249. explainer.expected_value #Shap计算基准值即全体样本rf预测值均值22.28338
  250. rf.predict(spl) #给定样本预测值 25.421
  251. rf.predict(spl)-splshapvalues.sum() #22.2835 约等于explainer.expected_value
  252. #查看单样本shap值绘图,本段代码只能在notebook中查看
  253. shap.force_plot(explainer.expected_value,
  254. splshapvalues,
  255. features=spl,
  256. feature_names=ds.feature_names)
  257. #------------------------------------
  258. #样本集上各变量的SHAP值
  259. shapvalues=explainer.shap_values(ds.data)
  260. #查看样本集上shap值绘图,本段代码只能在Jupyter notebook中查看
  261. shap.force_plot(explainer.expected_value,
  262. shapvalues,
  263. features=ds.data,
  264. feature_names=ds.feature_names)
  265. #绘制决策路径图,本段代码只能在Jupyter notebook中查看
  266. shap.decision_plot(explainer.expected_value,
  267. shapvalues[:12],
  268. ds.feature_names)
  269. #绘制特征依赖图,本段代码只能在Jupyter notebook中查看
  270. shap.dependence_plot(ds.feature_names.tolist().index('LSTAT'),
  271. shapvalues,
  272. ds.data)
  273. #全局特征重要性,本段代码只能在Jupyter notebook中查看
  274. shap.summary_plot(shapvalues,
  275. ds.data,
  276. feature_names=ds.feature_names,
  277. max_display=5)
  278. #以柱状图方式展示各变量SHAP绝对值平均值
  279. shap.summary_plot(shapvalues,
  280. feature_names=ds.feature_names,
  281. plot_type='bar',
  282. max_display=5)

从评分卡模型到高维机器学习模型_20211018

  1. #------------------------------------------------------------------------------
  2. """
  3. 功能说明:
  4. 本代码是第15章从评分卡模型到高维机器学习模型配套代码。
  5. 算法流程:
  6. - 使用XGBoost建立预测模型
  7. - 使用LightGBM建立预测模型
  8. 输入数据:
  9. 使用代码自带数据,无需额外的外部数据
  10. 输出数据:
  11. 各代码段输出相应结果变量
  12. 版本历史:
  13. 20211018:定稿提交出版
  14. """
  15. #------------------------------------------------------------------------------
  16. # 使用XGBoost建立预测模型
  17. #导入库包
  18. import pandas as pd
  19. from sklearn.datasets import load_breast_cancer #乳腺癌数据,Target:二分类
  20. from sklearn.model_selection import train_test_split
  21. from sklearn.metrics import accuracy_score
  22. from sklearn.metrics import roc_auc_score
  23. from sklearn.metrics import roc_curve
  24. from sklearn.metrics import precision_recall_curve
  25. from sklearn.metrics import plot_precision_recall_curve
  26. from sklearn.metrics import plot_roc_curve
  27. import xgboost
  28. from xgboost import XGBClassifier
  29. #准备数据
  30. ds_cancer = load_breast_cancer()
  31. data = pd.DataFrame(data=ds_cancer.data,columns=ds_cancer.feature_names)
  32. target = pd.DataFrame(data=ds_cancer.target,columns=['target'])
  33. #数据分区
  34. X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.3)
  35. #定义XGBoost模型
  36. xgb = XGBClassifier(n_estimators=3, max_depth=2)
  37. #显示模型参数
  38. xgb.get_params()
  39. #模型拟合
  40. xgb.fit(X_train, y_train)
  41. #获得模型对象的属性和方法
  42. xgb.score(X_train, y_train)
  43. xgb.feature_importances_
  44. #模型预测
  45. y_train_predict=xgb.predict(X_train)
  46. y_train_predict_proba=xgb.predict_proba(X_train)
  47. #模型评估
  48. accuracy_score(y_train,y_train_predict) #Accuracy指标基于真实标签vs预测标签
  49. roc_auc_score(y_train, y_train_predict_proba[:,1]) #AUC指标基于真实标签vs预测概率
  50. plot_roc_curve(xgb,X_train,y_train) #绘制ROC曲线
  51. plot_precision_recall_curve(xgb,X_train,y_train) #绘制PR曲线
  52. #打印变量重要性
  53. xgboost.plot_importance(xgb)
  54. #------------------------------------------------------------------------------
  55. # 使用LightGBM建立预测模型
  56. #导入库包
  57. import pandas as pd
  58. import matplotlib.pyplot as plt
  59. from sklearn.datasets import load_breast_cancer #乳腺癌数据,Target:二分类
  60. from sklearn.model_selection import train_test_split
  61. from sklearn.metrics import accuracy_score
  62. from sklearn.metrics import roc_auc_score
  63. from sklearn.metrics import roc_curve
  64. from sklearn.metrics import precision_recall_curve
  65. from sklearn.metrics import plot_precision_recall_curve
  66. from sklearn.metrics import plot_roc_curve
  67. import lightgbm as lgb
  68. from lightgbm import LGBMClassifier
  69. # 准备数据
  70. ds_cancer = load_breast_cancer()
  71. data = pd.DataFrame(data=ds_cancer.data,columns=ds_cancer.feature_names)
  72. target = pd.DataFrame(data=ds_cancer.target,columns=['target'])
  73. #数据分区
  74. X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.3)
  75. #定义分类器
  76. lgbm = LGBMClassifier(boosting_type="gbdt", class_weight=None, colsample_bytree=0.7,
  77. isunbalance=True, learning_rate=0.01, max_bin=15,
  78. max_depth=1, min_child_samples=100, min_child_weight=1,
  79. min_split_gain=0.04, n_estimators=100, num_leaves=32,
  80. objective="binary", random_state=27, subsample=0.8, subsample_freq=1)
  81. #显示模型对象参数
  82. lgbm.get_params()
  83. #拟合模型
  84. lgbm.fit(X_train,y_train)
  85. #获得模型对象的属性
  86. lgbm.classes_
  87. lgbm.feature_importances_
  88. lgbm.n_classes_
  89. lgbm.n_features_
  90. lgbm.objective_
  91. #模型预测
  92. y_train_predict=lgbm.predict(X_train)
  93. y_train_predict_proba=lgbm.predict_proba(X_train)
  94. #模型评估
  95. fpr,tpr,pct = roc_curve(y_train, y_train_predict_proba[:,1]) #ROC曲线计算FPR和TPR序列值
  96. ks=abs(fpr-tpr).max() #KS指标
  97. plt.plot(tpr,"b-",fpr,"r-") #KS曲线
  98. accuracy_score(y_train,y_train_predict) #Accuracy指标基于真实标签vs预测标签
  99. roc_auc_score(y_train, y_train_predict_proba[:,1]) #AUC指标基于真实标签vs预测概率
  100. plot_precision_recall_curve(lgbm,X_train,y_train) #绘制PR曲线
  101. plot_roc_curve(lgbm,X_train,y_train) #绘制ROC曲线
  102. #调用lightGBM函数绘制相关图
  103. lgb.create_tree_digraph(lgbm,tree_index=1)
  104. lgb.plot_importance(lgbm)
  105. lgb.plot_tree(lgbm,tree_index=1,figsize=(12,9))