import pandas as pdimport osimport gcimport lightgbm as lgbimport xgboost as xgbfrom catboost import CatBoostRegressorfrom sklearn.linear_model import SGDRegressor, LinearRegression, Ridgefrom sklearn.preprocessing import MinMaxScalerimport mathimport numpy as npfrom tqdm import tqdmfrom sklearn.model_selection import StratifiedKFold, KFoldfrom sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_lossimport matplotlib.pyplot as pltimport timeimport warningswarnings.filterwarnings('ignore')
train = pd.read_csv('train.csv')testA = pd.read_csv('testA.csv')
train.head()
data = pd.concat([train, testA], axis=0, ignore_index=True)
数据预处理
- 可以看到很多变量不能直接训练,比如grade、subGrade、employmentLength、issueDate、earliesCreditLine,需要进行预处理
print(sorted(data['grade'].unique()))print(sorted(data['subGrade'].unique()))
['A', 'B', 'C', 'D', 'E', 'F', 'G']['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1', 'G2', 'G3', 'G4', 'G5']
data['employmentLength'].value_counts(dropna=False).sort_index()
1 year 6567110+ years 3285252 years 905653 years 801634 years 598185 years 626456 years 465827 years 442308 years 451689 years 37866< 1 year 80226NaN 58541Name: employmentLength, dtype: int64
- 首先对employmentLength进行转换到数值
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)data['employmentLength'].replace('< 1 year', '0 years', inplace=True)def employmentLength_to_int(s): if pd.isnull(s): return s else: return np.int8(s.split()[0])data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
data['employmentLength'].value_counts(dropna=False).sort_index()
0.0 802261.0 656712.0 905653.0 801634.0 598185.0 626456.0 465827.0 442308.0 451689.0 3786610.0 328525NaN 58541Name: employmentLength, dtype: int64
data['earliesCreditLine'].sample(5)
375743 Jun-2003361340 Jul-1999716602 Aug-1995893559 Oct-1982221525 Nov-2004Name: earliesCreditLine, dtype: object
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
data['earliesCreditLine'].describe()
count 1000000.000000mean 1998.688632std 7.606231min 1944.00000025% 1995.00000050% 2000.00000075% 2004.000000max 2015.000000Name: earliesCreditLine, dtype: float64
data.head()
# 部分类别特征cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \ 'applicationType', 'initialListStatus', 'title', 'policyCode']for f in cate_features: print(f, '类型数:', data[f].nunique())
grade 类型数: 7subGrade 类型数: 35employmentTitle 类型数: 298101homeOwnership 类型数: 6verificationStatus 类型数: 3purpose 类型数: 14postCode 类型数: 935regionCode 类型数: 51applicationType 类型数: 2initialListStatus 类型数: 2title 类型数: 47903policyCode 类型数: 1
# 类型数在2之上,又不是高维稀疏的data = pd.get_dummies(data, columns=['grade', 'subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
# 高维类别特征需要进行转换for f in ['employmentTitle', 'postCode', 'title']: data[f+'_cnts'] = data.groupby([f])['id'].transform('count') data[f+'_rank'] = data.groupby([f])['id'].rank(ascending=False).astype(int) del data[f]
训练数据/测试数据准备
features = [f for f in data.columns if f not in ['id','issueDate','isDefault']]train = data[data.isDefault.notnull()].reset_index(drop=True)test = data[data.isDefault.isnull()].reset_index(drop=True)x_train = train[features]x_test = test[features]y_train = train['isDefault']
模型训练
def cv_model(clf, train_x, train_y, test_x, clf_name): folds = 5 seed = 2020 kf = KFold(n_splits=folds, shuffle=True, random_state=seed) train = np.zeros(train_x.shape[0]) test = np.zeros(test_x.shape[0]) cv_scores = [] for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)): print('************************************ {} ************************************'.format(str(i+1))) trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index] if clf_name == "lgb": train_matrix = clf.Dataset(trn_x, label=trn_y) valid_matrix = clf.Dataset(val_x, label=val_y) params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'min_child_weight': 5, 'num_leaves': 2 ** 5, 'lambda_l2': 10, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 4, 'learning_rate': 0.1, 'seed': 2020, 'nthread': 28, 'n_jobs':24, 'silent': True, 'verbose': -1, } model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200) val_pred = model.predict(val_x, num_iteration=model.best_iteration) test_pred = model.predict(test_x, num_iteration=model.best_iteration) # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20]) if clf_name == "xgb": train_matrix = clf.DMatrix(trn_x , label=trn_y) valid_matrix = clf.DMatrix(val_x , label=val_y) test_matrix = clf.DMatrix(test_x) params = {'booster': 'gbtree', 'objective': 'binary:logistic', 'eval_metric': 'auc', 'gamma': 1, 'min_child_weight': 1.5, 'max_depth': 5, 'lambda': 10, 'subsample': 0.7, 'colsample_bytree': 0.7, 'colsample_bylevel': 0.7, 'eta': 0.04, 'tree_method': 'exact', 'seed': 2020, 'nthread': 36, "silent": True, } watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')] model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200) val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit) test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit) if clf_name == "cat": params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli', 'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False} model = clf(iterations=20000, **params) model.fit(trn_x, trn_y, eval_set=(val_x, val_y), cat_features=[], use_best_model=True, verbose=500) val_pred = model.predict(val_x) test_pred = model.predict(test_x) train[valid_index] = val_pred test = test_pred / kf.n_splits cv_scores.append(roc_auc_score(val_y, val_pred)) print(cv_scores) print("%s_scotrainre_list:" % clf_name, cv_scores) print("%s_score_mean:" % clf_name, np.mean(cv_scores)) print("%s_score_std:" % clf_name, np.std(cv_scores)) return train, test
def lgb_model(x_train, y_train, x_test): lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb") return lgb_train, lgb_testdef xgb_model(x_train, y_train, x_test): xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb") return xgb_train, xgb_testdef cat_model(x_train, y_train, x_test): cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat") return cat_train, cat_test
lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)
************************************ 1 ************************************Training until validation scores don't improve for 200 rounds.[200] training's auc: 0.742884 valid_1's auc: 0.73055[400] training's auc: 0.755686 valid_1's auc: 0.731888[600] training's auc: 0.766421 valid_1's auc: 0.731988[800] training's auc: 0.776244 valid_1's auc: 0.731868Early stopping, best iteration is:[656] training's auc: 0.769146 valid_1's auc: 0.732081[0.7320814878889421]************************************ 2 ************************************Training until validation scores don't improve for 200 rounds.[200] training's auc: 0.74372 valid_1's auc: 0.726466[400] training's auc: 0.756459 valid_1's auc: 0.727727[600] training's auc: 0.767156 valid_1's auc: 0.727776Early stopping, best iteration is:[520] training's auc: 0.762985 valid_1's auc: 0.727902[0.7320814878889421, 0.7279015876934286]************************************ 3 ************************************Training until validation scores don't improve for 200 rounds.[200] training's auc: 0.742884 valid_1's auc: 0.731466[400] training's auc: 0.755466 valid_1's auc: 0.732748[600] training's auc: 0.766313 valid_1's auc: 0.733069[800] training's auc: 0.776349 valid_1's auc: 0.732892Early stopping, best iteration is:[694] training's auc: 0.771133 valid_1's auc: 0.73312[0.7320814878889421, 0.7279015876934286, 0.7331203287449972]************************************ 4 ************************************Training until validation scores don't improve for 200 rounds.[200] training's auc: 0.742632 valid_1's auc: 0.730114[400] training's auc: 0.755357 valid_1's auc: 0.731443[600] training's auc: 0.765983 valid_1's auc: 0.731566[800] training's auc: 0.776112 valid_1's auc: 0.731805Early stopping, best iteration is:[706] training's auc: 0.771324 valid_1's auc: 0.731887[0.7320814878889421, 0.7279015876934286, 0.7331203287449972, 0.731886588682118]************************************ 5 ************************************Training until validation scores don't improve for 200 rounds.[200] training's auc: 0.743113 valid_1's auc: 0.729226[400] training's auc: 0.7559 valid_1's auc: 0.730816[600] training's auc: 0.766388 valid_1's auc: 0.73092[800] training's auc: 0.77627 valid_1's auc: 0.731029[1000] training's auc: 0.785791 valid_1's auc: 0.730933Early stopping, best iteration is:[883] training's auc: 0.780369 valid_1's auc: 0.731096[0.7320814878889421, 0.7279015876934286, 0.7331203287449972, 0.731886588682118, 0.7310960057774112]lgb_scotrainre_list: [0.7320814878889421, 0.7279015876934286, 0.7331203287449972, 0.731886588682118, 0.7310960057774112]lgb_score_mean: 0.7312171997573793lgb_score_std: 0.001779041696522632
xgb_train, xgb_test = xgb_model(x_train, y_train, x_test)
************************************ 1 ************************************[0] train-auc:0.677293 eval-auc:0.678869Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.Will train until eval-auc hasn't improved in 200 rounds.[200] train-auc:0.727527 eval-auc:0.723771[400] train-auc:0.73516 eval-auc:0.727725[600] train-auc:0.740458 eval-auc:0.729631[800] train-auc:0.744963 eval-auc:0.730829[1000] train-auc:0.748802 eval-auc:0.731495[1200] train-auc:0.752295 eval-auc:0.732074[1400] train-auc:0.755574 eval-auc:0.732421[1600] train-auc:0.758671 eval-auc:0.732674[1800] train-auc:0.761605 eval-auc:0.732964[2000] train-auc:0.764627 eval-auc:0.733111[2200] train-auc:0.767443 eval-auc:0.733201[2400] train-auc:0.770204 eval-auc:0.733224Stopping. Best iteration:[2328] train-auc:0.7692 eval-auc:0.733246[0.7332460852050292]************************************ 2 ************************************[0] train-auc:0.677718 eval-auc:0.672523Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.Will train until eval-auc hasn't improved in 200 rounds.[200] train-auc:0.728628 eval-auc:0.720255[400] train-auc:0.736149 eval-auc:0.724308[600] train-auc:0.741354 eval-auc:0.726443[800] train-auc:0.745611 eval-auc:0.72746[1000] train-auc:0.749627 eval-auc:0.728194[1200] train-auc:0.753176 eval-auc:0.728711[1400] train-auc:0.756476 eval-auc:0.72899[1600] train-auc:0.759574 eval-auc:0.729224[1800] train-auc:0.762608 eval-auc:0.729501[2000] train-auc:0.765549 eval-auc:0.729627[2200] train-auc:0.768304 eval-auc:0.729782[2400] train-auc:0.771131 eval-auc:0.729922[2600] train-auc:0.773769 eval-auc:0.729961[2800] train-auc:0.776371 eval-auc:0.72999Stopping. Best iteration:[2697] train-auc:0.775119 eval-auc:0.730036[0.7332460852050292, 0.7300358478747684]************************************ 3 ************************************[0] train-auc:0.676641 eval-auc:0.67765Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.Will train until eval-auc hasn't improved in 200 rounds.[200] train-auc:0.72757 eval-auc:0.724632[400] train-auc:0.735185 eval-auc:0.728571[600] train-auc:0.740671 eval-auc:0.73067[800] train-auc:0.745049 eval-auc:0.731899[1000] train-auc:0.748976 eval-auc:0.732787[1200] train-auc:0.752383 eval-auc:0.73321[1400] train-auc:0.75564 eval-auc:0.733548[1600] train-auc:0.758796 eval-auc:0.733825[1800] train-auc:0.761717 eval-auc:0.734007[2000] train-auc:0.76459 eval-auc:0.734193[2200] train-auc:0.767399 eval-auc:0.734261[2400] train-auc:0.770174 eval-auc:0.734362[2600] train-auc:0.772818 eval-auc:0.734369[2800] train-auc:0.775568 eval-auc:0.734391[3000] train-auc:0.777985 eval-auc:0.73444[3200] train-auc:0.780514 eval-auc:0.734477[3400] train-auc:0.782893 eval-auc:0.734427Stopping. Best iteration:[3207] train-auc:0.780621 eval-auc:0.734494[0.7332460852050292, 0.7300358478747684, 0.7344942212088965]************************************ 4 ************************************[0] train-auc:0.677768 eval-auc:0.677179Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.Will train until eval-auc hasn't improved in 200 rounds.[200] train-auc:0.727614 eval-auc:0.72295[400] train-auc:0.735165 eval-auc:0.726994[600] train-auc:0.740498 eval-auc:0.729116[800] train-auc:0.744884 eval-auc:0.730417[1000] train-auc:0.748782 eval-auc:0.731318[1200] train-auc:0.75225 eval-auc:0.731899[1400] train-auc:0.755505 eval-auc:0.732295[1600] train-auc:0.758618 eval-auc:0.732629[1800] train-auc:0.76176 eval-auc:0.733046[2000] train-auc:0.764736 eval-auc:0.733189[2200] train-auc:0.767476 eval-auc:0.733276[2400] train-auc:0.770154 eval-auc:0.733409[2600] train-auc:0.772874 eval-auc:0.733469[2800] train-auc:0.77541 eval-auc:0.733405Stopping. Best iteration:[2644] train-auc:0.773429 eval-auc:0.733488[0.7332460852050292, 0.7300358478747684, 0.7344942212088965, 0.7334876284761012]************************************ 5 ************************************[0] train-auc:0.677768 eval-auc:0.676353Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.Will train until eval-auc hasn't improved in 200 rounds.[200] train-auc:0.728072 eval-auc:0.722913[400] train-auc:0.735517 eval-auc:0.726582[600] train-auc:0.740782 eval-auc:0.728449[800] train-auc:0.745258 eval-auc:0.729653[1000] train-auc:0.749185 eval-auc:0.730489[1200] train-auc:0.752723 eval-auc:0.731038[1400] train-auc:0.755985 eval-auc:0.731466[1600] train-auc:0.759166 eval-auc:0.731758[1800] train-auc:0.762205 eval-auc:0.73199[2000] train-auc:0.765197 eval-auc:0.732145[2200] train-auc:0.767976 eval-auc:0.732194Stopping. Best iteration:[2191] train-auc:0.767852 eval-auc:0.732213[0.7332460852050292, 0.7300358478747684, 0.7344942212088965, 0.7334876284761012, 0.7322134048106561]xgb_scotrainre_list: [0.7332460852050292, 0.7300358478747684, 0.7344942212088965, 0.7334876284761012, 0.7322134048106561]xgb_score_mean: 0.7326954375150903xgb_score_std: 0.0015147392354657807
cat_train, cat_test = cat_model(x_train, y_train, x_test)
************************************ 1 ************************************0: learn: 0.4415198 test: 0.4387088 best: 0.4387088 (0) total: 111ms remaining: 37m 6s500: learn: 0.3772118 test: 0.3759665 best: 0.3759665 (500) total: 37.7s remaining: 24m 25s1000: learn: 0.3756709 test: 0.3752058 best: 0.3752058 (1000) total: 1m 14s remaining: 23m 41s1500: learn: 0.3745785 test: 0.3748423 best: 0.3748423 (1500) total: 1m 52s remaining: 23m 7s2000: learn: 0.3736834 test: 0.3746564 best: 0.3746564 (2000) total: 2m 29s remaining: 22m 28s2500: learn: 0.3728568 test: 0.3745180 best: 0.3745165 (2492) total: 3m 7s remaining: 21m 52s3000: learn: 0.3720793 test: 0.3744201 best: 0.3744198 (2998) total: 3m 44s remaining: 21m 14sStopped by overfitting detector (50 iterations wait)bestTest = 0.3744006318bestIteration = 3086Shrink model to first 3087 iterations.[0.7326058985428212]************************************ 2 ************************************0: learn: 0.4406928 test: 0.4420714 best: 0.4420714 (0) total: 53.3ms remaining: 17m 46s500: learn: 0.3765250 test: 0.3787287 best: 0.3787287 (500) total: 38.7s remaining: 25m 8s1000: learn: 0.3749822 test: 0.3779503 best: 0.3779503 (998) total: 1m 16s remaining: 24m 18s1500: learn: 0.3738772 test: 0.3775654 best: 0.3775654 (1500) total: 1m 54s remaining: 23m 34s2000: learn: 0.3729354 test: 0.3773407 best: 0.3773401 (1999) total: 2m 33s remaining: 22m 56s2500: learn: 0.3721077 test: 0.3771987 best: 0.3771971 (2496) total: 3m 10s remaining: 22m 15s3000: learn: 0.3713621 test: 0.3771114 best: 0.3771114 (3000) total: 3m 49s remaining: 21m 37sStopped by overfitting detector (50 iterations wait)bestTest = 0.3770400469bestIteration = 3382Shrink model to first 3383 iterations.[0.7326058985428212, 0.7292909146788396]************************************ 3 ************************************0: learn: 0.4408230 test: 0.4418939 best: 0.4418939 (0) total: 59.1ms remaining: 19m 42s500: learn: 0.3767851 test: 0.3776319 best: 0.3776319 (500) total: 40.4s remaining: 26m 12s1000: learn: 0.3752331 test: 0.3768292 best: 0.3768292 (1000) total: 1m 20s remaining: 25m 19s1500: learn: 0.3741550 test: 0.3764926 best: 0.3764926 (1500) total: 2m remaining: 24m 39s2000: learn: 0.3732520 test: 0.3762840 best: 0.3762832 (1992) total: 2m 40s remaining: 24m 2s2500: learn: 0.3724303 test: 0.3761303 best: 0.3761279 (2490) total: 3m 20s remaining: 23m 22s3000: learn: 0.3716684 test: 0.3760402 best: 0.3760395 (2995) total: 4m remaining: 22m 42s3500: learn: 0.3709308 test: 0.3759509 best: 0.3759502 (3495) total: 4m 40s remaining: 22m 2s4000: learn: 0.3702269 test: 0.3759039 best: 0.3759027 (3993) total: 5m 20s remaining: 21m 20s4500: learn: 0.3695477 test: 0.3758698 best: 0.3758663 (4459) total: 6m remaining: 20m 40sStopped by overfitting detector (50 iterations wait)bestTest = 0.3758663409bestIteration = 4459Shrink model to first 4460 iterations.[0.7326058985428212, 0.7292909146788396, 0.7341207611812285]************************************ 4 ************************************0: learn: 0.4408778 test: 0.4413264 best: 0.4413264 (0) total: 46.6ms remaining: 15m 32s500: learn: 0.3768022 test: 0.3777678 best: 0.3777678 (500) total: 40.3s remaining: 26m 7s1000: learn: 0.3753097 test: 0.3769403 best: 0.3769403 (1000) total: 1m 20s remaining: 25m 24s1500: learn: 0.3742418 test: 0.3765698 best: 0.3765698 (1500) total: 2m remaining: 24m 41s2000: learn: 0.3733478 test: 0.3763500 best: 0.3763496 (1998) total: 2m 40s remaining: 23m 59s2500: learn: 0.3725263 test: 0.3762101 best: 0.3762093 (2488) total: 3m 20s remaining: 23m 19s3000: learn: 0.3717486 test: 0.3760966 best: 0.3760966 (2999) total: 3m 59s remaining: 22m 36sStopped by overfitting detector (50 iterations wait)bestTest = 0.3760182133bestIteration = 3432Shrink model to first 3433 iterations.[0.7326058985428212, 0.7292909146788396, 0.7341207611812285, 0.7324483603137153]************************************ 5 ************************************0: learn: 0.4409876 test: 0.4409159 best: 0.4409159 (0) total: 52.3ms remaining: 17m 26s500: learn: 0.3768055 test: 0.3776229 best: 0.3776229 (500) total: 38s remaining: 24m 38s1000: learn: 0.3752600 test: 0.3768397 best: 0.3768397 (1000) total: 1m 15s remaining: 23m 57s1500: learn: 0.3741843 test: 0.3764855 best: 0.3764855 (1500) total: 1m 53s remaining: 23m 16s2000: learn: 0.3732691 test: 0.3762491 best: 0.3762490 (1998) total: 2m 31s remaining: 22m 40s2500: learn: 0.3724407 test: 0.3761154 best: 0.3761154 (2500) total: 3m 9s remaining: 22m 5s3000: learn: 0.3716764 test: 0.3760184 best: 0.3760184 (3000) total: 3m 47s remaining: 21m 26s3500: learn: 0.3709545 test: 0.3759453 best: 0.3759453 (3500) total: 4m 24s remaining: 20m 47sStopped by overfitting detector (50 iterations wait)bestTest = 0.3759421091bestIteration = 3544Shrink model to first 3545 iterations.[0.7326058985428212, 0.7292909146788396, 0.7341207611812285, 0.7324483603137153, 0.7312334660628076]cat_scotrainre_list: [0.7326058985428212, 0.7292909146788396, 0.7341207611812285, 0.7324483603137153, 0.7312334660628076]cat_score_mean: 0.7319398801558824cat_score_std: 0.001610863965629903---------------------------------------------------------------------------TypeError Traceback (most recent call last)<ipython-input-25-2e9bafef31e8> in <module>----> 1 cat_train, cat_test = cat_model(x_train, y_train, x_test)TypeError: 'NoneType' object is not iterable
rh_test = lgb_test*0.5 + xgb_test*0.5
testA['isDefault'] = rh_test
testA[['id','isDefault']].to_csv('test_sub.csv', index=False)