import pandas as pd
import os
import gc
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')
train = pd.read_csv('train.csv')
testA = pd.read_csv('testA.csv')
train.head()
data = pd.concat([train, testA], axis=0, ignore_index=True)
数据预处理
- 可以看到很多变量不能直接训练,比如grade、subGrade、employmentLength、issueDate、earliesCreditLine,需要进行预处理
print(sorted(data['grade'].unique()))
print(sorted(data['subGrade'].unique()))
['A', 'B', 'C', 'D', 'E', 'F', 'G']
['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1', 'G2', 'G3', 'G4', 'G5']
data['employmentLength'].value_counts(dropna=False).sort_index()
1 year 65671
10+ years 328525
2 years 90565
3 years 80163
4 years 59818
5 years 62645
6 years 46582
7 years 44230
8 years 45168
9 years 37866
< 1 year 80226
NaN 58541
Name: employmentLength, dtype: int64
- 首先对employmentLength进行转换到数值
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
def employmentLength_to_int(s):
if pd.isnull(s):
return s
else:
return np.int8(s.split()[0])
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
data['employmentLength'].value_counts(dropna=False).sort_index()
0.0 80226
1.0 65671
2.0 90565
3.0 80163
4.0 59818
5.0 62645
6.0 46582
7.0 44230
8.0 45168
9.0 37866
10.0 328525
NaN 58541
Name: employmentLength, dtype: int64
data['earliesCreditLine'].sample(5)
375743 Jun-2003
361340 Jul-1999
716602 Aug-1995
893559 Oct-1982
221525 Nov-2004
Name: earliesCreditLine, dtype: object
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
data['earliesCreditLine'].describe()
count 1000000.000000
mean 1998.688632
std 7.606231
min 1944.000000
25% 1995.000000
50% 2000.000000
75% 2004.000000
max 2015.000000
Name: earliesCreditLine, dtype: float64
data.head()
# 部分类别特征
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \
'applicationType', 'initialListStatus', 'title', 'policyCode']
for f in cate_features:
print(f, '类型数:', data[f].nunique())
grade 类型数: 7
subGrade 类型数: 35
employmentTitle 类型数: 298101
homeOwnership 类型数: 6
verificationStatus 类型数: 3
purpose 类型数: 14
postCode 类型数: 935
regionCode 类型数: 51
applicationType 类型数: 2
initialListStatus 类型数: 2
title 类型数: 47903
policyCode 类型数: 1
# 类型数在2之上,又不是高维稀疏的
data = pd.get_dummies(data, columns=['grade', 'subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
# 高维类别特征需要进行转换
for f in ['employmentTitle', 'postCode', 'title']:
data[f+'_cnts'] = data.groupby([f])['id'].transform('count')
data[f+'_rank'] = data.groupby([f])['id'].rank(ascending=False).astype(int)
del data[f]
训练数据/测试数据准备
features = [f for f in data.columns if f not in ['id','issueDate','isDefault']]
train = data[data.isDefault.notnull()].reset_index(drop=True)
test = data[data.isDefault.isnull()].reset_index(drop=True)
x_train = train[features]
x_test = test[features]
y_train = train['isDefault']
模型训练
def cv_model(clf, train_x, train_y, test_x, clf_name):
folds = 5
seed = 2020
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
train = np.zeros(train_x.shape[0])
test = np.zeros(test_x.shape[0])
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
print('************************************ {} ************************************'.format(str(i+1)))
trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
if clf_name == "lgb":
train_matrix = clf.Dataset(trn_x, label=trn_y)
valid_matrix = clf.Dataset(val_x, label=val_y)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'min_child_weight': 5,
'num_leaves': 2 ** 5,
'lambda_l2': 10,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 4,
'learning_rate': 0.1,
'seed': 2020,
'nthread': 28,
'n_jobs':24,
'silent': True,
'verbose': -1,
}
model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
val_pred = model.predict(val_x, num_iteration=model.best_iteration)
test_pred = model.predict(test_x, num_iteration=model.best_iteration)
# print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
if clf_name == "xgb":
train_matrix = clf.DMatrix(trn_x , label=trn_y)
valid_matrix = clf.DMatrix(val_x , label=val_y)
test_matrix = clf.DMatrix(test_x)
params = {'booster': 'gbtree',
'objective': 'binary:logistic',
'eval_metric': 'auc',
'gamma': 1,
'min_child_weight': 1.5,
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.04,
'tree_method': 'exact',
'seed': 2020,
'nthread': 36,
"silent": True,
}
watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
if clf_name == "cat":
params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
model = clf(iterations=20000, **params)
model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
cat_features=[], use_best_model=True, verbose=500)
val_pred = model.predict(val_x)
test_pred = model.predict(test_x)
train[valid_index] = val_pred
test = test_pred / kf.n_splits
cv_scores.append(roc_auc_score(val_y, val_pred))
print(cv_scores)
print("%s_scotrainre_list:" % clf_name, cv_scores)
print("%s_score_mean:" % clf_name, np.mean(cv_scores))
print("%s_score_std:" % clf_name, np.std(cv_scores))
return train, test
def lgb_model(x_train, y_train, x_test):
lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
return lgb_train, lgb_test
def xgb_model(x_train, y_train, x_test):
xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
return xgb_train, xgb_test
def cat_model(x_train, y_train, x_test):
cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
return cat_train, cat_test
lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)
************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds.
[200] training's auc: 0.742884 valid_1's auc: 0.73055
[400] training's auc: 0.755686 valid_1's auc: 0.731888
[600] training's auc: 0.766421 valid_1's auc: 0.731988
[800] training's auc: 0.776244 valid_1's auc: 0.731868
Early stopping, best iteration is:
[656] training's auc: 0.769146 valid_1's auc: 0.732081
[0.7320814878889421]
************************************ 2 ************************************
Training until validation scores don't improve for 200 rounds.
[200] training's auc: 0.74372 valid_1's auc: 0.726466
[400] training's auc: 0.756459 valid_1's auc: 0.727727
[600] training's auc: 0.767156 valid_1's auc: 0.727776
Early stopping, best iteration is:
[520] training's auc: 0.762985 valid_1's auc: 0.727902
[0.7320814878889421, 0.7279015876934286]
************************************ 3 ************************************
Training until validation scores don't improve for 200 rounds.
[200] training's auc: 0.742884 valid_1's auc: 0.731466
[400] training's auc: 0.755466 valid_1's auc: 0.732748
[600] training's auc: 0.766313 valid_1's auc: 0.733069
[800] training's auc: 0.776349 valid_1's auc: 0.732892
Early stopping, best iteration is:
[694] training's auc: 0.771133 valid_1's auc: 0.73312
[0.7320814878889421, 0.7279015876934286, 0.7331203287449972]
************************************ 4 ************************************
Training until validation scores don't improve for 200 rounds.
[200] training's auc: 0.742632 valid_1's auc: 0.730114
[400] training's auc: 0.755357 valid_1's auc: 0.731443
[600] training's auc: 0.765983 valid_1's auc: 0.731566
[800] training's auc: 0.776112 valid_1's auc: 0.731805
Early stopping, best iteration is:
[706] training's auc: 0.771324 valid_1's auc: 0.731887
[0.7320814878889421, 0.7279015876934286, 0.7331203287449972, 0.731886588682118]
************************************ 5 ************************************
Training until validation scores don't improve for 200 rounds.
[200] training's auc: 0.743113 valid_1's auc: 0.729226
[400] training's auc: 0.7559 valid_1's auc: 0.730816
[600] training's auc: 0.766388 valid_1's auc: 0.73092
[800] training's auc: 0.77627 valid_1's auc: 0.731029
[1000] training's auc: 0.785791 valid_1's auc: 0.730933
Early stopping, best iteration is:
[883] training's auc: 0.780369 valid_1's auc: 0.731096
[0.7320814878889421, 0.7279015876934286, 0.7331203287449972, 0.731886588682118, 0.7310960057774112]
lgb_scotrainre_list: [0.7320814878889421, 0.7279015876934286, 0.7331203287449972, 0.731886588682118, 0.7310960057774112]
lgb_score_mean: 0.7312171997573793
lgb_score_std: 0.001779041696522632
xgb_train, xgb_test = xgb_model(x_train, y_train, x_test)
************************************ 1 ************************************
[0] train-auc:0.677293 eval-auc:0.678869
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.
Will train until eval-auc hasn't improved in 200 rounds.
[200] train-auc:0.727527 eval-auc:0.723771
[400] train-auc:0.73516 eval-auc:0.727725
[600] train-auc:0.740458 eval-auc:0.729631
[800] train-auc:0.744963 eval-auc:0.730829
[1000] train-auc:0.748802 eval-auc:0.731495
[1200] train-auc:0.752295 eval-auc:0.732074
[1400] train-auc:0.755574 eval-auc:0.732421
[1600] train-auc:0.758671 eval-auc:0.732674
[1800] train-auc:0.761605 eval-auc:0.732964
[2000] train-auc:0.764627 eval-auc:0.733111
[2200] train-auc:0.767443 eval-auc:0.733201
[2400] train-auc:0.770204 eval-auc:0.733224
Stopping. Best iteration:
[2328] train-auc:0.7692 eval-auc:0.733246
[0.7332460852050292]
************************************ 2 ************************************
[0] train-auc:0.677718 eval-auc:0.672523
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.
Will train until eval-auc hasn't improved in 200 rounds.
[200] train-auc:0.728628 eval-auc:0.720255
[400] train-auc:0.736149 eval-auc:0.724308
[600] train-auc:0.741354 eval-auc:0.726443
[800] train-auc:0.745611 eval-auc:0.72746
[1000] train-auc:0.749627 eval-auc:0.728194
[1200] train-auc:0.753176 eval-auc:0.728711
[1400] train-auc:0.756476 eval-auc:0.72899
[1600] train-auc:0.759574 eval-auc:0.729224
[1800] train-auc:0.762608 eval-auc:0.729501
[2000] train-auc:0.765549 eval-auc:0.729627
[2200] train-auc:0.768304 eval-auc:0.729782
[2400] train-auc:0.771131 eval-auc:0.729922
[2600] train-auc:0.773769 eval-auc:0.729961
[2800] train-auc:0.776371 eval-auc:0.72999
Stopping. Best iteration:
[2697] train-auc:0.775119 eval-auc:0.730036
[0.7332460852050292, 0.7300358478747684]
************************************ 3 ************************************
[0] train-auc:0.676641 eval-auc:0.67765
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.
Will train until eval-auc hasn't improved in 200 rounds.
[200] train-auc:0.72757 eval-auc:0.724632
[400] train-auc:0.735185 eval-auc:0.728571
[600] train-auc:0.740671 eval-auc:0.73067
[800] train-auc:0.745049 eval-auc:0.731899
[1000] train-auc:0.748976 eval-auc:0.732787
[1200] train-auc:0.752383 eval-auc:0.73321
[1400] train-auc:0.75564 eval-auc:0.733548
[1600] train-auc:0.758796 eval-auc:0.733825
[1800] train-auc:0.761717 eval-auc:0.734007
[2000] train-auc:0.76459 eval-auc:0.734193
[2200] train-auc:0.767399 eval-auc:0.734261
[2400] train-auc:0.770174 eval-auc:0.734362
[2600] train-auc:0.772818 eval-auc:0.734369
[2800] train-auc:0.775568 eval-auc:0.734391
[3000] train-auc:0.777985 eval-auc:0.73444
[3200] train-auc:0.780514 eval-auc:0.734477
[3400] train-auc:0.782893 eval-auc:0.734427
Stopping. Best iteration:
[3207] train-auc:0.780621 eval-auc:0.734494
[0.7332460852050292, 0.7300358478747684, 0.7344942212088965]
************************************ 4 ************************************
[0] train-auc:0.677768 eval-auc:0.677179
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.
Will train until eval-auc hasn't improved in 200 rounds.
[200] train-auc:0.727614 eval-auc:0.72295
[400] train-auc:0.735165 eval-auc:0.726994
[600] train-auc:0.740498 eval-auc:0.729116
[800] train-auc:0.744884 eval-auc:0.730417
[1000] train-auc:0.748782 eval-auc:0.731318
[1200] train-auc:0.75225 eval-auc:0.731899
[1400] train-auc:0.755505 eval-auc:0.732295
[1600] train-auc:0.758618 eval-auc:0.732629
[1800] train-auc:0.76176 eval-auc:0.733046
[2000] train-auc:0.764736 eval-auc:0.733189
[2200] train-auc:0.767476 eval-auc:0.733276
[2400] train-auc:0.770154 eval-auc:0.733409
[2600] train-auc:0.772874 eval-auc:0.733469
[2800] train-auc:0.77541 eval-auc:0.733405
Stopping. Best iteration:
[2644] train-auc:0.773429 eval-auc:0.733488
[0.7332460852050292, 0.7300358478747684, 0.7344942212088965, 0.7334876284761012]
************************************ 5 ************************************
[0] train-auc:0.677768 eval-auc:0.676353
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.
Will train until eval-auc hasn't improved in 200 rounds.
[200] train-auc:0.728072 eval-auc:0.722913
[400] train-auc:0.735517 eval-auc:0.726582
[600] train-auc:0.740782 eval-auc:0.728449
[800] train-auc:0.745258 eval-auc:0.729653
[1000] train-auc:0.749185 eval-auc:0.730489
[1200] train-auc:0.752723 eval-auc:0.731038
[1400] train-auc:0.755985 eval-auc:0.731466
[1600] train-auc:0.759166 eval-auc:0.731758
[1800] train-auc:0.762205 eval-auc:0.73199
[2000] train-auc:0.765197 eval-auc:0.732145
[2200] train-auc:0.767976 eval-auc:0.732194
Stopping. Best iteration:
[2191] train-auc:0.767852 eval-auc:0.732213
[0.7332460852050292, 0.7300358478747684, 0.7344942212088965, 0.7334876284761012, 0.7322134048106561]
xgb_scotrainre_list: [0.7332460852050292, 0.7300358478747684, 0.7344942212088965, 0.7334876284761012, 0.7322134048106561]
xgb_score_mean: 0.7326954375150903
xgb_score_std: 0.0015147392354657807
cat_train, cat_test = cat_model(x_train, y_train, x_test)
************************************ 1 ************************************
0: learn: 0.4415198 test: 0.4387088 best: 0.4387088 (0) total: 111ms remaining: 37m 6s
500: learn: 0.3772118 test: 0.3759665 best: 0.3759665 (500) total: 37.7s remaining: 24m 25s
1000: learn: 0.3756709 test: 0.3752058 best: 0.3752058 (1000) total: 1m 14s remaining: 23m 41s
1500: learn: 0.3745785 test: 0.3748423 best: 0.3748423 (1500) total: 1m 52s remaining: 23m 7s
2000: learn: 0.3736834 test: 0.3746564 best: 0.3746564 (2000) total: 2m 29s remaining: 22m 28s
2500: learn: 0.3728568 test: 0.3745180 best: 0.3745165 (2492) total: 3m 7s remaining: 21m 52s
3000: learn: 0.3720793 test: 0.3744201 best: 0.3744198 (2998) total: 3m 44s remaining: 21m 14s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.3744006318
bestIteration = 3086
Shrink model to first 3087 iterations.
[0.7326058985428212]
************************************ 2 ************************************
0: learn: 0.4406928 test: 0.4420714 best: 0.4420714 (0) total: 53.3ms remaining: 17m 46s
500: learn: 0.3765250 test: 0.3787287 best: 0.3787287 (500) total: 38.7s remaining: 25m 8s
1000: learn: 0.3749822 test: 0.3779503 best: 0.3779503 (998) total: 1m 16s remaining: 24m 18s
1500: learn: 0.3738772 test: 0.3775654 best: 0.3775654 (1500) total: 1m 54s remaining: 23m 34s
2000: learn: 0.3729354 test: 0.3773407 best: 0.3773401 (1999) total: 2m 33s remaining: 22m 56s
2500: learn: 0.3721077 test: 0.3771987 best: 0.3771971 (2496) total: 3m 10s remaining: 22m 15s
3000: learn: 0.3713621 test: 0.3771114 best: 0.3771114 (3000) total: 3m 49s remaining: 21m 37s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.3770400469
bestIteration = 3382
Shrink model to first 3383 iterations.
[0.7326058985428212, 0.7292909146788396]
************************************ 3 ************************************
0: learn: 0.4408230 test: 0.4418939 best: 0.4418939 (0) total: 59.1ms remaining: 19m 42s
500: learn: 0.3767851 test: 0.3776319 best: 0.3776319 (500) total: 40.4s remaining: 26m 12s
1000: learn: 0.3752331 test: 0.3768292 best: 0.3768292 (1000) total: 1m 20s remaining: 25m 19s
1500: learn: 0.3741550 test: 0.3764926 best: 0.3764926 (1500) total: 2m remaining: 24m 39s
2000: learn: 0.3732520 test: 0.3762840 best: 0.3762832 (1992) total: 2m 40s remaining: 24m 2s
2500: learn: 0.3724303 test: 0.3761303 best: 0.3761279 (2490) total: 3m 20s remaining: 23m 22s
3000: learn: 0.3716684 test: 0.3760402 best: 0.3760395 (2995) total: 4m remaining: 22m 42s
3500: learn: 0.3709308 test: 0.3759509 best: 0.3759502 (3495) total: 4m 40s remaining: 22m 2s
4000: learn: 0.3702269 test: 0.3759039 best: 0.3759027 (3993) total: 5m 20s remaining: 21m 20s
4500: learn: 0.3695477 test: 0.3758698 best: 0.3758663 (4459) total: 6m remaining: 20m 40s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.3758663409
bestIteration = 4459
Shrink model to first 4460 iterations.
[0.7326058985428212, 0.7292909146788396, 0.7341207611812285]
************************************ 4 ************************************
0: learn: 0.4408778 test: 0.4413264 best: 0.4413264 (0) total: 46.6ms remaining: 15m 32s
500: learn: 0.3768022 test: 0.3777678 best: 0.3777678 (500) total: 40.3s remaining: 26m 7s
1000: learn: 0.3753097 test: 0.3769403 best: 0.3769403 (1000) total: 1m 20s remaining: 25m 24s
1500: learn: 0.3742418 test: 0.3765698 best: 0.3765698 (1500) total: 2m remaining: 24m 41s
2000: learn: 0.3733478 test: 0.3763500 best: 0.3763496 (1998) total: 2m 40s remaining: 23m 59s
2500: learn: 0.3725263 test: 0.3762101 best: 0.3762093 (2488) total: 3m 20s remaining: 23m 19s
3000: learn: 0.3717486 test: 0.3760966 best: 0.3760966 (2999) total: 3m 59s remaining: 22m 36s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.3760182133
bestIteration = 3432
Shrink model to first 3433 iterations.
[0.7326058985428212, 0.7292909146788396, 0.7341207611812285, 0.7324483603137153]
************************************ 5 ************************************
0: learn: 0.4409876 test: 0.4409159 best: 0.4409159 (0) total: 52.3ms remaining: 17m 26s
500: learn: 0.3768055 test: 0.3776229 best: 0.3776229 (500) total: 38s remaining: 24m 38s
1000: learn: 0.3752600 test: 0.3768397 best: 0.3768397 (1000) total: 1m 15s remaining: 23m 57s
1500: learn: 0.3741843 test: 0.3764855 best: 0.3764855 (1500) total: 1m 53s remaining: 23m 16s
2000: learn: 0.3732691 test: 0.3762491 best: 0.3762490 (1998) total: 2m 31s remaining: 22m 40s
2500: learn: 0.3724407 test: 0.3761154 best: 0.3761154 (2500) total: 3m 9s remaining: 22m 5s
3000: learn: 0.3716764 test: 0.3760184 best: 0.3760184 (3000) total: 3m 47s remaining: 21m 26s
3500: learn: 0.3709545 test: 0.3759453 best: 0.3759453 (3500) total: 4m 24s remaining: 20m 47s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.3759421091
bestIteration = 3544
Shrink model to first 3545 iterations.
[0.7326058985428212, 0.7292909146788396, 0.7341207611812285, 0.7324483603137153, 0.7312334660628076]
cat_scotrainre_list: [0.7326058985428212, 0.7292909146788396, 0.7341207611812285, 0.7324483603137153, 0.7312334660628076]
cat_score_mean: 0.7319398801558824
cat_score_std: 0.001610863965629903
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-25-2e9bafef31e8> in <module>
----> 1 cat_train, cat_test = cat_model(x_train, y_train, x_test)
TypeError: 'NoneType' object is not iterable
rh_test = lgb_test*0.5 + xgb_test*0.5
testA['isDefault'] = rh_test
testA[['id','isDefault']].to_csv('test_sub.csv', index=False)