1. import pandas as pd
  2. import os
  3. import gc
  4. import lightgbm as lgb
  5. import xgboost as xgb
  6. from catboost import CatBoostRegressor
  7. from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
  8. from sklearn.preprocessing import MinMaxScaler
  9. import math
  10. import numpy as np
  11. from tqdm import tqdm
  12. from sklearn.model_selection import StratifiedKFold, KFold
  13. from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
  14. import matplotlib.pyplot as plt
  15. import time
  16. import warnings
  17. warnings.filterwarnings('ignore')
  1. train = pd.read_csv('train.csv')
  2. testA = pd.read_csv('testA.csv')
  1. train.head()
  1. data = pd.concat([train, testA], axis=0, ignore_index=True)

数据预处理

  • 可以看到很多变量不能直接训练,比如grade、subGrade、employmentLength、issueDate、earliesCreditLine,需要进行预处理
  1. print(sorted(data['grade'].unique()))
  2. print(sorted(data['subGrade'].unique()))
  1. ['A', 'B', 'C', 'D', 'E', 'F', 'G']
  2. ['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1', 'G2', 'G3', 'G4', 'G5']
  1. data['employmentLength'].value_counts(dropna=False).sort_index()
  1. 1 year 65671
  2. 10+ years 328525
  3. 2 years 90565
  4. 3 years 80163
  5. 4 years 59818
  6. 5 years 62645
  7. 6 years 46582
  8. 7 years 44230
  9. 8 years 45168
  10. 9 years 37866
  11. < 1 year 80226
  12. NaN 58541
  13. Name: employmentLength, dtype: int64
  • 首先对employmentLength进行转换到数值
  1. data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
  2. data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
  3. def employmentLength_to_int(s):
  4. if pd.isnull(s):
  5. return s
  6. else:
  7. return np.int8(s.split()[0])
  8. data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
  1. data['employmentLength'].value_counts(dropna=False).sort_index()
  1. 0.0 80226
  2. 1.0 65671
  3. 2.0 90565
  4. 3.0 80163
  5. 4.0 59818
  6. 5.0 62645
  7. 6.0 46582
  8. 7.0 44230
  9. 8.0 45168
  10. 9.0 37866
  11. 10.0 328525
  12. NaN 58541
  13. Name: employmentLength, dtype: int64
  • 对earliesCreditLine进行预处理
  1. data['earliesCreditLine'].sample(5)
  1. 375743 Jun-2003
  2. 361340 Jul-1999
  3. 716602 Aug-1995
  4. 893559 Oct-1982
  5. 221525 Nov-2004
  6. Name: earliesCreditLine, dtype: object
  1. data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
  1. data['earliesCreditLine'].describe()
  1. count 1000000.000000
  2. mean 1998.688632
  3. std 7.606231
  4. min 1944.000000
  5. 25% 1995.000000
  6. 50% 2000.000000
  7. 75% 2004.000000
  8. max 2015.000000
  9. Name: earliesCreditLine, dtype: float64
  1. data.head()
  • 类别特征处理
  1. # 部分类别特征
  2. cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \
  3. 'applicationType', 'initialListStatus', 'title', 'policyCode']
  4. for f in cate_features:
  5. print(f, '类型数:', data[f].nunique())
  1. grade 类型数: 7
  2. subGrade 类型数: 35
  3. employmentTitle 类型数: 298101
  4. homeOwnership 类型数: 6
  5. verificationStatus 类型数: 3
  6. purpose 类型数: 14
  7. postCode 类型数: 935
  8. regionCode 类型数: 51
  9. applicationType 类型数: 2
  10. initialListStatus 类型数: 2
  11. title 类型数: 47903
  12. policyCode 类型数: 1
  1. # 类型数在2之上,又不是高维稀疏的
  2. data = pd.get_dummies(data, columns=['grade', 'subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
  1. # 高维类别特征需要进行转换
  2. for f in ['employmentTitle', 'postCode', 'title']:
  3. data[f+'_cnts'] = data.groupby([f])['id'].transform('count')
  4. data[f+'_rank'] = data.groupby([f])['id'].rank(ascending=False).astype(int)
  5. del data[f]

训练数据/测试数据准备

  1. features = [f for f in data.columns if f not in ['id','issueDate','isDefault']]
  2. train = data[data.isDefault.notnull()].reset_index(drop=True)
  3. test = data[data.isDefault.isnull()].reset_index(drop=True)
  4. x_train = train[features]
  5. x_test = test[features]
  6. y_train = train['isDefault']

模型训练

  • 直接构建了一个函数,可以调用三种树模型,方便快捷
  1. def cv_model(clf, train_x, train_y, test_x, clf_name):
  2. folds = 5
  3. seed = 2020
  4. kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
  5. train = np.zeros(train_x.shape[0])
  6. test = np.zeros(test_x.shape[0])
  7. cv_scores = []
  8. for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
  9. print('************************************ {} ************************************'.format(str(i+1)))
  10. trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
  11. if clf_name == "lgb":
  12. train_matrix = clf.Dataset(trn_x, label=trn_y)
  13. valid_matrix = clf.Dataset(val_x, label=val_y)
  14. params = {
  15. 'boosting_type': 'gbdt',
  16. 'objective': 'binary',
  17. 'metric': 'auc',
  18. 'min_child_weight': 5,
  19. 'num_leaves': 2 ** 5,
  20. 'lambda_l2': 10,
  21. 'feature_fraction': 0.8,
  22. 'bagging_fraction': 0.8,
  23. 'bagging_freq': 4,
  24. 'learning_rate': 0.1,
  25. 'seed': 2020,
  26. 'nthread': 28,
  27. 'n_jobs':24,
  28. 'silent': True,
  29. 'verbose': -1,
  30. }
  31. model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
  32. val_pred = model.predict(val_x, num_iteration=model.best_iteration)
  33. test_pred = model.predict(test_x, num_iteration=model.best_iteration)
  34. # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
  35. if clf_name == "xgb":
  36. train_matrix = clf.DMatrix(trn_x , label=trn_y)
  37. valid_matrix = clf.DMatrix(val_x , label=val_y)
  38. test_matrix = clf.DMatrix(test_x)
  39. params = {'booster': 'gbtree',
  40. 'objective': 'binary:logistic',
  41. 'eval_metric': 'auc',
  42. 'gamma': 1,
  43. 'min_child_weight': 1.5,
  44. 'max_depth': 5,
  45. 'lambda': 10,
  46. 'subsample': 0.7,
  47. 'colsample_bytree': 0.7,
  48. 'colsample_bylevel': 0.7,
  49. 'eta': 0.04,
  50. 'tree_method': 'exact',
  51. 'seed': 2020,
  52. 'nthread': 36,
  53. "silent": True,
  54. }
  55. watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
  56. model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
  57. val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
  58. test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
  59. if clf_name == "cat":
  60. params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
  61. 'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
  62. model = clf(iterations=20000, **params)
  63. model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
  64. cat_features=[], use_best_model=True, verbose=500)
  65. val_pred = model.predict(val_x)
  66. test_pred = model.predict(test_x)
  67. train[valid_index] = val_pred
  68. test = test_pred / kf.n_splits
  69. cv_scores.append(roc_auc_score(val_y, val_pred))
  70. print(cv_scores)
  71. print("%s_scotrainre_list:" % clf_name, cv_scores)
  72. print("%s_score_mean:" % clf_name, np.mean(cv_scores))
  73. print("%s_score_std:" % clf_name, np.std(cv_scores))
  74. return train, test
  1. def lgb_model(x_train, y_train, x_test):
  2. lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
  3. return lgb_train, lgb_test
  4. def xgb_model(x_train, y_train, x_test):
  5. xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
  6. return xgb_train, xgb_test
  7. def cat_model(x_train, y_train, x_test):
  8. cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
  9. return cat_train, cat_test
  1. lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)
  1. ************************************ 1 ************************************
  2. Training until validation scores don't improve for 200 rounds.
  3. [200] training's auc: 0.742884 valid_1's auc: 0.73055
  4. [400] training's auc: 0.755686 valid_1's auc: 0.731888
  5. [600] training's auc: 0.766421 valid_1's auc: 0.731988
  6. [800] training's auc: 0.776244 valid_1's auc: 0.731868
  7. Early stopping, best iteration is:
  8. [656] training's auc: 0.769146 valid_1's auc: 0.732081
  9. [0.7320814878889421]
  10. ************************************ 2 ************************************
  11. Training until validation scores don't improve for 200 rounds.
  12. [200] training's auc: 0.74372 valid_1's auc: 0.726466
  13. [400] training's auc: 0.756459 valid_1's auc: 0.727727
  14. [600] training's auc: 0.767156 valid_1's auc: 0.727776
  15. Early stopping, best iteration is:
  16. [520] training's auc: 0.762985 valid_1's auc: 0.727902
  17. [0.7320814878889421, 0.7279015876934286]
  18. ************************************ 3 ************************************
  19. Training until validation scores don't improve for 200 rounds.
  20. [200] training's auc: 0.742884 valid_1's auc: 0.731466
  21. [400] training's auc: 0.755466 valid_1's auc: 0.732748
  22. [600] training's auc: 0.766313 valid_1's auc: 0.733069
  23. [800] training's auc: 0.776349 valid_1's auc: 0.732892
  24. Early stopping, best iteration is:
  25. [694] training's auc: 0.771133 valid_1's auc: 0.73312
  26. [0.7320814878889421, 0.7279015876934286, 0.7331203287449972]
  27. ************************************ 4 ************************************
  28. Training until validation scores don't improve for 200 rounds.
  29. [200] training's auc: 0.742632 valid_1's auc: 0.730114
  30. [400] training's auc: 0.755357 valid_1's auc: 0.731443
  31. [600] training's auc: 0.765983 valid_1's auc: 0.731566
  32. [800] training's auc: 0.776112 valid_1's auc: 0.731805
  33. Early stopping, best iteration is:
  34. [706] training's auc: 0.771324 valid_1's auc: 0.731887
  35. [0.7320814878889421, 0.7279015876934286, 0.7331203287449972, 0.731886588682118]
  36. ************************************ 5 ************************************
  37. Training until validation scores don't improve for 200 rounds.
  38. [200] training's auc: 0.743113 valid_1's auc: 0.729226
  39. [400] training's auc: 0.7559 valid_1's auc: 0.730816
  40. [600] training's auc: 0.766388 valid_1's auc: 0.73092
  41. [800] training's auc: 0.77627 valid_1's auc: 0.731029
  42. [1000] training's auc: 0.785791 valid_1's auc: 0.730933
  43. Early stopping, best iteration is:
  44. [883] training's auc: 0.780369 valid_1's auc: 0.731096
  45. [0.7320814878889421, 0.7279015876934286, 0.7331203287449972, 0.731886588682118, 0.7310960057774112]
  46. lgb_scotrainre_list: [0.7320814878889421, 0.7279015876934286, 0.7331203287449972, 0.731886588682118, 0.7310960057774112]
  47. lgb_score_mean: 0.7312171997573793
  48. lgb_score_std: 0.001779041696522632
  1. xgb_train, xgb_test = xgb_model(x_train, y_train, x_test)
  1. ************************************ 1 ************************************
  2. [0] train-auc:0.677293 eval-auc:0.678869
  3. Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.
  4. Will train until eval-auc hasn't improved in 200 rounds.
  5. [200] train-auc:0.727527 eval-auc:0.723771
  6. [400] train-auc:0.73516 eval-auc:0.727725
  7. [600] train-auc:0.740458 eval-auc:0.729631
  8. [800] train-auc:0.744963 eval-auc:0.730829
  9. [1000] train-auc:0.748802 eval-auc:0.731495
  10. [1200] train-auc:0.752295 eval-auc:0.732074
  11. [1400] train-auc:0.755574 eval-auc:0.732421
  12. [1600] train-auc:0.758671 eval-auc:0.732674
  13. [1800] train-auc:0.761605 eval-auc:0.732964
  14. [2000] train-auc:0.764627 eval-auc:0.733111
  15. [2200] train-auc:0.767443 eval-auc:0.733201
  16. [2400] train-auc:0.770204 eval-auc:0.733224
  17. Stopping. Best iteration:
  18. [2328] train-auc:0.7692 eval-auc:0.733246
  19. [0.7332460852050292]
  20. ************************************ 2 ************************************
  21. [0] train-auc:0.677718 eval-auc:0.672523
  22. Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.
  23. Will train until eval-auc hasn't improved in 200 rounds.
  24. [200] train-auc:0.728628 eval-auc:0.720255
  25. [400] train-auc:0.736149 eval-auc:0.724308
  26. [600] train-auc:0.741354 eval-auc:0.726443
  27. [800] train-auc:0.745611 eval-auc:0.72746
  28. [1000] train-auc:0.749627 eval-auc:0.728194
  29. [1200] train-auc:0.753176 eval-auc:0.728711
  30. [1400] train-auc:0.756476 eval-auc:0.72899
  31. [1600] train-auc:0.759574 eval-auc:0.729224
  32. [1800] train-auc:0.762608 eval-auc:0.729501
  33. [2000] train-auc:0.765549 eval-auc:0.729627
  34. [2200] train-auc:0.768304 eval-auc:0.729782
  35. [2400] train-auc:0.771131 eval-auc:0.729922
  36. [2600] train-auc:0.773769 eval-auc:0.729961
  37. [2800] train-auc:0.776371 eval-auc:0.72999
  38. Stopping. Best iteration:
  39. [2697] train-auc:0.775119 eval-auc:0.730036
  40. [0.7332460852050292, 0.7300358478747684]
  41. ************************************ 3 ************************************
  42. [0] train-auc:0.676641 eval-auc:0.67765
  43. Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.
  44. Will train until eval-auc hasn't improved in 200 rounds.
  45. [200] train-auc:0.72757 eval-auc:0.724632
  46. [400] train-auc:0.735185 eval-auc:0.728571
  47. [600] train-auc:0.740671 eval-auc:0.73067
  48. [800] train-auc:0.745049 eval-auc:0.731899
  49. [1000] train-auc:0.748976 eval-auc:0.732787
  50. [1200] train-auc:0.752383 eval-auc:0.73321
  51. [1400] train-auc:0.75564 eval-auc:0.733548
  52. [1600] train-auc:0.758796 eval-auc:0.733825
  53. [1800] train-auc:0.761717 eval-auc:0.734007
  54. [2000] train-auc:0.76459 eval-auc:0.734193
  55. [2200] train-auc:0.767399 eval-auc:0.734261
  56. [2400] train-auc:0.770174 eval-auc:0.734362
  57. [2600] train-auc:0.772818 eval-auc:0.734369
  58. [2800] train-auc:0.775568 eval-auc:0.734391
  59. [3000] train-auc:0.777985 eval-auc:0.73444
  60. [3200] train-auc:0.780514 eval-auc:0.734477
  61. [3400] train-auc:0.782893 eval-auc:0.734427
  62. Stopping. Best iteration:
  63. [3207] train-auc:0.780621 eval-auc:0.734494
  64. [0.7332460852050292, 0.7300358478747684, 0.7344942212088965]
  65. ************************************ 4 ************************************
  66. [0] train-auc:0.677768 eval-auc:0.677179
  67. Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.
  68. Will train until eval-auc hasn't improved in 200 rounds.
  69. [200] train-auc:0.727614 eval-auc:0.72295
  70. [400] train-auc:0.735165 eval-auc:0.726994
  71. [600] train-auc:0.740498 eval-auc:0.729116
  72. [800] train-auc:0.744884 eval-auc:0.730417
  73. [1000] train-auc:0.748782 eval-auc:0.731318
  74. [1200] train-auc:0.75225 eval-auc:0.731899
  75. [1400] train-auc:0.755505 eval-auc:0.732295
  76. [1600] train-auc:0.758618 eval-auc:0.732629
  77. [1800] train-auc:0.76176 eval-auc:0.733046
  78. [2000] train-auc:0.764736 eval-auc:0.733189
  79. [2200] train-auc:0.767476 eval-auc:0.733276
  80. [2400] train-auc:0.770154 eval-auc:0.733409
  81. [2600] train-auc:0.772874 eval-auc:0.733469
  82. [2800] train-auc:0.77541 eval-auc:0.733405
  83. Stopping. Best iteration:
  84. [2644] train-auc:0.773429 eval-auc:0.733488
  85. [0.7332460852050292, 0.7300358478747684, 0.7344942212088965, 0.7334876284761012]
  86. ************************************ 5 ************************************
  87. [0] train-auc:0.677768 eval-auc:0.676353
  88. Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.
  89. Will train until eval-auc hasn't improved in 200 rounds.
  90. [200] train-auc:0.728072 eval-auc:0.722913
  91. [400] train-auc:0.735517 eval-auc:0.726582
  92. [600] train-auc:0.740782 eval-auc:0.728449
  93. [800] train-auc:0.745258 eval-auc:0.729653
  94. [1000] train-auc:0.749185 eval-auc:0.730489
  95. [1200] train-auc:0.752723 eval-auc:0.731038
  96. [1400] train-auc:0.755985 eval-auc:0.731466
  97. [1600] train-auc:0.759166 eval-auc:0.731758
  98. [1800] train-auc:0.762205 eval-auc:0.73199
  99. [2000] train-auc:0.765197 eval-auc:0.732145
  100. [2200] train-auc:0.767976 eval-auc:0.732194
  101. Stopping. Best iteration:
  102. [2191] train-auc:0.767852 eval-auc:0.732213
  103. [0.7332460852050292, 0.7300358478747684, 0.7344942212088965, 0.7334876284761012, 0.7322134048106561]
  104. xgb_scotrainre_list: [0.7332460852050292, 0.7300358478747684, 0.7344942212088965, 0.7334876284761012, 0.7322134048106561]
  105. xgb_score_mean: 0.7326954375150903
  106. xgb_score_std: 0.0015147392354657807
  1. cat_train, cat_test = cat_model(x_train, y_train, x_test)
  1. ************************************ 1 ************************************
  2. 0: learn: 0.4415198 test: 0.4387088 best: 0.4387088 (0) total: 111ms remaining: 37m 6s
  3. 500: learn: 0.3772118 test: 0.3759665 best: 0.3759665 (500) total: 37.7s remaining: 24m 25s
  4. 1000: learn: 0.3756709 test: 0.3752058 best: 0.3752058 (1000) total: 1m 14s remaining: 23m 41s
  5. 1500: learn: 0.3745785 test: 0.3748423 best: 0.3748423 (1500) total: 1m 52s remaining: 23m 7s
  6. 2000: learn: 0.3736834 test: 0.3746564 best: 0.3746564 (2000) total: 2m 29s remaining: 22m 28s
  7. 2500: learn: 0.3728568 test: 0.3745180 best: 0.3745165 (2492) total: 3m 7s remaining: 21m 52s
  8. 3000: learn: 0.3720793 test: 0.3744201 best: 0.3744198 (2998) total: 3m 44s remaining: 21m 14s
  9. Stopped by overfitting detector (50 iterations wait)
  10. bestTest = 0.3744006318
  11. bestIteration = 3086
  12. Shrink model to first 3087 iterations.
  13. [0.7326058985428212]
  14. ************************************ 2 ************************************
  15. 0: learn: 0.4406928 test: 0.4420714 best: 0.4420714 (0) total: 53.3ms remaining: 17m 46s
  16. 500: learn: 0.3765250 test: 0.3787287 best: 0.3787287 (500) total: 38.7s remaining: 25m 8s
  17. 1000: learn: 0.3749822 test: 0.3779503 best: 0.3779503 (998) total: 1m 16s remaining: 24m 18s
  18. 1500: learn: 0.3738772 test: 0.3775654 best: 0.3775654 (1500) total: 1m 54s remaining: 23m 34s
  19. 2000: learn: 0.3729354 test: 0.3773407 best: 0.3773401 (1999) total: 2m 33s remaining: 22m 56s
  20. 2500: learn: 0.3721077 test: 0.3771987 best: 0.3771971 (2496) total: 3m 10s remaining: 22m 15s
  21. 3000: learn: 0.3713621 test: 0.3771114 best: 0.3771114 (3000) total: 3m 49s remaining: 21m 37s
  22. Stopped by overfitting detector (50 iterations wait)
  23. bestTest = 0.3770400469
  24. bestIteration = 3382
  25. Shrink model to first 3383 iterations.
  26. [0.7326058985428212, 0.7292909146788396]
  27. ************************************ 3 ************************************
  28. 0: learn: 0.4408230 test: 0.4418939 best: 0.4418939 (0) total: 59.1ms remaining: 19m 42s
  29. 500: learn: 0.3767851 test: 0.3776319 best: 0.3776319 (500) total: 40.4s remaining: 26m 12s
  30. 1000: learn: 0.3752331 test: 0.3768292 best: 0.3768292 (1000) total: 1m 20s remaining: 25m 19s
  31. 1500: learn: 0.3741550 test: 0.3764926 best: 0.3764926 (1500) total: 2m remaining: 24m 39s
  32. 2000: learn: 0.3732520 test: 0.3762840 best: 0.3762832 (1992) total: 2m 40s remaining: 24m 2s
  33. 2500: learn: 0.3724303 test: 0.3761303 best: 0.3761279 (2490) total: 3m 20s remaining: 23m 22s
  34. 3000: learn: 0.3716684 test: 0.3760402 best: 0.3760395 (2995) total: 4m remaining: 22m 42s
  35. 3500: learn: 0.3709308 test: 0.3759509 best: 0.3759502 (3495) total: 4m 40s remaining: 22m 2s
  36. 4000: learn: 0.3702269 test: 0.3759039 best: 0.3759027 (3993) total: 5m 20s remaining: 21m 20s
  37. 4500: learn: 0.3695477 test: 0.3758698 best: 0.3758663 (4459) total: 6m remaining: 20m 40s
  38. Stopped by overfitting detector (50 iterations wait)
  39. bestTest = 0.3758663409
  40. bestIteration = 4459
  41. Shrink model to first 4460 iterations.
  42. [0.7326058985428212, 0.7292909146788396, 0.7341207611812285]
  43. ************************************ 4 ************************************
  44. 0: learn: 0.4408778 test: 0.4413264 best: 0.4413264 (0) total: 46.6ms remaining: 15m 32s
  45. 500: learn: 0.3768022 test: 0.3777678 best: 0.3777678 (500) total: 40.3s remaining: 26m 7s
  46. 1000: learn: 0.3753097 test: 0.3769403 best: 0.3769403 (1000) total: 1m 20s remaining: 25m 24s
  47. 1500: learn: 0.3742418 test: 0.3765698 best: 0.3765698 (1500) total: 2m remaining: 24m 41s
  48. 2000: learn: 0.3733478 test: 0.3763500 best: 0.3763496 (1998) total: 2m 40s remaining: 23m 59s
  49. 2500: learn: 0.3725263 test: 0.3762101 best: 0.3762093 (2488) total: 3m 20s remaining: 23m 19s
  50. 3000: learn: 0.3717486 test: 0.3760966 best: 0.3760966 (2999) total: 3m 59s remaining: 22m 36s
  51. Stopped by overfitting detector (50 iterations wait)
  52. bestTest = 0.3760182133
  53. bestIteration = 3432
  54. Shrink model to first 3433 iterations.
  55. [0.7326058985428212, 0.7292909146788396, 0.7341207611812285, 0.7324483603137153]
  56. ************************************ 5 ************************************
  57. 0: learn: 0.4409876 test: 0.4409159 best: 0.4409159 (0) total: 52.3ms remaining: 17m 26s
  58. 500: learn: 0.3768055 test: 0.3776229 best: 0.3776229 (500) total: 38s remaining: 24m 38s
  59. 1000: learn: 0.3752600 test: 0.3768397 best: 0.3768397 (1000) total: 1m 15s remaining: 23m 57s
  60. 1500: learn: 0.3741843 test: 0.3764855 best: 0.3764855 (1500) total: 1m 53s remaining: 23m 16s
  61. 2000: learn: 0.3732691 test: 0.3762491 best: 0.3762490 (1998) total: 2m 31s remaining: 22m 40s
  62. 2500: learn: 0.3724407 test: 0.3761154 best: 0.3761154 (2500) total: 3m 9s remaining: 22m 5s
  63. 3000: learn: 0.3716764 test: 0.3760184 best: 0.3760184 (3000) total: 3m 47s remaining: 21m 26s
  64. 3500: learn: 0.3709545 test: 0.3759453 best: 0.3759453 (3500) total: 4m 24s remaining: 20m 47s
  65. Stopped by overfitting detector (50 iterations wait)
  66. bestTest = 0.3759421091
  67. bestIteration = 3544
  68. Shrink model to first 3545 iterations.
  69. [0.7326058985428212, 0.7292909146788396, 0.7341207611812285, 0.7324483603137153, 0.7312334660628076]
  70. cat_scotrainre_list: [0.7326058985428212, 0.7292909146788396, 0.7341207611812285, 0.7324483603137153, 0.7312334660628076]
  71. cat_score_mean: 0.7319398801558824
  72. cat_score_std: 0.001610863965629903
  73. ---------------------------------------------------------------------------
  74. TypeError Traceback (most recent call last)
  75. <ipython-input-25-2e9bafef31e8> in <module>
  76. ----> 1 cat_train, cat_test = cat_model(x_train, y_train, x_test)
  77. TypeError: 'NoneType' object is not iterable
  1. rh_test = lgb_test*0.5 + xgb_test*0.5
  1. testA['isDefault'] = rh_test
  1. testA[['id','isDefault']].to_csv('test_sub.csv', index=False)