https://mp.weixin.qq.com/s/5HK-NbSp9pkoTpONwx56uQ?accessToken=eyJhbGciOiJIUzI1NiIsImtpZCI6ImRlZmF1bHQiLCJ0eXAiOiJKV1QifQ.eyJhdWQiOiJhY2Nlc3NfcmVzb3VyY2UiLCJleHAiOjE2NTU5MTEyNjYsImZpbGVHVUlEIjoibG9xZVdReVlvR0lQWU5BbiIsImlhdCI6MTY1NTkxMDk2NiwidXNlcklkIjoxODgyNjM4NH0.TFgokBihpTnGHldixKOV1kFaZxJRQ0VTBohk0MzK0Pg

    1. import matplotlib.pyplot as plt
    2. import seaborn as sns
    3. import gc
    4. import re
    5. import pandas as pd
    6. import lightgbm as lgb
    7. import numpy as np
    8. from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
    9. from sklearn.model_selection import KFold
    10. from lightgbm import LGBMClassifier
    11. import matplotlib.pyplot as plt
    12. import seaborn as sns
    13. import gc
    14. from sklearn.model_selection import StratifiedKFold
    15. from dateutil.relativedelta import relativedelta
    16. train_data = pd.read_csv('raw_data/train_public.csv')
    17. submit_example = pd.read_csv('raw_data/submit_example.csv')
    18. test_public = pd.read_csv('raw_data/test_public.csv')
    19. train_inte = pd.read_csv('raw_data/train_internet.csv')
    20. pd.set_option('max_columns', None)
    21. pd.set_option('max_rows', 200)
    22. pd.set_option('float_format', lambda x: '%.3f' % x)
    23. def train_model(data_, test_, y_, folds_):
    24. oof_preds = np.zeros(data_.shape[0])
    25. sub_preds = np.zeros(test_.shape[0])
    26. feature_importance_df = pd.DataFrame()
    27. feats = [f for f in data_.columns if f not in ['loan_id', 'user_id', 'isDefault'] ]
    28. for n_fold, (trn_idx, val_idx) in enumerate(folds_.split(data_)):
    29. trn_x, trn_y = data_[feats].iloc[trn_idx], y_.iloc[trn_idx]
    30. val_x, val_y = data_[feats].iloc[val_idx], y_.iloc[val_idx]
    31. clf = LGBMClassifier(
    32. n_estimators=4000,
    33. learning_rate=0.08,
    34. num_leaves=2**5,
    35. colsample_bytree=.65,
    36. subsample=.9,
    37. max_depth=5,
    38. # max_bin=250,
    39. reg_alpha=.3,
    40. reg_lambda=.3,
    41. min_split_gain=.01,
    42. min_child_weight=2,
    43. silent=-1,
    44. verbose=-1,
    45. )
    46. clf.fit(trn_x, trn_y,
    47. eval_set= [(trn_x, trn_y), (val_x, val_y)],
    48. eval_metric='auc', verbose=100, early_stopping_rounds=40 #30
    49. )
    50. oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
    51. sub_preds += clf.predict_proba(test_[feats], num_iteration=clf.best_iteration_)[:, 1] / folds_.n_splits
    52. fold_importance_df = pd.DataFrame()
    53. fold_importance_df["feature"] = feats
    54. fold_importance_df["importance"] = clf.feature_importances_
    55. fold_importance_df["fold"] = n_fold + 1
    56. feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    57. print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    58. del clf, trn_x, trn_y, val_x, val_y
    59. gc.collect()
    60. print('Full AUC score %.6f' % roc_auc_score(y, oof_preds))
    61. test_['isDefault'] = sub_preds
    62. return oof_preds, test_[['loan_id', 'isDefault']], feature_importance_df
    63. def display_importances(feature_importance_df_):
    64. # Plot feature importances
    65. cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(
    66. by="importance", ascending=False)[:50].index
    67. best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    68. plt.figure(figsize=(8,10))
    69. sns.barplot(x="importance", y="feature",
    70. data=best_features.sort_values(by="importance", ascending=False))
    71. plt.title('LightGBM Features (avg over folds)')
    72. plt.tight_layout()
    73. plt.savefig('lgbm_importances.png')
    74. def workYearDIc(x):
    75. if str(x)=='nan':
    76. return -1
    77. x = x.replace('< 1','0')
    78. return int(re.search('(\d+)', x).group())
    79. def findDig(val):
    80. fd = re.search('(\d+-)', val)
    81. if fd is None:
    82. return '1-'+val
    83. return val + '-01'
    84. class_dict = {
    85. 'A': 1,
    86. 'B': 2,
    87. 'C': 3,
    88. 'D': 4,
    89. 'E': 5,
    90. 'F': 6,
    91. 'G': 7,
    92. }
    93. timeMax = pd.to_datetime('1-Dec-21')
    94. train_data['work_year'] = train_data['work_year'].map(workYearDIc)
    95. test_public['work_year'] = test_public['work_year'].map(workYearDIc)
    96. train_data['class'] = train_data['class'].map(class_dict)
    97. test_public['class'] = test_public['class'].map(class_dict)
    98. train_data['earlies_credit_mon'] = pd.to_datetime(train_data['earlies_credit_mon'].map(findDig))
    99. test_public['earlies_credit_mon'] = pd.to_datetime(test_public['earlies_credit_mon'].map(findDig))
    100. train_data.loc[ train_data['earlies_credit_mon']>timeMax,'earlies_credit_mon' ] = train_data.loc[ train_data['earlies_credit_mon']>timeMax,'earlies_credit_mon' ]+ pd.offsets.DateOffset(years=-100)
    101. test_public.loc[ test_public['earlies_credit_mon']>timeMax,'earlies_credit_mon' ] = test_public.loc[ test_public['earlies_credit_mon']>timeMax,'earlies_credit_mon' ]+ pd.offsets.DateOffset(years=-100)
    102. train_data['issue_date'] = pd.to_datetime(train_data['issue_date'])
    103. test_public['issue_date'] = pd.to_datetime(test_public['issue_date'])
    104. #Internet数据处理
    105. train_inte['work_year'] = train_inte['work_year'].map(workYearDIc)
    106. train_inte['class'] = train_inte['class'].map(class_dict)
    107. train_inte['earlies_credit_mon'] = pd.to_datetime(train_inte['earlies_credit_mon'])
    108. train_inte['issue_date'] = pd.to_datetime(train_inte['issue_date'])
    109. train_data['issue_date_month'] = train_data['issue_date'].dt.month
    110. test_public['issue_date_month'] = test_public['issue_date'].dt.month
    111. train_data['issue_date_dayofweek'] = train_data['issue_date'].dt.dayofweek
    112. test_public['issue_date_dayofweek'] = test_public['issue_date'].dt.dayofweek
    113. train_data['earliesCreditMon'] = train_data['earlies_credit_mon'].dt.month
    114. test_public['earliesCreditMon'] = test_public['earlies_credit_mon'].dt.month
    115. train_data['earliesCreditYear'] = train_data['earlies_credit_mon'].dt.year
    116. test_public['earliesCreditYear'] = test_public['earlies_credit_mon'].dt.year
    117. ###internet数据
    118. train_inte['issue_date_month'] = train_inte['issue_date'].dt.month
    119. train_inte['issue_date_dayofweek'] = train_inte['issue_date'].dt.dayofweek
    120. train_inte['earliesCreditMon'] = train_inte['earlies_credit_mon'].dt.month
    121. train_inte['earliesCreditYear'] = train_inte['earlies_credit_mon'].dt.year
    122. cat_cols = ['employer_type', 'industry']
    123. from sklearn.preprocessing import LabelEncoder
    124. for col in cat_cols:
    125. lbl = LabelEncoder().fit(train_data[col])
    126. train_data[col] = lbl.transform(train_data[col])
    127. test_public[col] = lbl.transform(test_public[col])
    128. #Internet处理
    129. train_inte[col] = lbl.transform(train_inte[col])
    130. # 'f1','policy_code','app_type' 这三个去掉是881
    131. # ,'f1','policy_code','app_type'
    132. col_to_drop = ['issue_date', 'earlies_credit_mon']
    133. train_data = train_data.drop(col_to_drop, axis=1)
    134. test_public = test_public.drop(col_to_drop, axis=1 )
    135. ##internet处理
    136. train_inte = train_inte.drop(col_to_drop, axis=1 )
    137. # 暂时不变
    138. # train_inte = train_inte.rename(columns={'is_default':'isDefault'})
    139. # data = pd.concat( [train_data,test_public] )
    140. tr_cols = set(train_data.columns)
    141. same_col = list(tr_cols.intersection(set(train_inte.columns)))
    142. train_inteSame = train_inte[same_col].copy()
    143. Inte_add_cos = list(tr_cols.difference(set(same_col)))
    144. for col in Inte_add_cos:
    145. train_inteSame[col] = np.nan
    146. #81后加
    147. # for col in cat_cols:
    148. # dum = pd.get_dummies(data[col], prefix='OneHot_'+col +'_')
    149. # data = pd.concat([data, dum], axis=1)
    150. # # del data[col]
    151. # del dum
    152. y = train_data['isDefault']
    153. folds = KFold(n_splits=5, shuffle=True, random_state=546789)
    154. oof_preds, IntePre, importances = train_model(train_data, train_inteSame, y, folds)
    155. IntePre['isDef'] = train_inte['is_default']
    156. from sklearn.metrics import roc_auc_score
    157. roc_auc_score(IntePre['isDef'],IntePre.isDefault)
    158. ## 选择阈值0.05,从internet表中提取预测小于该概率的样本,并对不同来源的样本赋予来源值
    159. InteId = IntePre.loc[IntePre.isDefault<0.05, 'loan_id'].tolist()
    160. train_data['dataSourse'] = 1
    161. test_public['dataSourse'] = 1
    162. train_inteSame['dataSourse'] = 0
    163. train_inteSame['isDefault'] = train_inte['is_default']
    164. use_te = train_inteSame[train_inteSame.loan_id.isin( InteId )].copy()
    165. data = pd.concat([ train_data,test_public,use_te]).reset_index(drop=True)
    166. # InteId = IntePre.loc[IntePre.isDefault<0.05, 'loan_id'].tolist()
    167. # train_inte = train_inte.rename(columns={'is_default':'isDefault'})
    168. # train_data['dataSourse'] = 1
    169. # test_public['dataSourse'] = 1
    170. # train_inte['dataSourse'] = 0
    171. # use_te = train_inte[train_inte.loan_id.isin( InteId )].copy()
    172. # data = pd.concat([ train_data,test_public,use_te]).reset_index(drop=True)
    173. # IntePre.isDefault
    174. plt.figure(figsize=(16,6))
    175. plt.title("Distribution of Default values IntePre")
    176. sns.distplot(IntePre['isDefault'],color="black", kde=True,bins=120, label='train_data')
    177. # sns.distplot(train_inte[col],color="red", kde=True,bins=120, label='train_inte')
    178. plt.legend();plt.show()
    179. train = data[data['isDefault'].notna()]
    180. test = data[data['isDefault'].isna()]
    181. # for col in ['sub_class', 'work_type']:
    182. # del train[col]
    183. # del test[col]
    184. del data
    185. del train_data,test_public
    186. y = train['isDefault']
    187. folds = KFold(n_splits=5, shuffle=True, random_state=546789)
    188. oof_preds, test_preds, importances = train_model(train, test, y, folds)
    189. test_preds.rename({'loan_id': 'id'}, axis=1)[['id', 'isDefault']].to_csv('nn2.csv', index=False)