1. import matplotlib.pyplot as plt
    2. import seaborn as sns
    3. import gc
    4. import re
    5. import pandas as pd
    6. import lightgbm as lgb
    7. import numpy as np
    8. from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
    9. from sklearn.model_selection import KFold
    10. from lightgbm import LGBMClassifier
    11. import matplotlib.pyplot as plt
    12. import seaborn as sns
    13. import gc
    14. from sklearn.model_selection import StratifiedKFold
    15. from dateutil.relativedelta import relativedelta

    读取

    1. train_data = pd.read_csv('raw_data/train_public.csv')
    2. submit_example = pd.read_csv('raw_data/submit_example.csv')
    3. test_public = pd.read_csv('raw_data/test_public.csv')
    4. train_inte = pd.read_csv('raw_data/train_internet.csv')
    5. pd.set_option('max_columns', None)
    6. pd.set_option('max_rows', 200)
    7. pd.set_option('float_format', lambda x: '%.3f' % x)

    模型模块

    1. def train_model(data_, test_, y_, folds_):
    2. oof_preds = np.zeros(data_.shape[0]) # 获取训练集的行数?列数?
    3. sub_preds = np.zeros(test_.shape[0])
    4. feature_importance_df = pd.DataFrame()
    5. feats = [f for f in data_.columns if f not in ['loan_id', 'user_id', 'isDefault'] ]
    6. for n_fold, (trn_idx, val_idx) in enumerate(folds_.split(data_)):
    7. trn_x, trn_y = data_[feats].iloc[trn_idx], y_.iloc[trn_idx]
    8. val_x, val_y = data_[feats].iloc[val_idx], y_.iloc[val_idx]
    9. clf = LGBMClassifier(
    10. n_estimators=4000,
    11. learning_rate=0.08,
    12. num_leaves=2**5,
    13. colsample_bytree=.65,
    14. subsample=.9,
    15. max_depth=5,
    16. # max_bin=250,
    17. reg_alpha=.3,
    18. reg_lambda=.3,
    19. min_split_gain=.01,
    20. min_child_weight=2,
    21. silent=-1,
    22. verbose=-1,
    23. )#相对优雅的训练方式
    24. clf.fit(trn_x, trn_y,
    25. eval_set= [(trn_x, trn_y), (val_x, val_y)],
    26. eval_metric='auc', verbose=100, early_stopping_rounds=40 #30
    27. )
    28. oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1] #搞清楚oof_preds是什么,以及下问的sub_preds是什么
    29. sub_preds += clf.predict_proba(test_[feats], num_iteration=clf.best_iteration_)[:, 1] / folds_.n_splits
    30. fold_importance_df = pd.DataFrame()
    31. fold_importance_df["feature"] = feats
    32. fold_importance_df["importance"] = clf.feature_importances_
    33. fold_importance_df["fold"] = n_fold + 1
    34. feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    35. print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    36. del clf, trn_x, trn_y, val_x, val_y
    37. gc.collect()
    38. print('Full AUC score %.6f' % roc_auc_score(y, oof_preds))
    39. test_['isDefault'] = sub_preds
    40. return oof_preds, test_[['loan_id', 'isDefault']], feature_importance_df
    41. def display_importances(feature_importance_df_):
    42. # Plot feature importances
    43. cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(
    44. by="importance", ascending=False)[:50].index
    45. best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    46. plt.figure(figsize=(8,10))
    47. sns.barplot(x="importance", y="feature",
    48. data=best_features.sort_values(by="importance", ascending=False))
    49. plt.title('LightGBM Features (avg over folds)')
    50. plt.tight_layout()
    51. plt.savefig('lgbm_importances.png')

    2

    1. def workYearDIc(x):
    2. if str(x)=='nan':
    3. return -1
    4. x = x.replace('< 1','0')
    5. return int(re.search('(\d+)', x).group())
    6. def findDig(val):
    7. fd = re.search('(\d+-)', val)
    8. if fd is None:
    9. return '1-'+val
    10. return val + '-01'
    11. class_dict = {
    12. 'A': 1,
    13. 'B': 2,
    14. 'C': 3,
    15. 'D': 4,
    16. 'E': 5,
    17. 'F': 6,
    18. 'G': 7,
    19. }
    20. timeMax = pd.to_datetime('1-Dec-21')

    数据预处理

    1. train_data['work_year'] = train_data['work_year'].map(workYearDIc)
    2. test_public['work_year'] = test_public['work_year'].map(workYearDIc)
    3. train_data['class'] = train_data['class'].map(class_dict)
    4. test_public['class'] = test_public['class'].map(class_dict)
    5. train_data['earlies_credit_mon'] = pd.to_datetime(train_data['earlies_credit_mon'].map(findDig))
    6. test_public['earlies_credit_mon'] = pd.to_datetime(test_public['earlies_credit_mon'].map(findDig))
    7. train_data.loc[ train_data['earlies_credit_mon']>timeMax,'earlies_credit_mon' ] = train_data.loc[ train_data['earlies_credit_mon']>timeMax,'earlies_credit_mon' ]+ pd.offsets.DateOffset(years=-100)
    8. test_public.loc[ test_public['earlies_credit_mon']>timeMax,'earlies_credit_mon' ] = test_public.loc[ test_public['earlies_credit_mon']>timeMax,'earlies_credit_mon' ]+ pd.offsets.DateOffset(years=-100)
    9. train_data['issue_date'] = pd.to_datetime(train_data['issue_date'])
    10. test_public['issue_date'] = pd.to_datetime(test_public['issue_date'])
    11. #Internet数据处理
    12. train_inte['work_year'] = train_inte['work_year'].map(workYearDIc)
    13. train_inte['class'] = train_inte['class'].map(class_dict)
    14. train_inte['earlies_credit_mon'] = pd.to_datetime(train_inte['earlies_credit_mon'])
    15. train_inte['issue_date'] = pd.to_datetime(train_inte['issue_date'])

    时间特征

    1. train_data['issue_date_month'] = train_data['issue_date'].dt.month
    2. test_public['issue_date_month'] = test_public['issue_date'].dt.month
    3. train_data['issue_date_dayofweek'] = train_data['issue_date'].dt.dayofweek
    4. test_public['issue_date_dayofweek'] = test_public['issue_date'].dt.dayofweek
    5. train_data['earliesCreditMon'] = train_data['earlies_credit_mon'].dt.month
    6. test_public['earliesCreditMon'] = test_public['earlies_credit_mon'].dt.month
    7. train_data['earliesCreditYear'] = train_data['earlies_credit_mon'].dt.year
    8. test_public['earliesCreditYear'] = test_public['earlies_credit_mon'].dt.year
    9. ###internet数据
    10. train_inte['issue_date_month'] = train_inte['issue_date'].dt.month
    11. train_inte['issue_date_dayofweek'] = train_inte['issue_date'].dt.dayofweek
    12. train_inte['earliesCreditMon'] = train_inte['earlies_credit_mon'].dt.month
    13. train_inte['earliesCreditYear'] = train_inte['earlies_credit_mon'].dt.year

    自然编码

    1. cat_cols = ['employer_type', 'industry']
    2. from sklearn.preprocessing import LabelEncoder
    3. for col in cat_cols:
    4. lbl = LabelEncoder().fit(train_data[col])
    5. train_data[col] = lbl.transform(train_data[col])
    6. test_public[col] = lbl.transform(test_public[col])
    7. #Internet处理
    8. train_inte[col] = lbl.transform(train_inte[col])
    9. # 'f1','policy_code','app_type' 这三个去掉是881
    10. # ,'f1','policy_code','app_type'
    11. col_to_drop = ['issue_date', 'earlies_credit_mon']
    12. train_data = train_data.drop(col_to_drop, axis=1)
    13. test_public = test_public.drop(col_to_drop, axis=1 )
    14. ##internet处理
    15. train_inte = train_inte.drop(col_to_drop, axis=1 )
    16. # 暂时不变
    17. # train_inte = train_inte.rename(columns={'is_default':'isDefault'})
    18. # data = pd.concat( [train_data,test_public] )
    19. tr_cols = set(train_data.columns)
    20. same_col = list(tr_cols.intersection(set(train_inte.columns)))
    21. train_inteSame = train_inte[same_col].copy()
    22. Inte_add_cos = list(tr_cols.difference(set(same_col)))
    23. for col in Inte_add_cos:
    24. train_inteSame[col] = np.nan
    25. #81后加
    26. # for col in cat_cols:
    27. # dum = pd.get_dummies(data[col], prefix='OneHot_'+col +'_')
    28. # data = pd.concat([data, dum], axis=1)
    29. # # del data[col]
    30. # del dum

    train 预测 Internet

    y = train_data['isDefault']
    folds = KFold(n_splits=5, shuffle=True, random_state=546789)
    oof_preds, IntePre, importances = train_model(train_data, train_inteSame, y, folds)
    
    IntePre['isDef'] = train_inte['is_default']
    
    from sklearn.metrics import roc_auc_score
    roc_auc_score(IntePre['isDef'],IntePre.isDefault)
    
    ## 选择阈值0.05,从internet表中提取预测小于该概率的样本,并对不同来源的样本赋予来源值
    InteId = IntePre.loc[IntePre.isDefault<0.05, 'loan_id'].tolist()
    
    train_data['dataSourse'] = 1
    test_public['dataSourse'] = 1
    train_inteSame['dataSourse'] = 0
    train_inteSame['isDefault'] = train_inte['is_default']
    use_te = train_inteSame[train_inteSame.loan_id.isin( InteId )].copy()
    data = pd.concat([ train_data,test_public,use_te]).reset_index(drop=True)
    
    
    
    
    # InteId = IntePre.loc[IntePre.isDefault<0.05, 'loan_id'].tolist()
    # train_inte = train_inte.rename(columns={'is_default':'isDefault'})
    
    # train_data['dataSourse'] = 1
    # test_public['dataSourse'] = 1
    # train_inte['dataSourse'] = 0
    
    
    
    # use_te = train_inte[train_inte.loan_id.isin( InteId )].copy()
    # data = pd.concat([ train_data,test_public,use_te]).reset_index(drop=True)
    
    # IntePre.isDefault
    plt.figure(figsize=(16,6))
    plt.title("Distribution of Default values IntePre")
    sns.distplot(IntePre['isDefault'],color="black", kde=True,bins=120, label='train_data')
    # sns.distplot(train_inte[col],color="red", kde=True,bins=120, label='train_inte')
    plt.legend();plt.show()
    

    模型输出

    train = data[data['isDefault'].notna()]
    test  = data[data['isDefault'].isna()]
    # for col in ['sub_class', 'work_type']:
    #     del train[col]
    #     del test[col]
    
    
    del data
    del train_data,test_public
    
    
    y = train['isDefault']
    folds = KFold(n_splits=5, shuffle=True, random_state=546789)
    oof_preds, test_preds, importances = train_model(train, test, y, folds)
    
    test_preds.rename({'loan_id': 'id'}, axis=1)[['id', 'isDefault']].to_csv('submit/nn2.csv', index=False)
    
    display_importances(importances)
    
    # 处理一些取值精度后没有大的变化 依然是状态 / 得分
    # 0.88142741347