毕业项目 - 《机器学习训练营作业与笔记》

Baseline Predictions
Improving baseline with stacking

import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
# Suppress warnings for now
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

# Loading data
train = pd.read_csv('../data/train_final.csv', engine='python')
test = pd.read_csv('../data/test_final.csv', engine='python')
TRAIN_IDX=train.shape[0]
TEST_IDX = TRAIN_IDX + test.shape[0]
data = pd.concat([train, test], axis=0)

Baseline Predictions

from sklearn.model_selection import train_test_split
train = data.iloc[:TRAIN_IDX, :]
test = data.iloc[TRAIN_IDX:TEST_IDX, :]
# Getting the output variable
y = train['loan_status']
# Getting the input variables
X = train.drop(['loan_status'], axis=1).fillna(0)
# Diving our input and output into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
                                    X, y, 
                                    test_size=0.33, 
                                    random_state=42
                                   )

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
def algorithm_pipeline(X_train_data,
                       X_test_data,
                       y_train_data,
                       y_test_data,
                       model,
                       param_grid,
                       cv=10,
                       scoring_fit='accuracy',
                       scoring_test=accuracy_score,
                       do_probabilities=False):
    gs = GridSearchCV(estimator=model,
                      param_grid=param_grid,
                      cv=cv,
                      n_jobs=4,
                      scoring=scoring_fit,
                      verbose=2)
    fitted_model = gs.fit(X_train_data, y_train_data)
    best_model = fitted_model.best_estimator_
    if do_probabilities:
        pred = fitted_model.predict_proba(X_test_data)
    else:
        pred = fitted_model.predict(X_test_data)
    score = scoring_test(y_test_data, pred)
    return [best_model, pred, score]

from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
# Defining our estimator, the algorithm to optimize
models_to_train = [XGBClassifier(), LGBMClassifier(), RandomForestClassifier()]
# Defining the hyperparameters to optimize
grid_parameters = [
    { # XGBoost
        'n_estimators': [400, 700, 1000],
        'colsample_bytree': [0.7, 0.8],
        'max_depth': [15,20,25],
        'reg_alpha': [1.1, 1.2, 1.3],
        'reg_lambda': [1.1, 1.2, 1.3],
        'subsample': [0.7, 0.8, 0.9]
    },
    { # LightGBM
        'n_estimators': [400, 700, 1000],
        'learning_rate': [0.12],
        'colsample_bytree': [0.7, 0.8],
        'max_depth': [4],
        'num_leaves': [10, 20],
        'reg_alpha': [1.1, 1.2],
        'reg_lambda': [1.1, 1.2],
        'min_split_gain': [0.3, 0.4],
        'subsample': [0.8, 0.9],
        'subsample_freq': [10, 20]
    }, 
    { # Random Forest
        'max_depth':[3, 5, 10, 13], 
        'n_estimators':[100, 200, 400, 600, 900],
        'max_features':[2, 4, 6, 8, 10]
    }
]

%%time
models_preds_scores = []
for i, model in enumerate(models_to_train):
    params = grid_parameters[i]
    result = algorithm_pipeline(X_train, X_test, y_train, y_test, 
                                 model, params, cv=3)
    models_preds_scores.append(result)

for result in models_preds_scores:
    print('Model: {0}, Score: {1}'.format(type(result[0]).__name__, result[2]))
# Model: XGBClassifier, Score: 0.9094545454545454
# Model: LGBMClassifier, Score: 0.9153333333333333
# Model: RandomForestClassifier, Score: 0.9166666666666666

Improving baseline with stacking

%%time
from mlxtend.classifier import StackingCVClassifier
from sklearn.linear_model import LogisticRegression
RANDOM_SEED = 42
xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                    colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
                    importance_type='gain', interaction_constraints='',
                    learning_rate=0.300000012, max_delta_step=0, max_depth=6,
                    min_child_weight=1,  monotone_constraints='()',
                    n_estimators=400, n_jobs=8, num_parallel_tree=1, random_state=0,
                    reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
                    tree_method='exact', validate_parameters=1, verbosity=None)
lgbm  = LGBMClassifier(boosting_type='gbdt', objective='binary', metric='binary_error',verbose=-1,
            n_estimators=2000,learning_rate=0.001, num_leaves=11, max_depth=5,reg_alpha=1e-5,reg_lambda=1e-3)
lgbm1 = LGBMClassifier(boosting_type='gbdt', objective='binary', metric='binary_error',verbose=-1,
            n_estimators=2000,learning_rate=0.003, num_leaves=12, max_depth=5,reg_alpha=1e-5,reg_lambda=1e-3)
lgbm2 = LGBMClassifier(boosting_type='gbdt', objective='binary', metric='binary_error',verbose=-1,
            n_estimators=5000,learning_rate=0.0005, num_leaves=12, max_depth=5,reg_alpha=1e-5,reg_lambda=1e-3)
rf = RandomForestClassifier(max_depth=13, max_features=10)
lr = LogisticRegression()
stack = StackingCVClassifier(classifiers=(rf, lgbm, lgbm1, lgbm2, xgb),
                            meta_classifier=lr, cv=5,
                            use_features_in_secondary=True,
                            store_train_meta_features=True,
                            shuffle=True,
                            random_state=RANDOM_SEED)
stack.fit(X_train.fillna(0), y_train)
pred = stack.predict(X_test.fillna(0))
score = accuracy_score(y_test, pred)
print(score)
print(accuracy_score(test['loan_status'],
                     stack.predict(test.drop(['loan_status'], axis=1).fillna(0))))
# Validation accuracy_score:0.9115151515151515
# Test accuracy_score:0.91132