import pandas as pdimport numpy as npfrom sklearn.datasets import load_boston# Suppress warnings for nowimport warningswarnings.simplefilter(action='ignore', category=UserWarning)warnings.simplefilter(action='ignore', category=FutureWarning)
# Loading datatrain = pd.read_csv('../data/train_final.csv', engine='python')test = pd.read_csv('../data/test_final.csv', engine='python')TRAIN_IDX=train.shape[0]TEST_IDX = TRAIN_IDX + test.shape[0]data = pd.concat([train, test], axis=0)
Baseline Predictions
from sklearn.model_selection import train_test_splittrain = data.iloc[:TRAIN_IDX, :]test = data.iloc[TRAIN_IDX:TEST_IDX, :]# Getting the output variabley = train['loan_status']# Getting the input variablesX = train.drop(['loan_status'], axis=1).fillna(0)# Diving our input and output into training and testing setsX_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42 )
from sklearn.model_selection import GridSearchCVfrom sklearn.metrics import accuracy_scoredef algorithm_pipeline(X_train_data, X_test_data, y_train_data, y_test_data, model, param_grid, cv=10, scoring_fit='accuracy', scoring_test=accuracy_score, do_probabilities=False): gs = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, n_jobs=4, scoring=scoring_fit, verbose=2) fitted_model = gs.fit(X_train_data, y_train_data) best_model = fitted_model.best_estimator_ if do_probabilities: pred = fitted_model.predict_proba(X_test_data) else: pred = fitted_model.predict(X_test_data) score = scoring_test(y_test_data, pred) return [best_model, pred, score]
from sklearn.ensemble import RandomForestClassifierfrom lightgbm import LGBMClassifierfrom xgboost import XGBClassifier# Defining our estimator, the algorithm to optimizemodels_to_train = [XGBClassifier(), LGBMClassifier(), RandomForestClassifier()]# Defining the hyperparameters to optimizegrid_parameters = [ { # XGBoost 'n_estimators': [400, 700, 1000], 'colsample_bytree': [0.7, 0.8], 'max_depth': [15,20,25], 'reg_alpha': [1.1, 1.2, 1.3], 'reg_lambda': [1.1, 1.2, 1.3], 'subsample': [0.7, 0.8, 0.9] }, { # LightGBM 'n_estimators': [400, 700, 1000], 'learning_rate': [0.12], 'colsample_bytree': [0.7, 0.8], 'max_depth': [4], 'num_leaves': [10, 20], 'reg_alpha': [1.1, 1.2], 'reg_lambda': [1.1, 1.2], 'min_split_gain': [0.3, 0.4], 'subsample': [0.8, 0.9], 'subsample_freq': [10, 20] }, { # Random Forest 'max_depth':[3, 5, 10, 13], 'n_estimators':[100, 200, 400, 600, 900], 'max_features':[2, 4, 6, 8, 10] }]
%%timemodels_preds_scores = []for i, model in enumerate(models_to_train): params = grid_parameters[i] result = algorithm_pipeline(X_train, X_test, y_train, y_test, model, params, cv=3) models_preds_scores.append(result)
for result in models_preds_scores: print('Model: {0}, Score: {1}'.format(type(result[0]).__name__, result[2]))# Model: XGBClassifier, Score: 0.9094545454545454# Model: LGBMClassifier, Score: 0.9153333333333333# Model: RandomForestClassifier, Score: 0.9166666666666666
Improving baseline with stacking
%%timefrom mlxtend.classifier import StackingCVClassifierfrom sklearn.linear_model import LogisticRegressionRANDOM_SEED = 42xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.300000012, max_delta_step=0, max_depth=6, min_child_weight=1, monotone_constraints='()', n_estimators=400, n_jobs=8, num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact', validate_parameters=1, verbosity=None)lgbm = LGBMClassifier(boosting_type='gbdt', objective='binary', metric='binary_error',verbose=-1, n_estimators=2000,learning_rate=0.001, num_leaves=11, max_depth=5,reg_alpha=1e-5,reg_lambda=1e-3)lgbm1 = LGBMClassifier(boosting_type='gbdt', objective='binary', metric='binary_error',verbose=-1, n_estimators=2000,learning_rate=0.003, num_leaves=12, max_depth=5,reg_alpha=1e-5,reg_lambda=1e-3)lgbm2 = LGBMClassifier(boosting_type='gbdt', objective='binary', metric='binary_error',verbose=-1, n_estimators=5000,learning_rate=0.0005, num_leaves=12, max_depth=5,reg_alpha=1e-5,reg_lambda=1e-3)rf = RandomForestClassifier(max_depth=13, max_features=10)lr = LogisticRegression()stack = StackingCVClassifier(classifiers=(rf, lgbm, lgbm1, lgbm2, xgb), meta_classifier=lr, cv=5, use_features_in_secondary=True, store_train_meta_features=True, shuffle=True, random_state=RANDOM_SEED)stack.fit(X_train.fillna(0), y_train)pred = stack.predict(X_test.fillna(0))score = accuracy_score(y_test, pred)print(score)print(accuracy_score(test['loan_status'], stack.predict(test.drop(['loan_status'], axis=1).fillna(0))))# Validation accuracy_score:0.9115151515151515# Test accuracy_score:0.91132