1. import pandas as pd
  2. import numpy as np
  3. from sklearn.datasets import load_boston
  4. # Suppress warnings for now
  5. import warnings
  6. warnings.simplefilter(action='ignore', category=UserWarning)
  7. warnings.simplefilter(action='ignore', category=FutureWarning)
  1. # Loading data
  2. train = pd.read_csv('../data/train_final.csv', engine='python')
  3. test = pd.read_csv('../data/test_final.csv', engine='python')
  4. TRAIN_IDX=train.shape[0]
  5. TEST_IDX = TRAIN_IDX + test.shape[0]
  6. data = pd.concat([train, test], axis=0)

Baseline Predictions

  1. from sklearn.model_selection import train_test_split
  2. train = data.iloc[:TRAIN_IDX, :]
  3. test = data.iloc[TRAIN_IDX:TEST_IDX, :]
  4. # Getting the output variable
  5. y = train['loan_status']
  6. # Getting the input variables
  7. X = train.drop(['loan_status'], axis=1).fillna(0)
  8. # Diving our input and output into training and testing sets
  9. X_train, X_test, y_train, y_test = train_test_split(
  10. X, y,
  11. test_size=0.33,
  12. random_state=42
  13. )
  1. from sklearn.model_selection import GridSearchCV
  2. from sklearn.metrics import accuracy_score
  3. def algorithm_pipeline(X_train_data,
  4. X_test_data,
  5. y_train_data,
  6. y_test_data,
  7. model,
  8. param_grid,
  9. cv=10,
  10. scoring_fit='accuracy',
  11. scoring_test=accuracy_score,
  12. do_probabilities=False):
  13. gs = GridSearchCV(estimator=model,
  14. param_grid=param_grid,
  15. cv=cv,
  16. n_jobs=4,
  17. scoring=scoring_fit,
  18. verbose=2)
  19. fitted_model = gs.fit(X_train_data, y_train_data)
  20. best_model = fitted_model.best_estimator_
  21. if do_probabilities:
  22. pred = fitted_model.predict_proba(X_test_data)
  23. else:
  24. pred = fitted_model.predict(X_test_data)
  25. score = scoring_test(y_test_data, pred)
  26. return [best_model, pred, score]
  1. from sklearn.ensemble import RandomForestClassifier
  2. from lightgbm import LGBMClassifier
  3. from xgboost import XGBClassifier
  4. # Defining our estimator, the algorithm to optimize
  5. models_to_train = [XGBClassifier(), LGBMClassifier(), RandomForestClassifier()]
  6. # Defining the hyperparameters to optimize
  7. grid_parameters = [
  8. { # XGBoost
  9. 'n_estimators': [400, 700, 1000],
  10. 'colsample_bytree': [0.7, 0.8],
  11. 'max_depth': [15,20,25],
  12. 'reg_alpha': [1.1, 1.2, 1.3],
  13. 'reg_lambda': [1.1, 1.2, 1.3],
  14. 'subsample': [0.7, 0.8, 0.9]
  15. },
  16. { # LightGBM
  17. 'n_estimators': [400, 700, 1000],
  18. 'learning_rate': [0.12],
  19. 'colsample_bytree': [0.7, 0.8],
  20. 'max_depth': [4],
  21. 'num_leaves': [10, 20],
  22. 'reg_alpha': [1.1, 1.2],
  23. 'reg_lambda': [1.1, 1.2],
  24. 'min_split_gain': [0.3, 0.4],
  25. 'subsample': [0.8, 0.9],
  26. 'subsample_freq': [10, 20]
  27. },
  28. { # Random Forest
  29. 'max_depth':[3, 5, 10, 13],
  30. 'n_estimators':[100, 200, 400, 600, 900],
  31. 'max_features':[2, 4, 6, 8, 10]
  32. }
  33. ]
  1. %%time
  2. models_preds_scores = []
  3. for i, model in enumerate(models_to_train):
  4. params = grid_parameters[i]
  5. result = algorithm_pipeline(X_train, X_test, y_train, y_test,
  6. model, params, cv=3)
  7. models_preds_scores.append(result)
  1. for result in models_preds_scores:
  2. print('Model: {0}, Score: {1}'.format(type(result[0]).__name__, result[2]))
  3. # Model: XGBClassifier, Score: 0.9094545454545454
  4. # Model: LGBMClassifier, Score: 0.9153333333333333
  5. # Model: RandomForestClassifier, Score: 0.9166666666666666

Improving baseline with stacking

  1. %%time
  2. from mlxtend.classifier import StackingCVClassifier
  3. from sklearn.linear_model import LogisticRegression
  4. RANDOM_SEED = 42
  5. xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
  6. colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
  7. importance_type='gain', interaction_constraints='',
  8. learning_rate=0.300000012, max_delta_step=0, max_depth=6,
  9. min_child_weight=1, monotone_constraints='()',
  10. n_estimators=400, n_jobs=8, num_parallel_tree=1, random_state=0,
  11. reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
  12. tree_method='exact', validate_parameters=1, verbosity=None)
  13. lgbm = LGBMClassifier(boosting_type='gbdt', objective='binary', metric='binary_error',verbose=-1,
  14. n_estimators=2000,learning_rate=0.001, num_leaves=11, max_depth=5,reg_alpha=1e-5,reg_lambda=1e-3)
  15. lgbm1 = LGBMClassifier(boosting_type='gbdt', objective='binary', metric='binary_error',verbose=-1,
  16. n_estimators=2000,learning_rate=0.003, num_leaves=12, max_depth=5,reg_alpha=1e-5,reg_lambda=1e-3)
  17. lgbm2 = LGBMClassifier(boosting_type='gbdt', objective='binary', metric='binary_error',verbose=-1,
  18. n_estimators=5000,learning_rate=0.0005, num_leaves=12, max_depth=5,reg_alpha=1e-5,reg_lambda=1e-3)
  19. rf = RandomForestClassifier(max_depth=13, max_features=10)
  20. lr = LogisticRegression()
  21. stack = StackingCVClassifier(classifiers=(rf, lgbm, lgbm1, lgbm2, xgb),
  22. meta_classifier=lr, cv=5,
  23. use_features_in_secondary=True,
  24. store_train_meta_features=True,
  25. shuffle=True,
  26. random_state=RANDOM_SEED)
  27. stack.fit(X_train.fillna(0), y_train)
  28. pred = stack.predict(X_test.fillna(0))
  29. score = accuracy_score(y_test, pred)
  30. print(score)
  31. print(accuracy_score(test['loan_status'],
  32. stack.predict(test.drop(['loan_status'], axis=1).fillna(0))))
  33. # Validation accuracy_score:0.9115151515151515
  34. # Test accuracy_score:0.91132