1. import numpy as np
  2. import pandas as pd
  3. import matplotlib.pyplot as plt
  4. from sklearn.metrics import roc_curve
  5. SEED = 222
  6. np.random.seed(SEED)
  7. from sklearn.model_selection import train_test_split
  8. from sklearn.metrics import roc_auc_score
  9. from sklearn.svm import SVC,LinearSVC
  10. from sklearn.naive_bayes import GaussianNB
  11. from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
  12. from sklearn.linear_model import LogisticRegression
  13. from sklearn.neighbors import KNeighborsClassifier
  14. from sklearn.neural_network import MLPClassifier
  15. df = pd.read_csv('input.csv')
  16. def get_train_test(): # 数据处理
  17. y = 1 * (df.cand_pty_affiliation == "REP")
  18. x = df.drop(['cand_pty_affiliation'],axis=1)
  19. x = pd.get_dummies(x,sparse=True)
  20. x.drop(x.columns[x.std()==0],axis=1,inplace=True)
  21. return train_test_split(x,y,test_size=0.95,random_state=SEED)
  22. def get_models(): # 模型定义
  23. nb = GaussianNB()
  24. svc = SVC(C=100,probability=True)
  25. knn = KNeighborsClassifier(n_neighbors=3)
  26. lr = LogisticRegression(C=100,random_state=SEED)
  27. nn = MLPClassifier((80, 10), early_stopping=False, random_state=SEED)
  28. gb = GradientBoostingClassifier(n_estimators =100, random_state = SEED)
  29. rf = RandomForestClassifier(n_estimators=1,max_depth=3,random_state=SEED)
  30. models = {'svm':svc,
  31. 'knn':knn,
  32. 'naive bayes':nb,
  33. 'mlp-nn':nn,
  34. 'random forest':rf,
  35. 'gbm':gb,
  36. 'logistic':lr,
  37. }
  38. return models
  39. def train_base_learnres(base_learners,inp,out,verbose=True): # 训练基本模型
  40. if verbose:print("fitting models.")
  41. for i,(name,m) in enumerate(base_learners.items()):
  42. if verbose:print("%s..." % name,end=" ",flush=False)
  43. m.fit(inp,out)
  44. if verbose:print("done")
  45. def predict_base_learners(pred_base_learners,inp,verbose=True): # 把基本学习器的输出作为融合学习的特征,这里计算特征
  46. p = np.zeros((inp.shape[0],len(pred_base_learners)))
  47. if verbose:print("Generating base learner predictions.")
  48. for i,(name,m) in enumerate(pred_base_learners.items()):
  49. if verbose:print("%s..." % name,end=" ",flush=False)
  50. p_ = m.predict_proba(inp)
  51. p[:,i] = p_[:,1]
  52. if verbose:print("done")
  53. return p
  54. def ensemble_predict(base_learners,meta_learner,inp,verbose=True): # 融合学习进行预测
  55. p_pred = predict_base_learners(base_learners,inp,verbose=verbose) # 测试数据必须先经过基本学习器计算特征
  56. return p_pred,meta_learner.predict_proba(p_pred)[:,1]
  57. def ensenmble_by_blend(): # blend融合
  58. xtrain_base, xpred_base, ytrain_base, ypred_base = train_test_split(
  59. xtrain, ytrain, test_size=0.5, random_state=SEED
  60. ) # 把数据切分成两部分
  61. train_base_learnres(base_learners, xtrain_base, ytrain_base) # 训练基本模型
  62. p_base = predict_base_learners(base_learners, xpred_base) # 把基本学习器的输出作为融合学习的特征,这里计算特征
  63. meta_learner.fit(p_base, ypred_base) # 融合学习器的训练
  64. p_pred, p = ensemble_predict(base_learners, meta_learner, xtest) # 融合学习进行预测
  65. print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(ytest, p))
  66. from sklearn.base import clone
  67. def stacking(base_learners,meta_learner,X,y,generator): # stacking进行融合
  68. print("Fitting final base learners...",end="")
  69. train_base_learnres(base_learners,X,y,verbose=False)
  70. print("done")
  71. print("Generating cross-validated predictions...")
  72. cv_preds,cv_y = [],[]
  73. for i,(train_inx,test_idx) in enumerate(generator.split(X)):
  74. fold_xtrain,fold_ytrain = X[train_inx,:],y[train_inx]
  75. fold_xtest,fold_ytest = X[test_idx,:],y[test_idx]
  76. fold_base_learners = {name:clone(model)
  77. for name,model in base_learners.items()}
  78. train_base_learnres(fold_base_learners,fold_xtrain,fold_ytrain,verbose=False)
  79. fold_P_base = predict_base_learners(fold_base_learners,fold_xtest,verbose=False)
  80. cv_preds.append(fold_P_base)
  81. cv_y.append(fold_ytest)
  82. print("Fold %i done" %(i+1))
  83. print("CV-predictions done")
  84. cv_preds = np.vstack(cv_preds)
  85. cv_y = np.hstack(cv_y)
  86. print("Fitting meta learner...",end="")
  87. meta_learner.fit(cv_preds,cv_y)
  88. print("done")
  89. return base_learners,meta_learner
  90. def ensemble_by_stack():
  91. from sklearn.model_selection import KFold
  92. cv_base_learners,cv_meta_learner = stacking(
  93. get_models(),clone(meta_learner),xtrain.values,ytrain.values,KFold(2))
  94. P_pred,p = ensemble_predict(cv_base_learners,cv_meta_learner,xtest,verbose=False)
  95. print("\nEnsemble ROC-AUC score: %.3f" %roc_auc_score(ytest,p))
  96. def plot_roc_curve(ytest,p_base_learners,p_ensemble,labels,ens_label):
  97. plt.figure(figsize=(10,8))
  98. plt.plot([0,1],[0,1],'k--')
  99. cm = [plt.cm.rainbow(i)
  100. for i in np.linspace(0,1.0, p_base_learners.shape[1] +1)]
  101. for i in range(p_base_learners.shape[1]):
  102. p = p_base_learners[:,i]
  103. fpr,tpr,_ = roc_curve(ytest,p)
  104. plt.plot(fpr,tpr,label = labels[i],c=cm[i+1])
  105. fpr, tpr, _ = roc_curve(ytest, p_ensemble)
  106. plt.plot(fpr, tpr, label=ens_label, c=cm[0])
  107. plt.xlabel('False positive rate')
  108. plt.ylabel('True positive rate')
  109. plt.title('ROC curve')
  110. plt.legend(frameon=False)
  111. plt.show()
  112. from mlens.ensemble import SuperLearner
  113. def use_pack():
  114. sl =SuperLearner(
  115. folds=10,random_state=SEED,verbose=2,
  116. # backend="multiprocessing"
  117. )
  118. # Add the base learners and the meta learner
  119. sl.add(list(base_learners.values()),proba=True)
  120. sl.add_meta(meta_learner,proba=True)
  121. # Train the ensemble
  122. sl.fit(xtrain,ytrain)
  123. # Predict the test set
  124. p_sl=sl.predict_proba(xtest)
  125. print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(ytest,p_sl[:,1]))
  126. if __name__ == "__main__":
  127. xtrain, xtest, ytrain, ytest = get_train_test()
  128. base_learners = get_models()
  129. meta_learner = GradientBoostingClassifier(
  130. n_estimators=1000,
  131. loss="exponential",
  132. max_depth=4,
  133. subsample=0.5,
  134. learning_rate=0.005,
  135. random_state=SEED
  136. )
  137. # ensenmble_by_blend() # blend进行融合
  138. # ensemble_by_stack() # stack进行融合
  139. use_pack() # 调用包进行融合

2 参考资料

【Stacking、Blend的baseline】https://www.cnblogs.com/demo-deng/p/10557267.html
【集成学习 好的博客+代码】https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-for-ensemble-models/