1 IDTDF调参

提取特征,参数调优IFTDF参数调优
ngram_range=(1,3), max_features=3000
参考资料

  1. list_ngram = [1,2,3,4]
  2. list_feature = [1000,3000,5000]
  3. #分数记录字典
  4. score_dict = {"list_n":[],"list_f":[],"score":[]}
  5. #创建方法进行验证
  6. def para_Tdf(data_x):
  7. for n in list_ngram:
  8. for fea in list_feature:
  9. Tdf = TfidfVectorizer(ngram_range=(1,n),max_features=fea)
  10. tdf_data = Tdf.fit_transform(data_x)
  11. Ridge_clf2 =Ridge_clf.fit(tdf_data[:10000],data['label'][:10000])
  12. pred_y = Ridge_clf2.predict(tdf_data[10000:20000])
  13. score2 = f1_score(data['label'][10000:20000],pred_y,average="macro")
  14. score_dict["list_n"].append(n)
  15. score_dict['list_f'].append(fea)
  16. score_dict['score'].append(score2)
  17. #方法调用
  18. para_Tdf(data['text'][:20000])
  19. #以DataFrame形式显示分数
  20. score_df = pd.DataFrame(score_dict)

2 提取textlen数值特征

3 模型调参

3.1 XGB

  1. label= np.array(pd.read_csv('./data/label.csv'))
  2. train = pd.read_csv('./temp/train.csv',header = None,names=['id','text','label'])
  3. def adjust_model():
  4. Tdf = TfidfVectorizer(ngram_range=(1,2),max_features=500)
  5. tdf_data = Tdf.fit_transform(train['text'])
  6. X_train,X_test,y_train,y_test = train_test_split(tdf_data,label,test_size=0.3)
  7. paralist = []
  8. score_dict = {"list_n":[],"list_f":[],"loss":[]}
  9. # for n in paralist
  10. param_test1 = {'estimator__max_depth':range(9,13,2)}
  11. model = OneVsRestClassifier(XGBClassifier(eval_metric= 'mlogloss',
  12. max_depth = 11,
  13. min_child_weight =1,
  14. use_label_encoder=False,
  15. min_child_weight =1,
  16. learning_rate =0.1,
  17. n_estimators=100,
  18. gamma=0,
  19. subsample=0.8,
  20. colsample_bytree=0.8,
  21. nthread=4,
  22. scale_pos_weight=1,
  23. seed=27,
  24. verbose=True))
  25. gsearch1 = GridSearchCV(model,param_grid = param_test1,scoring='roc_auc',n_jobs=20, cv=5,verbose=2)
  26. gsearch1.fit(X_train, y_train)
  27. print("参数\n",gsearch1.best_params_)
  28. print("最佳得分",gsearch1.best_score_)
  29. print()
  30. adjust_model()
  31. clf1 = OneVsRestClassifier(XGBClassifier(eval_metric= 'mlogloss',
  32. max_depth = 11,
  33. min_child_weight =1,
  34. n_estimators=150,
  35. use_label_encoder=False,
  36. learning_rate =0.01,
  37. gamma=0,
  38. subsample=0.8,
  39. colsample_bytree=0.8,
  40. nthread=10,
  41. scale_pos_weight=1,
  42. seed=27,
  43. ))
  1. max_depth = 11 range(3,10,2),
  2. min_child_weight = 1 range(2,13,2)
  3. range(1,6,2)
  4. learning_rate =0.01,
  5. n_estimators=5000,
  6. max_depth=4,
  7. min_child_weight=6,
  8. gamma=0,
  9. subsample=0.8,
  10. colsample_bytree=0.8,
  11. reg_alpha=0.005,
  12. objective= 'binary:logistic',
  13. nthread=4,
  14. scale_pos_weight=1,

4 尝试深度学习模型

fasttextcnn
3、【Doc2VEc+分类 ==二分类,查找baseline】(http://linanqiu.github.io/2015/10/07/word2vec-sentiment/)

5 深度学习

新闻文本分类rank1开源源码
https://github.com/kangyishuai/NEWS-TEXT-CLASSIFICATION
大观赛文本分类RANK1开源
https://github.com/ShawnyXiao/2018-DC-DataGrand-TextIntelProcess
达观文本top10开源:用到了深度学习+传统模型融合
https://github.com/moneyDboat/data_grand

5.1 TextCNN

(1)总结

  • Glove+word2vc相比较单个word2vc提高0.1
  • SGD当前选择,Adam模型直接废掉线上0.40+
  • 词向量维度是一个待调参,64比100,好0.1:glove_word_150epoch=>val_loss = 0.21
  • 交叉验证+word2vc_glove_128_sgd:看线上效果如何

(2)下一步