1 IDTDF调参
提取特征,参数调优IFTDF参数调优
ngram_range=(1,3), max_features=3000
参考资料
list_ngram = [1,2,3,4]
list_feature = [1000,3000,5000]
#分数记录字典
score_dict = {"list_n":[],"list_f":[],"score":[]}
#创建方法进行验证
def para_Tdf(data_x):
for n in list_ngram:
for fea in list_feature:
Tdf = TfidfVectorizer(ngram_range=(1,n),max_features=fea)
tdf_data = Tdf.fit_transform(data_x)
Ridge_clf2 =Ridge_clf.fit(tdf_data[:10000],data['label'][:10000])
pred_y = Ridge_clf2.predict(tdf_data[10000:20000])
score2 = f1_score(data['label'][10000:20000],pred_y,average="macro")
score_dict["list_n"].append(n)
score_dict['list_f'].append(fea)
score_dict['score'].append(score2)
#方法调用
para_Tdf(data['text'][:20000])
#以DataFrame形式显示分数
score_df = pd.DataFrame(score_dict)
2 提取textlen数值特征
3 模型调参
3.1 XGB
label= np.array(pd.read_csv('./data/label.csv'))
train = pd.read_csv('./temp/train.csv',header = None,names=['id','text','label'])
def adjust_model():
Tdf = TfidfVectorizer(ngram_range=(1,2),max_features=500)
tdf_data = Tdf.fit_transform(train['text'])
X_train,X_test,y_train,y_test = train_test_split(tdf_data,label,test_size=0.3)
paralist = []
score_dict = {"list_n":[],"list_f":[],"loss":[]}
# for n in paralist
param_test1 = {'estimator__max_depth':range(9,13,2)}
model = OneVsRestClassifier(XGBClassifier(eval_metric= 'mlogloss',
max_depth = 11,
min_child_weight =1,
use_label_encoder=False,
min_child_weight =1,
learning_rate =0.1,
n_estimators=100,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
nthread=4,
scale_pos_weight=1,
seed=27,
verbose=True))
gsearch1 = GridSearchCV(model,param_grid = param_test1,scoring='roc_auc',n_jobs=20, cv=5,verbose=2)
gsearch1.fit(X_train, y_train)
print("参数\n",gsearch1.best_params_)
print("最佳得分",gsearch1.best_score_)
print()
adjust_model()
clf1 = OneVsRestClassifier(XGBClassifier(eval_metric= 'mlogloss',
max_depth = 11,
min_child_weight =1,
n_estimators=150,
use_label_encoder=False,
learning_rate =0.01,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
nthread=10,
scale_pos_weight=1,
seed=27,
))
max_depth = 11 range(3,10,2),
min_child_weight = 1 range(2,13,2)
range(1,6,2)
learning_rate =0.01,
n_estimators=5000,
max_depth=4,
min_child_weight=6,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
reg_alpha=0.005,
objective= 'binary:logistic',
nthread=4,
scale_pos_weight=1,
4 尝试深度学习模型
fasttextcnn
3、【Doc2VEc+分类 ==二分类,查找baseline】(http://linanqiu.github.io/2015/10/07/word2vec-sentiment/)
5 深度学习
新闻文本分类rank1开源源码
https://github.com/kangyishuai/NEWS-TEXT-CLASSIFICATION
大观赛文本分类RANK1开源
https://github.com/ShawnyXiao/2018-DC-DataGrand-TextIntelProcess
达观文本top10开源:用到了深度学习+传统模型融合
https://github.com/moneyDboat/data_grand
5.1 TextCNN
(1)总结
- Glove+word2vc相比较单个word2vc提高0.1
- SGD当前选择,Adam模型直接废掉线上0.40+
- 词向量维度是一个待调参,64比100,好0.1:glove_word_150epoch=>val_loss = 0.21
- 交叉验证+word2vc_glove_128_sgd:看线上效果如何
(2)下一步
- 跑通大观top10中的几个模型
- 模型融合
- TextCNN+Attention
https://github.com/tcxdgit/cnn_multilabel_classification/blob/master/cnn_attention_model.py