数据格式
0 涬 芸 飛 铤新规画带洄䒸不凭感觉不靠運氣数据支撑全天⑤⑥⑦⑧码精準人工計划 导師Q企鹅480504288
1 项目名称《设计项目》,项目编号 务复评提醒ii,您好! 购买标书,并至少提前4周申请投标保函开立“,自评已完成,请及时复评,谢谢! 登录地址:http:// 温馨提示:本邮件由系统自动生成,请勿直接回复。
2 项目名称。。。。。
3 项目名称。。。。。
4 项目名称。。。。。
5 项目名称。。。。。
第一列为label,第二列为空格,剩下的为文本。
6个类别影响了fasttext的label和textcnn的输入输出神经元数量
#!/usr/bin python3
# -*- encoding: utf-8 -*-
'''
@File : model.py
@Time : 2020/09/23 17:42:25
@Author : 陈培杞
@Version : 1.0
'''
import re
import time
import jieba
import joblib
import functools
import subprocess
import numpy as np
import pandas as pd
from memory_profiler import profile
import fasttext
import xgboost as xgb
import lightgbm as lgbm
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.metrics import classification_report
from tensorflow import keras
from tensorflow.keras.models import Model as Model_
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, concatenate, Flatten, Dropout, Input, Dense
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--dataprocess', default=False, help='数据重新处理')
args = parser.parse_args()
CONTENTTRAIN = './data/content.train'
CONTENTTEST = './data/content.test'
CACTRANSTRAIN = './data/cactrans.train'
CACTRANSTEST = './data/cactrans.test'
TRAINDATA = './data/train.data'
TRAINLABEL = './data/train.label'
TESTDATA = './data/test.data'
TESTLABEL = './data/test.label'
FTTRAINDATA = './data/fasttext.train'
FTTESTDATA = './data/fasttext.test'
TFIDFMODEL = './model/tfidf.pkl'
SVMMODEL = './model/svm.pkl'
FTMODEL = './model/fasttext.ftz'
XGBOOSTMODEL = './model/xgboost.pkl'
LIGHTGBMMODEL = './model/lightgbm.pkl'
def timeCost(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
time_start = time.time()
func(*args, **kwargs)
print('\033[1;35m time cost \033[0m',time.time()-time_start,'s',end='\n\n')
return wrapper
class Model(object):
def __init__(self):
self.model = None
self.modelPath = './model.pkl'
def save(self):
if self.model:
joblib.dump(self.model, self.modelPath)
def load(self):
self.model = joblib.load(self.modelPath)
@timeCost
def fit_transform(self, X_train, X_test, y_train, y_test):
self.model.fit(X_train, y_train)
self.save()
y_pre = self.model.predict(X_test)
self.report(y_test, y_pre)
def report(self,y_true, y_pre):
print(classification_report(y_true, y_pre))
class SvmModel(Model):
def __init__(self):
self.model = SVC()
self.modelPath = SVMMODEL
class FasttextModel(Model):
# 直接使用已经训练好的fasttext进行预测,不再单独训练新模型
def __init__(self):
self.model = fasttext.load_model(FTMODEL)
@timeCost
def fit_transform(self):
y_test = []
y_pre = []
labels = {
'__label__a':0,
'__label__b':1,
'__label__c':2,
'__label__d':3,
'__label__e':4,
'__label__f':5,
}
with open(FTTESTDATA, 'r') as f:
for line in f:
y_test.append(int(line[9]))
content = line[11:].replace('\n', '')
y_pre.append(labels[self.model.predict(content)[0][0]])
self.report(y_test, y_pre)
class TextCNNModel(Model):
def __init__(self):
pass
def TextCNN_model_1(self, vocab,x_train_padded_seqs,y_train,x_test_padded_seqs,y_test):
main_input = Input(shape=(50,), dtype='float64')
embedder = Embedding(len(vocab) + 1, 300, input_length=50, trainable=False)
embed = embedder(main_input)
cnn1 = Conv1D(256, 3, padding='same', strides=1, activation='relu')(embed)
cnn1 = MaxPooling1D(pool_size=48)(cnn1)
cnn2 = Conv1D(256, 4, padding='same', strides=1, activation='relu')(embed)
cnn2 = MaxPooling1D(pool_size=47)(cnn2)
cnn3 = Conv1D(256, 5, padding='same', strides=1, activation='relu')(embed)
cnn3 = MaxPooling1D(pool_size=46)(cnn3)
cnn = concatenate([cnn1, cnn2, cnn3], axis=-1)
flat = Flatten()(cnn)
drop = Dropout(0.2)(flat)
main_output = Dense(6, activation='softmax')(drop)
model = Model_(inputs=main_input, outputs=main_output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
one_hot_labels = keras.utils.to_categorical(y_train, num_classes=6)
model.fit(x_train_padded_seqs, one_hot_labels, batch_size=800, epochs=20, verbose=1)
result = model.predict(x_test_padded_seqs)
result_labels = np.argmax(result, axis=1)
y_predict = list(map(str, result_labels))
self.report(y_test, y_predict)
@timeCost
def fit_transform(self, X_train, X_test, y_train, y_test):
tokenizer=Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab=tokenizer.word_index
x_train_word_ids=tokenizer.texts_to_sequences(X_train)
x_test_word_ids = tokenizer.texts_to_sequences(X_test)
x_train_padded_seqs = pad_sequences(x_train_word_ids,maxlen=50)
x_test_padded_seqs = pad_sequences(x_test_word_ids, maxlen=50)
self.TextCNN_model_1(vocab,x_train_padded_seqs,y_train,x_test_padded_seqs,y_test)
class LightgbmModel(Model):
def __init__(self):
self.model = lgbm.LGBMClassifier(objective='multiclass', verbose=-1, learning_rate=0.5, max_depth=20, num_leaves=50, n_estimators=120, max_bin=2000,)
self.modelPath =LIGHTGBMMODEL
class XgboostModel(Model):
def __init__(self):
self.model = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8,
subsample=0.8, nthread=10, learning_rate=0.1)
self.modelPath =XGBOOSTMODEL
def TFIDF(corpus):
vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", max_df=0.5)
tfidfModel = vectorizer.fit(corpus)
joblib.dump(tfidfModel, TFIDFMODEL)
def dataProcess(reProcess=True):
def process(contentHandle ,cactransHandle, XHandle, yHandle, ftHandle):
subprocess.call("cat %s | awk -F'\\[TID\\]' '{print $1}' > %s"%(contentHandle ,cactransHandle),shell=True)
with open(cactransHandle , 'r') as f:
labels = []
corpus = []
#vocabulary = set()
for line in f:
label = line[0]
content = re.sub('\W+', ' ', line[1:]).replace("_", '')
tokenList = list(jieba.cut(content))
tokenList = [ token for token in tokenList if token!=' ']
tmpcorpus = " ".join(tokenList)
# fasttext, textcnn
print(f"__label__{label} {tmpcorpus}", file=ftHandle)
# svm, libgbm, xgboost
print(f'{label}', file=yHandle)
print(f'{tmpcorpus}', file=XHandle)
labels.append(label)
corpus.append(tmpcorpus)
#vocabulary = vocabulary | set(tokenList)
return corpus, labels
if reProcess==False:
X_train = open(TRAINDATA, 'r').readlines()
X_test = open(TESTDATA, 'r').readlines()
y_train = open(TRAINLABEL, 'r').read().split('\n')[:-1]
y_test = open(TESTLABEL, 'r').read().split('\n')[:-1]
else:
ft_trainHandle, X_trainHandle, y_trainHandle = open(FTTRAINDATA,'w'), open(TRAINDATA, 'w'), open(TRAINLABEL, 'w')
X_train, y_train = process(CONTENTTRAIN, CACTRANSTRAIN, X_trainHandle, y_trainHandle, ft_trainHandle)
TFIDF(X_train)
ft_testHandle, X_testHandle, y_testHandle = open(FTTESTDATA,'w'), open(TESTDATA, 'w'), open(TESTLABEL, 'w')
X_test, y_test = process(CONTENTTEST, CACTRANSTEST, X_testHandle, y_testHandle, ft_testHandle)
ft_trainHandle.close()
ft_testHandle.close()
X_trainHandle.close()
X_testHandle.close()
y_trainHandle.close()
y_testHandle.close()
return X_train, X_test, y_train, y_test
@profile
def main(X_train, X_test, y_train, y_test):
FasttextModel().fit_transform()
TextCNNModel().fit_transform(X_train, X_test, y_train, y_test)
X_train = tfidfModel.transform(X_train)
X_test = tfidfModel.transform(X_test)
LightgbmModel().fit_transform(X_train, X_test, y_train, y_test)
SvmModel().fit_transform(X_train, X_test, y_train, y_test)
XgboostModel().fit_transform(X_train, X_test, y_train, y_test)
if __name__=='__main__':
X_train, X_test, y_train, y_test = dataProcess(args.dataprocess)
tfidfModel = joblib.load(TFIDFMODEL)
main(X_train, X_test, y_train, y_test)