数据格式

    1. 0 铤新规画带洄䒸不凭感觉不靠運氣数据支撑全天⑤⑥⑦⑧码精準人工計划 导師Q企鹅480504288
    2. 1 项目名称《设计项目》,项目编号 务复评提醒ii,您好! 购买标书,并至少提前4周申请投标保函开立“,自评已完成,请及时复评,谢谢! 登录地址:http:// 温馨提示:本邮件由系统自动生成,请勿直接回复。
    3. 2 项目名称。。。。。
    4. 3 项目名称。。。。。
    5. 4 项目名称。。。。。
    6. 5 项目名称。。。。。

    第一列为label,第二列为空格,剩下的为文本。
    6个类别影响了fasttext的label和textcnn的输入输出神经元数量

    1. #!/usr/bin python3
    2. # -*- encoding: utf-8 -*-
    3. '''
    4. @File : model.py
    5. @Time : 2020/09/23 17:42:25
    6. @Author : 陈培杞
    7. @Version : 1.0
    8. '''
    9. import re
    10. import time
    11. import jieba
    12. import joblib
    13. import functools
    14. import subprocess
    15. import numpy as np
    16. import pandas as pd
    17. from memory_profiler import profile
    18. import fasttext
    19. import xgboost as xgb
    20. import lightgbm as lgbm
    21. from sklearn.svm import SVC
    22. from sklearn.metrics import classification_report
    23. from sklearn.linear_model import LogisticRegression
    24. from sklearn.model_selection import train_test_split
    25. from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    26. from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
    27. from sklearn.metrics import classification_report
    28. from tensorflow import keras
    29. from tensorflow.keras.models import Model as Model_
    30. from tensorflow.keras.preprocessing.text import Tokenizer
    31. from tensorflow.keras.preprocessing.sequence import pad_sequences
    32. from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, concatenate, Flatten, Dropout, Input, Dense
    33. import argparse
    34. parser = argparse.ArgumentParser()
    35. parser.add_argument('--dataprocess', default=False, help='数据重新处理')
    36. args = parser.parse_args()
    37. CONTENTTRAIN = './data/content.train'
    38. CONTENTTEST = './data/content.test'
    39. CACTRANSTRAIN = './data/cactrans.train'
    40. CACTRANSTEST = './data/cactrans.test'
    41. TRAINDATA = './data/train.data'
    42. TRAINLABEL = './data/train.label'
    43. TESTDATA = './data/test.data'
    44. TESTLABEL = './data/test.label'
    45. FTTRAINDATA = './data/fasttext.train'
    46. FTTESTDATA = './data/fasttext.test'
    47. TFIDFMODEL = './model/tfidf.pkl'
    48. SVMMODEL = './model/svm.pkl'
    49. FTMODEL = './model/fasttext.ftz'
    50. XGBOOSTMODEL = './model/xgboost.pkl'
    51. LIGHTGBMMODEL = './model/lightgbm.pkl'
    52. def timeCost(func):
    53. @functools.wraps(func)
    54. def wrapper(*args, **kwargs):
    55. time_start = time.time()
    56. func(*args, **kwargs)
    57. print('\033[1;35m time cost \033[0m',time.time()-time_start,'s',end='\n\n')
    58. return wrapper
    59. class Model(object):
    60. def __init__(self):
    61. self.model = None
    62. self.modelPath = './model.pkl'
    63. def save(self):
    64. if self.model:
    65. joblib.dump(self.model, self.modelPath)
    66. def load(self):
    67. self.model = joblib.load(self.modelPath)
    68. @timeCost
    69. def fit_transform(self, X_train, X_test, y_train, y_test):
    70. self.model.fit(X_train, y_train)
    71. self.save()
    72. y_pre = self.model.predict(X_test)
    73. self.report(y_test, y_pre)
    74. def report(self,y_true, y_pre):
    75. print(classification_report(y_true, y_pre))
    76. class SvmModel(Model):
    77. def __init__(self):
    78. self.model = SVC()
    79. self.modelPath = SVMMODEL
    80. class FasttextModel(Model):
    81. # 直接使用已经训练好的fasttext进行预测,不再单独训练新模型
    82. def __init__(self):
    83. self.model = fasttext.load_model(FTMODEL)
    84. @timeCost
    85. def fit_transform(self):
    86. y_test = []
    87. y_pre = []
    88. labels = {
    89. '__label__a':0,
    90. '__label__b':1,
    91. '__label__c':2,
    92. '__label__d':3,
    93. '__label__e':4,
    94. '__label__f':5,
    95. }
    96. with open(FTTESTDATA, 'r') as f:
    97. for line in f:
    98. y_test.append(int(line[9]))
    99. content = line[11:].replace('\n', '')
    100. y_pre.append(labels[self.model.predict(content)[0][0]])
    101. self.report(y_test, y_pre)
    102. class TextCNNModel(Model):
    103. def __init__(self):
    104. pass
    105. def TextCNN_model_1(self, vocab,x_train_padded_seqs,y_train,x_test_padded_seqs,y_test):
    106. main_input = Input(shape=(50,), dtype='float64')
    107. embedder = Embedding(len(vocab) + 1, 300, input_length=50, trainable=False)
    108. embed = embedder(main_input)
    109. cnn1 = Conv1D(256, 3, padding='same', strides=1, activation='relu')(embed)
    110. cnn1 = MaxPooling1D(pool_size=48)(cnn1)
    111. cnn2 = Conv1D(256, 4, padding='same', strides=1, activation='relu')(embed)
    112. cnn2 = MaxPooling1D(pool_size=47)(cnn2)
    113. cnn3 = Conv1D(256, 5, padding='same', strides=1, activation='relu')(embed)
    114. cnn3 = MaxPooling1D(pool_size=46)(cnn3)
    115. cnn = concatenate([cnn1, cnn2, cnn3], axis=-1)
    116. flat = Flatten()(cnn)
    117. drop = Dropout(0.2)(flat)
    118. main_output = Dense(6, activation='softmax')(drop)
    119. model = Model_(inputs=main_input, outputs=main_output)
    120. model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    121. one_hot_labels = keras.utils.to_categorical(y_train, num_classes=6)
    122. model.fit(x_train_padded_seqs, one_hot_labels, batch_size=800, epochs=20, verbose=1)
    123. result = model.predict(x_test_padded_seqs)
    124. result_labels = np.argmax(result, axis=1)
    125. y_predict = list(map(str, result_labels))
    126. self.report(y_test, y_predict)
    127. @timeCost
    128. def fit_transform(self, X_train, X_test, y_train, y_test):
    129. tokenizer=Tokenizer()
    130. tokenizer.fit_on_texts(X_train)
    131. vocab=tokenizer.word_index
    132. x_train_word_ids=tokenizer.texts_to_sequences(X_train)
    133. x_test_word_ids = tokenizer.texts_to_sequences(X_test)
    134. x_train_padded_seqs = pad_sequences(x_train_word_ids,maxlen=50)
    135. x_test_padded_seqs = pad_sequences(x_test_word_ids, maxlen=50)
    136. self.TextCNN_model_1(vocab,x_train_padded_seqs,y_train,x_test_padded_seqs,y_test)
    137. class LightgbmModel(Model):
    138. def __init__(self):
    139. self.model = lgbm.LGBMClassifier(objective='multiclass', verbose=-1, learning_rate=0.5, max_depth=20, num_leaves=50, n_estimators=120, max_bin=2000,)
    140. self.modelPath =LIGHTGBMMODEL
    141. class XgboostModel(Model):
    142. def __init__(self):
    143. self.model = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8,
    144. subsample=0.8, nthread=10, learning_rate=0.1)
    145. self.modelPath =XGBOOSTMODEL
    146. def TFIDF(corpus):
    147. vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", max_df=0.5)
    148. tfidfModel = vectorizer.fit(corpus)
    149. joblib.dump(tfidfModel, TFIDFMODEL)
    150. def dataProcess(reProcess=True):
    151. def process(contentHandle ,cactransHandle, XHandle, yHandle, ftHandle):
    152. subprocess.call("cat %s | awk -F'\\[TID\\]' '{print $1}' > %s"%(contentHandle ,cactransHandle),shell=True)
    153. with open(cactransHandle , 'r') as f:
    154. labels = []
    155. corpus = []
    156. #vocabulary = set()
    157. for line in f:
    158. label = line[0]
    159. content = re.sub('\W+', ' ', line[1:]).replace("_", '')
    160. tokenList = list(jieba.cut(content))
    161. tokenList = [ token for token in tokenList if token!=' ']
    162. tmpcorpus = " ".join(tokenList)
    163. # fasttext, textcnn
    164. print(f"__label__{label} {tmpcorpus}", file=ftHandle)
    165. # svm, libgbm, xgboost
    166. print(f'{label}', file=yHandle)
    167. print(f'{tmpcorpus}', file=XHandle)
    168. labels.append(label)
    169. corpus.append(tmpcorpus)
    170. #vocabulary = vocabulary | set(tokenList)
    171. return corpus, labels
    172. if reProcess==False:
    173. X_train = open(TRAINDATA, 'r').readlines()
    174. X_test = open(TESTDATA, 'r').readlines()
    175. y_train = open(TRAINLABEL, 'r').read().split('\n')[:-1]
    176. y_test = open(TESTLABEL, 'r').read().split('\n')[:-1]
    177. else:
    178. ft_trainHandle, X_trainHandle, y_trainHandle = open(FTTRAINDATA,'w'), open(TRAINDATA, 'w'), open(TRAINLABEL, 'w')
    179. X_train, y_train = process(CONTENTTRAIN, CACTRANSTRAIN, X_trainHandle, y_trainHandle, ft_trainHandle)
    180. TFIDF(X_train)
    181. ft_testHandle, X_testHandle, y_testHandle = open(FTTESTDATA,'w'), open(TESTDATA, 'w'), open(TESTLABEL, 'w')
    182. X_test, y_test = process(CONTENTTEST, CACTRANSTEST, X_testHandle, y_testHandle, ft_testHandle)
    183. ft_trainHandle.close()
    184. ft_testHandle.close()
    185. X_trainHandle.close()
    186. X_testHandle.close()
    187. y_trainHandle.close()
    188. y_testHandle.close()
    189. return X_train, X_test, y_train, y_test
    190. @profile
    191. def main(X_train, X_test, y_train, y_test):
    192. FasttextModel().fit_transform()
    193. TextCNNModel().fit_transform(X_train, X_test, y_train, y_test)
    194. X_train = tfidfModel.transform(X_train)
    195. X_test = tfidfModel.transform(X_test)
    196. LightgbmModel().fit_transform(X_train, X_test, y_train, y_test)
    197. SvmModel().fit_transform(X_train, X_test, y_train, y_test)
    198. XgboostModel().fit_transform(X_train, X_test, y_train, y_test)
    199. if __name__=='__main__':
    200. X_train, X_test, y_train, y_test = dataProcess(args.dataprocess)
    201. tfidfModel = joblib.load(TFIDFMODEL)
    202. main(X_train, X_test, y_train, y_test)