中文垃圾邮件分类识别 - 《自然语言处理(NLP)》

需求：
实现流程：
代码实现：

需求：

基于已有的邮件构建模型，对新收到的邮件进行分类，识别出该邮件是否为垃圾邮件。

实现流程：

1、收集历史邮件
2、对中文邮件内容进行分词
3、矩阵化文本
4、训练SVM模型
5、使用构建好的模型对新收到的邮件进行分类识别。

代码实现：

`_# coding=utf-8
_import joblib
import numpy as np
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

# 打开文件，加载数据集
_def loadData():
with open(“ham_data.txt”,”r”,encoding=”utf-8”) as hf,open(“spam_data.txt”,”r”,encoding=”utf-8”) as spf:
hfdata = hf.readlines()
spdata = spf.readlines()
# 对数据集中邮件打标签（分类）
hf_label = np.ones(len(hfdata)).tolist() # 全1矩阵
sp_label = np.zeros(len(spdata)).tolist() # 全0矩阵
print()
# 获取训练集邮件的特征和类别
train = hfdata + spdata # 特征
label = hf_label + sp_label # 类别
_return train,label

# 分词
_def participle():
# 加载停用词表
with open(“../关键词提取/baidu_stopwords.txt”,”r”,encoding=”utf-8”) as sf:
stopwords = [word.strip(“\r\t\n”) for word in sf]
train,label = loadData()
rs = []
# 分词
_for line in train:
content = jieba.lcut(line)
tmp = [c.strip() for c in content if c not in stopwords]
rs.append(“”.join(tmp))
return rs,label

# 向量化
_def vectorization():
train,label = participle()
# 实例化TFIDF
tfidf = TfidfVectorizer()
tfidf.fit(train,label)
# 将文本转化为矩阵
train_data = tfidf.transform(train)
print(train_data)
# 构建模型—支持向量机
_model = SVC(kernel=”rbf”)
model.fit(train_data,label)

_# 保存基于训练数据构建好的矩阵<br />    _joblib.dump(filename="tfidf_ec.m",value=tfidf)<br />    _# 保存训练好的模型<br />    _joblib.dump(filename="svm_ec.m",value=model)<br />    print("模型训练完成")

# 通过训练好的模型对测试邮件进行分类识别
_def processing():
# 加载数据和模型
tfidf = joblib.load(“tfidf_ec.m”)
model = joblib.load(“svm_ec.m”)
# 测试,对新邮件进行分类
newEmail = “快来下载吧”
# 加载停用词表
with open(“../关键词提取/baidu_stopwords.txt”, “r”, encoding=”utf-8”) as sf:
stopwords = [word.strip(“\r\t\n”) for word in sf]
rs = jieba.lcut(newEmail)
rs = [word for word in rs if word not in stopwords]
rs = [“ “.join(rs)]
newEmail_mat = tfidf.transform(rs)
# 类别预测
_last = model.predict(newEmail_mat)
last_label = “正常邮件” if last == 1 else “垃圾邮件”
print(“该邮件属于：”, last_label)

if name == ‘main‘:
_# vectorization()
_processing()`

数据集：

ham_data.txt

spam_data.txt

baidu_stopwords.txt