定义树结构
/**
* 树的节点
* 保存了:当前字符c和子树next
*/
public class Word implements Comparable<Word> {
public char c;
public List next = null;
public Word(char c) {
this.c = c;
}
@Override
public int compareTo(Word word) {
return c - word.c;
}
public String toString() {
return c + "(" + (next == null ? null : next.size()) + ")";
}
}
多叉树某一层所有树节点的集合
/**
* 多叉树某一层所有树节点的集合
*/
public class List extends ArrayList<Word> {
public Word get(char c) {
for (Word w : this) {
if (w.c == c) return w;
}
return null;
}
/**
* 二分查找,必须先升序排序
*
* @param c 需要查找的字符
* @return Word对象:如果找到 null:如果没找到
*/
public Word binaryGet(char c) {
int left, right, key;
Word word;
left = 0;
right = this.size() - 1;
while (left <= right) {
key = (left + right) / 2;
word = get(key);
if (word.c == c) {
return word;
} else if (word.c > c) {
right = key - 1;
} else {
left = key + 1;
}
}
return null;
}
public Word add(char c) {
Word word = new Word(c);
super.add(word);
return word;
}
}
DFA算法核心
import java.io.*;
import java.util.ArrayList;
import java.util.Collections;
/**
* DFA算法核心
*/
public final class SensitiveWordFilter {
//加载完后的敏感词多叉树
public static List wordList;
private final static char replace = '*'; // 替代字符
private final static char[] skip = new char[]{ // 遇到这些字符就会跳过,例如,如果"AB"是敏感词,那么"A B","A=B"也会被屏蔽
'!', '*', '-', '+', '_', '=', ',', '.', '@'
};
/**
* 敏感词替换
*
* @param text 待替换文本
* @return 替换后的文本
*/
public static String Filter(String text) {
if (wordList == null || wordList.size() == 0) return text;
char[] __char__ = text.toCharArray(); // 把String转化成char数组,便于遍历
int i, j;
Word word;
boolean flag; // 是否需要替换
for (i = 0; i < __char__.length; i++) { // 遍历所有字符
char c = __char__[i];
word = wordList.binaryGet(c); // 使用二分查找来寻找字符,提高效率
if (word != null) { // word != null说明找到了
flag = false;
j = i + 1;
while (j < __char__.length) { // 开始逐个比较后面的字符
if (skip(__char__[j])) { // 跳过空格之类的无关字符
j++;
continue;
}
if (word.next != null) { // 字符串尚未结束,不确定是否存在敏感词
/*
以下代码并没有使用二分查找,因为以同一个字符开头的敏感词较少
例如,wordList中记录了所有敏感词的开头第一个字,它的数量通常会有上千个
假如现在锁定了字符“T”开头的敏感词,而“T”开头的敏感词只有10个,这时使用二分查找的效率反而低于顺序查找
*/
word = word.next.get(__char__[j]);
if (word == null) {
break;
}
j++;
} else { // 字符串已结束,存在敏感词汇
flag = true;
break;
}
}
if (word != null && word.next == null) {
flag = true;
}
if (flag) { // 如果flag==true,说明检测出敏感粗,需要替换
while (i < j) {
if (skip(__char__[i])) { // 跳过空格之类的无关字符,如果要把空格也替换成'*',则删除这个if语句
i++;
continue;
}
__char__[i] = replace;
i++;
}
i--;
}
}
}
return new String(__char__);
}
/**
* 加载敏感词列表
*
* @param words 敏感词数组
*/
public static void loadWord(ArrayList<String> words) {
if (words == null) return;
char[] chars;
List now;
Word word;
wordList = new List();
for (String __word__ : words) {
if (__word__ == null) {
continue;
}
chars = __word__.toCharArray();
now = wordList;
word = null;
for (char c : chars) {
if (word != null) {
if (word.next == null) {
word.next = new List();
}
now = word.next;
}
word = now.get(c);
if (word == null) {
word = now.add(c);
}
}
}
sort(wordList);
}
/**
* 加载敏感词txt文件,每个敏感词独占一行,不可出现空格,空行,逗号等非文字内容,必须使用UTF-8编码
*
* @param path txt文件的绝对地址
*/
public static void loadWordFromFile(String path) {
String encoding = "UTF-8";
File file = new File(path);
try {
if (file.isFile() && file.exists()) {
InputStreamReader inputStreamReader = new InputStreamReader(
new FileInputStream(file), encoding
);
BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
String line;
ArrayList<String> list = new ArrayList<>();
while ((line = bufferedReader.readLine()) != null) {
list.add(line);
}
bufferedReader.close();
inputStreamReader.close();
loadWord(list);
}
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 对敏感词多叉树递增排序
*
* @param list 待排序List
*/
private static void sort(List list) {
if (list == null) return;
Collections.sort(list); // 递增排序
for (Word word : list) {
sort(word.next);
}
}
/**
* 判断是否跳过当前字符
*
* @param c 待检测字符
* @return true:需要跳过 false:不需要跳过
*/
private static boolean skip(char c) {
for (char c1 : skip) {
if (c1 == c) return true;
}
return false;
}
}
测试代码
public class Main {
public static void main(String[] args) throws Exception {
SensitiveWordFilter.loadWordFromFile("C:\\code\\test\\SensitiveWord\\SensitiveWordList.txt");
Scanner sc = new Scanner(System.in);
while (sc.hasNextLine()){
String line = sc.nextLine();
if (line.equals("bye")){
break;
}
String filter = SensitiveWordFilter.Filter(line);
System.out.println("结果:"+filter);
}
}
}
测试结果示例
敏感词文件示例
机吧
TMD
参考
原代码地址,本文只做总结搬运