定义树结构
/*** 树的节点* 保存了:当前字符c和子树next*/public class Word implements Comparable<Word> {    public char c;    public List next = null;    public Word(char c) {        this.c = c;    }    @Override    public int compareTo(Word word) {        return c - word.c;    }    public String toString() {        return c + "(" + (next == null ? null : next.size()) + ")";    }}
多叉树某一层所有树节点的集合
/** * 多叉树某一层所有树节点的集合 */public class List extends ArrayList<Word> {    public Word get(char c) {        for (Word w : this) {            if (w.c == c) return w;        }        return null;    }    /**     * 二分查找,必须先升序排序     *     * @param c 需要查找的字符     * @return Word对象:如果找到   null:如果没找到     */    public Word binaryGet(char c) {        int left, right, key;        Word word;        left = 0;        right = this.size() - 1;        while (left <= right) {            key = (left + right) / 2;            word = get(key);            if (word.c == c) {                return word;            } else if (word.c > c) {                right = key - 1;            } else {                left = key + 1;            }        }        return null;    }    public Word add(char c) {        Word word = new Word(c);        super.add(word);        return word;    }}
DFA算法核心
import java.io.*;import java.util.ArrayList;import java.util.Collections;/** * DFA算法核心 */public final class SensitiveWordFilter {    //加载完后的敏感词多叉树    public static List wordList;    private final static char replace = '*'; // 替代字符    private final static char[] skip = new char[]{ // 遇到这些字符就会跳过,例如,如果"AB"是敏感词,那么"A B","A=B"也会被屏蔽            '!', '*', '-', '+', '_', '=', ',', '.', '@'    };    /**     * 敏感词替换     *     * @param text 待替换文本     * @return 替换后的文本     */    public static String Filter(String text) {        if (wordList == null || wordList.size() == 0) return text;        char[] __char__ = text.toCharArray(); // 把String转化成char数组,便于遍历        int i, j;        Word word;        boolean flag; // 是否需要替换        for (i = 0; i < __char__.length; i++) { // 遍历所有字符            char c = __char__[i];            word = wordList.binaryGet(c); // 使用二分查找来寻找字符,提高效率            if (word != null) { // word != null说明找到了                flag = false;                j = i + 1;                while (j < __char__.length) { // 开始逐个比较后面的字符                    if (skip(__char__[j])) { // 跳过空格之类的无关字符                        j++;                        continue;                    }                    if (word.next != null) { // 字符串尚未结束,不确定是否存在敏感词                        /*                        以下代码并没有使用二分查找,因为以同一个字符开头的敏感词较少                        例如,wordList中记录了所有敏感词的开头第一个字,它的数量通常会有上千个                        假如现在锁定了字符“T”开头的敏感词,而“T”开头的敏感词只有10个,这时使用二分查找的效率反而低于顺序查找                         */                        word = word.next.get(__char__[j]);                        if (word == null) {                            break;                        }                        j++;                    } else { // 字符串已结束,存在敏感词汇                        flag = true;                        break;                    }                }                if (word != null && word.next == null) {                    flag = true;                }                if (flag) { // 如果flag==true,说明检测出敏感粗,需要替换                    while (i < j) {                        if (skip(__char__[i])) { // 跳过空格之类的无关字符,如果要把空格也替换成'*',则删除这个if语句                            i++;                            continue;                        }                        __char__[i] = replace;                        i++;                    }                    i--;                }            }        }        return new String(__char__);    }    /**     * 加载敏感词列表     *     * @param words 敏感词数组     */    public static void loadWord(ArrayList<String> words) {        if (words == null) return;        char[] chars;        List now;        Word word;        wordList = new List();        for (String __word__ : words) {            if (__word__ == null) {                continue;            }            chars = __word__.toCharArray();            now = wordList;            word = null;            for (char c : chars) {                if (word != null) {                    if (word.next == null) {                        word.next = new List();                    }                    now = word.next;                }                word = now.get(c);                if (word == null) {                    word = now.add(c);                }            }        }        sort(wordList);    }    /**     * 加载敏感词txt文件,每个敏感词独占一行,不可出现空格,空行,逗号等非文字内容,必须使用UTF-8编码     *     * @param path txt文件的绝对地址     */    public static void loadWordFromFile(String path) {        String encoding = "UTF-8";        File file = new File(path);        try {            if (file.isFile() && file.exists()) {                InputStreamReader inputStreamReader = new InputStreamReader(                        new FileInputStream(file), encoding                );                BufferedReader bufferedReader = new BufferedReader(inputStreamReader);                String line;                ArrayList<String> list = new ArrayList<>();                while ((line = bufferedReader.readLine()) != null) {                    list.add(line);                }                bufferedReader.close();                inputStreamReader.close();                loadWord(list);            }        } catch (IOException e) {            e.printStackTrace();        }    }    /**     * 对敏感词多叉树递增排序     *     * @param list 待排序List     */    private static void sort(List list) {        if (list == null) return;        Collections.sort(list); // 递增排序        for (Word word : list) {            sort(word.next);        }    }    /**     * 判断是否跳过当前字符     *     * @param c 待检测字符     * @return true:需要跳过   false:不需要跳过     */    private static boolean skip(char c) {        for (char c1 : skip) {            if (c1 == c) return true;        }        return false;    }}
测试代码
public class Main {    public static void main(String[] args) throws Exception {        SensitiveWordFilter.loadWordFromFile("C:\\code\\test\\SensitiveWord\\SensitiveWordList.txt");        Scanner sc = new Scanner(System.in);        while (sc.hasNextLine()){            String line = sc.nextLine();            if (line.equals("bye")){                break;            }            String filter = SensitiveWordFilter.Filter(line);            System.out.println("结果:"+filter);        }    }}
测试结果示例
敏感词文件示例
机吧TMD
参考
原代码地址,本文只做总结搬运