定义树结构

  1. /**
  2. * 树的节点
  3. * 保存了:当前字符c和子树next
  4. */
  5. public class Word implements Comparable<Word> {
  6. public char c;
  7. public List next = null;
  8. public Word(char c) {
  9. this.c = c;
  10. }
  11. @Override
  12. public int compareTo(Word word) {
  13. return c - word.c;
  14. }
  15. public String toString() {
  16. return c + "(" + (next == null ? null : next.size()) + ")";
  17. }
  18. }

多叉树某一层所有树节点的集合

  1. /**
  2. * 多叉树某一层所有树节点的集合
  3. */
  4. public class List extends ArrayList<Word> {
  5. public Word get(char c) {
  6. for (Word w : this) {
  7. if (w.c == c) return w;
  8. }
  9. return null;
  10. }
  11. /**
  12. * 二分查找,必须先升序排序
  13. *
  14. * @param c 需要查找的字符
  15. * @return Word对象:如果找到 null:如果没找到
  16. */
  17. public Word binaryGet(char c) {
  18. int left, right, key;
  19. Word word;
  20. left = 0;
  21. right = this.size() - 1;
  22. while (left <= right) {
  23. key = (left + right) / 2;
  24. word = get(key);
  25. if (word.c == c) {
  26. return word;
  27. } else if (word.c > c) {
  28. right = key - 1;
  29. } else {
  30. left = key + 1;
  31. }
  32. }
  33. return null;
  34. }
  35. public Word add(char c) {
  36. Word word = new Word(c);
  37. super.add(word);
  38. return word;
  39. }
  40. }

DFA算法核心

  1. import java.io.*;
  2. import java.util.ArrayList;
  3. import java.util.Collections;
  4. /**
  5. * DFA算法核心
  6. */
  7. public final class SensitiveWordFilter {
  8. //加载完后的敏感词多叉树
  9. public static List wordList;
  10. private final static char replace = '*'; // 替代字符
  11. private final static char[] skip = new char[]{ // 遇到这些字符就会跳过,例如,如果"AB"是敏感词,那么"A B","A=B"也会被屏蔽
  12. '!', '*', '-', '+', '_', '=', ',', '.', '@'
  13. };
  14. /**
  15. * 敏感词替换
  16. *
  17. * @param text 待替换文本
  18. * @return 替换后的文本
  19. */
  20. public static String Filter(String text) {
  21. if (wordList == null || wordList.size() == 0) return text;
  22. char[] __char__ = text.toCharArray(); // 把String转化成char数组,便于遍历
  23. int i, j;
  24. Word word;
  25. boolean flag; // 是否需要替换
  26. for (i = 0; i < __char__.length; i++) { // 遍历所有字符
  27. char c = __char__[i];
  28. word = wordList.binaryGet(c); // 使用二分查找来寻找字符,提高效率
  29. if (word != null) { // word != null说明找到了
  30. flag = false;
  31. j = i + 1;
  32. while (j < __char__.length) { // 开始逐个比较后面的字符
  33. if (skip(__char__[j])) { // 跳过空格之类的无关字符
  34. j++;
  35. continue;
  36. }
  37. if (word.next != null) { // 字符串尚未结束,不确定是否存在敏感词
  38. /*
  39. 以下代码并没有使用二分查找,因为以同一个字符开头的敏感词较少
  40. 例如,wordList中记录了所有敏感词的开头第一个字,它的数量通常会有上千个
  41. 假如现在锁定了字符“T”开头的敏感词,而“T”开头的敏感词只有10个,这时使用二分查找的效率反而低于顺序查找
  42. */
  43. word = word.next.get(__char__[j]);
  44. if (word == null) {
  45. break;
  46. }
  47. j++;
  48. } else { // 字符串已结束,存在敏感词汇
  49. flag = true;
  50. break;
  51. }
  52. }
  53. if (word != null && word.next == null) {
  54. flag = true;
  55. }
  56. if (flag) { // 如果flag==true,说明检测出敏感粗,需要替换
  57. while (i < j) {
  58. if (skip(__char__[i])) { // 跳过空格之类的无关字符,如果要把空格也替换成'*',则删除这个if语句
  59. i++;
  60. continue;
  61. }
  62. __char__[i] = replace;
  63. i++;
  64. }
  65. i--;
  66. }
  67. }
  68. }
  69. return new String(__char__);
  70. }
  71. /**
  72. * 加载敏感词列表
  73. *
  74. * @param words 敏感词数组
  75. */
  76. public static void loadWord(ArrayList<String> words) {
  77. if (words == null) return;
  78. char[] chars;
  79. List now;
  80. Word word;
  81. wordList = new List();
  82. for (String __word__ : words) {
  83. if (__word__ == null) {
  84. continue;
  85. }
  86. chars = __word__.toCharArray();
  87. now = wordList;
  88. word = null;
  89. for (char c : chars) {
  90. if (word != null) {
  91. if (word.next == null) {
  92. word.next = new List();
  93. }
  94. now = word.next;
  95. }
  96. word = now.get(c);
  97. if (word == null) {
  98. word = now.add(c);
  99. }
  100. }
  101. }
  102. sort(wordList);
  103. }
  104. /**
  105. * 加载敏感词txt文件,每个敏感词独占一行,不可出现空格,空行,逗号等非文字内容,必须使用UTF-8编码
  106. *
  107. * @param path txt文件的绝对地址
  108. */
  109. public static void loadWordFromFile(String path) {
  110. String encoding = "UTF-8";
  111. File file = new File(path);
  112. try {
  113. if (file.isFile() && file.exists()) {
  114. InputStreamReader inputStreamReader = new InputStreamReader(
  115. new FileInputStream(file), encoding
  116. );
  117. BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
  118. String line;
  119. ArrayList<String> list = new ArrayList<>();
  120. while ((line = bufferedReader.readLine()) != null) {
  121. list.add(line);
  122. }
  123. bufferedReader.close();
  124. inputStreamReader.close();
  125. loadWord(list);
  126. }
  127. } catch (IOException e) {
  128. e.printStackTrace();
  129. }
  130. }
  131. /**
  132. * 对敏感词多叉树递增排序
  133. *
  134. * @param list 待排序List
  135. */
  136. private static void sort(List list) {
  137. if (list == null) return;
  138. Collections.sort(list); // 递增排序
  139. for (Word word : list) {
  140. sort(word.next);
  141. }
  142. }
  143. /**
  144. * 判断是否跳过当前字符
  145. *
  146. * @param c 待检测字符
  147. * @return true:需要跳过 false:不需要跳过
  148. */
  149. private static boolean skip(char c) {
  150. for (char c1 : skip) {
  151. if (c1 == c) return true;
  152. }
  153. return false;
  154. }
  155. }

测试代码

  1. public class Main {
  2. public static void main(String[] args) throws Exception {
  3. SensitiveWordFilter.loadWordFromFile("C:\\code\\test\\SensitiveWord\\SensitiveWordList.txt");
  4. Scanner sc = new Scanner(System.in);
  5. while (sc.hasNextLine()){
  6. String line = sc.nextLine();
  7. if (line.equals("bye")){
  8. break;
  9. }
  10. String filter = SensitiveWordFilter.Filter(line);
  11. System.out.println("结果:"+filter);
  12. }
  13. }
  14. }

测试结果示例

image.png

敏感词文件示例

  1. 机吧
  2. TMD

参考

原代码地址,本文只做总结搬运