java敏感词全文替换
在csdn上看到一篇DFA算法替换敏感词的全文替换文章,同时也看到网上不少文章说对于这种敏感词功能替换无疑DFA是一种效率不错的实现。下面是基本FDA实现的java代码
public class DFA { private Node rootNode = new Node('R'); private int a = 0; private StringBuilder strWord = new StringBuilder(); public void searchWord(String content) { char[] chars =content.toCharArray(); Node node = rootNode; while(a<chars.length) { node = findNode(node,chars); if(node == null){ node = rootNode; strWord.append(chars); }else{ strWord.append("*"); } a++; } } public void createTree() { for(String str : arr) { char[] chars = str.toCharArray(); if(chars.length > 0) insertNode(rootNode, chars, 0); } } private void insertNode(Node node, char[] cs, int index) { Node n = findNode(node, cs); if(n == null) { n = new Node(cs); node.nodes.add(n); } if(index == (cs.length-1)) n.flag = 1; index++; if(index<cs.length) insertNode(n, cs, index); } private Node findNode(Node node, char c) { List<Node> nodes = node.nodes; Node rn = null; for(Node n : nodes) { if(n.c==c) { rn = n; break; } } return rn; } private static class Node { public char c; public int flag; //1:表示终结,0:延续 这里只替换成*所以用不着 public List<Node> nodes = new ArrayList<Node>(); public Node(char c) { this.c = c; this.flag = 0; } public Node(char c, int flag) { this.c = c; this.flag = flag; } } }
下面是自己写的用Map实现的一段替换代码
public String replaceAllWord(String[] arr,String content){ char conCharArry[] = content.toCharArray(); //这里key为每个敏感词的第一个字符,里面放着第一个字符相同的敏感词list集合 Map<Character, List<String>> word = new HashMap<Character, List<String>>(); //遍历数组生成敏感词map对象 for(String str : arr){ char key = str.charAt(0); List<String> list = word.get(key); if(list == null){ list = new ArrayList<String>(); list.add(str); word.put(key, list); }else{ list.add(str); } } //对内容每一个字符进行遍历,如果当前字符为敏感词的首字符则进行下面行为否则continue本次操作 for(int i = 0 ; i < conCharArry.length; i++){ List<String> list = word.get(conCharArry); if(list == null){ continue; } for(String str : list){ char words[] = str.toCharArray(); //对是否匹配一个完整的敏感词进行标志,如果匹配敏感词过程中有一个字符不符则标注为false boolean mark = true; for(int j = 0; j < words.length; j++){ if(words != conCharArry){ mark = false; break; } } //把敏感词逐个替换成* if(mark){ for(int j = 0; j < words.length; j++){ conCharArry = '*'; } } } } return new String(conCharArry); } 当我们在main函数中测试代码
public static void main(String[] args) { String[] arr = {"tmd", "小姐", "DA"}; String content = "tmd ITeye文章版权属于作者,受法律保护 Da 小姐" long start = System.currentTimeMillis(); for(int i = 0; i < 10000 ; i++){ DFA dfa = new DFA(); dfa.createTree(arr ); dfa.searchWord(); //dfa.replaseAllword(arr,content); } long end =System.currentTimeMillis(); System.out.println(end - start); } 测试结果
DFA实现:45毫秒
Map实现:16毫秒
当把替换类型加至11220字、敏感字词数组增至十来个时
DFA实现:16903毫秒
Map实现:4758毫秒
这里DFA效率不如下面的hash实现的疑惑还请各位指点下。
ps:
加一段备用修改后的代码(properties配制)
@SuppressWarnings("unchecked")public String replaceWordStr(String content) {char conCharArry[] = content.toCharArray();StringBuffer sb = new StringBuffer();// 这里key为每个敏感词的第一个字符,里面放着第一个字符相同的敏感词list集合Map<Character, List<String>> word = new HashMap<Character, List<String>>();// 遍历数组生成敏感词map对象for (Entry entry : properties.entrySet()) {String keyWrod = entry.getKey().toString();char key = "".equals(keyWrod) ? ' ' : keyWrod.charAt(0);List<String> list = word.get(key);if (list == null) {list = new ArrayList<String>();list.add(keyWrod);word.put(key, list);} else {list.add(keyWrod);}}// 对内容每一个字符进行遍历,如果当前字符为敏感词的首字符则进行下面行为否则continue本次操作for (int i = 0; i < conCharArry.length; i++) {List<String> list = word.get(conCharArry);if (list == null) {sb.append(conCharArry);continue;}for (String str : list) {char words[] = str.toCharArray();// 对是否匹配一个完整的敏感词进行标志,如果匹配敏感词过程中有一个字符不符则标注为falseboolean mark = true;for (int j = 0; j < words.length; j++) {if (words != conCharArry) {mark = false;break;}}// 把敏感词逐个替换if (mark) {sb.append(properties.get(str));for (int j = 1; j < words.length; j++) {i++;}} else {sb.append(conCharArry);}}}return sb.toString();}
页:
[1]