DFA全称为:Deterministic Finite Automaton,即确定有穷自动机。其特征为:有一个有限状态集合和一些从一个状态通向另一个状态的边,每条边上标记有一个符号,其中一个状态是初态,某些状态是终态。但不同于不确定的有限自动机,DFA中不会有从同一状态出发的两条边标志有相同的符号。常常应用于敏感词过滤,具体的细节原理可以在网上搜到很多,这里就不具体说了。

下面给出简单的实现。

简单实现代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;


public class DFA {

public static Map<String,String> sensitiveWordsMap = new HashMap<String, String>();

public void addSensitiveWords(Set<String> wordSet){
for (String word : wordSet) {
Map nowMap = sensitiveWordsMap;
for (int i = 0; i < word.length(); i++) {
char keyChar = word.charAt(i);
Object nextMap = nowMap.get(keyChar);
if (nextMap != null) {
nowMap = (Map<String,String>) nextMap;
}else {
Map<String, String> newMap = new HashMap<String, String>();
newMap.put("isEnd", "0");
nowMap.put(keyChar, newMap);
nowMap = newMap;
}

if (i == word.length() - 1) {
nowMap.put("isEnd", "1");
}
}
}
}

public boolean isSensitiveWord(String word) {
Map nowMap = sensitiveWordsMap;

for(int i=0;i<word.length();i++){
char keyChar = word.charAt(i);
nowMap = (Map) nowMap.get(keyChar);
if(nowMap == null)
return false;
if(nowMap.get("isEnd").equals("1")){
return true;
}
}
return false;
}

public boolean isContainSensitiveWord(String txt){
for(int i=0; i < txt.length(); i++){
String subTxt = txt.substring(i);
if(isSensitiveWord(subTxt))
return true;
}
return false;
}

public static void main(String[] args){
Set<String> wordSet = new HashSet<String>();
wordSet.add("赌博");
wordSet.add("赌博赌博");
wordSet.add("嫖娼");
wordSet.add("毒品");
wordSet.add("枪支");

DFA dfa = new DFA();
//添加敏感词
dfa.addSensitiveWords(wordSet);
System.out.println(sensitiveWordsMap.toString());

//查询测试
System.out.println(dfa.isContainSensitiveWord("赌博"));
System.out.println(dfa.isContainSensitiveWord("赌博赌"));
System.out.println(dfa.isContainSensitiveWord("博赌"));
System.out.println(dfa.isContainSensitiveWord("赌&博"));
System.out.println(dfa.isContainSensitiveWord("赌博不许"));
System.out.println(dfa.isContainSensitiveWord("不许赌博"));
System.out.println(dfa.isContainSensitiveWord("&赌博&嫖娼&毒品&枪支&"));
}
}
/*
{枪={支={isEnd=1}, isEnd=0}, 赌={isEnd=0, 博={isEnd=1, 赌={isEnd=0, 博={isEnd=1}}}},
嫖={isEnd=0, 娼={isEnd=1}}, 毒={品={isEnd=1}, isEnd=0}}
true
true
false
false
true
true
true
*/