golang实现DFA算法的敏感词查找、过滤替换、敏感词匹配检测
New 支持传入分词方式,默认使用 TokenizeByRune。
sensitivewords.TokenizeByRune: 按[]rune(input)分词,适合中文、符号、以及需要逐字符匹配的场景。sensitivewords.New()等价于sensitivewords.New(sensitivewords.TokenizeByRune)。sensitivewords.TokenizeBySpace: 按strings.Fields(input)分词,适合英文单词、英文短语关键词。例如关键词test不会命中titestosterone,关键词bad word会按两个英文 token 匹配。
默认按字分词,适合中文关键词和符号关键词
package main
import (
"fmt"
"github.com/yangyin5127/sensitivewords"
)
func main() {
sensitive := sensitivewords.New()
/*
* keywords.txt:
* 尼玛
* 哈哈
*/
sensitive.LoadFromFile("./keywords.txt")
sensitive.AddWord("测试")
sensitive.AddWords("+q", "+v")
s, keyword := sensitive.Find("测试啊+q/+v,尼玛,哈哈")
fmt.Printf("Find:%v, %v\n", s, keyword) //true,测试
s, results := sensitive.FindAll("测试啊+q/+v,尼玛,哈哈")
fmt.Printf("FindAll:%v, %v\n", s, results) //true, [测试 +q +v 尼玛 哈哈哈]
s, results = sensitive.FindAny("测试啊+q/+v,尼玛,哈哈", 3)
fmt.Printf("FindAny:%v, %v\n", s, results) //true, [测试 +q +v]
s = sensitive.Check("测试啊+q/+v,尼玛,哈哈")
fmt.Printf("Check: %v\n", s) //true
str := sensitive.Filter("测试啊+q/+v,尼玛,哈哈")
fmt.Printf("Filter:%v\n", str) //**啊**/**,**,**
}
Use TokenizeBySpace when keywords should match English words or phrases separated by whitespace.
package main
import (
"fmt"
"github.com/yangyin5127/sensitivewords"
)
func main() {
sensitive := sensitivewords.New(sensitivewords.TokenizeBySpace)
sensitive.AddWords("test", "bad word")
s, keyword := sensitive.Find("this is a test")
fmt.Printf("Find:%v, %v\n", s, keyword) // true,test
s, results := sensitive.FindAll("this is a bad word and test")
fmt.Printf("FindAll:%v, %v\n", s, results) // true,[bad word test]
s = sensitive.Check("titestosterone")
fmt.Printf("Check: %v\n", s) // false
str := sensitive.Filter("test titestosterone bad word")
fmt.Printf("Filter:%v\n", str) // **** titestosterone *** ****
}