-
Notifications
You must be signed in to change notification settings - Fork 7
/
maximum_matching.go
91 lines (68 loc) · 1.77 KB
/
maximum_matching.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
package gotokenizer
import (
"strings"
)
// MaxMatch records dict and dictPath
type MaxMatch struct {
dict *Dict
dictPath string
WordFilter WordFilter
EnabledFilterStopToken bool
StopTokens *StopTokens
}
// NewMaxMatch returns a newly initialized MaxMatch object
func NewMaxMatch(dictPath string) *MaxMatch {
mm := &MaxMatch{
dictPath: dictPath,
}
mm.WordFilter = &NumAndLetterWordFilter{}
return mm
}
// LoadDict loads dict that implements the Tokenizer interface
func (mm *MaxMatch) LoadDict() error {
mm.dict = NewDict(mm.dictPath)
return mm.dict.Load()
}
// Get returns segmentation that implements the Tokenizer interface
func (mm *MaxMatch) Get(text string) ([]string, error) {
CheckDictIsLoaded(mm.dict)
var result []string
startLen := mm.dict.maxLen
text = strings.Trim(text, " ")
for len([]rune(text)) > 0 {
if len([]rune(text)) < startLen {
startLen = len([]rune(text))
}
word := string([]rune(text)[0:startLen])
isFind := false
for !isFind {
if len([]rune(word)) == 1 {
break
}
if _, ok := mm.dict.Records[word]; !ok {
word = string([]rune(word)[0 : len([]rune(word))-1])
} else {
isFind = true
}
if mm.WordFilter.Filter(word) {
isFind = true
}
}
if mm.EnabledFilterStopToken && !mm.StopTokens.IsStopToken(word) {
result = append(result, word)
}
if !mm.EnabledFilterStopToken {
result = append(result, word)
}
text = string([]rune(text)[len([]rune(word)):])
}
return result, nil
}
// GetFrequency returns token frequency that implements the Tokenizer interface
func (mm *MaxMatch) GetFrequency(text string) (map[string]int, error) {
result, err := mm.Get(text)
if err != nil {
return nil, err
}
return GetFrequency(result), nil
}