-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #16 from alldroll/next_word_suggestion
Next word suggestion
- Loading branch information
Showing
76 changed files
with
1,928 additions
and
1,167 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
package analysis | ||
|
||
import "strings" | ||
|
||
// filterTokenizer performs tokenize and filter operations | ||
type filterTokenizer struct { | ||
tokenizer Tokenizer | ||
filter TokenFilter | ||
} | ||
|
||
// NewFilterTokenizer creates a new instance of filter tokenizer | ||
func NewFilterTokenizer(tokenizer Tokenizer, filter TokenFilter) Tokenizer { | ||
return &filterTokenizer{ | ||
tokenizer: tokenizer, | ||
filter: filter, | ||
} | ||
} | ||
|
||
// Tokenize splits the given text on a sequence of tokens | ||
func (t *filterTokenizer) Tokenize(text string) []Token { | ||
text = strings.ToLower(text) | ||
text = strings.Trim(text, " ") | ||
|
||
tokens := t.tokenizer.Tokenize(text) | ||
|
||
return t.filter.Filter(tokens) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
package analysis | ||
|
||
const maxN = 8 | ||
|
||
// NewNGramTokenizer creates a new instance of Tokenizer | ||
func NewNGramTokenizer(nGramSize int) Tokenizer { | ||
return &nGramTokenizer{ | ||
nGramSize: nGramSize, | ||
} | ||
} | ||
|
||
type nGramTokenizer struct { | ||
nGramSize int | ||
} | ||
|
||
// Tokenize splits the given text on a sequence of tokens | ||
func (t *nGramTokenizer) Tokenize(text string) []Token { | ||
if len(text) < t.nGramSize { | ||
return []Token{} | ||
} | ||
|
||
result := make([]Token, 0, len(text)-t.nGramSize+1) | ||
prevIndexes := [maxN]int{} | ||
i := 0 | ||
|
||
for index := range text { | ||
i++ | ||
|
||
if i > t.nGramSize { | ||
top := prevIndexes[(i-t.nGramSize)%t.nGramSize] | ||
nGram := text[top:index] | ||
result = appendUnique(result, nGram) | ||
} | ||
|
||
prevIndexes[i%t.nGramSize] = index | ||
} | ||
|
||
top := prevIndexes[(i+1)%t.nGramSize] | ||
nGram := text[top:] | ||
result = appendUnique(result, nGram) | ||
|
||
return result | ||
} | ||
|
||
// https://blog.golang.org/profiling-go-programs | ||
func appendUnique(a []Token, x Token) []Token { | ||
for _, y := range a { | ||
if x == y { | ||
return a | ||
} | ||
} | ||
|
||
return append(a, x) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
package analysis | ||
|
||
import ( | ||
"reflect" | ||
"testing" | ||
) | ||
|
||
func TestTokenizeNGrams(t *testing.T) { | ||
cases := []struct { | ||
word string | ||
k int | ||
ngrams []Token | ||
}{ | ||
{ | ||
"tet", | ||
2, | ||
[]Token{"te", "et"}, | ||
}, | ||
{ | ||
"te", | ||
2, | ||
[]Token{"te"}, | ||
}, | ||
{ | ||
"testing", | ||
3, | ||
[]Token{"tes", "est", "sti", "tin", "ing"}, | ||
}, | ||
{ | ||
"жигули", | ||
2, | ||
[]Token{"жи", "иг", "гу", "ул", "ли"}, | ||
}, | ||
{ | ||
"", | ||
2, | ||
[]Token{}, | ||
}, | ||
{ | ||
"lalala", | ||
2, | ||
[]Token{"la", "al"}, | ||
}, | ||
} | ||
|
||
for _, c := range cases { | ||
tokenizer := NewNGramTokenizer(c.k) | ||
actual := tokenizer.Tokenize(c.word) | ||
|
||
if !reflect.DeepEqual(actual, c.ngrams) { | ||
t.Errorf( | ||
"Test Fail, expected %v, got %v", | ||
c.ngrams, | ||
actual, | ||
) | ||
} | ||
} | ||
} | ||
|
||
func BenchmarkNGramTokenizer(b *testing.B) { | ||
tokenizer := NewNGramTokenizer(3) | ||
|
||
for i := 0; i < b.N; i++ { | ||
tokenizer.Tokenize("abcdefghkl123456йцукен") | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
package analysis | ||
|
||
import ( | ||
"github.com/alldroll/suggest/pkg/alphabet" | ||
) | ||
|
||
type normalizeFilter struct { | ||
chars alphabet.Alphabet | ||
pad string | ||
} | ||
|
||
// NewNormalizerFilter returns tokens filter | ||
func NewNormalizerFilter(chars alphabet.Alphabet, pad string) TokenFilter { | ||
return &normalizeFilter{ | ||
chars: chars, | ||
pad: pad, | ||
} | ||
} | ||
|
||
// Filter filters the given list with described behaviour | ||
func (f *normalizeFilter) Filter(list []Token) []Token { | ||
for i, token := range list { | ||
res := "" | ||
|
||
for _, r := range token { | ||
if f.chars.Has(r) { | ||
res += string(r) | ||
} else { | ||
res += f.pad | ||
} | ||
} | ||
|
||
list[i] = res | ||
} | ||
|
||
return list | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
package analysis | ||
|
||
// Token is a string with an assigned and thus identified meaning | ||
type Token = string | ||
|
||
// Tokenizer performs splitting the given text on a sequence of tokens | ||
type Tokenizer interface { | ||
// Splits the given text on a sequence of tokens | ||
Tokenize(text string) []Token | ||
} | ||
|
||
// TokenFilter is responsible for removing, modifiying and altering the given token flow | ||
type TokenFilter interface { | ||
// Filter filters the given list with described behaviour | ||
Filter(list []Token) []Token | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
package analysis | ||
|
||
import ( | ||
"unicode/utf8" | ||
|
||
"github.com/alldroll/suggest/pkg/alphabet" | ||
) | ||
|
||
// NewWordTokenizer creates a new instance of Tokenizer | ||
func NewWordTokenizer(alphabet alphabet.Alphabet) Tokenizer { | ||
return &wordTokenizer{ | ||
alphabet: alphabet, | ||
} | ||
} | ||
|
||
// tokenizer implements Tokenizer interface | ||
type wordTokenizer struct { | ||
alphabet alphabet.Alphabet | ||
} | ||
|
||
// Tokenize splits the given text on a sequence of tokens | ||
func (t *wordTokenizer) Tokenize(text string) []Token { | ||
words := []Token{} | ||
wordStart, wordLen := -1, 0 | ||
|
||
for i, char := range text { | ||
if t.alphabet.Has(char) { | ||
if wordStart == -1 { | ||
wordStart = i | ||
} | ||
|
||
wordLen += utf8.RuneLen(char) | ||
} else { | ||
if wordStart != -1 { | ||
words = append(words, text[wordStart:wordStart+wordLen]) | ||
} | ||
|
||
wordStart, wordLen = -1, 0 | ||
} | ||
} | ||
|
||
if wordStart != -1 { | ||
words = append(words, text[wordStart:wordStart+wordLen]) | ||
} | ||
|
||
return words | ||
} |
Oops, something went wrong.