Merge pull request #16 from alldroll/next_word_suggestion

Next word suggestion
suggest-go · Aug 25, 2019 · f32a53c · f32a53c
2 parents 2e32cdb + 25423b5
commit f32a53c
Show file tree

Hide file tree

Showing 76 changed files with 1,928 additions and 1,167 deletions.
diff --git a/README.md b/README.md
@@ -1,7 +1,8 @@
 # Suggest
 
-Library for Top-k Approximate String Matching.
+Library for Top-k Approximate String Matching, autocomplete and spell checking.
 
+[![Build Status](https://travis-ci.com/alldroll/suggest.svg?branch=master)](https://travis-ci.com/alldroll/suggest)
 [![Go Report Card](https://goreportcard.com/badge/github.com/alldroll/suggest)](https://goreportcard.com/report/github.com/alldroll/suggest)
 [![GoDoc](https://godoc.org/github.com/alldroll/suggest?status.svg)](https://godoc.org/github.com/alldroll/suggest)
 
@@ -15,16 +16,17 @@ The library was mostly inspired by
 
 ## Purpose
 
-Let's imagine you have a website, for instance a pharmacy website.
+Let's imagine you have a website, for instance, a pharmacy website.
 There could be a lot of dictionaries, such as a list of medical drugs,
 a list of cities (countries), where you can deliver your goods and so on.
-Some of these dictionaries could be a pretty large, and it might be a
+Some of these dictionaries could be pretty large, and it might be a
 tedious for a customer to choose the correct option from the dictionary.
 Having the possibility of `Top-k approximate string search` in a dictionary
-is a significant in these cases.
+is significant in these cases.
 
-This library provides API and the simple `http service` for such purposes.
+Also, the library provides spell checking functionality, that allows you to predict the next word.
 
+The library provides API and the simple `HTTP service` for such purposes.
 
 ## Demo
 
@@ -55,8 +57,9 @@ of choosing a similarity, type of metric and topK.
 ## Usage
 
 ```go
-// The dictionary, on which we expect fuzzy search
-dictionary := dictionary.NewInMemoryDictionary([]string{
+// we create InMemoryDictionary. Here we can use anything we want,
+// for example SqlDictionary, CDBDictionary and so on
+dict := dictionary.NewInMemoryDictionary([]string{
     "Nissan March",
     "Nissan Juke",
     "Nissan Maxima",
@@ -67,10 +70,7 @@ dictionary := dictionary.NewInMemoryDictionary([]string{
     "Toyota Corona",
 })
 
-// create suggest service
-service := suggest.NewService()
-
-// here we describe our index configuration
+// describe index configuration
 indexDescription := suggest.IndexDescription{
     Name:      "cars",                   // name of the dictionary
     NGramSize: 3,                        // size of the nGram
@@ -79,15 +79,17 @@ indexDescription := suggest.IndexDescription{
     Alphabet:  []string{"english", "$"}, // alphabet of allowed chars (other chars will be replaced with pad symbol)
 }
 
-// create runtime search index builder (because we don't have indexed data)
-builder, err := suggest.NewRAMBuilder(dictionary, indexDescription)
+// create runtime search index builder
+builder, err := suggest.NewRAMBuilder(dict, indexDescription)
 
 if err != nil {
     log.Fatalf("Unexpected error: %v", err)
 }
 
-// asking our service for adding a new search index with the given configuration
-if err := service.AddIndex(indexDescription.Name, dictionary, builder); err != nil {
+service := suggest.NewService()
+
+// add a new search index with the given configuration
+if err := service.AddIndex(indexDescription.Name, dict, builder); err != nil {
     log.Fatalf("Unexpected error: %v", err)
 }
 
@@ -116,16 +118,32 @@ fmt.Println(values)
 
 ## Suggest eval
 
-Eval command is a command line tool for approximate string search.
+Eval command is a command-line tool for approximate string search.
 
 ## Suggest indexer
 
 Indexer command builds a search index with the given [configuration](##index-description-format).
-Generated data is required by `DISC` implementation of a index driver.
+Generated data is required by `DISC` implementation of an index driver.
 
 ## Suggest service-run
 
-Runs a http web server with suggest methods.
+Runs HTTP webserver with suggest methods.
+
+## Language model ngram-count
+
+Creates Google n-grams format
+
+## Language model build-lm
+
+Builds a binary representation of a stupid-backoff language model and writes it to disk
+
+## Language model eval
+
+Eval command is a cli for lm scoring
+
+## Spellchecker
+
+Cli for spell checking
 
 ### REST API
 
@@ -221,9 +239,3 @@ Returns a list of managed dictionaries
 
   * **Code:** 500 SERVER ERROR <br />
     **Content:** `description`
-
-## TODO
-
-* Autocomplete (to improve initial prototype)
-* NGram language model
-* Spellchecker
diff --git a/pkg/analysis/filter_tokenizer.go b/pkg/analysis/filter_tokenizer.go
@@ -0,0 +1,27 @@
+package analysis
+
+import "strings"
+
+// filterTokenizer performs tokenize and filter operations
+type filterTokenizer struct {
+	tokenizer Tokenizer
+	filter    TokenFilter
+}
+
+// NewFilterTokenizer creates a new instance of filter tokenizer
+func NewFilterTokenizer(tokenizer Tokenizer, filter TokenFilter) Tokenizer {
+	return &filterTokenizer{
+		tokenizer: tokenizer,
+		filter:    filter,
+	}
+}
+
+// Tokenize splits the given text on a sequence of tokens
+func (t *filterTokenizer) Tokenize(text string) []Token {
+	text = strings.ToLower(text)
+	text = strings.Trim(text, " ")
+
+	tokens := t.tokenizer.Tokenize(text)
+
+	return t.filter.Filter(tokens)
+}
diff --git a/pkg/analysis/ngram_tokenizer.go b/pkg/analysis/ngram_tokenizer.go
@@ -0,0 +1,54 @@
+package analysis
+
+const maxN = 8
+
+// NewNGramTokenizer creates a new instance of Tokenizer
+func NewNGramTokenizer(nGramSize int) Tokenizer {
+	return &nGramTokenizer{
+		nGramSize: nGramSize,
+	}
+}
+
+type nGramTokenizer struct {
+	nGramSize int
+}
+
+// Tokenize splits the given text on a sequence of tokens
+func (t *nGramTokenizer) Tokenize(text string) []Token {
+	if len(text) < t.nGramSize {
+		return []Token{}
+	}
+
+	result := make([]Token, 0, len(text)-t.nGramSize+1)
+	prevIndexes := [maxN]int{}
+	i := 0
+
+	for index := range text {
+		i++
+
+		if i > t.nGramSize {
+			top := prevIndexes[(i-t.nGramSize)%t.nGramSize]
+			nGram := text[top:index]
+			result = appendUnique(result, nGram)
+		}
+
+		prevIndexes[i%t.nGramSize] = index
+	}
+
+	top := prevIndexes[(i+1)%t.nGramSize]
+	nGram := text[top:]
+	result = appendUnique(result, nGram)
+
+	return result
+}
+
+// https://blog.golang.org/profiling-go-programs
+func appendUnique(a []Token, x Token) []Token {
+	for _, y := range a {
+		if x == y {
+			return a
+		}
+	}
+
+	return append(a, x)
+}
diff --git a/pkg/analysis/ngram_tokenizer_test.go b/pkg/analysis/ngram_tokenizer_test.go
@@ -0,0 +1,66 @@
+package analysis
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestTokenizeNGrams(t *testing.T) {
+	cases := []struct {
+		word   string
+		k      int
+		ngrams []Token
+	}{
+		{
+			"tet",
+			2,
+			[]Token{"te", "et"},
+		},
+		{
+			"te",
+			2,
+			[]Token{"te"},
+		},
+		{
+			"testing",
+			3,
+			[]Token{"tes", "est", "sti", "tin", "ing"},
+		},
+		{
+			"жигули",
+			2,
+			[]Token{"жи", "иг", "гу", "ул", "ли"},
+		},
+		{
+			"",
+			2,
+			[]Token{},
+		},
+		{
+			"lalala",
+			2,
+			[]Token{"la", "al"},
+		},
+	}
+
+	for _, c := range cases {
+		tokenizer := NewNGramTokenizer(c.k)
+		actual := tokenizer.Tokenize(c.word)
+
+		if !reflect.DeepEqual(actual, c.ngrams) {
+			t.Errorf(
+				"Test Fail, expected %v, got %v",
+				c.ngrams,
+				actual,
+			)
+		}
+	}
+}
+
+func BenchmarkNGramTokenizer(b *testing.B) {
+	tokenizer := NewNGramTokenizer(3)
+
+	for i := 0; i < b.N; i++ {
+		tokenizer.Tokenize("abcdefghkl123456йцукен")
+	}
+}
diff --git a/pkg/analysis/normalizer.go b/pkg/analysis/normalizer.go
@@ -0,0 +1,37 @@
+package analysis
+
+import (
+	"github.com/alldroll/suggest/pkg/alphabet"
+)
+
+type normalizeFilter struct {
+	chars alphabet.Alphabet
+	pad   string
+}
+
+// NewNormalizerFilter returns tokens filter
+func NewNormalizerFilter(chars alphabet.Alphabet, pad string) TokenFilter {
+	return &normalizeFilter{
+		chars: chars,
+		pad:   pad,
+	}
+}
+
+// Filter filters the given list with described behaviour
+func (f *normalizeFilter) Filter(list []Token) []Token {
+	for i, token := range list {
+		res := ""
+
+		for _, r := range token {
+			if f.chars.Has(r) {
+				res += string(r)
+			} else {
+				res += f.pad
+			}
+		}
+
+		list[i] = res
+	}
+
+	return list
+}
diff --git a/pkg/analysis/tokenizer.go b/pkg/analysis/tokenizer.go
@@ -0,0 +1,16 @@
+package analysis
+
+// Token is a string with an assigned and thus identified meaning
+type Token = string
+
+// Tokenizer performs splitting the given text on a sequence of tokens
+type Tokenizer interface {
+	// Splits the given text on a sequence of tokens
+	Tokenize(text string) []Token
+}
+
+// TokenFilter is responsible for removing, modifiying and altering the given token flow
+type TokenFilter interface {
+	// Filter filters the given list with described behaviour
+	Filter(list []Token) []Token
+}
diff --git a/pkg/analysis/word_tokenizer.go b/pkg/analysis/word_tokenizer.go
@@ -0,0 +1,47 @@
+package analysis
+
+import (
+	"unicode/utf8"
+
+	"github.com/alldroll/suggest/pkg/alphabet"
+)
+
+// NewWordTokenizer creates a new instance of Tokenizer
+func NewWordTokenizer(alphabet alphabet.Alphabet) Tokenizer {
+	return &wordTokenizer{
+		alphabet: alphabet,
+	}
+}
+
+// tokenizer implements Tokenizer interface
+type wordTokenizer struct {
+	alphabet alphabet.Alphabet
+}
+
+// Tokenize splits the given text on a sequence of tokens
+func (t *wordTokenizer) Tokenize(text string) []Token {
+	words := []Token{}
+	wordStart, wordLen := -1, 0
+
+	for i, char := range text {
+		if t.alphabet.Has(char) {
+			if wordStart == -1 {
+				wordStart = i
+			}
+
+			wordLen += utf8.RuneLen(char)
+		} else {
+			if wordStart != -1 {
+				words = append(words, text[wordStart:wordStart+wordLen])
+			}
+
+			wordStart, wordLen = -1, 0
+		}
+	}
+
+	if wordStart != -1 {
+		words = append(words, text[wordStart:wordStart+wordLen])
+	}
+
+	return words
+}