Skip to content

Commit

Permalink
Fuzzy match support (hypermodeinc#2916)
Browse files Browse the repository at this point in the history
* added "match" to list of valid funcs

* added "match" to list of valid func names

* added handleMatchFunction and MatchFn type

* the match func needs a fulltext tokenizer

* added ngram bleve analizer

* added ngram tokenizer

* verify match uses ngram tokenizer

* get string tokens for match

* added func to build match tokens

* changed handleMatchFunction to use index

* cherry-pick schema.HasTokenizer

* configure bigram filter before the analyzer to update cache

* added EncodeTokens convenience func to encapsulate encodeToken

* we dont need to pre-get tokens, we do that when the task is running

* handleMatchFunction updated for fuzzy match, filter optimizations, and code cleanups

* matchFuzzy func using index and bigram (ngram) index

* adding Bleve ngram support

* added func comments

* all fuzzy tokens must match

* cp: test cases might not be mutex in the future, revert the order.

* cp: dont try to match all posting values against match

* cp: added comment

* removed extra branch

* switch to trigram for fuzzy match indexing

* fixed typo

* renamed ngram to match

* fixed typo

* added full posting value to search terms, minor cleanups

* added fuzzy matching test

* remove Bleve ngram pkg

* revert this change

* fixed needsIntersect

* switched to a new fuzzy matching pkg

* tweaked test with misspelling

* added func GetTokenizerByID to search registered tokenizers by id

* added tok.GetTokens for generating tokens by id

* changed to use tok.GetTokens

* fixed grammar in comment

* replaced underused switch with if block

* small test change

* Pick up and modify Levenshtein distance to introduce a max distance factor, which would cause early termination of the algo to save CPU resources. Remove the fuzzysearch lib.

* using threshold for lev distance max

* worker/task.go: added match() argument for specifying max distance.

This change allows setting a second integer argument in match() to set the
max Levenshtein distance threshold. If no value is set, the default value
of 8 is used.

* systest/queries_test.go: updated match query tests

Updated test for new matchFuzzy using threshold.

* wiki/content/query-language/index.md: added section for match function

* vendor/vendor.json: removed old fuzzy pkg

* worker/task.go: match func enforce 2 args, max distance must be gt zero

* wiki/content/query-language/index.md: updated syntax, example and fixed typos

* systest/queries_test.go: updated syntax in tests

* wiki/content/query-language/index.md: minior doc fixes
  • Loading branch information
srfrog authored and dna2github committed Jul 19, 2019
1 parent 0eb2b2d commit 5419b71
Show file tree
Hide file tree
Showing 10 changed files with 479 additions and 17 deletions.
2 changes: 1 addition & 1 deletion gql/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -1349,7 +1349,7 @@ func validFuncName(name string) bool {

switch name {
case "regexp", "anyofterms", "allofterms", "alloftext", "anyoftext",
"has", "uid", "uid_in", "anyof", "allof", "type":
"has", "uid", "uid_in", "anyof", "allof", "type", "match":
return true
}
return false
Expand Down
2 changes: 1 addition & 1 deletion query/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -2423,7 +2423,7 @@ func isValidArg(a string) bool {
func isValidFuncName(f string) bool {
switch f {
case "anyofterms", "allofterms", "val", "regexp", "anyoftext", "alloftext",
"has", "uid", "uid_in", "anyof", "allof", "type":
"has", "uid", "uid_in", "anyof", "allof", "type", "match":
return true
}
return isInequalityFn(f) || types.IsGeoFunc(f)
Expand Down
142 changes: 142 additions & 0 deletions systest/queries_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ func TestQuery(t *testing.T) {
t.Run("multiple block eval", wrap(MultipleBlockEval))
t.Run("unmatched var assignment eval", wrap(UnmatchedVarEval))
t.Run("hash index queries", wrap(QueryHashIndex))
t.Run("fuzzy matching", wrap(FuzzyMatch))
t.Run("regexp with toggled trigram index", wrap(RegexpToggleTrigramIndex))
t.Run("groupby uid that works", wrap(GroupByUidWorks))
t.Run("cleanup", wrap(SchemaQueryCleanup))
Expand Down Expand Up @@ -539,6 +540,147 @@ func SchemaQueryTestHTTP(t *testing.T, c *dgo.Dgraph) {
CompareJSON(t, js, string(m["data"]))
}

func FuzzyMatch(t *testing.T, c *dgo.Dgraph) {
ctx := context.Background()

require.NoError(t, c.Alter(ctx, &api.Operation{
Schema: `
term: string @index(trigram) .
name: string .
`,
}))

txn := c.NewTxn()
_, err := txn.Mutate(ctx, &api.Mutation{
SetNquads: []byte(`
_:t0 <term> "" .
_:t1 <term> "road" .
_:t2 <term> "avenue" .
_:t3 <term> "street" .
_:t4 <term> "boulevard" .
_:t5 <term> "drive" .
_:t6 <term> "route" .
_:t7 <term> "pass" .
_:t8 <term> "pathway" .
_:t9 <term> "lane" .
_:ta <term> "highway" .
_:tb <term> "parkway" .
_:tc <term> "motorway" .
_:td <term> "high road" .
_:te <term> "side street" .
_:tf <term> "dual carriageway" .
_:n0 <name> "srfrog" .
`),
})
require.NoError(t, err)
require.NoError(t, txn.Commit(ctx))

tests := []struct {
in, out, failure string
}{
{
in: `{q(func:match(term, drive, 8)) {term}}`,
out: `{"q":[{"term":"drive"}]}`,
},
{
in: `{q(func:match(term, "plano", 1)) {term}}`,
out: `{"q":[]}`,
},
{
in: `{q(func:match(term, "plano", 2)) {term}}`,
out: `{"q":[{"term":"lane"}]}`,
},
{
in: `{q(func:match(term, "plano", 8)) {term}}`,
out: `{"q":[{"term":"lane"}]}`,
},
{
in: `{q(func:match(term, way, 8)) {term}}`,
out: `{"q":[
{"term": "highway"},
{"term": "pathway"},
{"term": "parkway"},
{"term": "motorway"}
]}`,
},
{
in: `{q(func:match(term, pway, 8)) {term}}`,
out: `{"q":[
{"term": "highway"},
{"term": "pathway"},
{"term": "parkway"},
{"term": "motorway"}
]}`,
},
{
in: `{q(func:match(term, high, 8)) {term}}`,
out: `{"q":[
{"term": "highway"},
{"term": "high road"}
]}`,
},
{
in: `{q(func:match(term, str, 8)) {term}}`,
out: `{"q":[
{"term": "street"},
{"term": "side street"}
]}`,
},
{
in: `{q(func:match(term, strip, 8)) {term}}`,
out: `{"q":[
{"term": "street"},
{"term": "side street"}
]}`,
},
{
in: `{q(func:match(term, strip, 3)) {term}}`,
out: `{"q":[{"term": "street"}]}`,
},
{
in: `{q(func:match(term, "carigeway", 8)) {term}}`,
out: `{"q":[
{"term": "dual carriageway"}
]}`,
},
{
in: `{q(func:match(term, "carigeway", 4)) {term}}`,
out: `{"q":[]}`,
},
{
in: `{q(func:match(term, "dualway", 8)) {term}}`,
out: `{"q":[
{"term": "highway"},
{"term": "pathway"},
{"term": "parkway"},
{"term": "motorway"}
]}`,
},
{
in: `{q(func:match(term, "dualway", 2)) {term}}`,
out: `{"q":[]}`,
},
{
in: `{q(func:match(term, "", 8)) {term}}`,
failure: `Empty argument received`,
},
{
in: `{q(func:match(name, "someone", 8)) {name}}`,
failure: `Attribute name is not indexed with type trigram`,
},
}
for _, tc := range tests {
resp, err := c.NewTxn().Query(ctx, tc.in)
if tc.failure != "" {
require.Error(t, err)
require.Contains(t, err.Error(), tc.failure)
continue
}
require.NoError(t, err)
CompareJSON(t, tc.out, string(resp.Json))
}
}

func QueryHashIndex(t *testing.T, c *dgo.Dgraph) {
ctx := context.Background()

Expand Down
17 changes: 17 additions & 0 deletions tok/tok.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,17 @@ func LoadCustomTokenizer(soFile string) {
registerTokenizer(CustomTokenizer{PluginTokenizer: tokenizer})
}

// GetTokenizerByID tries to find a tokenizer by id in the registered list.
// Returns the tokenizer and true if found, otherwise nil and false.
func GetTokenizerByID(id byte) (Tokenizer, bool) {
for _, t := range tokenizers {
if id == t.Identifier() {
return t, true
}
}
return nil, false
}

// GetTokenizer returns tokenizer given unique name.
func GetTokenizer(name string) (Tokenizer, bool) {
t, found := tokenizers[name]
Expand Down Expand Up @@ -332,6 +343,12 @@ func EncodeRegexTokens(tokens []string) {
}
}

func EncodeTokens(id byte, tokens []string) {
for i := 0; i < len(tokens); i++ {
tokens[i] = encodeToken(tokens[i], id)
}
}

type BoolTokenizer struct{}

func (t BoolTokenizer) Name() string { return "bool" }
Expand Down
12 changes: 10 additions & 2 deletions tok/tokens.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,19 @@ func GetLangTokenizer(t Tokenizer, lang string) Tokenizer {
return t
}

func GetTermTokens(funcArgs []string) ([]string, error) {
func GetTokens(id byte, funcArgs ...string) ([]string, error) {
if l := len(funcArgs); l != 1 {
return nil, x.Errorf("Function requires 1 arguments, but got %d", l)
}
return BuildTokens(funcArgs[0], TermTokenizer{})
tokenizer, ok := GetTokenizerByID(id)
if !ok {
return nil, x.Errorf("No tokenizer was found with id %v", id)
}
return BuildTokens(funcArgs[0], tokenizer)
}

func GetTermTokens(funcArgs []string) ([]string, error) {
return GetTokens(IdentTerm, funcArgs...)
}

func GetFullTextTokens(funcArgs []string, lang string) ([]string, error) {
Expand Down
33 changes: 33 additions & 0 deletions wiki/content/query-language/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,39 @@ Keep the following in mind when designing regular expression queries.
- If the partial result (for subset of trigrams) exceeds 1000000 uids during index scan, the query is stopped to prohibit expensive queries.


### Fuzzy matching


Syntax: `match(predicate, string, distance)`

Schema Types: `string`

Index Required: `trigram`

Matches predicate values by calculating the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) to the string,
also known as _fuzzy matching_. The distance parameter must be greater than zero (0). Using a greater distance value can yield more but less accurate results.

Query Example: At root, fuzzy match nodes similar to `Stephen`, with a distance value of 8.

{{< runnable >}}
{
directors(func: match(name@en, Stephen, 8)) {
name@en
}
}
{{< /runnable >}}

Same query with a Levenshtein distance of 3.

{{< runnable >}}
{
directors(func: match(name@en, Stephen, 3)) {
name@en
}
}
{{< /runnable >}}


### Full Text Search

Syntax Examples: `alloftext(predicate, "space-separated text")` and `anyoftext(predicate, "space-separated text")`
Expand Down
114 changes: 114 additions & 0 deletions worker/match.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
/*
* Copyright 2019 Dgraph Labs, Inc. and Contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package worker

import (
"github.com/dgraph-io/dgraph/algo"
"github.com/dgraph-io/dgraph/posting"
"github.com/dgraph-io/dgraph/protos/pb"
"github.com/dgraph-io/dgraph/tok"
"github.com/dgraph-io/dgraph/x"
)

// LevenshteinDistance measures the difference between two strings.
// The Levenshtein distance between two words is the minimum number of
// single-character edits (i.e. insertions, deletions or substitutions)
// required to change one word into the other.
//
// This implemention is optimized to use O(min(m,n)) space and is based on the
// optimized C version found here:
// http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance#C
func levenshteinDistance(s, t string, max int) int {
if len(s) > len(t) {
s, t = t, s
}
r1, r2 := []rune(s), []rune(t) // len(s) <= len(t) => len(r1) <= len(r2)
column := make([]int, len(r1)+1)

for y := 1; y <= len(r1); y++ {
column[y] = y
}

var minIdx int
for x := 1; x <= len(r2); x++ {
column[0] = x

for y, lastDiag := 1, x-1; y <= len(r1); y++ {
oldDiag := column[y]
cost := 0
if r1[y-1] != r2[x-1] {
cost = 1
}
column[y] = min(column[y]+1, column[y-1]+1, lastDiag+cost)
lastDiag = oldDiag
}
if minIdx < len(r1) && column[minIdx] > column[minIdx+1] {
minIdx++
}
if column[minIdx] > max {
return column[minIdx]
}
}
return column[len(r1)]
}

func min(a, b, c int) int {
if a < b && a < c {
return a
} else if b < c {
return b
}
return c
}

// matchFuzzy takes in a value (from posting) and compares it to our list of ngram tokens.
// Returns true if value matches fuzzy tokens, false otherwise.
func matchFuzzy(query, val string, max int) bool {
if val == "" {
return false
}
return levenshteinDistance(val, query, max) <= max
}

// uidsForMatch collects a list of uids that "might" match a fuzzy term based on the ngram
// index. matchFuzzy does the actual fuzzy match.
// Returns the list of uids even if empty, or an error otherwise.
func uidsForMatch(attr string, arg funcArgs) (*pb.List, error) {
opts := posting.ListOptions{ReadTs: arg.q.ReadTs}
uidsForNgram := func(ngram string) (*pb.List, error) {
key := x.IndexKey(attr, ngram)
pl, err := posting.GetNoStore(key)
if err != nil {
return nil, err
}
return pl.Uids(opts)
}

tokens, err := tok.GetTokens(tok.IdentTrigram, arg.srcFn.tokens...)
if err != nil {
return nil, err
}

uidMatrix := make([]*pb.List, len(tokens))
for i, t := range tokens {
uidMatrix[i], err = uidsForNgram(t)
if err != nil {
return nil, err
}
}
return algo.MergeSorted(uidMatrix), nil
}
Loading

0 comments on commit 5419b71

Please sign in to comment.