Fuzzy match support (hypermodeinc#2916)

* added "match" to list of valid funcs * added "match" to list of valid func names * added handleMatchFunction and MatchFn type * the match func needs a fulltext tokenizer * added ngram bleve analizer * added ngram tokenizer * verify match uses ngram tokenizer * get string tokens for match * added func to build match tokens * changed handleMatchFunction to use index * cherry-pick schema.HasTokenizer * configure bigram filter before the analyzer to update cache * added EncodeTokens convenience func to encapsulate encodeToken * we dont need to pre-get tokens, we do that when the task is running * handleMatchFunction updated for fuzzy match, filter optimizations, and code cleanups * matchFuzzy func using index and bigram (ngram) index * adding Bleve ngram support * added func comments * all fuzzy tokens must match * cp: test cases might not be mutex in the future, revert the order. * cp: dont try to match all posting values against match * cp: added comment * removed extra branch * switch to trigram for fuzzy match indexing * fixed typo * renamed ngram to match * fixed typo * added full posting value to search terms, minor cleanups * added fuzzy matching test * remove Bleve ngram pkg * revert this change * fixed needsIntersect * switched to a new fuzzy matching pkg * tweaked test with misspelling * added func GetTokenizerByID to search registered tokenizers by id * added tok.GetTokens for generating tokens by id * changed to use tok.GetTokens * fixed grammar in comment * replaced underused switch with if block * small test change * Pick up and modify Levenshtein distance to introduce a max distance factor, which would cause early termination of the algo to save CPU resources. Remove the fuzzysearch lib. * using threshold for lev distance max * worker/task.go: added match() argument for specifying max distance. This change allows setting a second integer argument in match() to set the max Levenshtein distance threshold. If no value is set, the default value of 8 is used. * systest/queries_test.go: updated match query tests Updated test for new matchFuzzy using threshold. * wiki/content/query-language/index.md: added section for match function * vendor/vendor.json: removed old fuzzy pkg * worker/task.go: match func enforce 2 args, max distance must be gt zero * wiki/content/query-language/index.md: updated syntax, example and fixed typos * systest/queries_test.go: updated syntax in tests * wiki/content/query-language/index.md: minior doc fixes
dna2fork · Jul 19, 2019 · 5419b71 · 5419b71
1 parent 0eb2b2d
commit 5419b71
Show file tree

Hide file tree

Showing 10 changed files with 479 additions and 17 deletions.
diff --git a/gql/parser.go b/gql/parser.go
@@ -1349,7 +1349,7 @@ func validFuncName(name string) bool {
 
 	switch name {
 	case "regexp", "anyofterms", "allofterms", "alloftext", "anyoftext",
-		"has", "uid", "uid_in", "anyof", "allof", "type":
+		"has", "uid", "uid_in", "anyof", "allof", "type", "match":
 		return true
 	}
 	return false

diff --git a/query/query.go b/query/query.go
@@ -2423,7 +2423,7 @@ func isValidArg(a string) bool {
 func isValidFuncName(f string) bool {
 	switch f {
 	case "anyofterms", "allofterms", "val", "regexp", "anyoftext", "alloftext",
-		"has", "uid", "uid_in", "anyof", "allof", "type":
+		"has", "uid", "uid_in", "anyof", "allof", "type", "match":
 		return true
 	}
 	return isInequalityFn(f) || types.IsGeoFunc(f)

diff --git a/systest/queries_test.go b/systest/queries_test.go
@@ -50,6 +50,7 @@ func TestQuery(t *testing.T) {
 	t.Run("multiple block eval", wrap(MultipleBlockEval))
 	t.Run("unmatched var assignment eval", wrap(UnmatchedVarEval))
 	t.Run("hash index queries", wrap(QueryHashIndex))
+	t.Run("fuzzy matching", wrap(FuzzyMatch))
 	t.Run("regexp with toggled trigram index", wrap(RegexpToggleTrigramIndex))
 	t.Run("groupby uid that works", wrap(GroupByUidWorks))
 	t.Run("cleanup", wrap(SchemaQueryCleanup))
@@ -539,6 +540,147 @@ func SchemaQueryTestHTTP(t *testing.T, c *dgo.Dgraph) {
 	CompareJSON(t, js, string(m["data"]))
 }
 
+func FuzzyMatch(t *testing.T, c *dgo.Dgraph) {
+	ctx := context.Background()
+
+	require.NoError(t, c.Alter(ctx, &api.Operation{
+		Schema: `
+      term: string @index(trigram) .
+      name: string .
+    `,
+	}))
+
+	txn := c.NewTxn()
+	_, err := txn.Mutate(ctx, &api.Mutation{
+		SetNquads: []byte(`
+      _:t0 <term> "" .
+      _:t1 <term> "road" .
+      _:t2 <term> "avenue" .
+      _:t3 <term> "street" .
+      _:t4 <term> "boulevard" .
+      _:t5 <term> "drive" .
+      _:t6 <term> "route" .
+      _:t7 <term> "pass" .
+      _:t8 <term> "pathway" .
+      _:t9 <term> "lane" .
+      _:ta <term> "highway" .
+      _:tb <term> "parkway" .
+      _:tc <term> "motorway" .
+      _:td <term> "high road" .
+      _:te <term> "side street" .
+      _:tf <term> "dual carriageway" .
+      _:n0 <name> "srfrog" .
+    `),
+	})
+	require.NoError(t, err)
+	require.NoError(t, txn.Commit(ctx))
+
+	tests := []struct {
+		in, out, failure string
+	}{
+		{
+			in:  `{q(func:match(term, drive, 8)) {term}}`,
+			out: `{"q":[{"term":"drive"}]}`,
+		},
+		{
+			in:  `{q(func:match(term, "plano", 1)) {term}}`,
+			out: `{"q":[]}`,
+		},
+		{
+			in:  `{q(func:match(term, "plano", 2)) {term}}`,
+			out: `{"q":[{"term":"lane"}]}`,
+		},
+		{
+			in:  `{q(func:match(term, "plano", 8)) {term}}`,
+			out: `{"q":[{"term":"lane"}]}`,
+		},
+		{
+			in: `{q(func:match(term, way, 8)) {term}}`,
+			out: `{"q":[
+        {"term": "highway"},
+        {"term": "pathway"},
+        {"term": "parkway"},
+        {"term": "motorway"}
+      ]}`,
+		},
+		{
+			in: `{q(func:match(term, pway, 8)) {term}}`,
+			out: `{"q":[
+        {"term": "highway"},
+        {"term": "pathway"},
+        {"term": "parkway"},
+        {"term": "motorway"}
+      ]}`,
+		},
+		{
+			in: `{q(func:match(term, high, 8)) {term}}`,
+			out: `{"q":[
+        {"term": "highway"},
+        {"term": "high road"}
+      ]}`,
+		},
+		{
+			in: `{q(func:match(term, str, 8)) {term}}`,
+			out: `{"q":[
+        {"term": "street"},
+        {"term": "side street"}
+      ]}`,
+		},
+		{
+			in: `{q(func:match(term, strip, 8)) {term}}`,
+			out: `{"q":[
+        {"term": "street"},
+        {"term": "side street"}
+      ]}`,
+		},
+		{
+			in:  `{q(func:match(term, strip, 3)) {term}}`,
+			out: `{"q":[{"term": "street"}]}`,
+		},
+		{
+			in: `{q(func:match(term, "carigeway", 8)) {term}}`,
+			out: `{"q":[
+        {"term": "dual carriageway"}
+      ]}`,
+		},
+		{
+			in:  `{q(func:match(term, "carigeway", 4)) {term}}`,
+			out: `{"q":[]}`,
+		},
+		{
+			in: `{q(func:match(term, "dualway", 8)) {term}}`,
+			out: `{"q":[
+        {"term": "highway"},
+        {"term": "pathway"},
+        {"term": "parkway"},
+        {"term": "motorway"}
+      ]}`,
+		},
+		{
+			in:  `{q(func:match(term, "dualway", 2)) {term}}`,
+			out: `{"q":[]}`,
+		},
+		{
+			in:      `{q(func:match(term, "", 8)) {term}}`,
+			failure: `Empty argument received`,
+		},
+		{
+			in:      `{q(func:match(name, "someone", 8)) {name}}`,
+			failure: `Attribute name is not indexed with type trigram`,
+		},
+	}
+	for _, tc := range tests {
+		resp, err := c.NewTxn().Query(ctx, tc.in)
+		if tc.failure != "" {
+			require.Error(t, err)
+			require.Contains(t, err.Error(), tc.failure)
+			continue
+		}
+		require.NoError(t, err)
+		CompareJSON(t, tc.out, string(resp.Json))
+	}
+}
+
 func QueryHashIndex(t *testing.T, c *dgo.Dgraph) {
 	ctx := context.Background()
 

diff --git a/tok/tok.go b/tok/tok.go
@@ -130,6 +130,17 @@ func LoadCustomTokenizer(soFile string) {
 	registerTokenizer(CustomTokenizer{PluginTokenizer: tokenizer})
 }
 
+// GetTokenizerByID tries to find a tokenizer by id in the registered list.
+// Returns the tokenizer and true if found, otherwise nil and false.
+func GetTokenizerByID(id byte) (Tokenizer, bool) {
+	for _, t := range tokenizers {
+		if id == t.Identifier() {
+			return t, true
+		}
+	}
+	return nil, false
+}
+
 // GetTokenizer returns tokenizer given unique name.
 func GetTokenizer(name string) (Tokenizer, bool) {
 	t, found := tokenizers[name]
@@ -332,6 +343,12 @@ func EncodeRegexTokens(tokens []string) {
 	}
 }
 
+func EncodeTokens(id byte, tokens []string) {
+	for i := 0; i < len(tokens); i++ {
+		tokens[i] = encodeToken(tokens[i], id)
+	}
+}
+
 type BoolTokenizer struct{}
 
 func (t BoolTokenizer) Name() string { return "bool" }

diff --git a/tok/tokens.go b/tok/tokens.go
@@ -33,11 +33,19 @@ func GetLangTokenizer(t Tokenizer, lang string) Tokenizer {
 	return t
 }
 
-func GetTermTokens(funcArgs []string) ([]string, error) {
+func GetTokens(id byte, funcArgs ...string) ([]string, error) {
 	if l := len(funcArgs); l != 1 {
 		return nil, x.Errorf("Function requires 1 arguments, but got %d", l)
 	}
-	return BuildTokens(funcArgs[0], TermTokenizer{})
+	tokenizer, ok := GetTokenizerByID(id)
+	if !ok {
+		return nil, x.Errorf("No tokenizer was found with id %v", id)
+	}
+	return BuildTokens(funcArgs[0], tokenizer)
+}
+
+func GetTermTokens(funcArgs []string) ([]string, error) {
+	return GetTokens(IdentTerm, funcArgs...)
 }
 
 func GetFullTextTokens(funcArgs []string, lang string) ([]string, error) {

diff --git a/wiki/content/query-language/index.md b/wiki/content/query-language/index.md
@@ -353,6 +353,39 @@ Keep the following in mind when designing regular expression queries.
 - If the partial result (for subset of trigrams) exceeds 1000000 uids during index scan, the query is stopped to prohibit expensive queries.
 
 
+### Fuzzy matching
+
+
+Syntax: `match(predicate, string, distance)`
+
+Schema Types: `string`
+
+Index Required: `trigram`
+
+Matches predicate values by calculating the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) to the string,
+also known as _fuzzy matching_. The distance parameter must be greater than zero (0). Using a greater distance value can yield more but less accurate results.
+
+Query Example: At root, fuzzy match nodes similar to `Stephen`, with a distance value of 8.
+
+{{< runnable >}}
+{
+  directors(func: match(name@en, Stephen, 8)) {
+    name@en
+  }
+}
+{{< /runnable >}}
+
+Same query with a Levenshtein distance of 3.
+
+{{< runnable >}}
+{
+  directors(func: match(name@en, Stephen, 3)) {
+    name@en
+  }
+}
+{{< /runnable >}}
+
+
 ### Full Text Search
 
 Syntax Examples: `alloftext(predicate, "space-separated text")` and `anyoftext(predicate, "space-separated text")`

diff --git a/worker/match.go b/worker/match.go
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2019 Dgraph Labs, Inc. and Contributors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package worker
+
+import (
+	"github.com/dgraph-io/dgraph/algo"
+	"github.com/dgraph-io/dgraph/posting"
+	"github.com/dgraph-io/dgraph/protos/pb"
+	"github.com/dgraph-io/dgraph/tok"
+	"github.com/dgraph-io/dgraph/x"
+)
+
+// LevenshteinDistance measures the difference between two strings.
+// The Levenshtein distance between two words is the minimum number of
+// single-character edits (i.e. insertions, deletions or substitutions)
+// required to change one word into the other.
+//
+// This implemention is optimized to use O(min(m,n)) space and is based on the
+// optimized C version found here:
+// http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance#C
+func levenshteinDistance(s, t string, max int) int {
+	if len(s) > len(t) {
+		s, t = t, s
+	}
+	r1, r2 := []rune(s), []rune(t) // len(s) <= len(t) => len(r1) <= len(r2)
+	column := make([]int, len(r1)+1)
+
+	for y := 1; y <= len(r1); y++ {
+		column[y] = y
+	}
+
+	var minIdx int
+	for x := 1; x <= len(r2); x++ {
+		column[0] = x
+
+		for y, lastDiag := 1, x-1; y <= len(r1); y++ {
+			oldDiag := column[y]
+			cost := 0
+			if r1[y-1] != r2[x-1] {
+				cost = 1
+			}
+			column[y] = min(column[y]+1, column[y-1]+1, lastDiag+cost)
+			lastDiag = oldDiag
+		}
+		if minIdx < len(r1) && column[minIdx] > column[minIdx+1] {
+			minIdx++
+		}
+		if column[minIdx] > max {
+			return column[minIdx]
+		}
+	}
+	return column[len(r1)]
+}
+
+func min(a, b, c int) int {
+	if a < b && a < c {
+		return a
+	} else if b < c {
+		return b
+	}
+	return c
+}
+
+// matchFuzzy takes in a value (from posting) and compares it to our list of ngram tokens.
+// Returns true if value matches fuzzy tokens, false otherwise.
+func matchFuzzy(query, val string, max int) bool {
+	if val == "" {
+		return false
+	}
+	return levenshteinDistance(val, query, max) <= max
+}
+
+// uidsForMatch collects a list of uids that "might" match a fuzzy term based on the ngram
+// index. matchFuzzy does the actual fuzzy match.
+// Returns the list of uids even if empty, or an error otherwise.
+func uidsForMatch(attr string, arg funcArgs) (*pb.List, error) {
+	opts := posting.ListOptions{ReadTs: arg.q.ReadTs}
+	uidsForNgram := func(ngram string) (*pb.List, error) {
+		key := x.IndexKey(attr, ngram)
+		pl, err := posting.GetNoStore(key)
+		if err != nil {
+			return nil, err
+		}
+		return pl.Uids(opts)
+	}
+
+	tokens, err := tok.GetTokens(tok.IdentTrigram, arg.srcFn.tokens...)
+	if err != nil {
+		return nil, err
+	}
+
+	uidMatrix := make([]*pb.List, len(tokens))
+	for i, t := range tokens {
+		uidMatrix[i], err = uidsForNgram(t)
+		if err != nil {
+			return nil, err
+		}
+	}
+	return algo.MergeSorted(uidMatrix), nil
+}