forked from hypermodeinc/dgraph
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fuzzy match support (hypermodeinc#2916)
* added "match" to list of valid funcs * added "match" to list of valid func names * added handleMatchFunction and MatchFn type * the match func needs a fulltext tokenizer * added ngram bleve analizer * added ngram tokenizer * verify match uses ngram tokenizer * get string tokens for match * added func to build match tokens * changed handleMatchFunction to use index * cherry-pick schema.HasTokenizer * configure bigram filter before the analyzer to update cache * added EncodeTokens convenience func to encapsulate encodeToken * we dont need to pre-get tokens, we do that when the task is running * handleMatchFunction updated for fuzzy match, filter optimizations, and code cleanups * matchFuzzy func using index and bigram (ngram) index * adding Bleve ngram support * added func comments * all fuzzy tokens must match * cp: test cases might not be mutex in the future, revert the order. * cp: dont try to match all posting values against match * cp: added comment * removed extra branch * switch to trigram for fuzzy match indexing * fixed typo * renamed ngram to match * fixed typo * added full posting value to search terms, minor cleanups * added fuzzy matching test * remove Bleve ngram pkg * revert this change * fixed needsIntersect * switched to a new fuzzy matching pkg * tweaked test with misspelling * added func GetTokenizerByID to search registered tokenizers by id * added tok.GetTokens for generating tokens by id * changed to use tok.GetTokens * fixed grammar in comment * replaced underused switch with if block * small test change * Pick up and modify Levenshtein distance to introduce a max distance factor, which would cause early termination of the algo to save CPU resources. Remove the fuzzysearch lib. * using threshold for lev distance max * worker/task.go: added match() argument for specifying max distance. This change allows setting a second integer argument in match() to set the max Levenshtein distance threshold. If no value is set, the default value of 8 is used. * systest/queries_test.go: updated match query tests Updated test for new matchFuzzy using threshold. * wiki/content/query-language/index.md: added section for match function * vendor/vendor.json: removed old fuzzy pkg * worker/task.go: match func enforce 2 args, max distance must be gt zero * wiki/content/query-language/index.md: updated syntax, example and fixed typos * systest/queries_test.go: updated syntax in tests * wiki/content/query-language/index.md: minior doc fixes
- Loading branch information
1 parent
0eb2b2d
commit 5419b71
Showing
10 changed files
with
479 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
/* | ||
* Copyright 2019 Dgraph Labs, Inc. and Contributors | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package worker | ||
|
||
import ( | ||
"github.com/dgraph-io/dgraph/algo" | ||
"github.com/dgraph-io/dgraph/posting" | ||
"github.com/dgraph-io/dgraph/protos/pb" | ||
"github.com/dgraph-io/dgraph/tok" | ||
"github.com/dgraph-io/dgraph/x" | ||
) | ||
|
||
// LevenshteinDistance measures the difference between two strings. | ||
// The Levenshtein distance between two words is the minimum number of | ||
// single-character edits (i.e. insertions, deletions or substitutions) | ||
// required to change one word into the other. | ||
// | ||
// This implemention is optimized to use O(min(m,n)) space and is based on the | ||
// optimized C version found here: | ||
// http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance#C | ||
func levenshteinDistance(s, t string, max int) int { | ||
if len(s) > len(t) { | ||
s, t = t, s | ||
} | ||
r1, r2 := []rune(s), []rune(t) // len(s) <= len(t) => len(r1) <= len(r2) | ||
column := make([]int, len(r1)+1) | ||
|
||
for y := 1; y <= len(r1); y++ { | ||
column[y] = y | ||
} | ||
|
||
var minIdx int | ||
for x := 1; x <= len(r2); x++ { | ||
column[0] = x | ||
|
||
for y, lastDiag := 1, x-1; y <= len(r1); y++ { | ||
oldDiag := column[y] | ||
cost := 0 | ||
if r1[y-1] != r2[x-1] { | ||
cost = 1 | ||
} | ||
column[y] = min(column[y]+1, column[y-1]+1, lastDiag+cost) | ||
lastDiag = oldDiag | ||
} | ||
if minIdx < len(r1) && column[minIdx] > column[minIdx+1] { | ||
minIdx++ | ||
} | ||
if column[minIdx] > max { | ||
return column[minIdx] | ||
} | ||
} | ||
return column[len(r1)] | ||
} | ||
|
||
func min(a, b, c int) int { | ||
if a < b && a < c { | ||
return a | ||
} else if b < c { | ||
return b | ||
} | ||
return c | ||
} | ||
|
||
// matchFuzzy takes in a value (from posting) and compares it to our list of ngram tokens. | ||
// Returns true if value matches fuzzy tokens, false otherwise. | ||
func matchFuzzy(query, val string, max int) bool { | ||
if val == "" { | ||
return false | ||
} | ||
return levenshteinDistance(val, query, max) <= max | ||
} | ||
|
||
// uidsForMatch collects a list of uids that "might" match a fuzzy term based on the ngram | ||
// index. matchFuzzy does the actual fuzzy match. | ||
// Returns the list of uids even if empty, or an error otherwise. | ||
func uidsForMatch(attr string, arg funcArgs) (*pb.List, error) { | ||
opts := posting.ListOptions{ReadTs: arg.q.ReadTs} | ||
uidsForNgram := func(ngram string) (*pb.List, error) { | ||
key := x.IndexKey(attr, ngram) | ||
pl, err := posting.GetNoStore(key) | ||
if err != nil { | ||
return nil, err | ||
} | ||
return pl.Uids(opts) | ||
} | ||
|
||
tokens, err := tok.GetTokens(tok.IdentTrigram, arg.srcFn.tokens...) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
uidMatrix := make([]*pb.List, len(tokens)) | ||
for i, t := range tokens { | ||
uidMatrix[i], err = uidsForNgram(t) | ||
if err != nil { | ||
return nil, err | ||
} | ||
} | ||
return algo.MergeSorted(uidMatrix), nil | ||
} |
Oops, something went wrong.