From 795bd92a852062ec18114c3e79b03510cf6e2291 Mon Sep 17 00:00:00 2001 From: srfrog Date: Fri, 18 Jan 2019 10:28:14 -0700 Subject: [PATCH 01/50] added "match" to list of valid funcs --- gql/parser.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gql/parser.go b/gql/parser.go index 8819094dbec..fde32f29c8c 100644 --- a/gql/parser.go +++ b/gql/parser.go @@ -1348,7 +1348,7 @@ func validFuncName(name string) bool { switch name { case "regexp", "anyofterms", "allofterms", "alloftext", "anyoftext", - "has", "uid", "uid_in", "anyof", "allof": + "has", "uid", "uid_in", "anyof", "allof", "match": return true } return false From a00006566b78e3330b56d39f13e5609af75f0bf4 Mon Sep 17 00:00:00 2001 From: srfrog Date: Fri, 18 Jan 2019 10:28:44 -0700 Subject: [PATCH 02/50] added "match" to list of valid func names --- query/query.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/query/query.go b/query/query.go index db5dc07f5f3..eeb88ab25a8 100644 --- a/query/query.go +++ b/query/query.go @@ -2349,7 +2349,7 @@ func isValidArg(a string) bool { func isValidFuncName(f string) bool { switch f { case "anyofterms", "allofterms", "val", "regexp", "anyoftext", "alloftext", - "has", "uid", "uid_in", "anyof", "allof": + "has", "uid", "uid_in", "anyof", "allof", "match": return true } return isInequalityFn(f) || types.IsGeoFunc(f) From 4a2643a2e08f95f0e5d9c8f034fd3cb0468b7234 Mon Sep 17 00:00:00 2001 From: srfrog Date: Fri, 18 Jan 2019 10:29:29 -0700 Subject: [PATCH 03/50] added handleMatchFunction and MatchFn type --- worker/task.go | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/worker/task.go b/worker/task.go index a587b188c86..62245aa29ce 100644 --- a/worker/task.go +++ b/worker/task.go @@ -220,6 +220,7 @@ const ( HasFn UidInFn CustomIndexFn + MatchFn StandardFn = 100 ) @@ -260,6 +261,8 @@ func parseFuncTypeHelper(name string) (FuncType, string) { return UidInFn, f case "anyof", "allof": return CustomIndexFn, f + case "match": + return MatchFn, f default: if types.IsGeoFunc(f) { return GeoFn, f @@ -270,7 +273,7 @@ func parseFuncTypeHelper(name string) (FuncType, string) { func needsIndex(fnType FuncType) bool { switch fnType { - case CompareAttrFn, GeoFn, RegexFn, FullTextSearchFn, StandardFn: + case CompareAttrFn, GeoFn, RegexFn, FullTextSearchFn, StandardFn, MatchFn: return true default: return false @@ -294,7 +297,7 @@ func (srcFn *functionContext) needsValuePostings(typ types.TypeID) (bool, error) return false, nil } return true, nil - case GeoFn, RegexFn, FullTextSearchFn, StandardFn, HasFn, CustomIndexFn: + case GeoFn, RegexFn, FullTextSearchFn, StandardFn, HasFn, CustomIndexFn, MatchFn: // All of these require index, hence would require fetching uid postings. return false, nil case UidInFn, CompareScalarFn: @@ -558,7 +561,7 @@ func (qs *queryState) handleUidPostings( } else { key = x.DataKey(q.Attr, q.UidList.Uids[i]) } - case GeoFn, RegexFn, FullTextSearchFn, StandardFn, CustomIndexFn: + case GeoFn, RegexFn, FullTextSearchFn, StandardFn, CustomIndexFn, MatchFn: key = x.IndexKey(q.Attr, srcFn.tokens[i]) case CompareAttrFn: key = x.IndexKey(q.Attr, srcFn.tokens[i]) @@ -816,6 +819,13 @@ func (qs *queryState) helpProcessTask( } } + if srcFn.fnType == MatchFn { + span.Annotate(nil, "handleMatchFunction") + if err := qs.handleMatchFunction(ctx, funcArgs{q, gid, srcFn, out}); err != nil { + return nil, err + } + } + // We fetch the actual value for the uids, compare them to the value in the // request and filter the uids only if the tokenizer IsLossy. if srcFn.fnType == CompareAttrFn && len(srcFn.tokens) > 0 { @@ -1086,6 +1096,27 @@ func (qs *queryState) handleCompareFunction(ctx context.Context, arg funcArgs) e return nil } +func (qs *queryState) handleMatchFunction(ctx context.Context, arg funcArgs) error { + attr := arg.q.Attr + typ, err := schema.State().TypeOf(attr) + if err != nil || !typ.IsScalar() { + return x.Errorf("Attribute not scalar: %s %v", attr, typ) + } + if typ != types.StringID { + return x.Errorf("Got non-string type. Fuzzy match is allowed only on string type.") + } + var found bool + for _, t := range schema.State().Tokenizer(attr) { + if t.Identifier() == tok.IdentFullText { + found = true + } + } + if !found { + return x.Errorf("Attribute %v does not have fulltext index for fuzzy matching.", attr) + } + return nil +} + func (qs *queryState) filterGeoFunction(arg funcArgs) error { attr := arg.q.Attr uids := algo.MergeSorted(arg.out.UidMatrix) @@ -1365,7 +1396,7 @@ func parseSrcFn(q *pb.Query) (*functionContext, error) { return nil, err } fc.n = len(q.UidList.Uids) - case StandardFn, FullTextSearchFn: + case StandardFn, FullTextSearchFn, MatchFn: // srcfunc 0th val is func name and and [2:] are args. // we tokenize the arguments of the query. if err = ensureArgsCount(q.SrcFunc, 1); err != nil { From 4081c8f2b462fb1e45f130f942245961ca6291a2 Mon Sep 17 00:00:00 2001 From: srfrog Date: Fri, 18 Jan 2019 10:30:12 -0700 Subject: [PATCH 04/50] the match func needs a fulltext tokenizer --- worker/tokens.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/worker/tokens.go b/worker/tokens.go index c787503384e..3d9ef998aa7 100644 --- a/worker/tokens.go +++ b/worker/tokens.go @@ -29,9 +29,10 @@ import ( func verifyStringIndex(attr string, funcType FuncType) (string, bool) { var requiredTokenizer tok.Tokenizer - if funcType == FullTextSearchFn { + switch funcType { + case FullTextSearchFn, MatchFn: requiredTokenizer = tok.FullTextTokenizer{} - } else { + default: requiredTokenizer = tok.TermTokenizer{} } From ef4a60ff9b50bb546deb790c2ba68d7ec342664b Mon Sep 17 00:00:00 2001 From: srfrog Date: Fri, 18 Jan 2019 19:36:09 -0700 Subject: [PATCH 05/50] added ngram bleve analizer --- tok/bleve.go | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/tok/bleve.go b/tok/bleve.go index dbea9e0ec82..258e20ae140 100644 --- a/tok/bleve.go +++ b/tok/bleve.go @@ -22,6 +22,7 @@ import ( "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/analysis/analyzer/custom" "github.com/blevesearch/bleve/analysis/token/lowercase" + "github.com/blevesearch/bleve/analysis/token/ngram" "github.com/blevesearch/bleve/analysis/token/unicodenorm" "github.com/blevesearch/bleve/analysis/tokenizer/unicode" "github.com/blevesearch/bleve/registry" @@ -29,9 +30,13 @@ import ( const unicodenormName = "unicodenorm_nfkc" +var _ = ngram.Name + var ( - bleveCache = registry.NewCache() - termAnalyzer, fulltextAnalyzer *analysis.Analyzer + bleveCache = registry.NewCache() + termAnalyzer, + fulltextAnalyzer, + ngramAnalyzer *analysis.Analyzer ) // setupBleve creates bleve filters and analyzers that we use for term and fulltext tokenizers. @@ -68,6 +73,21 @@ func setupBleve() { }, }) x.Check(err) + + // ngram analyzer - splits on word boundaries, lowercase, normalize tokens, split into ngrams + ngramAnalyzer, err = bleveCache.DefineAnalyzer("ngram_nfkc", + map[string]interface{}{ + "type": custom.Name, + "tokenizer": unicode.Name, + "token_filters": []string{ + lowercase.Name, + unicodenormName, + ngram.Name, + }, + "min": 2, + "max": 3, + }) + x.Check(err) } // uniqueTerms takes a token stream and returns a string slice of unique terms. From 506f5e821648b39a0041c0e791adb4e909f979b3 Mon Sep 17 00:00:00 2001 From: srfrog Date: Fri, 18 Jan 2019 19:36:38 -0700 Subject: [PATCH 06/50] added ngram tokenizer --- tok/tok.go | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tok/tok.go b/tok/tok.go index d0c5b02c2d9..69a8b6669aa 100644 --- a/tok/tok.go +++ b/tok/tok.go @@ -38,6 +38,7 @@ const ( IdentNone = 0x0 IdentTerm = 0x1 IdentExact = 0x2 + IdentNgram = 0x3 IdentYear = 0x4 IdentMonth = 0x41 IdentDay = 0x42 @@ -95,6 +96,7 @@ func init() { registerTokenizer(HashTokenizer{}) registerTokenizer(TermTokenizer{}) registerTokenizer(FullTextTokenizer{}) + registerTokenizer(NgramTokenizer{}) setupBleve() } @@ -305,6 +307,22 @@ func (t FullTextTokenizer) Identifier() byte { return IdentFullText } func (t FullTextTokenizer) IsSortable() bool { return false } func (t FullTextTokenizer) IsLossy() bool { return true } +type NgramTokenizer struct{} + +func (t NgramTokenizer) Name() string { return "ngram" } +func (t NgramTokenizer) Type() string { return "string" } +func (t NgramTokenizer) Tokens(v interface{}) ([]string, error) { + str, ok := v.(string) + if !ok || str == "" { + return []string{str}, nil + } + tokens := ngramAnalyzer.Analyze([]byte(str)) + return uniqueTerms(tokens), nil +} +func (t NgramTokenizer) Identifier() byte { return IdentNgram } +func (t NgramTokenizer) IsSortable() bool { return false } +func (t NgramTokenizer) IsLossy() bool { return true } + func encodeInt(val int64) string { buf := make([]byte, 9) binary.BigEndian.PutUint64(buf[1:], uint64(val)) From 7c76ae2fc7c0b2303f3cbbec9c8634e4d3fe5200 Mon Sep 17 00:00:00 2001 From: srfrog Date: Fri, 18 Jan 2019 19:49:10 -0700 Subject: [PATCH 07/50] verify match uses ngram tokenizer --- worker/tokens.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/worker/tokens.go b/worker/tokens.go index 3d9ef998aa7..f6a25e446dc 100644 --- a/worker/tokens.go +++ b/worker/tokens.go @@ -30,8 +30,10 @@ import ( func verifyStringIndex(attr string, funcType FuncType) (string, bool) { var requiredTokenizer tok.Tokenizer switch funcType { - case FullTextSearchFn, MatchFn: + case FullTextSearchFn: requiredTokenizer = tok.FullTextTokenizer{} + case MatchFn: + requiredTokenizer = tok.NgramTokenizer{} default: requiredTokenizer = tok.TermTokenizer{} } From 8a0ffeb0370414443d136c6f39f229fb67fe0e75 Mon Sep 17 00:00:00 2001 From: srfrog Date: Fri, 18 Jan 2019 20:01:16 -0700 Subject: [PATCH 08/50] get string tokens for match --- worker/tokens.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/worker/tokens.go b/worker/tokens.go index f6a25e446dc..1aa78f72267 100644 --- a/worker/tokens.go +++ b/worker/tokens.go @@ -66,11 +66,11 @@ func verifyCustomIndex(attr string, tokenizerName string) bool { // Return string tokens from function arguments. It maps function type to correct tokenizer. // Note: regexp functions require regexp compilation of argument, not tokenization. func getStringTokens(funcArgs []string, lang string, funcType FuncType) ([]string, error) { - if lang == "." { - lang = "en" - } - if funcType == FullTextSearchFn { + switch funcType { + case FullTextSearchFn: return tok.GetFullTextTokens(funcArgs, lang) + case MatchFn: + return tok.GetMatchTokens(funcArgs) } return tok.GetTermTokens(funcArgs) } From 6909395c20b1b0c3a653ce87b031875de1be2330 Mon Sep 17 00:00:00 2001 From: srfrog Date: Fri, 18 Jan 2019 20:01:38 -0700 Subject: [PATCH 09/50] added func to build match tokens --- tok/tokens.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tok/tokens.go b/tok/tokens.go index 14f41e82a42..26c831cc434 100644 --- a/tok/tokens.go +++ b/tok/tokens.go @@ -46,3 +46,10 @@ func GetFullTextTokens(funcArgs []string, lang string) ([]string, error) { } return BuildTokens(funcArgs[0], FullTextTokenizer{lang: lang}) } + +func GetMatchTokens(funcArgs []string) ([]string, error) { + if l := len(funcArgs); l != 1 { + return nil, x.Errorf("Function requires 1 arguments, but got %d", l) + } + return BuildTokens(funcArgs[0], NgramTokenizer{}) +} From f0ce9e06d23015056f7319978c7d4a1ba01be0ae Mon Sep 17 00:00:00 2001 From: srfrog Date: Fri, 18 Jan 2019 20:02:22 -0700 Subject: [PATCH 10/50] changed handleMatchFunction to use index --- worker/task.go | 92 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 80 insertions(+), 12 deletions(-) diff --git a/worker/task.go b/worker/task.go index 62245aa29ce..d81415f29a2 100644 --- a/worker/task.go +++ b/worker/task.go @@ -275,9 +275,15 @@ func needsIndex(fnType FuncType) bool { switch fnType { case CompareAttrFn, GeoFn, RegexFn, FullTextSearchFn, StandardFn, MatchFn: return true - default: - return false } + return false +} + +func needsIntersect(fnName string) bool { + fnName = strings.ToLower(fnName) + return strings.HasPrefix(fnName, "allof") || + strings.HasPrefix(fnName, "anyof") || + fnName == "match" } type funcArgs struct { @@ -1097,23 +1103,87 @@ func (qs *queryState) handleCompareFunction(ctx context.Context, arg funcArgs) e } func (qs *queryState) handleMatchFunction(ctx context.Context, arg funcArgs) error { + span := otrace.FromContext(ctx) + stop := x.SpanTimer(span, "handleMatchFunction") + defer stop() + if span != nil { + span.Annotatef(nil, "Number of uids: %d. args.srcFn: %+v", arg.srcFn.n, arg.srcFn) + } + attr := arg.q.Attr typ, err := schema.State().TypeOf(attr) + span.Annotatef(nil, "Attr: %s. Type: %s", attr, typ.Name()) if err != nil || !typ.IsScalar() { return x.Errorf("Attribute not scalar: %s %v", attr, typ) } if typ != types.StringID { return x.Errorf("Got non-string type. Fuzzy match is allowed only on string type.") } - var found bool + var useIndex bool for _, t := range schema.State().Tokenizer(attr) { - if t.Identifier() == tok.IdentFullText { - found = true + if t.Identifier() == tok.IdentNgram { + useIndex = true } } - if !found { - return x.Errorf("Attribute %v does not have fulltext index for fuzzy matching.", attr) + if !useIndex { + return x.Errorf("Attribute %v does not have ngram index for fuzzy matching.", attr) } + + uids, err := uidsForMatch(attr, arg) + if err != nil { + return err + } + + isList := schema.State().IsList(attr) + lang := langForFunc(arg.q.Langs) + span.Annotatef(nil, "Total uids: %d, list: %t lang: %v", len(uids.Uids), isList, lang) + arg.out.UidMatrix = append(arg.out.UidMatrix, uids) + + filtered := &pb.List{} + for _, uid := range uids.Uids { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + pl, err := qs.cache.Get(x.DataKey(attr, uid)) + if err != nil { + return err + } + + vals := make([]types.Val, 1) + switch { + case isList: + vals, err = pl.AllUntaggedValues(arg.q.ReadTs) + + case lang != "": + vals[0], err = pl.ValueForTag(arg.q.ReadTs, lang) + + default: + vals[0], err = pl.Value(arg.q.ReadTs) + } + if err != nil { + if err == posting.ErrNoValue { + continue + } else if err != nil { + return err + } + return err + } + + for _, val := range vals { + // convert data from binary to appropriate format + strVal, err := types.Convert(val, types.StringID) + if err == nil && matchFuzzy(strVal, arg.srcFn) { + filtered.Uids = append(filtered.Uids, uid) + } + } + } + + for i := 0; i < len(arg.out.UidMatrix); i++ { + algo.IntersectWith(arg.out.UidMatrix[i], filtered, arg.out.UidMatrix[i]) + } + return nil } @@ -1409,8 +1479,7 @@ func parseSrcFn(q *pb.Query) (*functionContext, error) { if fc.tokens, err = getStringTokens(q.SrcFunc.Args, langForFunc(q.Langs), fnType); err != nil { return nil, err } - fnName := strings.ToLower(q.SrcFunc.Name) - fc.intersectDest = strings.HasPrefix(fnName, "allof") // allofterms and alloftext + fc.intersectDest = needsIntersect(q.SrcFunc.Name) fc.n = len(fc.tokens) case CustomIndexFn: if err = ensureArgsCount(q.SrcFunc, 2); err != nil { @@ -1431,9 +1500,8 @@ func parseSrcFn(q *pb.Query) (*functionContext, error) { } fc.tokens, _ = tok.BuildTokens(valToTok.Value, tok.GetLangTokenizer(tokenizer, langForFunc(q.Langs))) - fnName := strings.ToLower(q.SrcFunc.Name) - x.AssertTrue(fnName == "allof" || fnName == "anyof") - fc.intersectDest = strings.HasSuffix(fnName, "allof") + fc.intersectDest = needsIntersect(q.SrcFunc.Name) + x.AssertTrue(fc.intersectDest) fc.n = len(fc.tokens) case RegexFn: if err = ensureArgsCount(q.SrcFunc, 2); err != nil { From dca21cc3c6ab7c603ce130735d1e6652c01a8cfa Mon Sep 17 00:00:00 2001 From: srfrog Date: Mon, 21 Jan 2019 17:47:31 -0700 Subject: [PATCH 11/50] cherry-pick schema.HasTokenizer --- schema/schema.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/schema/schema.go b/schema/schema.go index 584eb95d591..4ae1aa2499c 100644 --- a/schema/schema.go +++ b/schema/schema.go @@ -180,6 +180,17 @@ func (s *state) TokenizerNames(pred string) []string { return names } +// HasTokenizer is a convenience func that checks if a given tokenizer is found in pred. +// Returns true if found, else false. +func (s *state) HasTokenizer(id byte, pred string) bool { + for _, t := range s.Tokenizer(pred) { + if t.Identifier() == id { + return true + } + } + return false +} + // IsReversed returns whether the predicate has reverse edge or not func (s *state) IsReversed(pred string) bool { s.RLock() From c962cfb014afbb21808501fc52005f98f75de8fe Mon Sep 17 00:00:00 2001 From: srfrog Date: Mon, 21 Jan 2019 17:50:37 -0700 Subject: [PATCH 12/50] configure bigram filter before the analyzer to update cache --- tok/bleve.go | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tok/bleve.go b/tok/bleve.go index 258e20ae140..bc9037928dd 100644 --- a/tok/bleve.go +++ b/tok/bleve.go @@ -30,8 +30,6 @@ import ( const unicodenormName = "unicodenorm_nfkc" -var _ = ngram.Name - var ( bleveCache = registry.NewCache() termAnalyzer, @@ -50,6 +48,15 @@ func setupBleve() { }) x.Check(err) + // bigram filter - breaks up terms into bigram, suitable for fuzzy lookups. + _, err = bleveCache.DefineTokenFilter(ngram.Name, + map[string]interface{}{ + "type": ngram.Name, + "min": int(2), + "max": int(2), + }) + x.Check(err) + // term analyzer - splits on word boundaries, lowercase and normalize tokens. termAnalyzer, err = bleveCache.DefineAnalyzer("term", map[string]interface{}{ @@ -74,7 +81,7 @@ func setupBleve() { }) x.Check(err) - // ngram analyzer - splits on word boundaries, lowercase, normalize tokens, split into ngrams + // ngram analyzer - splits on word boundaries, lowercase, normalize tokens, split into bigrams ngramAnalyzer, err = bleveCache.DefineAnalyzer("ngram_nfkc", map[string]interface{}{ "type": custom.Name, @@ -84,9 +91,8 @@ func setupBleve() { unicodenormName, ngram.Name, }, - "min": 2, - "max": 3, - }) + }, + ) x.Check(err) } From 301f8be249adc446d9f0d22ae4e63e3d5b293036 Mon Sep 17 00:00:00 2001 From: srfrog Date: Mon, 21 Jan 2019 17:51:11 -0700 Subject: [PATCH 13/50] added EncodeTokens convenience func to encapsulate encodeToken --- tok/tok.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tok/tok.go b/tok/tok.go index 69a8b6669aa..fcc5cd83860 100644 --- a/tok/tok.go +++ b/tok/tok.go @@ -350,6 +350,12 @@ func EncodeRegexTokens(tokens []string) { } } +func EncodeTokens(id byte, tokens []string) { + for i := 0; i < len(tokens); i++ { + tokens[i] = encodeToken(tokens[i], id) + } +} + type BoolTokenizer struct{} func (t BoolTokenizer) Name() string { return "bool" } From 95483fb109f829c7503bb6aead8ae828c15628ae Mon Sep 17 00:00:00 2001 From: srfrog Date: Mon, 21 Jan 2019 17:52:08 -0700 Subject: [PATCH 14/50] we dont need to pre-get tokens, we do that when the task is running --- worker/tokens.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/worker/tokens.go b/worker/tokens.go index 1aa78f72267..e6a0e016d8f 100644 --- a/worker/tokens.go +++ b/worker/tokens.go @@ -69,8 +69,6 @@ func getStringTokens(funcArgs []string, lang string, funcType FuncType) ([]strin switch funcType { case FullTextSearchFn: return tok.GetFullTextTokens(funcArgs, lang) - case MatchFn: - return tok.GetMatchTokens(funcArgs) } return tok.GetTermTokens(funcArgs) } From 38d6280a057aad5fb3e7c790654473bf1923bd9c Mon Sep 17 00:00:00 2001 From: srfrog Date: Mon, 21 Jan 2019 17:53:11 -0700 Subject: [PATCH 15/50] handleMatchFunction updated for fuzzy match, filter optimizations, and code cleanups --- worker/task.go | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/worker/task.go b/worker/task.go index d81415f29a2..dccbcfc8983 100644 --- a/worker/task.go +++ b/worker/task.go @@ -281,9 +281,7 @@ func needsIndex(fnType FuncType) bool { func needsIntersect(fnName string) bool { fnName = strings.ToLower(fnName) - return strings.HasPrefix(fnName, "allof") || - strings.HasPrefix(fnName, "anyof") || - fnName == "match" + return strings.HasPrefix(fnName, "allof") || strings.HasPrefix(fnName, "anyof") } type funcArgs struct { @@ -1111,27 +1109,28 @@ func (qs *queryState) handleMatchFunction(ctx context.Context, arg funcArgs) err } attr := arg.q.Attr - typ, err := schema.State().TypeOf(attr) + typ := arg.srcFn.atype span.Annotatef(nil, "Attr: %s. Type: %s", attr, typ.Name()) - if err != nil || !typ.IsScalar() { + uids := &pb.List{} + switch { + case !typ.IsScalar(): return x.Errorf("Attribute not scalar: %s %v", attr, typ) - } - if typ != types.StringID { + + case typ != types.StringID: return x.Errorf("Got non-string type. Fuzzy match is allowed only on string type.") - } - var useIndex bool - for _, t := range schema.State().Tokenizer(attr) { - if t.Identifier() == tok.IdentNgram { - useIndex = true - } - } - if !useIndex { + + case !schema.State().HasTokenizer(tok.IdentNgram, attr): return x.Errorf("Attribute %v does not have ngram index for fuzzy matching.", attr) - } - uids, err := uidsForMatch(attr, arg) - if err != nil { - return err + case arg.q.UidList != nil && len(arg.q.UidList.Uids) != 0: + uids = arg.q.UidList + + default: + var err error + uids, err = uidsForMatch(attr, arg) + if err != nil { + return err + } } isList := schema.State().IsList(attr) @@ -1174,7 +1173,7 @@ func (qs *queryState) handleMatchFunction(ctx context.Context, arg funcArgs) err for _, val := range vals { // convert data from binary to appropriate format strVal, err := types.Convert(val, types.StringID) - if err == nil && matchFuzzy(strVal, arg.srcFn) { + if err == nil && matchFuzzy(arg.srcFn, strVal.Value.(string)) { filtered.Uids = append(filtered.Uids, uid) } } From a1554203ecc00d8b6ce31c1ba2d9b7eede638bb6 Mon Sep 17 00:00:00 2001 From: srfrog Date: Mon, 21 Jan 2019 17:56:53 -0700 Subject: [PATCH 16/50] matchFuzzy func using index and bigram (ngram) index --- worker/ngram.go | 74 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 worker/ngram.go diff --git a/worker/ngram.go b/worker/ngram.go new file mode 100644 index 00000000000..650dd219a7a --- /dev/null +++ b/worker/ngram.go @@ -0,0 +1,74 @@ +/* + * Copyright 2018 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package worker + +import ( + "github.com/dgraph-io/dgraph/algo" + "github.com/dgraph-io/dgraph/posting" + "github.com/dgraph-io/dgraph/protos/pb" + "github.com/dgraph-io/dgraph/tok" + "github.com/dgraph-io/dgraph/x" + fuzzstr "github.com/dgryski/go-fuzzstr" +) + +func matchFuzzy(srcFn *functionContext, val string) bool { + if val == "" { + return false + } + terms, err := tok.GetTermTokens([]string{val}) + if err != nil { + return false + } + idx := fuzzstr.NewIndex(terms) + cnt := 0 + for i := range srcFn.tokens { + p := idx.Query(srcFn.tokens[i]) + cnt += len(p) + } + return cnt > 0 +} + +func uidsForMatch(attr string, arg funcArgs) (*pb.List, error) { + var results *pb.List + + opts := posting.ListOptions{ReadTs: arg.q.ReadTs} + + uidsForNgram := func(ngram string) (*pb.List, error) { + key := x.IndexKey(attr, ngram) + pl, err := posting.GetNoStore(key) + if err != nil { + return nil, err + } + return pl.Uids(opts) + } + + tokens, err := tok.GetMatchTokens(arg.srcFn.tokens) + if err != nil { + return nil, err + } + uidMatrix := make([]*pb.List, len(tokens)) + + for i, t := range tokens { + uidMatrix[i], err = uidsForNgram(t) + if err != nil { + return nil, err + } + } + results = algo.MergeSorted(uidMatrix) + + return results, nil +} From 8d9e59273da25792fc8c1303cec1bcaff2a176a6 Mon Sep 17 00:00:00 2001 From: srfrog Date: Mon, 21 Jan 2019 17:58:07 -0700 Subject: [PATCH 17/50] adding Bleve ngram support --- .../bleve/analysis/token/ngram/ngram.go | 113 ++++++++++++++++++ vendor/vendor.json | 6 + 2 files changed, 119 insertions(+) create mode 100644 vendor/github.com/blevesearch/bleve/analysis/token/ngram/ngram.go diff --git a/vendor/github.com/blevesearch/bleve/analysis/token/ngram/ngram.go b/vendor/github.com/blevesearch/bleve/analysis/token/ngram/ngram.go new file mode 100644 index 00000000000..abab0aefc0a --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/analysis/token/ngram/ngram.go @@ -0,0 +1,113 @@ +// Copyright (c) 2014 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ngram + +import ( + "bytes" + "fmt" + "unicode/utf8" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const Name = "ngram" + +type NgramFilter struct { + minLength int + maxLength int +} + +func NewNgramFilter(minLength, maxLength int) *NgramFilter { + return &NgramFilter{ + minLength: minLength, + maxLength: maxLength, + } +} + +func (s *NgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + rv := make(analysis.TokenStream, 0, len(input)) + + for _, token := range input { + runeCount := utf8.RuneCount(token.Term) + runes := bytes.Runes(token.Term) + for i := 0; i < runeCount; i++ { + // index of the starting rune for this token + for ngramSize := s.minLength; ngramSize <= s.maxLength; ngramSize++ { + // build an ngram of this size starting at i + if i+ngramSize <= runeCount { + ngramTerm := analysis.BuildTermFromRunes(runes[i : i+ngramSize]) + token := analysis.Token{ + Position: token.Position, + Start: token.Start, + End: token.End, + Type: token.Type, + Term: ngramTerm, + } + rv = append(rv, &token) + } + } + } + } + + return rv +} + +func NgramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + minVal, ok := config["min"] + if !ok { + return nil, fmt.Errorf("must specify min") + } + + min, err := convertToInt(minVal) + if err != nil { + return nil, err + } + + maxVal, ok := config["max"] + if !ok { + return nil, fmt.Errorf("must specify max") + } + + max, err := convertToInt(maxVal) + if err != nil { + return nil, err + } + + return NewNgramFilter(min, max), nil +} + +func init() { + registry.RegisterTokenFilter(Name, NgramFilterConstructor) +} + +// Expects either an int or a flaot64 value +func convertToInt(val interface{}) (int, error) { + var intVal int + var floatVal float64 + var ok bool + + intVal, ok = val.(int) + if ok { + return intVal, nil + } + + floatVal, ok = val.(float64) + if ok { + return int(floatVal), nil + } + + return 0, fmt.Errorf("failed to convert to int value") +} diff --git a/vendor/vendor.json b/vendor/vendor.json index b595de2f851..1b898de4702 100644 --- a/vendor/vendor.json +++ b/vendor/vendor.json @@ -242,6 +242,12 @@ "revision": "e1f5e6cdcd76e92b209576b5d9111ccab67bd2fb", "revisionTime": "2018-11-14T23:20:33Z" }, + { + "checksumSHA1": "sP0qvqWwZgUMRjK8GU9oobghGFM=", + "path": "github.com/blevesearch/bleve/analysis/token/ngram", + "revision": "d72521093e14a38cb92e22042aed18ca7a1e8fd4", + "revisionTime": "2019-01-16T19:08:17Z" + }, { "checksumSHA1": "QOw3ypU4VTmFT8XYS/52P3RILZw=", "path": "github.com/blevesearch/bleve/analysis/token/porter", From ffe2c6557fc3c058622108e363b9a910526f263b Mon Sep 17 00:00:00 2001 From: srfrog Date: Mon, 21 Jan 2019 18:06:12 -0700 Subject: [PATCH 18/50] added func comments --- worker/ngram.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/worker/ngram.go b/worker/ngram.go index 650dd219a7a..a4347c4bc9b 100644 --- a/worker/ngram.go +++ b/worker/ngram.go @@ -25,6 +25,9 @@ import ( fuzzstr "github.com/dgryski/go-fuzzstr" ) +// matchFuzzy takes in a value (from posting) and compares it our list of bigram tokens. +// All token values must match to be considered a fuzzy match. +// Returns true if value matches fuzzy tokens, false otherwise. func matchFuzzy(srcFn *functionContext, val string) bool { if val == "" { return false @@ -42,6 +45,9 @@ func matchFuzzy(srcFn *functionContext, val string) bool { return cnt > 0 } +// uidsForMatch collects a list of uids that "might" match a fuzzy term based on the bigram +// index. matchFuzzy does the actual fuzzy match. +// Returns the list of uids even if empty, or an error otherwise. func uidsForMatch(attr string, arg funcArgs) (*pb.List, error) { var results *pb.List From ad65a6cd43e86a5b90b59fde64c4c40b4c573d80 Mon Sep 17 00:00:00 2001 From: srfrog Date: Mon, 21 Jan 2019 18:24:18 -0700 Subject: [PATCH 19/50] all fuzzy tokens must match --- worker/ngram.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/worker/ngram.go b/worker/ngram.go index a4347c4bc9b..e9cff39ac98 100644 --- a/worker/ngram.go +++ b/worker/ngram.go @@ -25,8 +25,8 @@ import ( fuzzstr "github.com/dgryski/go-fuzzstr" ) -// matchFuzzy takes in a value (from posting) and compares it our list of bigram tokens. -// All token values must match to be considered a fuzzy match. +// matchFuzzy takes in a value (from posting) and compares it to our list of bigram tokens. +// All search tokens must match to be considered a fuzzy match. // Returns true if value matches fuzzy tokens, false otherwise. func matchFuzzy(srcFn *functionContext, val string) bool { if val == "" { @@ -40,7 +40,11 @@ func matchFuzzy(srcFn *functionContext, val string) bool { cnt := 0 for i := range srcFn.tokens { p := idx.Query(srcFn.tokens[i]) - cnt += len(p) + l := len(p) + if l == 0 { + return false + } + cnt++ } return cnt > 0 } From b98c0d24186f19dc5545234a3a0c49afeeb14df5 Mon Sep 17 00:00:00 2001 From: srfrog Date: Mon, 21 Jan 2019 23:14:46 -0700 Subject: [PATCH 20/50] cp: test cases might not be mutex in the future, revert the order. --- worker/task.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/worker/task.go b/worker/task.go index dccbcfc8983..592d66cd56c 100644 --- a/worker/task.go +++ b/worker/task.go @@ -1152,12 +1152,12 @@ func (qs *queryState) handleMatchFunction(ctx context.Context, arg funcArgs) err vals := make([]types.Val, 1) switch { - case isList: - vals, err = pl.AllUntaggedValues(arg.q.ReadTs) - case lang != "": vals[0], err = pl.ValueForTag(arg.q.ReadTs, lang) + case isList: + vals, err = pl.AllUntaggedValues(arg.q.ReadTs) + default: vals[0], err = pl.Value(arg.q.ReadTs) } From 6194d076c4de2fdea2c3b98eb3420fcde7133085 Mon Sep 17 00:00:00 2001 From: srfrog Date: Mon, 21 Jan 2019 23:16:12 -0700 Subject: [PATCH 21/50] cp: dont try to match all posting values against match --- worker/task.go | 1 + 1 file changed, 1 insertion(+) diff --git a/worker/task.go b/worker/task.go index 592d66cd56c..6f8fe95ae5f 100644 --- a/worker/task.go +++ b/worker/task.go @@ -1175,6 +1175,7 @@ func (qs *queryState) handleMatchFunction(ctx context.Context, arg funcArgs) err strVal, err := types.Convert(val, types.StringID) if err == nil && matchFuzzy(arg.srcFn, strVal.Value.(string)) { filtered.Uids = append(filtered.Uids, uid) + break } } } From 949525fac01f6b20d95b17049f5b8ebe86521d83 Mon Sep 17 00:00:00 2001 From: srfrog Date: Mon, 21 Jan 2019 23:16:50 -0700 Subject: [PATCH 22/50] cp: added comment --- worker/task.go | 1 + 1 file changed, 1 insertion(+) diff --git a/worker/task.go b/worker/task.go index 6f8fe95ae5f..ad511851bf8 100644 --- a/worker/task.go +++ b/worker/task.go @@ -1175,6 +1175,7 @@ func (qs *queryState) handleMatchFunction(ctx context.Context, arg funcArgs) err strVal, err := types.Convert(val, types.StringID) if err == nil && matchFuzzy(arg.srcFn, strVal.Value.(string)) { filtered.Uids = append(filtered.Uids, uid) + // NOTE: We only add the uid once. break } } From 77099912fe622590b77ad0ae967c19c179e18b47 Mon Sep 17 00:00:00 2001 From: srfrog Date: Mon, 21 Jan 2019 23:17:15 -0700 Subject: [PATCH 23/50] removed extra branch --- worker/task.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/worker/task.go b/worker/task.go index ad511851bf8..935f5cacdc4 100644 --- a/worker/task.go +++ b/worker/task.go @@ -1164,8 +1164,6 @@ func (qs *queryState) handleMatchFunction(ctx context.Context, arg funcArgs) err if err != nil { if err == posting.ErrNoValue { continue - } else if err != nil { - return err } return err } From 6038ed329154a575668724dbc9f4b5cf755b00a9 Mon Sep 17 00:00:00 2001 From: srfrog Date: Tue, 22 Jan 2019 20:32:41 -0700 Subject: [PATCH 24/50] switch to trigram for fuzzy match indexing --- tok/bleve.go | 29 +---------------------------- tok/tok.go | 18 ------------------ tok/tokens.go | 2 +- worker/ngram.go | 6 ++++-- worker/task.go | 11 +++++++---- worker/tokens.go | 2 +- 6 files changed, 14 insertions(+), 54 deletions(-) diff --git a/tok/bleve.go b/tok/bleve.go index bc9037928dd..b9fe4895fb6 100644 --- a/tok/bleve.go +++ b/tok/bleve.go @@ -22,7 +22,6 @@ import ( "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/analysis/analyzer/custom" "github.com/blevesearch/bleve/analysis/token/lowercase" - "github.com/blevesearch/bleve/analysis/token/ngram" "github.com/blevesearch/bleve/analysis/token/unicodenorm" "github.com/blevesearch/bleve/analysis/tokenizer/unicode" "github.com/blevesearch/bleve/registry" @@ -32,9 +31,7 @@ const unicodenormName = "unicodenorm_nfkc" var ( bleveCache = registry.NewCache() - termAnalyzer, - fulltextAnalyzer, - ngramAnalyzer *analysis.Analyzer + termAnalyzer, fulltextAnalyzer *analysis.Analyzer ) // setupBleve creates bleve filters and analyzers that we use for term and fulltext tokenizers. @@ -48,15 +45,6 @@ func setupBleve() { }) x.Check(err) - // bigram filter - breaks up terms into bigram, suitable for fuzzy lookups. - _, err = bleveCache.DefineTokenFilter(ngram.Name, - map[string]interface{}{ - "type": ngram.Name, - "min": int(2), - "max": int(2), - }) - x.Check(err) - // term analyzer - splits on word boundaries, lowercase and normalize tokens. termAnalyzer, err = bleveCache.DefineAnalyzer("term", map[string]interface{}{ @@ -81,21 +69,6 @@ func setupBleve() { }) x.Check(err) - // ngram analyzer - splits on word boundaries, lowercase, normalize tokens, split into bigrams - ngramAnalyzer, err = bleveCache.DefineAnalyzer("ngram_nfkc", - map[string]interface{}{ - "type": custom.Name, - "tokenizer": unicode.Name, - "token_filters": []string{ - lowercase.Name, - unicodenormName, - ngram.Name, - }, - }, - ) - x.Check(err) -} - // uniqueTerms takes a token stream and returns a string slice of unique terms. func uniqueTerms(tokens analysis.TokenStream) []string { var terms []string diff --git a/tok/tok.go b/tok/tok.go index fcc5cd83860..69bfb320daa 100644 --- a/tok/tok.go +++ b/tok/tok.go @@ -38,7 +38,6 @@ const ( IdentNone = 0x0 IdentTerm = 0x1 IdentExact = 0x2 - IdentNgram = 0x3 IdentYear = 0x4 IdentMonth = 0x41 IdentDay = 0x42 @@ -96,7 +95,6 @@ func init() { registerTokenizer(HashTokenizer{}) registerTokenizer(TermTokenizer{}) registerTokenizer(FullTextTokenizer{}) - registerTokenizer(NgramTokenizer{}) setupBleve() } @@ -307,22 +305,6 @@ func (t FullTextTokenizer) Identifier() byte { return IdentFullText } func (t FullTextTokenizer) IsSortable() bool { return false } func (t FullTextTokenizer) IsLossy() bool { return true } -type NgramTokenizer struct{} - -func (t NgramTokenizer) Name() string { return "ngram" } -func (t NgramTokenizer) Type() string { return "string" } -func (t NgramTokenizer) Tokens(v interface{}) ([]string, error) { - str, ok := v.(string) - if !ok || str == "" { - return []string{str}, nil - } - tokens := ngramAnalyzer.Analyze([]byte(str)) - return uniqueTerms(tokens), nil -} -func (t NgramTokenizer) Identifier() byte { return IdentNgram } -func (t NgramTokenizer) IsSortable() bool { return false } -func (t NgramTokenizer) IsLossy() bool { return true } - func encodeInt(val int64) string { buf := make([]byte, 9) binary.BigEndian.PutUint64(buf[1:], uint64(val)) diff --git a/tok/tokens.go b/tok/tokens.go index 26c831cc434..5238c58b229 100644 --- a/tok/tokens.go +++ b/tok/tokens.go @@ -51,5 +51,5 @@ func GetMatchTokens(funcArgs []string) ([]string, error) { if l := len(funcArgs); l != 1 { return nil, x.Errorf("Function requires 1 arguments, but got %d", l) } - return BuildTokens(funcArgs[0], NgramTokenizer{}) + return BuildTokens(funcArgs[0], TrigramTokenizer{}) } diff --git a/worker/ngram.go b/worker/ngram.go index e9cff39ac98..052574d53bb 100644 --- a/worker/ngram.go +++ b/worker/ngram.go @@ -23,9 +23,10 @@ import ( "github.com/dgraph-io/dgraph/tok" "github.com/dgraph-io/dgraph/x" fuzzstr "github.com/dgryski/go-fuzzstr" + "github.com/golang/glog" ) -// matchFuzzy takes in a value (from posting) and compares it to our list of bigram tokens. +// matchFuzzy takes in a value (from posting) and compares it to our list of ngram tokens. // All search tokens must match to be considered a fuzzy match. // Returns true if value matches fuzzy tokens, false otherwise. func matchFuzzy(srcFn *functionContext, val string) bool { @@ -49,7 +50,7 @@ func matchFuzzy(srcFn *functionContext, val string) bool { return cnt > 0 } -// uidsForMatch collects a list of uids that "might" match a fuzzy term based on the bigram +// uidsForMatch collects a list of uids that "might" match a fuzzy term based on the ngram // index. matchFuzzy does the actual fuzzy match. // Returns the list of uids even if empty, or an error otherwise. func uidsForMatch(attr string, arg funcArgs) (*pb.List, error) { @@ -70,6 +71,7 @@ func uidsForMatch(attr string, arg funcArgs) (*pb.List, error) { if err != nil { return nil, err } + glog.Infof("uidsForMatch: tokens: %v", tokens) uidMatrix := make([]*pb.List, len(tokens)) for i, t := range tokens { diff --git a/worker/task.go b/worker/task.go index 935f5cacdc4..080bf6ff9c9 100644 --- a/worker/task.go +++ b/worker/task.go @@ -1119,18 +1119,21 @@ func (qs *queryState) handleMatchFunction(ctx context.Context, arg funcArgs) err case typ != types.StringID: return x.Errorf("Got non-string type. Fuzzy match is allowed only on string type.") - case !schema.State().HasTokenizer(tok.IdentNgram, attr): - return x.Errorf("Attribute %v does not have ngram index for fuzzy matching.", attr) - case arg.q.UidList != nil && len(arg.q.UidList.Uids) != 0: uids = arg.q.UidList - default: + case schema.State().HasTokenizer(tok.IdentTrigram, attr): var err error uids, err = uidsForMatch(attr, arg) if err != nil { return err } + + default: + return x.Errorf( + "Attribute %v does not have trigram index for fuzzy matching. "+ + "Please add a trigram index or use has/uid function with match() as filter.", + attr) } isList := schema.State().IsList(attr) diff --git a/worker/tokens.go b/worker/tokens.go index e6a0e016d8f..9b2da4dde89 100644 --- a/worker/tokens.go +++ b/worker/tokens.go @@ -33,7 +33,7 @@ func verifyStringIndex(attr string, funcType FuncType) (string, bool) { case FullTextSearchFn: requiredTokenizer = tok.FullTextTokenizer{} case MatchFn: - requiredTokenizer = tok.NgramTokenizer{} + requiredTokenizer = tok.TrigramTokenizer{} default: requiredTokenizer = tok.TermTokenizer{} } From 31cd1ff79d9eca12bdd0e13cbfc057eaf6803438 Mon Sep 17 00:00:00 2001 From: srfrog Date: Tue, 22 Jan 2019 20:33:49 -0700 Subject: [PATCH 25/50] fixed typo --- tok/bleve.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tok/bleve.go b/tok/bleve.go index b9fe4895fb6..dbea9e0ec82 100644 --- a/tok/bleve.go +++ b/tok/bleve.go @@ -30,7 +30,7 @@ import ( const unicodenormName = "unicodenorm_nfkc" var ( - bleveCache = registry.NewCache() + bleveCache = registry.NewCache() termAnalyzer, fulltextAnalyzer *analysis.Analyzer ) @@ -68,6 +68,7 @@ func setupBleve() { }, }) x.Check(err) +} // uniqueTerms takes a token stream and returns a string slice of unique terms. func uniqueTerms(tokens analysis.TokenStream) []string { From 1bac79f5c767881bf1b3a2033e06b25f95160cac Mon Sep 17 00:00:00 2001 From: srfrog Date: Tue, 22 Jan 2019 20:35:25 -0700 Subject: [PATCH 26/50] renamed ngram to match --- worker/{ngram.go => match.go} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename worker/{ngram.go => match.go} (100%) diff --git a/worker/ngram.go b/worker/match.go similarity index 100% rename from worker/ngram.go rename to worker/match.go From 4ad60ad2e3c77c559367bb84e98c7e89b423fccf Mon Sep 17 00:00:00 2001 From: srfrog Date: Tue, 22 Jan 2019 20:36:05 -0700 Subject: [PATCH 27/50] fixed typo --- worker/match.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/worker/match.go b/worker/match.go index 052574d53bb..1781db0a3c3 100644 --- a/worker/match.go +++ b/worker/match.go @@ -1,5 +1,5 @@ /* - * Copyright 2018 Dgraph Labs, Inc. and Contributors + * Copyright 2019 Dgraph Labs, Inc. and Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 2d914d5c1e2845079fe42052cea29e8441f5bee7 Mon Sep 17 00:00:00 2001 From: srfrog Date: Wed, 23 Jan 2019 01:43:04 -0700 Subject: [PATCH 28/50] added full posting value to search terms, minor cleanups --- worker/match.go | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/worker/match.go b/worker/match.go index 1781db0a3c3..c84aa729efa 100644 --- a/worker/match.go +++ b/worker/match.go @@ -17,47 +17,45 @@ package worker import ( + "strings" + "github.com/dgraph-io/dgraph/algo" "github.com/dgraph-io/dgraph/posting" "github.com/dgraph-io/dgraph/protos/pb" "github.com/dgraph-io/dgraph/tok" "github.com/dgraph-io/dgraph/x" fuzzstr "github.com/dgryski/go-fuzzstr" - "github.com/golang/glog" ) // matchFuzzy takes in a value (from posting) and compares it to our list of ngram tokens. -// All search tokens must match to be considered a fuzzy match. // Returns true if value matches fuzzy tokens, false otherwise. func matchFuzzy(srcFn *functionContext, val string) bool { if val == "" { return false } + terms, err := tok.GetTermTokens([]string{val}) if err != nil { return false } + + // match the entire string. + terms = append(terms, strings.ToLower(val)) + idx := fuzzstr.NewIndex(terms) - cnt := 0 for i := range srcFn.tokens { - p := idx.Query(srcFn.tokens[i]) - l := len(p) - if l == 0 { - return false + if len(idx.Query(srcFn.tokens[i])) != 0 { + return true } - cnt++ } - return cnt > 0 + return false } // uidsForMatch collects a list of uids that "might" match a fuzzy term based on the ngram // index. matchFuzzy does the actual fuzzy match. // Returns the list of uids even if empty, or an error otherwise. func uidsForMatch(attr string, arg funcArgs) (*pb.List, error) { - var results *pb.List - opts := posting.ListOptions{ReadTs: arg.q.ReadTs} - uidsForNgram := func(ngram string) (*pb.List, error) { key := x.IndexKey(attr, ngram) pl, err := posting.GetNoStore(key) @@ -71,16 +69,13 @@ func uidsForMatch(attr string, arg funcArgs) (*pb.List, error) { if err != nil { return nil, err } - glog.Infof("uidsForMatch: tokens: %v", tokens) - uidMatrix := make([]*pb.List, len(tokens)) + uidMatrix := make([]*pb.List, len(tokens)) for i, t := range tokens { uidMatrix[i], err = uidsForNgram(t) if err != nil { return nil, err } } - results = algo.MergeSorted(uidMatrix) - - return results, nil + return algo.MergeSorted(uidMatrix), nil } From b0128700e38fb5d7cae6d135f1e8561fcfad2084 Mon Sep 17 00:00:00 2001 From: srfrog Date: Wed, 23 Jan 2019 01:44:29 -0700 Subject: [PATCH 29/50] added fuzzy matching test --- systest/queries_test.go | 109 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/systest/queries_test.go b/systest/queries_test.go index 0ab7dbef319..580f2b15405 100644 --- a/systest/queries_test.go +++ b/systest/queries_test.go @@ -50,6 +50,7 @@ func TestQuery(t *testing.T) { t.Run("multiple block eval", wrap(MultipleBlockEval)) t.Run("unmatched var assignment eval", wrap(UnmatchedVarEval)) t.Run("hash index queries", wrap(QueryHashIndex)) + t.Run("fuzzy matching", wrap(FuzzyMatch)) t.Run("cleanup", wrap(SchemaQueryCleanup)) } @@ -534,6 +535,114 @@ func SchemaQueryTestHTTP(t *testing.T, c *dgo.Dgraph) { CompareJSON(t, js, string(m["data"])) } +func FuzzyMatch(t *testing.T, c *dgo.Dgraph) { + ctx := context.Background() + + require.NoError(t, c.Alter(ctx, &api.Operation{ + Schema: ` + term: string @index(trigram) . + name: string . + `, + })) + + txn := c.NewTxn() + _, err := txn.Mutate(ctx, &api.Mutation{ + SetNquads: []byte(` + _:t0 "" . + _:t1 "road" . + _:t2 "avenue" . + _:t3 "street" . + _:t4 "boulevard" . + _:t5 "drive" . + _:t6 "route" . + _:t7 "pass" . + _:t8 "pathway" . + _:t9 "lane" . + _:ta "highway" . + _:tb "parkway" . + _:tc "motorway" . + _:td "high road" . + _:te "side street" . + _:tf "dual carriageway" . + _:n0 "srfrog" . + `), + }) + require.NoError(t, err) + require.NoError(t, txn.Commit(ctx)) + + tests := []struct { + in, out, failure string + }{ + { + in: `{q(func:match(term, "plane")) {term}}`, + out: `{"q":[]}`, + }, + { + in: `{q(func:match(term, drive)) {term}}`, + out: `{"q":[{"term":"drive"}]}`, + }, + { + in: `{q(func:match(term, way)) {term}}`, + out: `{"q":[ + {"term": "highway"}, + {"term": "pathway"}, + {"term": "parkway"}, + {"term": "dual carriageway"}, + {"term": "motorway"} + ]}`, + }, + { + in: `{q(func:match(term, pway)) {term}}`, + out: `{"q":[ + {"term": "pathway"}, + {"term": "parkway"} + ]}`, + }, + { + in: `{q(func:match(term, high)) {term}}`, + out: `{"q":[ + {"term": "highway"}, + {"term": "high road"} + ]}`, + }, + { + in: `{q(func:match(term, str)) {term}}`, + out: `{"q":[ + {"term": "street"}, + {"term": "side street"} + ]}`, + }, + { + in: `{q(func:match(term, "carr")) {term}}`, + out: `{"q":[ + {"term": "dual carriageway"} + ]}`, + }, + { + in: `{q(func:match(term, "dualway")) {term}}`, + out: `{"q":[]}`, + }, + { + in: `{q(func:match(term, "")) {term}}`, + failure: `Empty argument received`, + }, + { + in: `{q(func:match(name, "someone")) {name}}`, + failure: `Attribute name is not indexed with type trigram`, + }, + } + for _, tc := range tests { + resp, err := c.NewTxn().Query(ctx, tc.in) + if tc.failure != "" { + require.Error(t, err) + require.Contains(t, err.Error(), tc.failure) + continue + } + require.NoError(t, err) + CompareJSON(t, tc.out, string(resp.Json)) + } +} + func QueryHashIndex(t *testing.T, c *dgo.Dgraph) { ctx := context.Background() From 7a3a36344530119033c0e8c20cac9201471f9053 Mon Sep 17 00:00:00 2001 From: srfrog Date: Wed, 23 Jan 2019 12:16:09 -0700 Subject: [PATCH 30/50] remove Bleve ngram pkg --- .../bleve/analysis/token/ngram/ngram.go | 113 ------------------ vendor/vendor.json | 6 - 2 files changed, 119 deletions(-) delete mode 100644 vendor/github.com/blevesearch/bleve/analysis/token/ngram/ngram.go diff --git a/vendor/github.com/blevesearch/bleve/analysis/token/ngram/ngram.go b/vendor/github.com/blevesearch/bleve/analysis/token/ngram/ngram.go deleted file mode 100644 index abab0aefc0a..00000000000 --- a/vendor/github.com/blevesearch/bleve/analysis/token/ngram/ngram.go +++ /dev/null @@ -1,113 +0,0 @@ -// Copyright (c) 2014 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ngram - -import ( - "bytes" - "fmt" - "unicode/utf8" - - "github.com/blevesearch/bleve/analysis" - "github.com/blevesearch/bleve/registry" -) - -const Name = "ngram" - -type NgramFilter struct { - minLength int - maxLength int -} - -func NewNgramFilter(minLength, maxLength int) *NgramFilter { - return &NgramFilter{ - minLength: minLength, - maxLength: maxLength, - } -} - -func (s *NgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream { - rv := make(analysis.TokenStream, 0, len(input)) - - for _, token := range input { - runeCount := utf8.RuneCount(token.Term) - runes := bytes.Runes(token.Term) - for i := 0; i < runeCount; i++ { - // index of the starting rune for this token - for ngramSize := s.minLength; ngramSize <= s.maxLength; ngramSize++ { - // build an ngram of this size starting at i - if i+ngramSize <= runeCount { - ngramTerm := analysis.BuildTermFromRunes(runes[i : i+ngramSize]) - token := analysis.Token{ - Position: token.Position, - Start: token.Start, - End: token.End, - Type: token.Type, - Term: ngramTerm, - } - rv = append(rv, &token) - } - } - } - } - - return rv -} - -func NgramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { - minVal, ok := config["min"] - if !ok { - return nil, fmt.Errorf("must specify min") - } - - min, err := convertToInt(minVal) - if err != nil { - return nil, err - } - - maxVal, ok := config["max"] - if !ok { - return nil, fmt.Errorf("must specify max") - } - - max, err := convertToInt(maxVal) - if err != nil { - return nil, err - } - - return NewNgramFilter(min, max), nil -} - -func init() { - registry.RegisterTokenFilter(Name, NgramFilterConstructor) -} - -// Expects either an int or a flaot64 value -func convertToInt(val interface{}) (int, error) { - var intVal int - var floatVal float64 - var ok bool - - intVal, ok = val.(int) - if ok { - return intVal, nil - } - - floatVal, ok = val.(float64) - if ok { - return int(floatVal), nil - } - - return 0, fmt.Errorf("failed to convert to int value") -} diff --git a/vendor/vendor.json b/vendor/vendor.json index 1b898de4702..b595de2f851 100644 --- a/vendor/vendor.json +++ b/vendor/vendor.json @@ -242,12 +242,6 @@ "revision": "e1f5e6cdcd76e92b209576b5d9111ccab67bd2fb", "revisionTime": "2018-11-14T23:20:33Z" }, - { - "checksumSHA1": "sP0qvqWwZgUMRjK8GU9oobghGFM=", - "path": "github.com/blevesearch/bleve/analysis/token/ngram", - "revision": "d72521093e14a38cb92e22042aed18ca7a1e8fd4", - "revisionTime": "2019-01-16T19:08:17Z" - }, { "checksumSHA1": "QOw3ypU4VTmFT8XYS/52P3RILZw=", "path": "github.com/blevesearch/bleve/analysis/token/porter", From fcc05faf08f23b592e280ab95bf6b264c1a8bd98 Mon Sep 17 00:00:00 2001 From: srfrog Date: Wed, 23 Jan 2019 14:57:21 -0700 Subject: [PATCH 31/50] revert this change --- worker/tokens.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/worker/tokens.go b/worker/tokens.go index 9b2da4dde89..03e08df30c0 100644 --- a/worker/tokens.go +++ b/worker/tokens.go @@ -66,6 +66,9 @@ func verifyCustomIndex(attr string, tokenizerName string) bool { // Return string tokens from function arguments. It maps function type to correct tokenizer. // Note: regexp functions require regexp compilation of argument, not tokenization. func getStringTokens(funcArgs []string, lang string, funcType FuncType) ([]string, error) { + if lang == "." { + lang = "en" + } switch funcType { case FullTextSearchFn: return tok.GetFullTextTokens(funcArgs, lang) From a1a4d196df39700af1a3f71e09e16574b88b738f Mon Sep 17 00:00:00 2001 From: srfrog Date: Wed, 23 Jan 2019 14:58:12 -0700 Subject: [PATCH 32/50] fixed needsIntersect --- worker/task.go | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/worker/task.go b/worker/task.go index 080bf6ff9c9..9734a8a4ae7 100644 --- a/worker/task.go +++ b/worker/task.go @@ -279,9 +279,12 @@ func needsIndex(fnType FuncType) bool { return false } +// needsIntersect checks if the function type needs algo.IntersectSorted() after the results +// are collected. This is needed for functions that require all values to match, like +// "allofterms", "alloftext", and custom functions with "allof". +// Returns true if function results need intersect, false otherwise. func needsIntersect(fnName string) bool { - fnName = strings.ToLower(fnName) - return strings.HasPrefix(fnName, "allof") || strings.HasPrefix(fnName, "anyof") + return strings.HasPrefix(fnName, "allof") || strings.HasSuffix(fnName, "allof") } type funcArgs struct { @@ -1481,7 +1484,7 @@ func parseSrcFn(q *pb.Query) (*functionContext, error) { if fc.tokens, err = getStringTokens(q.SrcFunc.Args, langForFunc(q.Langs), fnType); err != nil { return nil, err } - fc.intersectDest = needsIntersect(q.SrcFunc.Name) + fc.intersectDest = needsIntersect(f) fc.n = len(fc.tokens) case CustomIndexFn: if err = ensureArgsCount(q.SrcFunc, 2); err != nil { @@ -1502,8 +1505,7 @@ func parseSrcFn(q *pb.Query) (*functionContext, error) { } fc.tokens, _ = tok.BuildTokens(valToTok.Value, tok.GetLangTokenizer(tokenizer, langForFunc(q.Langs))) - fc.intersectDest = needsIntersect(q.SrcFunc.Name) - x.AssertTrue(fc.intersectDest) + fc.intersectDest = needsIntersect(f) fc.n = len(fc.tokens) case RegexFn: if err = ensureArgsCount(q.SrcFunc, 2); err != nil { From 35ebe6972759cb1e7f9835aaf0f3dfefa24ec82d Mon Sep 17 00:00:00 2001 From: srfrog Date: Fri, 25 Jan 2019 20:35:09 -0700 Subject: [PATCH 33/50] switched to a new fuzzy matching pkg --- .../github.com/lithammer/fuzzysearch/LICENSE | 21 +++ .../lithammer/fuzzysearch/fuzzy/fuzzy.go | 173 ++++++++++++++++++ .../fuzzysearch/fuzzy/levenshtein.go | 43 +++++ vendor/vendor.json | 6 + worker/match.go | 11 +- 5 files changed, 246 insertions(+), 8 deletions(-) create mode 100644 vendor/github.com/lithammer/fuzzysearch/LICENSE create mode 100644 vendor/github.com/lithammer/fuzzysearch/fuzzy/fuzzy.go create mode 100644 vendor/github.com/lithammer/fuzzysearch/fuzzy/levenshtein.go diff --git a/vendor/github.com/lithammer/fuzzysearch/LICENSE b/vendor/github.com/lithammer/fuzzysearch/LICENSE new file mode 100644 index 00000000000..dee3d1de25e --- /dev/null +++ b/vendor/github.com/lithammer/fuzzysearch/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2018 Peter Lithammer + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vendor/github.com/lithammer/fuzzysearch/fuzzy/fuzzy.go b/vendor/github.com/lithammer/fuzzysearch/fuzzy/fuzzy.go new file mode 100644 index 00000000000..33d48987935 --- /dev/null +++ b/vendor/github.com/lithammer/fuzzysearch/fuzzy/fuzzy.go @@ -0,0 +1,173 @@ +// Fuzzy searching allows for flexibly matching a string with partial input, +// useful for filtering data very quickly based on lightweight user input. +package fuzzy + +import ( + "unicode" + "unicode/utf8" +) + +var noop = func(r rune) rune { return r } + +// Match returns true if source matches target using a fuzzy-searching +// algorithm. Note that it doesn't implement Levenshtein distance (see +// RankMatch instead), but rather a simplified version where there's no +// approximation. The method will return true only if each character in the +// source can be found in the target and occurs after the preceding matches. +func Match(source, target string) bool { + return match(source, target, noop) +} + +// MatchFold is a case-insensitive version of Match. +func MatchFold(source, target string) bool { + return match(source, target, unicode.ToLower) +} + +func match(source, target string, fn func(rune) rune) bool { + lenDiff := len(target) - len(source) + + if lenDiff < 0 { + return false + } + + if lenDiff == 0 && source == target { + return true + } + +Outer: + for _, r1 := range source { + for i, r2 := range target { + if fn(r1) == fn(r2) { + target = target[i+utf8.RuneLen(r2):] + continue Outer + } + } + return false + } + + return true +} + +// Find will return a list of strings in targets that fuzzy matches source. +func Find(source string, targets []string) []string { + return find(source, targets, noop) +} + +// FindFold is a case-insensitive version of Find. +func FindFold(source string, targets []string) []string { + return find(source, targets, unicode.ToLower) +} + +func find(source string, targets []string, fn func(rune) rune) []string { + var matches []string + + for _, target := range targets { + if match(source, target, fn) { + matches = append(matches, target) + } + } + + return matches +} + +// RankMatch is similar to Match except it will measure the Levenshtein +// distance between the source and the target and return its result. If there +// was no match, it will return -1. +// Given the requirements of match, RankMatch only needs to perform a subset of +// the Levenshtein calculation, only deletions need be considered, required +// additions and substitutions would fail the match test. +func RankMatch(source, target string) int { + return rank(source, target, noop) +} + +// RankMatchFold is a case-insensitive version of RankMatch. +func RankMatchFold(source, target string) int { + return rank(source, target, unicode.ToLower) +} + +func rank(source, target string, fn func(rune) rune) int { + lenDiff := len(target) - len(source) + + if lenDiff < 0 { + return -1 + } + + if lenDiff == 0 && source == target { + return 0 + } + + runeDiff := 0 + +Outer: + for _, r1 := range source { + for i, r2 := range target { + if fn(r1) == fn(r2) { + target = target[i+utf8.RuneLen(r2):] + continue Outer + } else { + runeDiff++ + } + } + return -1 + } + + // Count up remaining char + runeDiff += utf8.RuneCountInString(target) + + return runeDiff +} + +// RankFind is similar to Find, except it will also rank all matches using +// Levenshtein distance. +func RankFind(source string, targets []string) Ranks { + var r Ranks + + for index, target := range targets { + if match(source, target, noop) { + distance := LevenshteinDistance(source, target) + r = append(r, Rank{source, target, distance, index}) + } + } + return r +} + +// RankFindFold is a case-insensitive version of RankFind. +func RankFindFold(source string, targets []string) Ranks { + var r Ranks + + for index, target := range targets { + if match(source, target, unicode.ToLower) { + distance := LevenshteinDistance(source, target) + r = append(r, Rank{source, target, distance, index}) + } + } + return r +} + +type Rank struct { + // Source is used as the source for matching. + Source string + + // Target is the word matched against. + Target string + + // Distance is the Levenshtein distance between Source and Target. + Distance int + + // Location of Target in original list + OriginalIndex int +} + +type Ranks []Rank + +func (r Ranks) Len() int { + return len(r) +} + +func (r Ranks) Swap(i, j int) { + r[i], r[j] = r[j], r[i] +} + +func (r Ranks) Less(i, j int) bool { + return r[i].Distance < r[j].Distance +} diff --git a/vendor/github.com/lithammer/fuzzysearch/fuzzy/levenshtein.go b/vendor/github.com/lithammer/fuzzysearch/fuzzy/levenshtein.go new file mode 100644 index 00000000000..237923d3450 --- /dev/null +++ b/vendor/github.com/lithammer/fuzzysearch/fuzzy/levenshtein.go @@ -0,0 +1,43 @@ +package fuzzy + +// LevenshteinDistance measures the difference between two strings. +// The Levenshtein distance between two words is the minimum number of +// single-character edits (i.e. insertions, deletions or substitutions) +// required to change one word into the other. +// +// This implemention is optimized to use O(min(m,n)) space and is based on the +// optimized C version found here: +// http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance#C +func LevenshteinDistance(s, t string) int { + r1, r2 := []rune(s), []rune(t) + column := make([]int, len(r1)+1) + + for y := 1; y <= len(r1); y++ { + column[y] = y + } + + for x := 1; x <= len(r2); x++ { + column[0] = x + + for y, lastDiag := 1, x-1; y <= len(r1); y++ { + oldDiag := column[y] + cost := 0 + if r1[y-1] != r2[x-1] { + cost = 1 + } + column[y] = min(column[y]+1, column[y-1]+1, lastDiag+cost) + lastDiag = oldDiag + } + } + + return column[len(r1)] +} + +func min(a, b, c int) int { + if a < b && a < c { + return a + } else if b < c { + return b + } + return c +} diff --git a/vendor/vendor.json b/vendor/vendor.json index b595de2f851..fe05bc8f829 100644 --- a/vendor/vendor.json +++ b/vendor/vendor.json @@ -650,6 +650,12 @@ "revision": "23c074d0eceb2b8a5bfdbb271ab780cde70f05a8", "revisionTime": "2017-10-17T18:19:29Z" }, + { + "checksumSHA1": "9fTIdD63nJT3Y4QvHtw9dCBhzzE=", + "path": "github.com/lithammer/fuzzysearch/fuzzy", + "revision": "9704abcb5729ef17e4316e31439d7fcece8dac5f", + "revisionTime": "2019-01-06T22:13:24Z" + }, { "checksumSHA1": "8ae1DyNE/yY9NvY3PmvtQdLBJnc=", "path": "github.com/magiconair/properties", diff --git a/worker/match.go b/worker/match.go index c84aa729efa..5eac5565777 100644 --- a/worker/match.go +++ b/worker/match.go @@ -17,14 +17,12 @@ package worker import ( - "strings" - "github.com/dgraph-io/dgraph/algo" "github.com/dgraph-io/dgraph/posting" "github.com/dgraph-io/dgraph/protos/pb" "github.com/dgraph-io/dgraph/tok" "github.com/dgraph-io/dgraph/x" - fuzzstr "github.com/dgryski/go-fuzzstr" + "github.com/lithammer/fuzzysearch/fuzzy" ) // matchFuzzy takes in a value (from posting) and compares it to our list of ngram tokens. @@ -39,12 +37,9 @@ func matchFuzzy(srcFn *functionContext, val string) bool { return false } - // match the entire string. - terms = append(terms, strings.ToLower(val)) - - idx := fuzzstr.NewIndex(terms) for i := range srcFn.tokens { - if len(idx.Query(srcFn.tokens[i])) != 0 { + r := fuzzy.RankFind(srcFn.tokens[i], terms) + if len(r) != 0 { return true } } From 2151a99bf0b5ca4e43c921a718e133cfac03165f Mon Sep 17 00:00:00 2001 From: srfrog Date: Fri, 25 Jan 2019 20:35:35 -0700 Subject: [PATCH 34/50] tweaked test with misspelling --- systest/queries_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/systest/queries_test.go b/systest/queries_test.go index 7d7ce4007ab..16ea9cb1e13 100644 --- a/systest/queries_test.go +++ b/systest/queries_test.go @@ -600,7 +600,7 @@ func FuzzyMatch(t *testing.T, c *dgo.Dgraph) { ]}`, }, { - in: `{q(func:match(term, "carr")) {term}}`, + in: `{q(func:match(term, "carigeway")) {term}}`, out: `{"q":[ {"term": "dual carriageway"} ]}`, From 38ca29b83cbf6172ecc3658d52e39e4f2961ae31 Mon Sep 17 00:00:00 2001 From: srfrog Date: Wed, 30 Jan 2019 16:50:19 -0700 Subject: [PATCH 35/50] added func GetTokenizerByID to search registered tokenizers by id --- tok/tok.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tok/tok.go b/tok/tok.go index 69bfb320daa..62e62c004ad 100644 --- a/tok/tok.go +++ b/tok/tok.go @@ -130,6 +130,17 @@ func LoadCustomTokenizer(soFile string) { registerTokenizer(CustomTokenizer{PluginTokenizer: tokenizer}) } +// GetTokenizerByID tries to find a tokenizer by id in the registered list. +// Returns the tokenizer and true if found, otherwise nil and false. +func GetTokenizerByID(id byte) (Tokenizer, bool) { + for _, t := range tokenizers { + if id == t.Identifier() { + return t, true + } + } + return nil, false +} + // GetTokenizer returns tokenizer given unique name. func GetTokenizer(name string) (Tokenizer, bool) { t, found := tokenizers[name] From fc2e993d77fbcde51b0200014f1430110c4d3586 Mon Sep 17 00:00:00 2001 From: srfrog Date: Wed, 30 Jan 2019 16:51:01 -0700 Subject: [PATCH 36/50] added tok.GetTokens for generating tokens by id --- tok/tokens.go | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tok/tokens.go b/tok/tokens.go index 5238c58b229..9a16c731a7d 100644 --- a/tok/tokens.go +++ b/tok/tokens.go @@ -33,23 +33,24 @@ func GetLangTokenizer(t Tokenizer, lang string) Tokenizer { return t } -func GetTermTokens(funcArgs []string) ([]string, error) { +func GetTokens(id byte, funcArgs ...string) ([]string, error) { if l := len(funcArgs); l != 1 { return nil, x.Errorf("Function requires 1 arguments, but got %d", l) } - return BuildTokens(funcArgs[0], TermTokenizer{}) + tokenizer, ok := GetTokenizerByID(id) + if !ok { + return nil, x.Errorf("No tokenizer was found with id %v", id) + } + return BuildTokens(funcArgs[0], tokenizer) } -func GetFullTextTokens(funcArgs []string, lang string) ([]string, error) { - if l := len(funcArgs); l != 1 { - return nil, x.Errorf("Function requires 1 arguments, but got %d", l) - } - return BuildTokens(funcArgs[0], FullTextTokenizer{lang: lang}) +func GetTermTokens(funcArgs []string) ([]string, error) { + return GetTokens(IdentTerm, funcArgs...) } -func GetMatchTokens(funcArgs []string) ([]string, error) { +func GetFullTextTokens(funcArgs []string, lang string) ([]string, error) { if l := len(funcArgs); l != 1 { return nil, x.Errorf("Function requires 1 arguments, but got %d", l) } - return BuildTokens(funcArgs[0], TrigramTokenizer{}) + return BuildTokens(funcArgs[0], FullTextTokenizer{lang: lang}) } From 0cee4975a127dfbc75b02f04c785295802e32107 Mon Sep 17 00:00:00 2001 From: srfrog Date: Wed, 30 Jan 2019 16:51:54 -0700 Subject: [PATCH 37/50] changed to use tok.GetTokens --- worker/match.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/worker/match.go b/worker/match.go index 5eac5565777..74bf1892d59 100644 --- a/worker/match.go +++ b/worker/match.go @@ -32,7 +32,7 @@ func matchFuzzy(srcFn *functionContext, val string) bool { return false } - terms, err := tok.GetTermTokens([]string{val}) + terms, err := tok.GetTokens(tok.IdentTerm, val) if err != nil { return false } @@ -60,7 +60,7 @@ func uidsForMatch(attr string, arg funcArgs) (*pb.List, error) { return pl.Uids(opts) } - tokens, err := tok.GetMatchTokens(arg.srcFn.tokens) + tokens, err := tok.GetTokens(tok.IdentTrigram, arg.srcFn.tokens...) if err != nil { return nil, err } From cf6a931c42fc75fd39f8d45e04eec302c7ea8773 Mon Sep 17 00:00:00 2001 From: srfrog Date: Wed, 30 Jan 2019 16:52:16 -0700 Subject: [PATCH 38/50] fixed grammar in comment --- worker/task.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/worker/task.go b/worker/task.go index a0a2b166934..674a0a281bd 100644 --- a/worker/task.go +++ b/worker/task.go @@ -305,7 +305,7 @@ func (srcFn *functionContext) needsValuePostings(typ types.TypeID) (bool, error) } return true, nil case GeoFn, RegexFn, FullTextSearchFn, StandardFn, HasFn, CustomIndexFn, MatchFn: - // All of these require index, hence would require fetching uid postings. + // All of these require an index, hence would require fetching uid postings. return false, nil case UidInFn, CompareScalarFn: // Operate on uid postings From 8c39950971aa7bbb378f478a97631a1a0062ac6a Mon Sep 17 00:00:00 2001 From: srfrog Date: Wed, 30 Jan 2019 16:52:41 -0700 Subject: [PATCH 39/50] replaced underused switch with if block --- worker/tokens.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/worker/tokens.go b/worker/tokens.go index 03e08df30c0..4c7852a8759 100644 --- a/worker/tokens.go +++ b/worker/tokens.go @@ -69,8 +69,7 @@ func getStringTokens(funcArgs []string, lang string, funcType FuncType) ([]strin if lang == "." { lang = "en" } - switch funcType { - case FullTextSearchFn: + if funcType == FullTextSearchFn { return tok.GetFullTextTokens(funcArgs, lang) } return tok.GetTermTokens(funcArgs) From 4a889377ab7225beb0bd4c0fdf348cedf80635cb Mon Sep 17 00:00:00 2001 From: srfrog Date: Wed, 30 Jan 2019 16:53:25 -0700 Subject: [PATCH 40/50] small test change --- systest/queries_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/systest/queries_test.go b/systest/queries_test.go index 049a57cebe6..3f48c5e4937 100644 --- a/systest/queries_test.go +++ b/systest/queries_test.go @@ -577,14 +577,14 @@ func FuzzyMatch(t *testing.T, c *dgo.Dgraph) { tests := []struct { in, out, failure string }{ - { - in: `{q(func:match(term, "plane")) {term}}`, - out: `{"q":[]}`, - }, { in: `{q(func:match(term, drive)) {term}}`, out: `{"q":[{"term":"drive"}]}`, }, + { + in: `{q(func:match(term, "plane")) {term}}`, + out: `{"q":[]}`, + }, { in: `{q(func:match(term, way)) {term}}`, out: `{"q":[ From 0c7293344cc1954cd5879a619182442369ca6224 Mon Sep 17 00:00:00 2001 From: Manish R Jain Date: Tue, 5 Feb 2019 15:54:48 -0800 Subject: [PATCH 41/50] Pick up and modify Levenshtein distance to introduce a max distance factor, which would cause early termination of the algo to save CPU resources. Remove the fuzzysearch lib. --- .../github.com/lithammer/fuzzysearch/LICENSE | 21 --- .../lithammer/fuzzysearch/fuzzy/fuzzy.go | 173 ------------------ .../fuzzysearch/fuzzy/levenshtein.go | 43 ----- worker/match.go | 69 +++++-- worker/match_test.go | 18 ++ worker/task.go | 11 +- 6 files changed, 81 insertions(+), 254 deletions(-) delete mode 100644 vendor/github.com/lithammer/fuzzysearch/LICENSE delete mode 100644 vendor/github.com/lithammer/fuzzysearch/fuzzy/fuzzy.go delete mode 100644 vendor/github.com/lithammer/fuzzysearch/fuzzy/levenshtein.go create mode 100644 worker/match_test.go diff --git a/vendor/github.com/lithammer/fuzzysearch/LICENSE b/vendor/github.com/lithammer/fuzzysearch/LICENSE deleted file mode 100644 index dee3d1de25e..00000000000 --- a/vendor/github.com/lithammer/fuzzysearch/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -The MIT License (MIT) - -Copyright (c) 2018 Peter Lithammer - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/vendor/github.com/lithammer/fuzzysearch/fuzzy/fuzzy.go b/vendor/github.com/lithammer/fuzzysearch/fuzzy/fuzzy.go deleted file mode 100644 index 33d48987935..00000000000 --- a/vendor/github.com/lithammer/fuzzysearch/fuzzy/fuzzy.go +++ /dev/null @@ -1,173 +0,0 @@ -// Fuzzy searching allows for flexibly matching a string with partial input, -// useful for filtering data very quickly based on lightweight user input. -package fuzzy - -import ( - "unicode" - "unicode/utf8" -) - -var noop = func(r rune) rune { return r } - -// Match returns true if source matches target using a fuzzy-searching -// algorithm. Note that it doesn't implement Levenshtein distance (see -// RankMatch instead), but rather a simplified version where there's no -// approximation. The method will return true only if each character in the -// source can be found in the target and occurs after the preceding matches. -func Match(source, target string) bool { - return match(source, target, noop) -} - -// MatchFold is a case-insensitive version of Match. -func MatchFold(source, target string) bool { - return match(source, target, unicode.ToLower) -} - -func match(source, target string, fn func(rune) rune) bool { - lenDiff := len(target) - len(source) - - if lenDiff < 0 { - return false - } - - if lenDiff == 0 && source == target { - return true - } - -Outer: - for _, r1 := range source { - for i, r2 := range target { - if fn(r1) == fn(r2) { - target = target[i+utf8.RuneLen(r2):] - continue Outer - } - } - return false - } - - return true -} - -// Find will return a list of strings in targets that fuzzy matches source. -func Find(source string, targets []string) []string { - return find(source, targets, noop) -} - -// FindFold is a case-insensitive version of Find. -func FindFold(source string, targets []string) []string { - return find(source, targets, unicode.ToLower) -} - -func find(source string, targets []string, fn func(rune) rune) []string { - var matches []string - - for _, target := range targets { - if match(source, target, fn) { - matches = append(matches, target) - } - } - - return matches -} - -// RankMatch is similar to Match except it will measure the Levenshtein -// distance between the source and the target and return its result. If there -// was no match, it will return -1. -// Given the requirements of match, RankMatch only needs to perform a subset of -// the Levenshtein calculation, only deletions need be considered, required -// additions and substitutions would fail the match test. -func RankMatch(source, target string) int { - return rank(source, target, noop) -} - -// RankMatchFold is a case-insensitive version of RankMatch. -func RankMatchFold(source, target string) int { - return rank(source, target, unicode.ToLower) -} - -func rank(source, target string, fn func(rune) rune) int { - lenDiff := len(target) - len(source) - - if lenDiff < 0 { - return -1 - } - - if lenDiff == 0 && source == target { - return 0 - } - - runeDiff := 0 - -Outer: - for _, r1 := range source { - for i, r2 := range target { - if fn(r1) == fn(r2) { - target = target[i+utf8.RuneLen(r2):] - continue Outer - } else { - runeDiff++ - } - } - return -1 - } - - // Count up remaining char - runeDiff += utf8.RuneCountInString(target) - - return runeDiff -} - -// RankFind is similar to Find, except it will also rank all matches using -// Levenshtein distance. -func RankFind(source string, targets []string) Ranks { - var r Ranks - - for index, target := range targets { - if match(source, target, noop) { - distance := LevenshteinDistance(source, target) - r = append(r, Rank{source, target, distance, index}) - } - } - return r -} - -// RankFindFold is a case-insensitive version of RankFind. -func RankFindFold(source string, targets []string) Ranks { - var r Ranks - - for index, target := range targets { - if match(source, target, unicode.ToLower) { - distance := LevenshteinDistance(source, target) - r = append(r, Rank{source, target, distance, index}) - } - } - return r -} - -type Rank struct { - // Source is used as the source for matching. - Source string - - // Target is the word matched against. - Target string - - // Distance is the Levenshtein distance between Source and Target. - Distance int - - // Location of Target in original list - OriginalIndex int -} - -type Ranks []Rank - -func (r Ranks) Len() int { - return len(r) -} - -func (r Ranks) Swap(i, j int) { - r[i], r[j] = r[j], r[i] -} - -func (r Ranks) Less(i, j int) bool { - return r[i].Distance < r[j].Distance -} diff --git a/vendor/github.com/lithammer/fuzzysearch/fuzzy/levenshtein.go b/vendor/github.com/lithammer/fuzzysearch/fuzzy/levenshtein.go deleted file mode 100644 index 237923d3450..00000000000 --- a/vendor/github.com/lithammer/fuzzysearch/fuzzy/levenshtein.go +++ /dev/null @@ -1,43 +0,0 @@ -package fuzzy - -// LevenshteinDistance measures the difference between two strings. -// The Levenshtein distance between two words is the minimum number of -// single-character edits (i.e. insertions, deletions or substitutions) -// required to change one word into the other. -// -// This implemention is optimized to use O(min(m,n)) space and is based on the -// optimized C version found here: -// http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance#C -func LevenshteinDistance(s, t string) int { - r1, r2 := []rune(s), []rune(t) - column := make([]int, len(r1)+1) - - for y := 1; y <= len(r1); y++ { - column[y] = y - } - - for x := 1; x <= len(r2); x++ { - column[0] = x - - for y, lastDiag := 1, x-1; y <= len(r1); y++ { - oldDiag := column[y] - cost := 0 - if r1[y-1] != r2[x-1] { - cost = 1 - } - column[y] = min(column[y]+1, column[y-1]+1, lastDiag+cost) - lastDiag = oldDiag - } - } - - return column[len(r1)] -} - -func min(a, b, c int) int { - if a < b && a < c { - return a - } else if b < c { - return b - } - return c -} diff --git a/worker/match.go b/worker/match.go index 74bf1892d59..407361e5aa7 100644 --- a/worker/match.go +++ b/worker/match.go @@ -22,28 +22,69 @@ import ( "github.com/dgraph-io/dgraph/protos/pb" "github.com/dgraph-io/dgraph/tok" "github.com/dgraph-io/dgraph/x" - "github.com/lithammer/fuzzysearch/fuzzy" ) -// matchFuzzy takes in a value (from posting) and compares it to our list of ngram tokens. -// Returns true if value matches fuzzy tokens, false otherwise. -func matchFuzzy(srcFn *functionContext, val string) bool { - if val == "" { - return false +// LevenshteinDistance measures the difference between two strings. +// The Levenshtein distance between two words is the minimum number of +// single-character edits (i.e. insertions, deletions or substitutions) +// required to change one word into the other. +// +// This implemention is optimized to use O(min(m,n)) space and is based on the +// optimized C version found here: +// http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance#C +func levenshteinDistance(s, t string, max int) int { + if len(s) > len(t) { + s, t = t, s } + r1, r2 := []rune(s), []rune(t) // len(s) <= len(t) => len(r1) <= len(r2) + column := make([]int, len(r1)+1) - terms, err := tok.GetTokens(tok.IdentTerm, val) - if err != nil { - return false + for y := 1; y <= len(r1); y++ { + column[y] = y } - for i := range srcFn.tokens { - r := fuzzy.RankFind(srcFn.tokens[i], terms) - if len(r) != 0 { - return true + var minIdx int + for x := 1; x <= len(r2); x++ { + column[0] = x + + for y, lastDiag := 1, x-1; y <= len(r1); y++ { + oldDiag := column[y] + cost := 0 + if r1[y-1] != r2[x-1] { + cost = 1 + } + column[y] = min(column[y]+1, column[y-1]+1, lastDiag+cost) + lastDiag = oldDiag + } + if minIdx < len(r1) && column[minIdx] > column[minIdx+1] { + minIdx++ } + if column[minIdx] > max { + return column[minIdx] + } + } + return column[len(r1)] +} + +func min(a, b, c int) int { + if a < b && a < c { + return a + } else if b < c { + return b + } + return c +} + +// matchFuzzy takes in a value (from posting) and compares it to our list of ngram tokens. +// Returns true if value matches fuzzy tokens, false otherwise. +func matchFuzzy(query, val string) bool { + if val == "" { + return false } - return false + + // TODO: Fill in the max edit distance based parsing by match. + max := 8 + return levenshteinDistance(val, query, max) <= max } // uidsForMatch collects a list of uids that "might" match a fuzzy term based on the ngram diff --git a/worker/match_test.go b/worker/match_test.go new file mode 100644 index 00000000000..e97b41a672c --- /dev/null +++ b/worker/match_test.go @@ -0,0 +1,18 @@ +package worker + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestDistance(t *testing.T) { + require.Equal(t, 0, levenshteinDistance("detour", "detour", 2)) + require.Equal(t, 1, levenshteinDistance("detour", "det.our", 2)) + require.Equal(t, 2, levenshteinDistance("detour", "det..our", 2)) + require.Equal(t, 3, levenshteinDistance("detour", "..det..our", 2)) + require.Equal(t, 2, levenshteinDistance("detour", "detour..", 2)) + require.Equal(t, 3, levenshteinDistance("detour", "detour...", 2)) + require.Equal(t, 3, levenshteinDistance("detour", "...detour", 2)) + require.Equal(t, 3, levenshteinDistance("detour", "..detour.", 2)) +} diff --git a/worker/task.go b/worker/task.go index 674a0a281bd..cb2fdaf4728 100644 --- a/worker/task.go +++ b/worker/task.go @@ -1158,6 +1158,7 @@ func (qs *queryState) handleMatchFunction(ctx context.Context, arg funcArgs) err span.Annotatef(nil, "Total uids: %d, list: %t lang: %v", len(uids.Uids), isList, lang) arg.out.UidMatrix = append(arg.out.UidMatrix, uids) + matchQuery := strings.Join(arg.srcFn.tokens, "") filtered := &pb.List{} for _, uid := range uids.Uids { select { @@ -1191,7 +1192,7 @@ func (qs *queryState) handleMatchFunction(ctx context.Context, arg funcArgs) err for _, val := range vals { // convert data from binary to appropriate format strVal, err := types.Convert(val, types.StringID) - if err == nil && matchFuzzy(arg.srcFn, strVal.Value.(string)) { + if err == nil && matchFuzzy(matchQuery, strVal.Value.(string)) { filtered.Uids = append(filtered.Uids, uid) // NOTE: We only add the uid once. break @@ -1495,8 +1496,12 @@ func parseSrcFn(q *pb.Query) (*functionContext, error) { if !found { return nil, x.Errorf("Attribute %s is not indexed with type %s", attr, required) } - if fc.tokens, err = getStringTokens(q.SrcFunc.Args, langForFunc(q.Langs), fnType); err != nil { - return nil, err + if fnType == MatchFn { + fc.tokens = q.SrcFunc.Args + } else { + if fc.tokens, err = getStringTokens(q.SrcFunc.Args, langForFunc(q.Langs), fnType); err != nil { + return nil, err + } } fc.intersectDest = needsIntersect(f) fc.n = len(fc.tokens) From 4e301e25b8145c176bbb340f3cceb1d581af62ac Mon Sep 17 00:00:00 2001 From: srfrog Date: Wed, 6 Feb 2019 20:04:10 -0700 Subject: [PATCH 42/50] using threshold for lev distance max --- worker/match.go | 5 +---- worker/task.go | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/worker/match.go b/worker/match.go index 407361e5aa7..30cf16c2664 100644 --- a/worker/match.go +++ b/worker/match.go @@ -77,13 +77,10 @@ func min(a, b, c int) int { // matchFuzzy takes in a value (from posting) and compares it to our list of ngram tokens. // Returns true if value matches fuzzy tokens, false otherwise. -func matchFuzzy(query, val string) bool { +func matchFuzzy(query, val string, max int) bool { if val == "" { return false } - - // TODO: Fill in the max edit distance based parsing by match. - max := 8 return levenshteinDistance(val, query, max) <= max } diff --git a/worker/task.go b/worker/task.go index cb2fdaf4728..76b3a30190c 100644 --- a/worker/task.go +++ b/worker/task.go @@ -1192,7 +1192,7 @@ func (qs *queryState) handleMatchFunction(ctx context.Context, arg funcArgs) err for _, val := range vals { // convert data from binary to appropriate format strVal, err := types.Convert(val, types.StringID) - if err == nil && matchFuzzy(matchQuery, strVal.Value.(string)) { + if err == nil && matchFuzzy(matchQuery, strVal.Value.(string), int(arg.srcFn.threshold)) { filtered.Uids = append(filtered.Uids, uid) // NOTE: We only add the uid once. break From 513bccbef60e1926ff1fcbec93460ee4b81a1c8f Mon Sep 17 00:00:00 2001 From: srfrog Date: Wed, 6 Feb 2019 20:49:09 -0700 Subject: [PATCH 43/50] worker/task.go: added match() argument for specifying max distance. This change allows setting a second integer argument in match() to set the max Levenshtein distance threshold. If no value is set, the default value of 8 is used. --- worker/task.go | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/worker/task.go b/worker/task.go index 76b3a30190c..26ea532608b 100644 --- a/worker/task.go +++ b/worker/task.go @@ -1486,7 +1486,7 @@ func parseSrcFn(q *pb.Query) (*functionContext, error) { return nil, err } fc.n = len(q.UidList.Uids) - case StandardFn, FullTextSearchFn, MatchFn: + case StandardFn, FullTextSearchFn: // srcfunc 0th val is func name and and [2:] are args. // we tokenize the arguments of the query. if err = ensureArgsCount(q.SrcFunc, 1); err != nil { @@ -1496,15 +1496,37 @@ func parseSrcFn(q *pb.Query) (*functionContext, error) { if !found { return nil, x.Errorf("Attribute %s is not indexed with type %s", attr, required) } - if fnType == MatchFn { - fc.tokens = q.SrcFunc.Args - } else { - if fc.tokens, err = getStringTokens(q.SrcFunc.Args, langForFunc(q.Langs), fnType); err != nil { - return nil, err - } + if fc.tokens, err = getStringTokens(q.SrcFunc.Args, langForFunc(q.Langs), fnType); err != nil { + return nil, err } fc.intersectDest = needsIntersect(f) fc.n = len(fc.tokens) + case MatchFn: + l := len(q.SrcFunc.Args) + if l == 0 || l > 2 { + return nil, x.Errorf("Function '%s' requires at most 2 arguments, but got %d (%v)", + f, l, q.SrcFunc.Args) + } + required, found := verifyStringIndex(attr, fnType) + if !found { + return nil, x.Errorf("Attribute %s is not indexed with type %s", attr, required) + } + fc.intersectDest = needsIntersect(f) + // Max Levenshtein distance + fc.threshold = 8 + if l == 2 { + var s string + s, q.SrcFunc.Args = q.SrcFunc.Args[1], q.SrcFunc.Args[:1] + max, err := strconv.ParseInt(s, 10, 32) + if err != nil { + return nil, x.Errorf("Levenshtein distance value must be an int, got %v", s) + } + if max > 0 && max < 8 { + fc.threshold = int64(max) + } + } + fc.tokens = q.SrcFunc.Args + fc.n = len(fc.tokens) case CustomIndexFn: if err = ensureArgsCount(q.SrcFunc, 2); err != nil { return nil, err From 6a3244fb2769a4a64b9b702388c61c45e0cce6da Mon Sep 17 00:00:00 2001 From: srfrog Date: Wed, 6 Feb 2019 20:53:03 -0700 Subject: [PATCH 44/50] systest/queries_test.go: updated match query tests Updated test for new matchFuzzy using threshold. --- systest/queries_test.go | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/systest/queries_test.go b/systest/queries_test.go index 3f48c5e4937..bb9726522c7 100644 --- a/systest/queries_test.go +++ b/systest/queries_test.go @@ -582,24 +582,33 @@ func FuzzyMatch(t *testing.T, c *dgo.Dgraph) { out: `{"q":[{"term":"drive"}]}`, }, { - in: `{q(func:match(term, "plane")) {term}}`, + in: `{q(func:match(term, "plano", 1)) {term}}`, out: `{"q":[]}`, }, + { + in: `{q(func:match(term, "plano", 2)) {term}}`, + out: `{"q":[{"term":"lane"}]}`, + }, + { + in: `{q(func:match(term, "plano")) {term}}`, + out: `{"q":[{"term":"lane"}]}`, + }, { in: `{q(func:match(term, way)) {term}}`, out: `{"q":[ {"term": "highway"}, {"term": "pathway"}, {"term": "parkway"}, - {"term": "dual carriageway"}, {"term": "motorway"} ]}`, }, { in: `{q(func:match(term, pway)) {term}}`, out: `{"q":[ + {"term": "highway"}, {"term": "pathway"}, - {"term": "parkway"} + {"term": "parkway"}, + {"term": "motorway"} ]}`, }, { @@ -616,6 +625,17 @@ func FuzzyMatch(t *testing.T, c *dgo.Dgraph) { {"term": "side street"} ]}`, }, + { + in: `{q(func:match(term, strip)) {term}}`, + out: `{"q":[ + {"term": "street"}, + {"term": "side street"} + ]}`, + }, + { + in: `{q(func:match(term, strip, 3)) {term}}`, + out: `{"q":[{"term": "street"}]}`, + }, { in: `{q(func:match(term, "carigeway")) {term}}`, out: `{"q":[ @@ -623,7 +643,20 @@ func FuzzyMatch(t *testing.T, c *dgo.Dgraph) { ]}`, }, { - in: `{q(func:match(term, "dualway")) {term}}`, + in: `{q(func:match(term, "carigeway", 4)) {term}}`, + out: `{"q":[]}`, + }, + { + in: `{q(func:match(term, "dualway")) {term}}`, + out: `{"q":[ + {"term": "highway"}, + {"term": "pathway"}, + {"term": "parkway"}, + {"term": "motorway"} + ]}`, + }, + { + in: `{q(func:match(term, "dualway", 2)) {term}}`, out: `{"q":[]}`, }, { From d904afb75d1275af0b78a28024a0f44daae433be Mon Sep 17 00:00:00 2001 From: srfrog Date: Mon, 11 Feb 2019 12:35:53 -0700 Subject: [PATCH 45/50] wiki/content/query-language/index.md: added section for match function --- wiki/content/query-language/index.md | 34 ++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/wiki/content/query-language/index.md b/wiki/content/query-language/index.md index 8aeab95e49d..5d7855fcf46 100644 --- a/wiki/content/query-language/index.md +++ b/wiki/content/query-language/index.md @@ -353,6 +353,40 @@ Keep the following in mind when designing regular expression queries. - If the partial result (for subset of trigrams) exceeds 1000000 uids during index scan, the query is stopped to prohibit expensive queries. +### Fuzzy matching + + +Syntax Examples: `match(predicate, string)` or `regexp(predicate, string, distance)` + +Schema Types: `string` + +Index Required: `trigram` + +Matches strings by calculating the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) predicate value to the string. +By default the distance is set to eight (8) but a lesser distance can be set when using the second form. The distance will never exceed the default. +Using greater distance value could yield more but less accurate results. + +Query Example: At root, fuzzy match nodes similar to `Stephen`, with a default distance value of 8. + +{{< runnable >}} +{ + directors(func: match(name@en, Stephen)) { + name@en + } +} +{{< /runnable >}} + +Same query with a Levenshtein distance of 3. + +{{< runnable >}} +{ + directors(func: match(name@en, Stephen, 3)) { + name@en + } +} +{{< /runnable >}} + + ### Full Text Search Syntax Examples: `alloftext(predicate, "space-separated text")` and `anyoftext(predicate, "space-separated text")` From 5a4db74157f444ebcaceda3eef24fb2c64889087 Mon Sep 17 00:00:00 2001 From: srfrog Date: Mon, 11 Feb 2019 15:24:06 -0700 Subject: [PATCH 46/50] vendor/vendor.json: removed old fuzzy pkg --- vendor/vendor.json | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vendor/vendor.json b/vendor/vendor.json index fe05bc8f829..b595de2f851 100644 --- a/vendor/vendor.json +++ b/vendor/vendor.json @@ -650,12 +650,6 @@ "revision": "23c074d0eceb2b8a5bfdbb271ab780cde70f05a8", "revisionTime": "2017-10-17T18:19:29Z" }, - { - "checksumSHA1": "9fTIdD63nJT3Y4QvHtw9dCBhzzE=", - "path": "github.com/lithammer/fuzzysearch/fuzzy", - "revision": "9704abcb5729ef17e4316e31439d7fcece8dac5f", - "revisionTime": "2019-01-06T22:13:24Z" - }, { "checksumSHA1": "8ae1DyNE/yY9NvY3PmvtQdLBJnc=", "path": "github.com/magiconair/properties", From d8ff259d3675d92ae46efd9f7b7e02e2cb276a21 Mon Sep 17 00:00:00 2001 From: srfrog Date: Mon, 11 Feb 2019 15:25:22 -0700 Subject: [PATCH 47/50] worker/task.go: match func enforce 2 args, max distance must be gt zero --- worker/task.go | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/worker/task.go b/worker/task.go index 26ea532608b..6ffffb4a765 100644 --- a/worker/task.go +++ b/worker/task.go @@ -1189,10 +1189,11 @@ func (qs *queryState) handleMatchFunction(ctx context.Context, arg funcArgs) err return err } + max := int(arg.srcFn.threshold) for _, val := range vals { // convert data from binary to appropriate format strVal, err := types.Convert(val, types.StringID) - if err == nil && matchFuzzy(matchQuery, strVal.Value.(string), int(arg.srcFn.threshold)) { + if err == nil && matchFuzzy(matchQuery, strVal.Value.(string), max) { filtered.Uids = append(filtered.Uids, uid) // NOTE: We only add the uid once. break @@ -1502,10 +1503,8 @@ func parseSrcFn(q *pb.Query) (*functionContext, error) { fc.intersectDest = needsIntersect(f) fc.n = len(fc.tokens) case MatchFn: - l := len(q.SrcFunc.Args) - if l == 0 || l > 2 { - return nil, x.Errorf("Function '%s' requires at most 2 arguments, but got %d (%v)", - f, l, q.SrcFunc.Args) + if err = ensureArgsCount(q.SrcFunc, 2); err != nil { + return nil, err } required, found := verifyStringIndex(attr, fnType) if !found { @@ -1513,18 +1512,16 @@ func parseSrcFn(q *pb.Query) (*functionContext, error) { } fc.intersectDest = needsIntersect(f) // Max Levenshtein distance - fc.threshold = 8 - if l == 2 { - var s string - s, q.SrcFunc.Args = q.SrcFunc.Args[1], q.SrcFunc.Args[:1] - max, err := strconv.ParseInt(s, 10, 32) - if err != nil { - return nil, x.Errorf("Levenshtein distance value must be an int, got %v", s) - } - if max > 0 && max < 8 { - fc.threshold = int64(max) - } + var s string + s, q.SrcFunc.Args = q.SrcFunc.Args[1], q.SrcFunc.Args[:1] + max, err := strconv.ParseInt(s, 10, 32) + if err != nil { + return nil, x.Errorf("Levenshtein distance value must be an int, got %v", s) + } + if max < 0 { + return nil, x.Errorf("Levenshtein distance value must be greater than 0, got %v", s) } + fc.threshold = int64(max) fc.tokens = q.SrcFunc.Args fc.n = len(fc.tokens) case CustomIndexFn: From 50458e6d81db565570389cad0bb9ad13c22755a9 Mon Sep 17 00:00:00 2001 From: srfrog Date: Mon, 11 Feb 2019 17:15:31 -0700 Subject: [PATCH 48/50] wiki/content/query-language/index.md: updated syntax, example and fixed typos --- wiki/content/query-language/index.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/wiki/content/query-language/index.md b/wiki/content/query-language/index.md index 5d7855fcf46..6317ff580da 100644 --- a/wiki/content/query-language/index.md +++ b/wiki/content/query-language/index.md @@ -356,21 +356,20 @@ Keep the following in mind when designing regular expression queries. ### Fuzzy matching -Syntax Examples: `match(predicate, string)` or `regexp(predicate, string, distance)` +Syntax: ``match(predicate, string, distance)` Schema Types: `string` Index Required: `trigram` -Matches strings by calculating the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) predicate value to the string. -By default the distance is set to eight (8) but a lesser distance can be set when using the second form. The distance will never exceed the default. -Using greater distance value could yield more but less accurate results. +Matches strings by calculating the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) predicate value to the string, +also known as "fuzzy matching". The distance parameter must be greater than zero (0). Using a greater distance value could yield more but less accurate results. Query Example: At root, fuzzy match nodes similar to `Stephen`, with a default distance value of 8. {{< runnable >}} { - directors(func: match(name@en, Stephen)) { + directors(func: match(name@en, Stephen, 8)) { name@en } } From 2c49e8645abe4c084d6698a54a5af0acbfb386c0 Mon Sep 17 00:00:00 2001 From: srfrog Date: Mon, 11 Feb 2019 17:21:57 -0700 Subject: [PATCH 49/50] systest/queries_test.go: updated syntax in tests --- systest/queries_test.go | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/systest/queries_test.go b/systest/queries_test.go index bb9726522c7..260a8731663 100644 --- a/systest/queries_test.go +++ b/systest/queries_test.go @@ -578,7 +578,7 @@ func FuzzyMatch(t *testing.T, c *dgo.Dgraph) { in, out, failure string }{ { - in: `{q(func:match(term, drive)) {term}}`, + in: `{q(func:match(term, drive, 8)) {term}}`, out: `{"q":[{"term":"drive"}]}`, }, { @@ -590,11 +590,11 @@ func FuzzyMatch(t *testing.T, c *dgo.Dgraph) { out: `{"q":[{"term":"lane"}]}`, }, { - in: `{q(func:match(term, "plano")) {term}}`, + in: `{q(func:match(term, "plano", 8)) {term}}`, out: `{"q":[{"term":"lane"}]}`, }, { - in: `{q(func:match(term, way)) {term}}`, + in: `{q(func:match(term, way, 8)) {term}}`, out: `{"q":[ {"term": "highway"}, {"term": "pathway"}, @@ -603,7 +603,7 @@ func FuzzyMatch(t *testing.T, c *dgo.Dgraph) { ]}`, }, { - in: `{q(func:match(term, pway)) {term}}`, + in: `{q(func:match(term, pway, 8)) {term}}`, out: `{"q":[ {"term": "highway"}, {"term": "pathway"}, @@ -612,21 +612,21 @@ func FuzzyMatch(t *testing.T, c *dgo.Dgraph) { ]}`, }, { - in: `{q(func:match(term, high)) {term}}`, + in: `{q(func:match(term, high, 8)) {term}}`, out: `{"q":[ {"term": "highway"}, {"term": "high road"} ]}`, }, { - in: `{q(func:match(term, str)) {term}}`, + in: `{q(func:match(term, str, 8)) {term}}`, out: `{"q":[ {"term": "street"}, {"term": "side street"} ]}`, }, { - in: `{q(func:match(term, strip)) {term}}`, + in: `{q(func:match(term, strip, 8)) {term}}`, out: `{"q":[ {"term": "street"}, {"term": "side street"} @@ -637,7 +637,7 @@ func FuzzyMatch(t *testing.T, c *dgo.Dgraph) { out: `{"q":[{"term": "street"}]}`, }, { - in: `{q(func:match(term, "carigeway")) {term}}`, + in: `{q(func:match(term, "carigeway", 8)) {term}}`, out: `{"q":[ {"term": "dual carriageway"} ]}`, @@ -647,7 +647,7 @@ func FuzzyMatch(t *testing.T, c *dgo.Dgraph) { out: `{"q":[]}`, }, { - in: `{q(func:match(term, "dualway")) {term}}`, + in: `{q(func:match(term, "dualway", 8)) {term}}`, out: `{"q":[ {"term": "highway"}, {"term": "pathway"}, @@ -660,11 +660,11 @@ func FuzzyMatch(t *testing.T, c *dgo.Dgraph) { out: `{"q":[]}`, }, { - in: `{q(func:match(term, "")) {term}}`, + in: `{q(func:match(term, "", 8)) {term}}`, failure: `Empty argument received`, }, { - in: `{q(func:match(name, "someone")) {name}}`, + in: `{q(func:match(name, "someone", 8)) {name}}`, failure: `Attribute name is not indexed with type trigram`, }, } From be4f2e605e6e19773b25a4afc8c7a11a01e6f9f2 Mon Sep 17 00:00:00 2001 From: srfrog Date: Mon, 11 Feb 2019 17:26:13 -0700 Subject: [PATCH 50/50] wiki/content/query-language/index.md: minior doc fixes --- wiki/content/query-language/index.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/wiki/content/query-language/index.md b/wiki/content/query-language/index.md index 6317ff580da..ea8d371c7b7 100644 --- a/wiki/content/query-language/index.md +++ b/wiki/content/query-language/index.md @@ -356,16 +356,16 @@ Keep the following in mind when designing regular expression queries. ### Fuzzy matching -Syntax: ``match(predicate, string, distance)` +Syntax: `match(predicate, string, distance)` Schema Types: `string` Index Required: `trigram` -Matches strings by calculating the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) predicate value to the string, -also known as "fuzzy matching". The distance parameter must be greater than zero (0). Using a greater distance value could yield more but less accurate results. +Matches predicate values by calculating the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) to the string, +also known as _fuzzy matching_. The distance parameter must be greater than zero (0). Using a greater distance value can yield more but less accurate results. -Query Example: At root, fuzzy match nodes similar to `Stephen`, with a default distance value of 8. +Query Example: At root, fuzzy match nodes similar to `Stephen`, with a distance value of 8. {{< runnable >}} {