From 8e412fd37c42af776165b92cb398b9f20d52a0d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20Zdyba=C5=82?= Date: Fri, 14 Jul 2017 05:54:57 +0200 Subject: [PATCH] Removing duplicates after tokenization This resolves issue #1183 --- gql/parser.go | 24 +++++------------------- gql/parser_test.go | 10 ---------- tok/tok.go | 2 ++ tok/tok_test.go | 24 +++++++++++++++++++++--- worker/worker_test.go | 10 +++++----- x/x.go | 15 +++++++++++++++ x/x_test.go | 32 ++++++++++++++++++++++++++++++++ 7 files changed, 80 insertions(+), 37 deletions(-) create mode 100644 x/x_test.go diff --git a/gql/parser.go b/gql/parser.go index ac036b0591f..56cb133e2ce 100644 --- a/gql/parser.go +++ b/gql/parser.go @@ -582,8 +582,7 @@ func Parse(r Request) (res Result, rerr error) { for _, v := range res.MutationVars { varNames = append(varNames, v) } - sort.Strings(varNames) - varNames = removeDuplicates(varNames) + varNames = x.RemoveDuplicates(varNames) allVars = append(allVars, &Vars{Needs: varNames}) } @@ -605,27 +604,14 @@ func flatten(vl []*Vars) (needs []string, defines []string) { return } -// removes duplicates from a sorted slice of strings. Changes underylying array. -func removeDuplicates(s []string) (out []string) { - out = s[:0] - for i := range s { - if i > 0 && s[i] == s[i-1] { - continue - } - out = append(out, s[i]) - } - return -} - func checkDependency(vl []*Vars) error { needs, defines := flatten(vl) - sort.Strings(needs) - sort.Strings(defines) - - needs = removeDuplicates(needs) + needs = x.RemoveDuplicates(needs) + lenBefore := len(defines) + defines = x.RemoveDuplicates(defines) - if len(defines) != len(removeDuplicates(defines)) { + if len(defines) != lenBefore { return x.Errorf("Some variables are declared multiple times.") } diff --git a/gql/parser_test.go b/gql/parser_test.go index a5fda44135f..197d8df2730 100644 --- a/gql/parser_test.go +++ b/gql/parser_test.go @@ -3589,16 +3589,6 @@ func TestFilterUid(t *testing.T) { require.Equal(t, []uint64{3, 7}, gql.Query[0].Filter.Func.UID) } -func TestRemoveDuplicates(t *testing.T) { - set := removeDuplicates([]string{"a", "a", "a", "b", "b", "c", "c"}) - require.EqualValues(t, []string{"a", "b", "c"}, set) -} - -func TestRemoveDuplicatesWithoutDuplicates(t *testing.T) { - set := removeDuplicates([]string{"a", "b", "c", "d"}) - require.EqualValues(t, []string{"a", "b", "c", "d"}, set) -} - func TestMultipleSetBlocks(t *testing.T) { query := ` mutation { diff --git a/tok/tok.go b/tok/tok.go index 699ff86d05f..3d01acc828b 100644 --- a/tok/tok.go +++ b/tok/tok.go @@ -272,6 +272,7 @@ func getBleveTokens(name string, identifier byte, sv types.Val) ([]string, error for i, token := range tokenStream { terms[i] = encodeToken(string(token.Term), identifier) } + terms = x.RemoveDuplicates(terms) return terms, nil } @@ -333,6 +334,7 @@ func (t TrigramTokenizer) Tokens(sv types.Val) ([]string, error) { trigram := value[i : i+3] tokens[i] = encodeToken(trigram, t.Identifier()) } + tokens = x.RemoveDuplicates(tokens) return tokens, nil } return nil, nil diff --git a/tok/tok_test.go b/tok/tok_test.go index 541655b8508..b091e5d6792 100644 --- a/tok/tok_test.go +++ b/tok/tok_test.go @@ -148,7 +148,8 @@ func TestFullTextTokenizerLang(t *testing.T) { require.NoError(t, err) require.Equal(t, 2, len(tokens)) id := tokenizer.Identifier() - require.Equal(t, []string{encodeToken("katz", id), encodeToken("auffass", id)}, tokens) + // tokens should be sorted and unique + require.Equal(t, []string{encodeToken("auffass", id), encodeToken("katz", id)}, tokens) } func TestTermTokenizer(t *testing.T) { @@ -175,7 +176,7 @@ func TestTrigramTokenizer(t *testing.T) { require.NoError(t, err) require.Equal(t, 11, len(tokens)) id := tokenizer.Identifier() - require.Equal(t, []string{ + expected := []string{ encodeToken("Dgr", id), encodeToken("gra", id), encodeToken("rap", id), @@ -187,5 +188,22 @@ func TestTrigramTokenizer(t *testing.T) { encodeToken("ock", id), encodeToken("cks", id), encodeToken("ks!", id), - }, tokens) + } + sort.Strings(expected) + require.Equal(t, expected, tokens) +} + +func TestGetBleveTokens(t *testing.T) { + val := types.ValueForType(types.StringID) + val.Value = "Our chief weapon is surprise...surprise and fear...fear and surprise...." + + "Our two weapons are fear and surprise...and ruthless efficiency.... " + + "Our three weapons are fear, surprise, and ruthless efficiency..." + tokens, err := getBleveTokens(FTSTokenizerName, 0x20, val) // use space as identifier + require.NoError(t, err) + + expected := []string{" chief", " weapon", " surpris", " fear", " ruthless", " effici"} + sort.Strings(expected) + + // ensure that tokens are sorted and unique + require.Equal(t, expected, tokens) } diff --git a/worker/worker_test.go b/worker/worker_test.go index cf25198752e..794240c914b 100644 --- a/worker/worker_test.go +++ b/worker/worker_test.go @@ -191,9 +191,9 @@ func TestProcessTaskIndexMLayer(t *testing.T) { require.EqualValues(t, [][]uint64{ nil, - {10}, {12}, nil, + {10}, }, algo.ToUintsListForTest(r.UidMatrix)) // Try deleting. @@ -221,9 +221,9 @@ func TestProcessTaskIndexMLayer(t *testing.T) { require.NoError(t, err) require.EqualValues(t, [][]uint64{ + {12}, nil, nil, - {12}, }, algo.ToUintsListForTest(r.UidMatrix)) // Final touch: Merge everything to RocksDB. @@ -235,9 +235,9 @@ func TestProcessTaskIndexMLayer(t *testing.T) { require.NoError(t, err) require.EqualValues(t, [][]uint64{ + {12}, nil, nil, - {12}, }, algo.ToUintsListForTest(r.UidMatrix)) } @@ -281,9 +281,9 @@ func TestProcessTaskIndex(t *testing.T) { require.EqualValues(t, [][]uint64{ nil, - {10}, {12}, nil, + {10}, }, algo.ToUintsListForTest(r.UidMatrix)) posting.CommitLists(10, 1) @@ -314,9 +314,9 @@ func TestProcessTaskIndex(t *testing.T) { require.NoError(t, err) require.EqualValues(t, [][]uint64{ + {12}, nil, nil, - {12}, }, algo.ToUintsListForTest(r.UidMatrix)) } diff --git a/x/x.go b/x/x.go index df8029e0bbf..d3159e0a46d 100644 --- a/x/x.go +++ b/x/x.go @@ -24,6 +24,7 @@ import ( "net" "net/http" "regexp" + "sort" "strconv" "strings" "time" @@ -193,3 +194,17 @@ func ValidateAddress(addr string) bool { } return regExpHostName.MatchString(host) } + +// sorts the slice of strings and removes duplicates. changes the input slice. +// this function should be called like: someSlice = x.RemoveDuplicates(someSlice) +func RemoveDuplicates(s []string) (out []string) { + sort.Strings(s) + out = s[:0] + for i := range s { + if i > 0 && s[i] == s[i-1] { + continue + } + out = append(out, s[i]) + } + return +} diff --git a/x/x_test.go b/x/x_test.go new file mode 100644 index 00000000000..4432f9aec2c --- /dev/null +++ b/x/x_test.go @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package x + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestRemoveDuplicates(t *testing.T) { + set := RemoveDuplicates([]string{"a", "a", "a", "b", "b", "c", "c"}) + require.EqualValues(t, []string{"a", "b", "c"}, set) +} + +func TestRemoveDuplicatesWithoutDuplicates(t *testing.T) { + set := RemoveDuplicates([]string{"a", "b", "c", "d"}) + require.EqualValues(t, []string{"a", "b", "c", "d"}, set) +}