Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removing duplicates after tokenization #1189

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 5 additions & 19 deletions gql/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -582,8 +582,7 @@ func Parse(r Request) (res Result, rerr error) {
for _, v := range res.MutationVars {
varNames = append(varNames, v)
}
sort.Strings(varNames)
varNames = removeDuplicates(varNames)
varNames = x.RemoveDuplicates(varNames)

allVars = append(allVars, &Vars{Needs: varNames})
}
Expand All @@ -605,27 +604,14 @@ func flatten(vl []*Vars) (needs []string, defines []string) {
return
}

// removes duplicates from a sorted slice of strings. Changes underylying array.
func removeDuplicates(s []string) (out []string) {
out = s[:0]
for i := range s {
if i > 0 && s[i] == s[i-1] {
continue
}
out = append(out, s[i])
}
return
}

func checkDependency(vl []*Vars) error {
needs, defines := flatten(vl)

sort.Strings(needs)
sort.Strings(defines)

needs = removeDuplicates(needs)
needs = x.RemoveDuplicates(needs)
lenBefore := len(defines)
defines = x.RemoveDuplicates(defines)

if len(defines) != len(removeDuplicates(defines)) {
if len(defines) != lenBefore {
return x.Errorf("Some variables are declared multiple times.")
}

Expand Down
10 changes: 0 additions & 10 deletions gql/parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3589,16 +3589,6 @@ func TestFilterUid(t *testing.T) {
require.Equal(t, []uint64{3, 7}, gql.Query[0].Filter.Func.UID)
}

func TestRemoveDuplicates(t *testing.T) {
set := removeDuplicates([]string{"a", "a", "a", "b", "b", "c", "c"})
require.EqualValues(t, []string{"a", "b", "c"}, set)
}

func TestRemoveDuplicatesWithoutDuplicates(t *testing.T) {
set := removeDuplicates([]string{"a", "b", "c", "d"})
require.EqualValues(t, []string{"a", "b", "c", "d"}, set)
}

func TestMultipleSetBlocks(t *testing.T) {
query := `
mutation {
Expand Down
2 changes: 2 additions & 0 deletions tok/tok.go
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ func getBleveTokens(name string, identifier byte, sv types.Val) ([]string, error
for i, token := range tokenStream {
terms[i] = encodeToken(string(token.Term), identifier)
}
terms = x.RemoveDuplicates(terms)
return terms, nil
}

Expand Down Expand Up @@ -333,6 +334,7 @@ func (t TrigramTokenizer) Tokens(sv types.Val) ([]string, error) {
trigram := value[i : i+3]
tokens[i] = encodeToken(trigram, t.Identifier())
}
tokens = x.RemoveDuplicates(tokens)
return tokens, nil
}
return nil, nil
Expand Down
24 changes: 21 additions & 3 deletions tok/tok_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,8 @@ func TestFullTextTokenizerLang(t *testing.T) {
require.NoError(t, err)
require.Equal(t, 2, len(tokens))
id := tokenizer.Identifier()
require.Equal(t, []string{encodeToken("katz", id), encodeToken("auffass", id)}, tokens)
// tokens should be sorted and unique
require.Equal(t, []string{encodeToken("auffass", id), encodeToken("katz", id)}, tokens)
}

func TestTermTokenizer(t *testing.T) {
Expand All @@ -175,7 +176,7 @@ func TestTrigramTokenizer(t *testing.T) {
require.NoError(t, err)
require.Equal(t, 11, len(tokens))
id := tokenizer.Identifier()
require.Equal(t, []string{
expected := []string{
encodeToken("Dgr", id),
encodeToken("gra", id),
encodeToken("rap", id),
Expand All @@ -187,5 +188,22 @@ func TestTrigramTokenizer(t *testing.T) {
encodeToken("ock", id),
encodeToken("cks", id),
encodeToken("ks!", id),
}, tokens)
}
sort.Strings(expected)
require.Equal(t, expected, tokens)
}

func TestGetBleveTokens(t *testing.T) {
val := types.ValueForType(types.StringID)
val.Value = "Our chief weapon is surprise...surprise and fear...fear and surprise...." +
"Our two weapons are fear and surprise...and ruthless efficiency.... " +
"Our three weapons are fear, surprise, and ruthless efficiency..."
tokens, err := getBleveTokens(FTSTokenizerName, 0x20, val) // use space as identifier
require.NoError(t, err)

expected := []string{" chief", " weapon", " surpris", " fear", " ruthless", " effici"}
sort.Strings(expected)

// ensure that tokens are sorted and unique
require.Equal(t, expected, tokens)
}
10 changes: 5 additions & 5 deletions worker/worker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,9 +191,9 @@ func TestProcessTaskIndexMLayer(t *testing.T) {

require.EqualValues(t, [][]uint64{
nil,
{10},
{12},
nil,
{10},
}, algo.ToUintsListForTest(r.UidMatrix))

// Try deleting.
Expand Down Expand Up @@ -221,9 +221,9 @@ func TestProcessTaskIndexMLayer(t *testing.T) {
require.NoError(t, err)

require.EqualValues(t, [][]uint64{
{12},
nil,
nil,
{12},
}, algo.ToUintsListForTest(r.UidMatrix))

// Final touch: Merge everything to RocksDB.
Expand All @@ -235,9 +235,9 @@ func TestProcessTaskIndexMLayer(t *testing.T) {
require.NoError(t, err)

require.EqualValues(t, [][]uint64{
{12},
nil,
nil,
{12},
}, algo.ToUintsListForTest(r.UidMatrix))
}

Expand Down Expand Up @@ -281,9 +281,9 @@ func TestProcessTaskIndex(t *testing.T) {

require.EqualValues(t, [][]uint64{
nil,
{10},
{12},
nil,
{10},
}, algo.ToUintsListForTest(r.UidMatrix))

posting.CommitLists(10, 1)
Expand Down Expand Up @@ -314,9 +314,9 @@ func TestProcessTaskIndex(t *testing.T) {
require.NoError(t, err)

require.EqualValues(t, [][]uint64{
{12},
nil,
nil,
{12},
}, algo.ToUintsListForTest(r.UidMatrix))
}

Expand Down
15 changes: 15 additions & 0 deletions x/x.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"net"
"net/http"
"regexp"
"sort"
"strconv"
"strings"
"time"
Expand Down Expand Up @@ -193,3 +194,17 @@ func ValidateAddress(addr string) bool {
}
return regExpHostName.MatchString(host)
}

// sorts the slice of strings and removes duplicates. changes the input slice.
// this function should be called like: someSlice = x.RemoveDuplicates(someSlice)
func RemoveDuplicates(s []string) (out []string) {
sort.Strings(s)
out = s[:0]
for i := range s {
if i > 0 && s[i] == s[i-1] {
continue
}
out = append(out, s[i])
}
return
}
32 changes: 32 additions & 0 deletions x/x_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
* Copyright (C) 2017 Dgraph Labs, Inc. and Contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package x

import (
"testing"

"github.com/stretchr/testify/require"
)

func TestRemoveDuplicates(t *testing.T) {
set := RemoveDuplicates([]string{"a", "a", "a", "b", "b", "c", "c"})
require.EqualValues(t, []string{"a", "b", "c"}, set)
}

func TestRemoveDuplicatesWithoutDuplicates(t *testing.T) {
set := RemoveDuplicates([]string{"a", "b", "c", "d"})
require.EqualValues(t, []string{"a", "b", "c", "d"}, set)
}