Skip to content

Commit

Permalink
Removing duplicates after tokenization
Browse files Browse the repository at this point in the history
This resolves issue #1183
  • Loading branch information
Tomasz Zdybał committed Jul 14, 2017
1 parent a25947b commit 8e412fd
Show file tree
Hide file tree
Showing 7 changed files with 80 additions and 37 deletions.
24 changes: 5 additions & 19 deletions gql/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -582,8 +582,7 @@ func Parse(r Request) (res Result, rerr error) {
for _, v := range res.MutationVars {
varNames = append(varNames, v)
}
sort.Strings(varNames)
varNames = removeDuplicates(varNames)
varNames = x.RemoveDuplicates(varNames)

allVars = append(allVars, &Vars{Needs: varNames})
}
Expand All @@ -605,27 +604,14 @@ func flatten(vl []*Vars) (needs []string, defines []string) {
return
}

// removes duplicates from a sorted slice of strings. Changes underylying array.
func removeDuplicates(s []string) (out []string) {
out = s[:0]
for i := range s {
if i > 0 && s[i] == s[i-1] {
continue
}
out = append(out, s[i])
}
return
}

func checkDependency(vl []*Vars) error {
needs, defines := flatten(vl)

sort.Strings(needs)
sort.Strings(defines)

needs = removeDuplicates(needs)
needs = x.RemoveDuplicates(needs)
lenBefore := len(defines)
defines = x.RemoveDuplicates(defines)

if len(defines) != len(removeDuplicates(defines)) {
if len(defines) != lenBefore {
return x.Errorf("Some variables are declared multiple times.")
}

Expand Down
10 changes: 0 additions & 10 deletions gql/parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3589,16 +3589,6 @@ func TestFilterUid(t *testing.T) {
require.Equal(t, []uint64{3, 7}, gql.Query[0].Filter.Func.UID)
}

func TestRemoveDuplicates(t *testing.T) {
set := removeDuplicates([]string{"a", "a", "a", "b", "b", "c", "c"})
require.EqualValues(t, []string{"a", "b", "c"}, set)
}

func TestRemoveDuplicatesWithoutDuplicates(t *testing.T) {
set := removeDuplicates([]string{"a", "b", "c", "d"})
require.EqualValues(t, []string{"a", "b", "c", "d"}, set)
}

func TestMultipleSetBlocks(t *testing.T) {
query := `
mutation {
Expand Down
2 changes: 2 additions & 0 deletions tok/tok.go
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ func getBleveTokens(name string, identifier byte, sv types.Val) ([]string, error
for i, token := range tokenStream {
terms[i] = encodeToken(string(token.Term), identifier)
}
terms = x.RemoveDuplicates(terms)
return terms, nil
}

Expand Down Expand Up @@ -333,6 +334,7 @@ func (t TrigramTokenizer) Tokens(sv types.Val) ([]string, error) {
trigram := value[i : i+3]
tokens[i] = encodeToken(trigram, t.Identifier())
}
tokens = x.RemoveDuplicates(tokens)
return tokens, nil
}
return nil, nil
Expand Down
24 changes: 21 additions & 3 deletions tok/tok_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,8 @@ func TestFullTextTokenizerLang(t *testing.T) {
require.NoError(t, err)
require.Equal(t, 2, len(tokens))
id := tokenizer.Identifier()
require.Equal(t, []string{encodeToken("katz", id), encodeToken("auffass", id)}, tokens)
// tokens should be sorted and unique
require.Equal(t, []string{encodeToken("auffass", id), encodeToken("katz", id)}, tokens)
}

func TestTermTokenizer(t *testing.T) {
Expand All @@ -175,7 +176,7 @@ func TestTrigramTokenizer(t *testing.T) {
require.NoError(t, err)
require.Equal(t, 11, len(tokens))
id := tokenizer.Identifier()
require.Equal(t, []string{
expected := []string{
encodeToken("Dgr", id),
encodeToken("gra", id),
encodeToken("rap", id),
Expand All @@ -187,5 +188,22 @@ func TestTrigramTokenizer(t *testing.T) {
encodeToken("ock", id),
encodeToken("cks", id),
encodeToken("ks!", id),
}, tokens)
}
sort.Strings(expected)
require.Equal(t, expected, tokens)
}

func TestGetBleveTokens(t *testing.T) {
val := types.ValueForType(types.StringID)
val.Value = "Our chief weapon is surprise...surprise and fear...fear and surprise...." +
"Our two weapons are fear and surprise...and ruthless efficiency.... " +
"Our three weapons are fear, surprise, and ruthless efficiency..."
tokens, err := getBleveTokens(FTSTokenizerName, 0x20, val) // use space as identifier
require.NoError(t, err)

expected := []string{" chief", " weapon", " surpris", " fear", " ruthless", " effici"}
sort.Strings(expected)

// ensure that tokens are sorted and unique
require.Equal(t, expected, tokens)
}
10 changes: 5 additions & 5 deletions worker/worker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,9 +191,9 @@ func TestProcessTaskIndexMLayer(t *testing.T) {

require.EqualValues(t, [][]uint64{
nil,
{10},
{12},
nil,
{10},
}, algo.ToUintsListForTest(r.UidMatrix))

// Try deleting.
Expand Down Expand Up @@ -221,9 +221,9 @@ func TestProcessTaskIndexMLayer(t *testing.T) {
require.NoError(t, err)

require.EqualValues(t, [][]uint64{
{12},
nil,
nil,
{12},
}, algo.ToUintsListForTest(r.UidMatrix))

// Final touch: Merge everything to RocksDB.
Expand All @@ -235,9 +235,9 @@ func TestProcessTaskIndexMLayer(t *testing.T) {
require.NoError(t, err)

require.EqualValues(t, [][]uint64{
{12},
nil,
nil,
{12},
}, algo.ToUintsListForTest(r.UidMatrix))
}

Expand Down Expand Up @@ -281,9 +281,9 @@ func TestProcessTaskIndex(t *testing.T) {

require.EqualValues(t, [][]uint64{
nil,
{10},
{12},
nil,
{10},
}, algo.ToUintsListForTest(r.UidMatrix))

posting.CommitLists(10, 1)
Expand Down Expand Up @@ -314,9 +314,9 @@ func TestProcessTaskIndex(t *testing.T) {
require.NoError(t, err)

require.EqualValues(t, [][]uint64{
{12},
nil,
nil,
{12},
}, algo.ToUintsListForTest(r.UidMatrix))
}

Expand Down
15 changes: 15 additions & 0 deletions x/x.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"net"
"net/http"
"regexp"
"sort"
"strconv"
"strings"
"time"
Expand Down Expand Up @@ -193,3 +194,17 @@ func ValidateAddress(addr string) bool {
}
return regExpHostName.MatchString(host)
}

// sorts the slice of strings and removes duplicates. changes the input slice.
// this function should be called like: someSlice = x.RemoveDuplicates(someSlice)
func RemoveDuplicates(s []string) (out []string) {
sort.Strings(s)
out = s[:0]
for i := range s {
if i > 0 && s[i] == s[i-1] {
continue
}
out = append(out, s[i])
}
return
}
32 changes: 32 additions & 0 deletions x/x_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
* Copyright (C) 2017 Dgraph Labs, Inc. and Contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package x

import (
"testing"

"github.com/stretchr/testify/require"
)

func TestRemoveDuplicates(t *testing.T) {
set := RemoveDuplicates([]string{"a", "a", "a", "b", "b", "c", "c"})
require.EqualValues(t, []string{"a", "b", "c"}, set)
}

func TestRemoveDuplicatesWithoutDuplicates(t *testing.T) {
set := RemoveDuplicates([]string{"a", "b", "c", "d"})
require.EqualValues(t, []string{"a", "b", "c", "d"}, set)
}

0 comments on commit 8e412fd

Please sign in to comment.