Skip to content

Commit f961af5

Browse files
srfrogdna2github
authored andcommitted
Improve hash index (hypermodeinc#2887)
* saving state * added new fingerprint func using BLAKE2b * renamed function to Hash256 for clarity. * replaced 64 fingerprint hash with Hash256 * pickTokenizer use hash tokenizer when list is lossy. * added tokenizer identifier list for enforcing tokenizer. * compare func using hash index if available and eq won't compare values * fixed minor comment glitches * use tokenizer identifier consts, change hash to non-lossy. * using non-lossy hash so no need for extra logic in handleCompareFunction * simplify pickTokenizer and * simplify pickTokenizer * using tokenizer id * added id value for custom tokenizers, IdentCustom * using tokenizer ids when possible fixed bug in getInequalityTokens with fulltext indexes. * added hash index tests * Manish's review. Fixed a new bug introduced by this PR during IdentCustom comparison. Simplify pickTokenizer. Added comments. * Remove Long term for exact index warning. * fixed logic * pickTokenizer return error when comparison func doesn't have non-lossy (eq) or sortable (le, ge, gt, lt) index * added warning for eq comparison without non-lossy tokenizer * re-fixed this slippery lil bug * removed extra glog
1 parent ddcbda6 commit f961af5

File tree

6 files changed

+224
-87
lines changed

6 files changed

+224
-87
lines changed

posting/index.go

-6
Original file line numberDiff line numberDiff line change
@@ -65,12 +65,6 @@ func indexTokens(info *indexMutationInfo) ([]string, error) {
6565

6666
var tokens []string
6767
for _, it := range info.tokenizers {
68-
if it.Name() == "exact" && schemaType == types.StringID && len(sv.Value.(string)) > 100 {
69-
// Exact index can only be applied for strings so we can safely try to convert Value to
70-
// string.
71-
glog.Infof("Long term for exact index on predicate: [%s]. "+
72-
"Consider switching to hash for better performance.\n", attr)
73-
}
7468
toks, err := tok.BuildTokens(sv.Value, tok.GetLangTokenizer(it, lang))
7569
if err != nil {
7670
return tokens, err

posting/lists.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ func Cleanup() {
159159
// to lru cache and returns it.
160160
//
161161
// plist := Get(key, group)
162-
// ... // Use plist
162+
// ... Use plist
163163
// TODO: This should take a node id and index. And just append all indices to a list.
164164
// When doing a commit, it should update all the sync index watermarks.
165165
// worker pkg would push the indices to the watermarks held by lists.

systest/queries_test.go

+114
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ func TestQuery(t *testing.T) {
4646
t.Run("schema predicate names", wrap(SchemaQueryTestPredicate1))
4747
t.Run("schema specific predicate fields", wrap(SchemaQueryTestPredicate2))
4848
t.Run("schema specific predicate field", wrap(SchemaQueryTestPredicate3))
49+
t.Run("hash index queries", wrap(QueryHashIndex))
4950
t.Run("cleanup", wrap(SchemaQueryCleanup))
5051
}
5152

@@ -318,3 +319,116 @@ func SchemaQueryTestHTTP(t *testing.T, c *dgo.Dgraph) {
318319
}`
319320
CompareJSON(t, js, string(m["data"]))
320321
}
322+
323+
func QueryHashIndex(t *testing.T, c *dgo.Dgraph) {
324+
ctx := context.Background()
325+
326+
require.NoError(t, c.Alter(ctx, &api.Operation{
327+
Schema: `
328+
name: string @index(hash) @lang .
329+
`,
330+
}))
331+
332+
txn := c.NewTxn()
333+
_, err := txn.Mutate(ctx, &api.Mutation{
334+
SetNquads: []byte(`
335+
_:p0 <name> "" .
336+
_:p1 <name> "0" .
337+
_:p2 <name> "srfrog" .
338+
_:p3 <name> "Lorem ipsum" .
339+
_:p4 <name> "Lorem ipsum dolor sit amet" .
340+
_:p5 <name> "Lorem ipsum dolor sit amet, consectetur adipiscing elit" .
341+
_:p6 <name> "Lorem ipsum"@en .
342+
_:p7 <name> "Lorem ipsum dolor sit amet"@en .
343+
_:p8 <name> "Lorem ipsum dolor sit amet, consectetur adipiscing elit"@en .
344+
_:p9 <name> "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed varius tellus ut sem bibendum, eu tristique augue congue. Praesent eget odio tincidunt, pellentesque ante sit amet, tempus sem. Donec et tellus et diam facilisis egestas ut ac risus. Proin feugiat risus tristique erat condimentum placerat. Nulla eget ligula tempus, blandit leo vel, accumsan tortor. Phasellus et felis in diam ultricies porta nec in ipsum. Phasellus id leo sagittis, bibendum enim ut, pretium lectus. Quisque ac ex viverra, suscipit turpis sed, scelerisque metus. Sed non dui facilisis, viverra leo eget, vulputate erat. Etiam nec enim sed nisi imperdiet cursus. Suspendisse sed ligula non nisi pharetra varius." .
345+
_:pa <name> ""@fr .
346+
`),
347+
})
348+
require.NoError(t, err)
349+
require.NoError(t, txn.Commit(ctx))
350+
351+
tests := []struct {
352+
in, out string
353+
}{
354+
{
355+
in: `schema(pred: [name]) {}`,
356+
out: `
357+
{
358+
"schema": [
359+
{
360+
"index": true,
361+
"lang": true,
362+
"predicate": "name",
363+
"tokenizer": [
364+
"hash"
365+
],
366+
"type": "string"
367+
}
368+
]
369+
}`,
370+
},
371+
{
372+
in: `{q(func:eq(name,"")){name}}`,
373+
out: `{"q": [{"name":""}]}`,
374+
},
375+
{
376+
in: `{q(func:eq(name,"0")){name}}`,
377+
out: `{"q": [{"name":"0"}]}`,
378+
},
379+
{
380+
in: `{q(func:eq(name,"srfrog")){name}}`,
381+
out: `{"q": [{"name":"srfrog"}]}`,
382+
},
383+
{
384+
in: `{q(func:eq(name,"Lorem ipsum")){name}}`,
385+
out: `{"q": [{"name":"Lorem ipsum"}]}`,
386+
},
387+
{
388+
in: `{q(func:eq(name,"Lorem ipsum dolor sit amet")){name}}`,
389+
out: `{"q": [{"name":"Lorem ipsum dolor sit amet"}]}`,
390+
},
391+
{
392+
in: `{q(func:eq(name@en,"Lorem ipsum")){name@en}}`,
393+
out: `{"q": [{"name@en":"Lorem ipsum"}]}`,
394+
},
395+
{
396+
in: `{q(func:eq(name@.,"Lorem ipsum dolor sit amet")){name@en}}`,
397+
out: `{"q": [{"name@en":"Lorem ipsum dolor sit amet"}]}`,
398+
},
399+
{
400+
in: `{q(func:eq(name,["srfrog"])){name}}`,
401+
out: `{"q": [{"name":"srfrog"}]}`,
402+
},
403+
{
404+
in: `{q(func:eq(name,["srfrog","srf","srfrogg","sr","s"])){name}}`,
405+
out: `{"q": [{"name":"srfrog"}]}`,
406+
},
407+
{
408+
in: `{q(func:eq(name,["Lorem ipsum","Lorem ipsum dolor sit amet, consectetur adipiscing elit",""])){name}}`,
409+
out: `{"q": [{"name":""},{"name":"Lorem ipsum"},{"name":"Lorem ipsum dolor sit amet, consectetur adipiscing elit"}]}`,
410+
},
411+
{
412+
in: `{q(func:eq(name,["Lorem ipsum","Lorem ipsum","Lorem ipsum","Lorem ipsum","Lorem ipsum"])){name}}`,
413+
out: `{"q": [{"name":"Lorem ipsum"}]}`,
414+
},
415+
{
416+
in: `{q(func:eq(name@en,["Lorem ipsum","Lorem ipsum dolor sit amet, consectetur adipiscing elit",""])){name@en}}`,
417+
out: `{"q": [{"name@en":"Lorem ipsum"},{"name@en":"Lorem ipsum dolor sit amet, consectetur adipiscing elit"}]}`,
418+
},
419+
{
420+
in: `{q(func:eq(name@en,["Lorem ipsum","Lorem ipsum","Lorem ipsum","Lorem ipsum","Lorem ipsum"])){name@en}}`,
421+
out: `{"q": [{"name@en":"Lorem ipsum"}]}`,
422+
},
423+
{
424+
in: `{q(func:eq(name@.,"")){name@fr}}`,
425+
out: `{"q": [{"name@fr":""}]}`,
426+
},
427+
}
428+
429+
for _, tc := range tests {
430+
resp, err := c.NewTxn().Query(ctx, tc.in)
431+
require.NoError(t, err)
432+
CompareJSON(t, tc.out, string(resp.Json))
433+
}
434+
}

tok/tok.go

+49-18
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,36 @@ import (
2222
"plugin"
2323
"time"
2424

25-
farm "github.com/dgryski/go-farm"
2625
"github.com/golang/glog"
2726
geom "github.com/twpayne/go-geom"
27+
"golang.org/x/crypto/blake2b"
2828

2929
"github.com/dgraph-io/dgraph/types"
3030
"github.com/dgraph-io/dgraph/x"
3131
)
3232

33+
// Tokenizer identifiers are unique and can't be reused.
34+
// The range 0x00 - 0x7f is system reserved.
35+
// The range 0x80 - 0xff is for custom tokenizers.
36+
// TODO: use these everywhere where we must ensure a system tokenizer.
37+
const (
38+
IdentNone = 0x0
39+
IdentTerm = 0x1
40+
IdentExact = 0x2
41+
IdentYear = 0x4
42+
IdentMonth = 0x41
43+
IdentDay = 0x42
44+
IdentHour = 0x43
45+
IdentGeo = 0x5
46+
IdentInt = 0x6
47+
IdentFloat = 0x7
48+
IdentFullText = 0x8
49+
IdentBool = 0x9
50+
IdentTrigram = 0xA
51+
IdentHash = 0xB
52+
IdentCustom = 0x80
53+
)
54+
3355
// Tokenizer defines what a tokenizer must provide.
3456
type Tokenizer interface {
3557

@@ -103,7 +125,7 @@ func LoadCustomTokenizer(soFile string) {
103125
tokenizer := symb.(func() interface{})().(PluginTokenizer)
104126

105127
id := tokenizer.Identifier()
106-
x.AssertTruef(id >= 0x80,
128+
x.AssertTruef(id >= IdentCustom,
107129
"custom tokenizer identifier byte must be >= 0x80, but was %#x", id)
108130
registerTokenizer(CustomTokenizer{PluginTokenizer: tokenizer})
109131
}
@@ -142,7 +164,7 @@ func (t GeoTokenizer) Type() string { return "geo" }
142164
func (t GeoTokenizer) Tokens(v interface{}) ([]string, error) {
143165
return types.IndexGeoTokens(v.(geom.T))
144166
}
145-
func (t GeoTokenizer) Identifier() byte { return 0x5 }
167+
func (t GeoTokenizer) Identifier() byte { return IdentGeo }
146168
func (t GeoTokenizer) IsSortable() bool { return false }
147169
func (t GeoTokenizer) IsLossy() bool { return true }
148170

@@ -153,7 +175,7 @@ func (t IntTokenizer) Type() string { return "int" }
153175
func (t IntTokenizer) Tokens(v interface{}) ([]string, error) {
154176
return []string{encodeInt(v.(int64))}, nil
155177
}
156-
func (t IntTokenizer) Identifier() byte { return 0x6 }
178+
func (t IntTokenizer) Identifier() byte { return IdentInt }
157179
func (t IntTokenizer) IsSortable() bool { return true }
158180
func (t IntTokenizer) IsLossy() bool { return false }
159181

@@ -164,7 +186,7 @@ func (t FloatTokenizer) Type() string { return "float" }
164186
func (t FloatTokenizer) Tokens(v interface{}) ([]string, error) {
165187
return []string{encodeInt(int64(v.(float64)))}, nil
166188
}
167-
func (t FloatTokenizer) Identifier() byte { return 0x7 }
189+
func (t FloatTokenizer) Identifier() byte { return IdentFloat }
168190
func (t FloatTokenizer) IsSortable() bool { return true }
169191
func (t FloatTokenizer) IsLossy() bool { return true }
170192

@@ -178,7 +200,7 @@ func (t YearTokenizer) Tokens(v interface{}) ([]string, error) {
178200
binary.BigEndian.PutUint16(buf[0:2], uint16(tval.Year()))
179201
return []string{string(buf)}, nil
180202
}
181-
func (t YearTokenizer) Identifier() byte { return 0x4 }
203+
func (t YearTokenizer) Identifier() byte { return IdentYear }
182204
func (t YearTokenizer) IsSortable() bool { return true }
183205
func (t YearTokenizer) IsLossy() bool { return true }
184206

@@ -193,7 +215,7 @@ func (t MonthTokenizer) Tokens(v interface{}) ([]string, error) {
193215
binary.BigEndian.PutUint16(buf[2:4], uint16(tval.Month()))
194216
return []string{string(buf)}, nil
195217
}
196-
func (t MonthTokenizer) Identifier() byte { return 0x41 }
218+
func (t MonthTokenizer) Identifier() byte { return IdentMonth }
197219
func (t MonthTokenizer) IsSortable() bool { return true }
198220
func (t MonthTokenizer) IsLossy() bool { return true }
199221

@@ -209,7 +231,7 @@ func (t DayTokenizer) Tokens(v interface{}) ([]string, error) {
209231
binary.BigEndian.PutUint16(buf[4:6], uint16(tval.Day()))
210232
return []string{string(buf)}, nil
211233
}
212-
func (t DayTokenizer) Identifier() byte { return 0x42 }
234+
func (t DayTokenizer) Identifier() byte { return IdentDay }
213235
func (t DayTokenizer) IsSortable() bool { return true }
214236
func (t DayTokenizer) IsLossy() bool { return true }
215237

@@ -226,7 +248,7 @@ func (t HourTokenizer) Tokens(v interface{}) ([]string, error) {
226248
binary.BigEndian.PutUint16(buf[6:8], uint16(tval.Hour()))
227249
return []string{string(buf)}, nil
228250
}
229-
func (t HourTokenizer) Identifier() byte { return 0x43 }
251+
func (t HourTokenizer) Identifier() byte { return IdentHour }
230252
func (t HourTokenizer) IsSortable() bool { return true }
231253
func (t HourTokenizer) IsLossy() bool { return true }
232254

@@ -242,7 +264,7 @@ func (t TermTokenizer) Tokens(v interface{}) ([]string, error) {
242264
tokens := termAnalyzer.Analyze([]byte(str))
243265
return uniqueTerms(tokens), nil
244266
}
245-
func (t TermTokenizer) Identifier() byte { return 0x1 }
267+
func (t TermTokenizer) Identifier() byte { return IdentTerm }
246268
func (t TermTokenizer) IsSortable() bool { return false }
247269
func (t TermTokenizer) IsLossy() bool { return true }
248270

@@ -256,7 +278,7 @@ func (t ExactTokenizer) Tokens(v interface{}) ([]string, error) {
256278
}
257279
return nil, x.Errorf("Exact indices only supported for string types")
258280
}
259-
func (t ExactTokenizer) Identifier() byte { return 0x2 }
281+
func (t ExactTokenizer) Identifier() byte { return IdentExact }
260282
func (t ExactTokenizer) IsSortable() bool { return true }
261283
func (t ExactTokenizer) IsLossy() bool { return false }
262284

@@ -279,7 +301,7 @@ func (t FullTextTokenizer) Tokens(v interface{}) ([]string, error) {
279301
// finally, return the terms.
280302
return uniqueTerms(tokens), nil
281303
}
282-
func (t FullTextTokenizer) Identifier() byte { return 0x8 }
304+
func (t FullTextTokenizer) Identifier() byte { return IdentFullText }
283305
func (t FullTextTokenizer) IsSortable() bool { return false }
284306
func (t FullTextTokenizer) IsLossy() bool { return true }
285307

@@ -321,7 +343,7 @@ func (t BoolTokenizer) Tokens(v interface{}) ([]string, error) {
321343
}
322344
return []string{encodeInt(b)}, nil
323345
}
324-
func (t BoolTokenizer) Identifier() byte { return 0x9 }
346+
func (t BoolTokenizer) Identifier() byte { return IdentBool }
325347
func (t BoolTokenizer) IsSortable() bool { return false }
326348
func (t BoolTokenizer) IsLossy() bool { return false }
327349

@@ -345,7 +367,7 @@ func (t TrigramTokenizer) Tokens(v interface{}) ([]string, error) {
345367
}
346368
return nil, nil
347369
}
348-
func (t TrigramTokenizer) Identifier() byte { return 0xA }
370+
func (t TrigramTokenizer) Identifier() byte { return IdentTrigram }
349371
func (t TrigramTokenizer) IsSortable() bool { return false }
350372
func (t TrigramTokenizer) IsLossy() bool { return true }
351373

@@ -358,13 +380,22 @@ func (t HashTokenizer) Tokens(v interface{}) ([]string, error) {
358380
if !ok {
359381
return nil, x.Errorf("Hash tokenizer only supported for string types")
360382
}
361-
var hash [8]byte
362-
binary.BigEndian.PutUint64(hash[:], farm.Hash64([]byte(term)))
383+
// Blake2 is a hash function equivalent of SHA series, but faster. SHA is the best hash function
384+
// for doing checksum of content, because they have low collision ratios. See issue #2776.
385+
hash := blake2b.Sum256([]byte(term))
386+
if len(hash) == 0 {
387+
return nil, x.Errorf("Hash tokenizer failed to create hash")
388+
}
363389
return []string{string(hash[:])}, nil
364390
}
365-
func (t HashTokenizer) Identifier() byte { return 0xB }
391+
func (t HashTokenizer) Identifier() byte { return IdentHash }
366392
func (t HashTokenizer) IsSortable() bool { return false }
367-
func (t HashTokenizer) IsLossy() bool { return true }
393+
394+
// We have switched HashTokenizer to be non-lossy. This allows us to avoid having to retrieve values
395+
// for the returned results, and compare them against the value in the query, which is slow. There
396+
// is very low probability of collisions with a 256-bit hash. We use that fact to speed up equality
397+
// query operations using the hash index.
398+
func (t HashTokenizer) IsLossy() bool { return false }
368399

369400
// PluginTokenizer is implemented by external plugins loaded dynamically via
370401
// *.so files. It follows the implementation semantics of the Tokenizer

worker/task.go

+9
Original file line numberDiff line numberDiff line change
@@ -962,12 +962,21 @@ func (qs *queryState) handleRegexFunction(ctx context.Context, arg funcArgs) err
962962
}
963963

964964
func (qs *queryState) handleCompareFunction(ctx context.Context, arg funcArgs) error {
965+
span := otrace.FromContext(ctx)
966+
stop := x.SpanTimer(span, "handleCompareFunction")
967+
defer stop()
968+
if span != nil {
969+
span.Annotatef(nil, "Number of uids: %d. args.srcFn: %+v", arg.srcFn.n, arg.srcFn)
970+
}
971+
965972
attr := arg.q.Attr
973+
span.Annotatef(nil, "Attr: %s. Fname: %s", attr, arg.srcFn.fname)
966974
tokenizer, err := pickTokenizer(attr, arg.srcFn.fname)
967975
// We should already have checked this in getInequalityTokens.
968976
x.Check(err)
969977
// Only if the tokenizer that we used IsLossy, then we need to fetch
970978
// and compare the actual values.
979+
span.Annotatef(nil, "Tokenizer: %s, Lossy: %t", tokenizer.Name(), tokenizer.IsLossy())
971980
if tokenizer.IsLossy() {
972981
// Need to evaluate inequality for entries in the first bucket.
973982
typ, err := schema.State().TypeOf(attr)

0 commit comments

Comments
 (0)