Improve hash index (hypermodeinc#2887)

srfrog · dna2github · commit f961af5a4c89 · 2019-07-20T00:16:48.000+08:00
* saving state

* added new fingerprint func using BLAKE2b

* renamed function to Hash256 for clarity.

* replaced 64 fingerprint hash with Hash256

* pickTokenizer use hash tokenizer when list is lossy.

* added tokenizer identifier list for enforcing tokenizer.

* compare func using hash index if available and eq won't compare values

* fixed minor comment glitches

* use tokenizer identifier consts, change hash to non-lossy.

* using non-lossy hash so no need for extra logic in handleCompareFunction

* simplify pickTokenizer and

* simplify pickTokenizer

* using tokenizer id

* added id value for custom tokenizers, IdentCustom

* using tokenizer ids when possible
fixed bug in getInequalityTokens with fulltext indexes.

* added hash index tests

* Manish's review. Fixed a new bug introduced by this PR during IdentCustom comparison. Simplify pickTokenizer. Added comments.

* Remove Long term for exact index warning.

* fixed logic

* pickTokenizer return error when comparison func doesn't have non-lossy (eq) or sortable (le, ge, gt, lt) index

* added warning for eq comparison without non-lossy tokenizer

* re-fixed this slippery lil bug

* removed extra glog
diff --git a/posting/index.go b/posting/index.go
@@ -65,12 +65,6 @@ func indexTokens(info *indexMutationInfo) ([]string, error) {
 
 	var tokens []string
 	for _, it := range info.tokenizers {
-		if it.Name() == "exact" && schemaType == types.StringID && len(sv.Value.(string)) > 100 {
-			// Exact index can only be applied for strings so we can safely try to convert Value to
-			// string.
-			glog.Infof("Long term for exact index on predicate: [%s]. "+
-				"Consider switching to hash for better performance.\n", attr)
-		}
 		toks, err := tok.BuildTokens(sv.Value, tok.GetLangTokenizer(it, lang))
 		if err != nil {
 			return tokens, err
diff --git a/posting/lists.go b/posting/lists.go
@@ -159,7 +159,7 @@ func Cleanup() {
 // to lru cache and returns it.
 //
 // plist := Get(key, group)
-// ... // Use plist
+// ... Use plist
 // TODO: This should take a node id and index. And just append all indices to a list.
 // When doing a commit, it should update all the sync index watermarks.
 // worker pkg would push the indices to the watermarks held by lists.
diff --git a/systest/queries_test.go b/systest/queries_test.go
@@ -46,6 +46,7 @@ func TestQuery(t *testing.T) {
 	t.Run("schema predicate names", wrap(SchemaQueryTestPredicate1))
 	t.Run("schema specific predicate fields", wrap(SchemaQueryTestPredicate2))
 	t.Run("schema specific predicate field", wrap(SchemaQueryTestPredicate3))
+	t.Run("hash index queries", wrap(QueryHashIndex))
 	t.Run("cleanup", wrap(SchemaQueryCleanup))
 }
 
@@ -318,3 +319,116 @@ func SchemaQueryTestHTTP(t *testing.T, c *dgo.Dgraph) {
   }`
 	CompareJSON(t, js, string(m["data"]))
 }
+
+func QueryHashIndex(t *testing.T, c *dgo.Dgraph) {
+	ctx := context.Background()
+
+	require.NoError(t, c.Alter(ctx, &api.Operation{
+		Schema: `
+      name: string @index(hash) @lang .
+    `,
+	}))
+
+	txn := c.NewTxn()
+	_, err := txn.Mutate(ctx, &api.Mutation{
+		SetNquads: []byte(`
+      _:p0 <name> "" .
+      _:p1 <name> "0" .
+      _:p2 <name> "srfrog" .
+      _:p3 <name> "Lorem ipsum" .
+      _:p4 <name> "Lorem ipsum dolor sit amet" .
+      _:p5 <name> "Lorem ipsum dolor sit amet, consectetur adipiscing elit" .
+      _:p6 <name> "Lorem ipsum"@en .
+      _:p7 <name> "Lorem ipsum dolor sit amet"@en .
+      _:p8 <name> "Lorem ipsum dolor sit amet, consectetur adipiscing elit"@en .
+      _:p9 <name> "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed varius tellus ut sem bibendum, eu tristique augue congue. Praesent eget odio tincidunt, pellentesque ante sit amet, tempus sem. Donec et tellus et diam facilisis egestas ut ac risus. Proin feugiat risus tristique erat condimentum placerat. Nulla eget ligula tempus, blandit leo vel, accumsan tortor. Phasellus et felis in diam ultricies porta nec in ipsum. Phasellus id leo sagittis, bibendum enim ut, pretium lectus. Quisque ac ex viverra, suscipit turpis sed, scelerisque metus. Sed non dui facilisis, viverra leo eget, vulputate erat. Etiam nec enim sed nisi imperdiet cursus. Suspendisse sed ligula non nisi pharetra varius." .
+      _:pa <name> ""@fr .
+    `),
+	})
+	require.NoError(t, err)
+	require.NoError(t, txn.Commit(ctx))
+
+	tests := []struct {
+		in, out string
+	}{
+		{
+			in: `schema(pred: [name]) {}`,
+			out: `
+      {
+        "schema": [
+          {
+            "index": true,
+            "lang": true,
+            "predicate": "name",
+            "tokenizer": [
+              "hash"
+            ],
+            "type": "string"
+          }
+        ]
+      }`,
+		},
+		{
+			in:  `{q(func:eq(name,"")){name}}`,
+			out: `{"q": [{"name":""}]}`,
+		},
+		{
+			in:  `{q(func:eq(name,"0")){name}}`,
+			out: `{"q": [{"name":"0"}]}`,
+		},
+		{
+			in:  `{q(func:eq(name,"srfrog")){name}}`,
+			out: `{"q": [{"name":"srfrog"}]}`,
+		},
+		{
+			in:  `{q(func:eq(name,"Lorem ipsum")){name}}`,
+			out: `{"q": [{"name":"Lorem ipsum"}]}`,
+		},
+		{
+			in:  `{q(func:eq(name,"Lorem ipsum dolor sit amet")){name}}`,
+			out: `{"q": [{"name":"Lorem ipsum dolor sit amet"}]}`,
+		},
+		{
+			in:  `{q(func:eq(name@en,"Lorem ipsum")){name@en}}`,
+			out: `{"q": [{"name@en":"Lorem ipsum"}]}`,
+		},
+		{
+			in:  `{q(func:eq(name@.,"Lorem ipsum dolor sit amet")){name@en}}`,
+			out: `{"q": [{"name@en":"Lorem ipsum dolor sit amet"}]}`,
+		},
+		{
+			in:  `{q(func:eq(name,["srfrog"])){name}}`,
+			out: `{"q": [{"name":"srfrog"}]}`,
+		},
+		{
+			in:  `{q(func:eq(name,["srfrog","srf","srfrogg","sr","s"])){name}}`,
+			out: `{"q": [{"name":"srfrog"}]}`,
+		},
+		{
+			in:  `{q(func:eq(name,["Lorem ipsum","Lorem ipsum dolor sit amet, consectetur adipiscing elit",""])){name}}`,
+			out: `{"q": [{"name":""},{"name":"Lorem ipsum"},{"name":"Lorem ipsum dolor sit amet, consectetur adipiscing elit"}]}`,
+		},
+		{
+			in:  `{q(func:eq(name,["Lorem ipsum","Lorem ipsum","Lorem ipsum","Lorem ipsum","Lorem ipsum"])){name}}`,
+			out: `{"q": [{"name":"Lorem ipsum"}]}`,
+		},
+		{
+			in:  `{q(func:eq(name@en,["Lorem ipsum","Lorem ipsum dolor sit amet, consectetur adipiscing elit",""])){name@en}}`,
+			out: `{"q": [{"name@en":"Lorem ipsum"},{"name@en":"Lorem ipsum dolor sit amet, consectetur adipiscing elit"}]}`,
+		},
+		{
+			in:  `{q(func:eq(name@en,["Lorem ipsum","Lorem ipsum","Lorem ipsum","Lorem ipsum","Lorem ipsum"])){name@en}}`,
+			out: `{"q": [{"name@en":"Lorem ipsum"}]}`,
+		},
+		{
+			in:  `{q(func:eq(name@.,"")){name@fr}}`,
+			out: `{"q": [{"name@fr":""}]}`,
+		},
+	}
+
+	for _, tc := range tests {
+		resp, err := c.NewTxn().Query(ctx, tc.in)
+		require.NoError(t, err)
+		CompareJSON(t, tc.out, string(resp.Json))
+	}
+}
diff --git a/tok/tok.go b/tok/tok.go
@@ -22,14 +22,36 @@ import (
 	"plugin"
 	"time"
 
-	farm "github.com/dgryski/go-farm"
 	"github.com/golang/glog"
 	geom "github.com/twpayne/go-geom"
+	"golang.org/x/crypto/blake2b"
 
 	"github.com/dgraph-io/dgraph/types"
 	"github.com/dgraph-io/dgraph/x"
 )
 
+// Tokenizer identifiers are unique and can't be reused.
+// The range 0x00 - 0x7f is system reserved.
+// The range 0x80 - 0xff is for custom tokenizers.
+// TODO: use these everywhere where we must ensure a system tokenizer.
+const (
+	IdentNone     = 0x0
+	IdentTerm     = 0x1
+	IdentExact    = 0x2
+	IdentYear     = 0x4
+	IdentMonth    = 0x41
+	IdentDay      = 0x42
+	IdentHour     = 0x43
+	IdentGeo      = 0x5
+	IdentInt      = 0x6
+	IdentFloat    = 0x7
+	IdentFullText = 0x8
+	IdentBool     = 0x9
+	IdentTrigram  = 0xA
+	IdentHash     = 0xB
+	IdentCustom   = 0x80
+)
+
 // Tokenizer defines what a tokenizer must provide.
 type Tokenizer interface {
 
@@ -103,7 +125,7 @@ func LoadCustomTokenizer(soFile string) {
 	tokenizer := symb.(func() interface{})().(PluginTokenizer)
 
 	id := tokenizer.Identifier()
-	x.AssertTruef(id >= 0x80,
+	x.AssertTruef(id >= IdentCustom,
 		"custom tokenizer identifier byte must be >= 0x80, but was %#x", id)
 	registerTokenizer(CustomTokenizer{PluginTokenizer: tokenizer})
 }
@@ -142,7 +164,7 @@ func (t GeoTokenizer) Type() string { return "geo" }
 func (t GeoTokenizer) Tokens(v interface{}) ([]string, error) {
 	return types.IndexGeoTokens(v.(geom.T))
 }
-func (t GeoTokenizer) Identifier() byte { return 0x5 }
+func (t GeoTokenizer) Identifier() byte { return IdentGeo }
 func (t GeoTokenizer) IsSortable() bool { return false }
 func (t GeoTokenizer) IsLossy() bool    { return true }
 
@@ -153,7 +175,7 @@ func (t IntTokenizer) Type() string { return "int" }
 func (t IntTokenizer) Tokens(v interface{}) ([]string, error) {
 	return []string{encodeInt(v.(int64))}, nil
 }
-func (t IntTokenizer) Identifier() byte { return 0x6 }
+func (t IntTokenizer) Identifier() byte { return IdentInt }
 func (t IntTokenizer) IsSortable() bool { return true }
 func (t IntTokenizer) IsLossy() bool    { return false }
 
@@ -164,7 +186,7 @@ func (t FloatTokenizer) Type() string { return "float" }
 func (t FloatTokenizer) Tokens(v interface{}) ([]string, error) {
 	return []string{encodeInt(int64(v.(float64)))}, nil
 }
-func (t FloatTokenizer) Identifier() byte { return 0x7 }
+func (t FloatTokenizer) Identifier() byte { return IdentFloat }
 func (t FloatTokenizer) IsSortable() bool { return true }
 func (t FloatTokenizer) IsLossy() bool    { return true }
 
@@ -178,7 +200,7 @@ func (t YearTokenizer) Tokens(v interface{}) ([]string, error) {
 	binary.BigEndian.PutUint16(buf[0:2], uint16(tval.Year()))
 	return []string{string(buf)}, nil
 }
-func (t YearTokenizer) Identifier() byte { return 0x4 }
+func (t YearTokenizer) Identifier() byte { return IdentYear }
 func (t YearTokenizer) IsSortable() bool { return true }
 func (t YearTokenizer) IsLossy() bool    { return true }
 
@@ -193,7 +215,7 @@ func (t MonthTokenizer) Tokens(v interface{}) ([]string, error) {
 	binary.BigEndian.PutUint16(buf[2:4], uint16(tval.Month()))
 	return []string{string(buf)}, nil
 }
-func (t MonthTokenizer) Identifier() byte { return 0x41 }
+func (t MonthTokenizer) Identifier() byte { return IdentMonth }
 func (t MonthTokenizer) IsSortable() bool { return true }
 func (t MonthTokenizer) IsLossy() bool    { return true }
 
@@ -209,7 +231,7 @@ func (t DayTokenizer) Tokens(v interface{}) ([]string, error) {
 	binary.BigEndian.PutUint16(buf[4:6], uint16(tval.Day()))
 	return []string{string(buf)}, nil
 }
-func (t DayTokenizer) Identifier() byte { return 0x42 }
+func (t DayTokenizer) Identifier() byte { return IdentDay }
 func (t DayTokenizer) IsSortable() bool { return true }
 func (t DayTokenizer) IsLossy() bool    { return true }
 
@@ -226,7 +248,7 @@ func (t HourTokenizer) Tokens(v interface{}) ([]string, error) {
 	binary.BigEndian.PutUint16(buf[6:8], uint16(tval.Hour()))
 	return []string{string(buf)}, nil
 }
-func (t HourTokenizer) Identifier() byte { return 0x43 }
+func (t HourTokenizer) Identifier() byte { return IdentHour }
 func (t HourTokenizer) IsSortable() bool { return true }
 func (t HourTokenizer) IsLossy() bool    { return true }
 
@@ -242,7 +264,7 @@ func (t TermTokenizer) Tokens(v interface{}) ([]string, error) {
 	tokens := termAnalyzer.Analyze([]byte(str))
 	return uniqueTerms(tokens), nil
 }
-func (t TermTokenizer) Identifier() byte { return 0x1 }
+func (t TermTokenizer) Identifier() byte { return IdentTerm }
 func (t TermTokenizer) IsSortable() bool { return false }
 func (t TermTokenizer) IsLossy() bool    { return true }
 
@@ -256,7 +278,7 @@ func (t ExactTokenizer) Tokens(v interface{}) ([]string, error) {
 	}
 	return nil, x.Errorf("Exact indices only supported for string types")
 }
-func (t ExactTokenizer) Identifier() byte { return 0x2 }
+func (t ExactTokenizer) Identifier() byte { return IdentExact }
 func (t ExactTokenizer) IsSortable() bool { return true }
 func (t ExactTokenizer) IsLossy() bool    { return false }
 
@@ -279,7 +301,7 @@ func (t FullTextTokenizer) Tokens(v interface{}) ([]string, error) {
 	// finally, return the terms.
 	return uniqueTerms(tokens), nil
 }
-func (t FullTextTokenizer) Identifier() byte { return 0x8 }
+func (t FullTextTokenizer) Identifier() byte { return IdentFullText }
 func (t FullTextTokenizer) IsSortable() bool { return false }
 func (t FullTextTokenizer) IsLossy() bool    { return true }
 
@@ -321,7 +343,7 @@ func (t BoolTokenizer) Tokens(v interface{}) ([]string, error) {
 	}
 	return []string{encodeInt(b)}, nil
 }
-func (t BoolTokenizer) Identifier() byte { return 0x9 }
+func (t BoolTokenizer) Identifier() byte { return IdentBool }
 func (t BoolTokenizer) IsSortable() bool { return false }
 func (t BoolTokenizer) IsLossy() bool    { return false }
 
@@ -345,7 +367,7 @@ func (t TrigramTokenizer) Tokens(v interface{}) ([]string, error) {
 	}
 	return nil, nil
 }
-func (t TrigramTokenizer) Identifier() byte { return 0xA }
+func (t TrigramTokenizer) Identifier() byte { return IdentTrigram }
 func (t TrigramTokenizer) IsSortable() bool { return false }
 func (t TrigramTokenizer) IsLossy() bool    { return true }
 
@@ -358,13 +380,22 @@ func (t HashTokenizer) Tokens(v interface{}) ([]string, error) {
 	if !ok {
 		return nil, x.Errorf("Hash tokenizer only supported for string types")
 	}
-	var hash [8]byte
-	binary.BigEndian.PutUint64(hash[:], farm.Hash64([]byte(term)))
+	// Blake2 is a hash function equivalent of SHA series, but faster. SHA is the best hash function
+	// for doing checksum of content, because they have low collision ratios. See issue #2776.
+	hash := blake2b.Sum256([]byte(term))
+	if len(hash) == 0 {
+		return nil, x.Errorf("Hash tokenizer failed to create hash")
+	}
 	return []string{string(hash[:])}, nil
 }
-func (t HashTokenizer) Identifier() byte { return 0xB }
+func (t HashTokenizer) Identifier() byte { return IdentHash }
 func (t HashTokenizer) IsSortable() bool { return false }
-func (t HashTokenizer) IsLossy() bool    { return true }
+
+// We have switched HashTokenizer to be non-lossy. This allows us to avoid having to retrieve values
+// for the returned results, and compare them against the value in the query, which is slow. There
+// is very low probability of collisions with a 256-bit hash. We use that fact to speed up equality
+// query operations using the hash index.
+func (t HashTokenizer) IsLossy() bool { return false }
 
 // PluginTokenizer is implemented by external plugins loaded dynamically via
 // *.so files. It follows the implementation semantics of the Tokenizer
diff --git a/worker/task.go b/worker/task.go
@@ -962,12 +962,21 @@ func (qs *queryState) handleRegexFunction(ctx context.Context, arg funcArgs) err
 }
 
 func (qs *queryState) handleCompareFunction(ctx context.Context, arg funcArgs) error {
+	span := otrace.FromContext(ctx)
+	stop := x.SpanTimer(span, "handleCompareFunction")
+	defer stop()
+	if span != nil {
+		span.Annotatef(nil, "Number of uids: %d. args.srcFn: %+v", arg.srcFn.n, arg.srcFn)
+	}
+
 	attr := arg.q.Attr
+	span.Annotatef(nil, "Attr: %s. Fname: %s", attr, arg.srcFn.fname)
 	tokenizer, err := pickTokenizer(attr, arg.srcFn.fname)
 	// We should already have checked this in getInequalityTokens.
 	x.Check(err)
 	// Only if the tokenizer that we used IsLossy, then we need to fetch
 	// and compare the actual values.
+	span.Annotatef(nil, "Tokenizer: %s, Lossy: %t", tokenizer.Name(), tokenizer.IsLossy())
 	if tokenizer.IsLossy() {
 		// Need to evaluate inequality for entries in the first bucket.
 		typ, err := schema.State().TypeOf(attr)
diff --git a/worker/tokens.go b/worker/tokens.go