Skip to content

Commit a4756c2

Browse files
authored
RegExp filter work without filter (#2913)
* let regexp filter not depend on index. * added func HasTokenizer * using HasTokenizer for regexp index * support regexp without trigram index at root and filter * typo * simplified the logic a bit, lint fixes. * added regexp trigram tests * changed tests since they won't trigger error now. * this should return true sometimes * regexp index is no longer required * when filtering use the uid list provided * handleRegexFunction returns error when no index and regexp is called at root * updated test to expect an error in root case * test cases might not be mutex in the future, revert the order. * dont try to match all posting values against regexp * added comment
1 parent c4cae75 commit a4756c2

File tree

4 files changed

+162
-61
lines changed

4 files changed

+162
-61
lines changed

query/query3_test.go

+7-5
Original file line numberDiff line numberDiff line change
@@ -1509,15 +1509,16 @@ func TestFilterRegex1(t *testing.T) {
15091509
{
15101510
me(func: uid(0x01)) {
15111511
name
1512-
friend @filter(regexp(name, /^[a-z A-Z]+$/)) {
1512+
friend @filter(regexp(name, /^[Glen Rh]+$/)) {
15131513
name
15141514
}
15151515
}
15161516
}
15171517
`
15181518

1519-
_, err := processToFastJson(t, query)
1520-
require.Error(t, err)
1519+
js := processToFastJsonNoErr(t, query)
1520+
require.JSONEq(t,
1521+
`{"data": {"me":[{"name":"Michonne", "friend":[{"name":"Glenn Rhee"}]}]}}`, js)
15211522
}
15221523

15231524
func TestFilterRegex2(t *testing.T) {
@@ -1533,8 +1534,9 @@ func TestFilterRegex2(t *testing.T) {
15331534
}
15341535
`
15351536

1536-
_, err := processToFastJson(t, query)
1537-
require.Error(t, err)
1537+
js := processToFastJsonNoErr(t, query)
1538+
require.JSONEq(t,
1539+
`{"data": {"me":[{"name":"Michonne", "friend":[{"name":"Rick Grimes"},{"name":"Glenn Rhee"}]}]}}`, js)
15381540
}
15391541

15401542
func TestFilterRegex3(t *testing.T) {

schema/schema.go

+11
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,17 @@ func (s *state) TokenizerNames(pred string) []string {
180180
return names
181181
}
182182

183+
// HasTokenizer is a convenience func that checks if a given tokenizer is found in pred.
184+
// Returns true if found, else false.
185+
func (s *state) HasTokenizer(id byte, pred string) bool {
186+
for _, t := range s.Tokenizer(pred) {
187+
if t.Identifier() == id {
188+
return true
189+
}
190+
}
191+
return false
192+
}
193+
183194
// IsReversed returns whether the predicate has reverse edge or not
184195
func (s *state) IsReversed(pred string) bool {
185196
s.RLock()

systest/queries_test.go

+74
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ func TestQuery(t *testing.T) {
5050
t.Run("multiple block eval", wrap(MultipleBlockEval))
5151
t.Run("unmatched var assignment eval", wrap(UnmatchedVarEval))
5252
t.Run("hash index queries", wrap(QueryHashIndex))
53+
t.Run("regexp with toggled trigram index", wrap(RegexpToggleTrigramIndex))
5354
t.Run("cleanup", wrap(SchemaQueryCleanup))
5455
}
5556

@@ -632,3 +633,76 @@ func QueryHashIndex(t *testing.T, c *dgo.Dgraph) {
632633
CompareJSON(t, tc.out, string(resp.Json))
633634
}
634635
}
636+
637+
func RegexpToggleTrigramIndex(t *testing.T, c *dgo.Dgraph) {
638+
ctx := context.Background()
639+
640+
require.NoError(t, c.Alter(ctx, &api.Operation{
641+
Schema: `
642+
name: string @index(term) @lang .
643+
`,
644+
}))
645+
646+
txn := c.NewTxn()
647+
_, err := txn.Mutate(ctx, &api.Mutation{
648+
SetNquads: []byte(`
649+
_:x1 <name> "The luck is in the details" .
650+
_:x1 <name> "The art is in the details"@en .
651+
_:x1 <name> "L'art est dans les détails"@fr .
652+
653+
_:x2 <name> "Detach yourself from ostentation" .
654+
_:x2 <name> "Detach yourself from artificiality"@en .
655+
_:x2 <name> "Desprenderse de la artificialidad"@es .
656+
`),
657+
})
658+
require.NoError(t, err)
659+
require.NoError(t, txn.Commit(ctx))
660+
661+
tests := []struct {
662+
in, out string
663+
}{
664+
{
665+
in: `{q(func:has(name)) @filter(regexp(name, /art/)) {name}}`,
666+
out: `{"q":[]}`,
667+
},
668+
{
669+
in: `{q(func:has(name)) @filter(regexp(name@es, /art/)) {name@es}}`,
670+
out: `{"q":[{"name@es": "Desprenderse de la artificialidad"}]}`,
671+
},
672+
{
673+
in: `{q(func:has(name)) @filter(regexp(name@fr, /art/)) {name@fr}}`,
674+
out: `{"q":[{"name@fr": "L'art est dans les détails"}]}`,
675+
},
676+
}
677+
678+
t.Log("testing without trigram index")
679+
for _, tc := range tests {
680+
resp, err := c.NewTxn().Query(ctx, tc.in)
681+
require.NoError(t, err)
682+
CompareJSON(t, tc.out, string(resp.Json))
683+
}
684+
685+
require.NoError(t, c.Alter(ctx, &api.Operation{
686+
Schema: `
687+
name: string @index(trigram) @lang .
688+
`,
689+
}))
690+
691+
t.Log("testing with trigram index")
692+
for _, tc := range tests {
693+
resp, err := c.NewTxn().Query(ctx, tc.in)
694+
require.NoError(t, err)
695+
CompareJSON(t, tc.out, string(resp.Json))
696+
}
697+
698+
require.NoError(t, c.Alter(ctx, &api.Operation{
699+
Schema: `
700+
name: string @index(term) @lang .
701+
`,
702+
}))
703+
704+
t.Log("testing without trigram index at root")
705+
_, err = c.NewTxn().Query(ctx, `{q(func:regexp(name, /art/)) {name}}`)
706+
require.Error(t, err)
707+
require.Contains(t, err.Error(), "Attribute name does not have trigram index for regex matching.")
708+
}

worker/task.go

+70-56
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,8 @@ var (
5252
emptyValueList = pb.ValueList{Values: []*pb.TaskValue{}}
5353
)
5454

55-
func invokeNetworkRequest(
56-
ctx context.Context, addr string, f func(context.Context, pb.WorkerClient) (interface{}, error)) (interface{}, error) {
55+
func invokeNetworkRequest(ctx context.Context, addr string,
56+
f func(context.Context, pb.WorkerClient) (interface{}, error)) (interface{}, error) {
5757
pl, err := conn.Get().Get(addr)
5858
if err != nil {
5959
return &emptyResult, x.Wrapf(err, "dispatchTaskOverNetwork: while retrieving connection.")
@@ -270,7 +270,7 @@ func parseFuncTypeHelper(name string) (FuncType, string) {
270270

271271
func needsIndex(fnType FuncType) bool {
272272
switch fnType {
273-
case CompareAttrFn, GeoFn, RegexFn, FullTextSearchFn, StandardFn:
273+
case CompareAttrFn, GeoFn, FullTextSearchFn, StandardFn:
274274
return true
275275
default:
276276
return false
@@ -876,88 +876,102 @@ func (qs *queryState) handleCompareScalarFunction(arg funcArgs) error {
876876
}
877877

878878
func (qs *queryState) handleRegexFunction(ctx context.Context, arg funcArgs) error {
879+
span := otrace.FromContext(ctx)
880+
stop := x.SpanTimer(span, "handleRegexFunction")
881+
defer stop()
882+
if span != nil {
883+
span.Annotatef(nil, "Number of uids: %d. args.srcFn: %+v", arg.srcFn.n, arg.srcFn)
884+
}
885+
879886
attr := arg.q.Attr
880887
typ, err := schema.State().TypeOf(attr)
888+
span.Annotatef(nil, "Attr: %s. Type: %s", attr, typ.Name())
881889
if err != nil || !typ.IsScalar() {
882890
return x.Errorf("Attribute not scalar: %s %v", attr, typ)
883891
}
884892
if typ != types.StringID {
885893
return x.Errorf("Got non-string type. Regex match is allowed only on string type.")
886894
}
887-
tokenizers := schema.State().TokenizerNames(attr)
888-
var found bool
889-
for _, t := range tokenizers {
890-
if t == "trigram" { // TODO(tzdybal) - maybe just rename to 'regex' tokenizer?
891-
found = true
892-
}
893-
}
894-
if !found {
895-
return x.Errorf("Attribute %v does not have trigram index for regex matching.", attr)
896-
}
895+
useIndex := schema.State().HasTokenizer(tok.IdentTrigram, attr)
896+
span.Annotatef(nil, "Trigram index found: %t, func at root: %t",
897+
useIndex, arg.srcFn.isFuncAtRoot)
897898

898899
query := cindex.RegexpQuery(arg.srcFn.regex.Syntax)
899900
empty := pb.List{}
900-
uids, err := uidsForRegex(attr, arg, query, &empty)
901+
uids := &pb.List{}
902+
903+
// Here we determine the list of uids to match.
904+
switch {
905+
// If this is a filter eval, use the given uid list (good)
906+
case arg.q.UidList != nil && len(arg.q.UidList.Uids) != 0:
907+
uids = arg.q.UidList
908+
909+
// Prefer to use an index (fast)
910+
case useIndex:
911+
uids, err = uidsForRegex(attr, arg, query, &empty)
912+
if err != nil {
913+
return err
914+
}
915+
916+
// No index and at root, return error instructing user to use `has` or index.
917+
default:
918+
return x.Errorf(
919+
"Attribute %v does not have trigram index for regex matching. "+
920+
"Please add a trigram index or use has/uid function with regexp() as filter.",
921+
attr)
922+
}
923+
924+
arg.out.UidMatrix = append(arg.out.UidMatrix, uids)
901925
isList := schema.State().IsList(attr)
902926
lang := langForFunc(arg.q.Langs)
903-
if uids != nil {
904-
arg.out.UidMatrix = append(arg.out.UidMatrix, uids)
905927

906-
filtered := &pb.List{}
907-
for _, uid := range uids.Uids {
908-
select {
909-
case <-ctx.Done():
910-
return ctx.Err()
911-
default:
912-
}
913-
pl, err := qs.cache.Get(x.DataKey(attr, uid))
914-
if err != nil {
915-
return err
916-
}
928+
span.Annotatef(nil, "Total uids: %d, list: %t lang: %v", len(uids.Uids), isList, lang)
917929

918-
var val types.Val
919-
if lang != "" {
920-
val, err = pl.ValueForTag(arg.q.ReadTs, lang)
921-
} else if isList {
922-
vals, err := pl.AllUntaggedValues(arg.q.ReadTs)
923-
if err == posting.ErrNoValue {
924-
continue
925-
} else if err != nil {
926-
return err
927-
}
928-
for _, val := range vals {
929-
// convert data from binary to appropriate format
930-
strVal, err := types.Convert(val, types.StringID)
931-
if err == nil && matchRegex(strVal, arg.srcFn.regex) {
932-
filtered.Uids = append(filtered.Uids, uid)
933-
break
934-
}
935-
}
930+
filtered := &pb.List{}
931+
for _, uid := range uids.Uids {
932+
select {
933+
case <-ctx.Done():
934+
return ctx.Err()
935+
default:
936+
}
937+
pl, err := qs.cache.Get(x.DataKey(attr, uid))
938+
if err != nil {
939+
return err
940+
}
936941

937-
continue
938-
} else {
939-
val, err = pl.Value(arg.q.ReadTs)
940-
}
942+
vals := make([]types.Val, 1)
943+
switch {
944+
case lang != "":
945+
vals[0], err = pl.ValueForTag(arg.q.ReadTs, lang)
946+
947+
case isList:
948+
vals, err = pl.AllUntaggedValues(arg.q.ReadTs)
941949

950+
default:
951+
vals[0], err = pl.Value(arg.q.ReadTs)
952+
}
953+
if err != nil {
942954
if err == posting.ErrNoValue {
943955
continue
944-
} else if err != nil {
945-
return err
946956
}
957+
return err
958+
}
947959

960+
for _, val := range vals {
948961
// convert data from binary to appropriate format
949962
strVal, err := types.Convert(val, types.StringID)
950963
if err == nil && matchRegex(strVal, arg.srcFn.regex) {
951964
filtered.Uids = append(filtered.Uids, uid)
965+
// NOTE: We only add the uid once.
966+
break
952967
}
953968
}
969+
}
954970

955-
for i := 0; i < len(arg.out.UidMatrix); i++ {
956-
algo.IntersectWith(arg.out.UidMatrix[i], filtered, arg.out.UidMatrix[i])
957-
}
958-
} else {
959-
return err
971+
for i := 0; i < len(arg.out.UidMatrix); i++ {
972+
algo.IntersectWith(arg.out.UidMatrix[i], filtered, arg.out.UidMatrix[i])
960973
}
974+
961975
return nil
962976
}
963977

0 commit comments

Comments
 (0)