Skip to content

Commit 8fc4301

Browse files
GuptaManan100tanjinx
authored andcommitted
backport upstream 16655
1 parent 44b64bb commit 8fc4301

7 files changed

+302
-65
lines changed

go/vt/discovery/keyspace_events.go

+87-12
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@ package discovery
1919
import (
2020
"context"
2121
"fmt"
22+
"slices"
2223
"sync"
24+
"time"
2325

2426
"google.golang.org/protobuf/proto"
2527

@@ -36,6 +38,11 @@ import (
3638
vschemapb "vitess.io/vitess/go/vt/proto/vschema"
3739
)
3840

41+
var (
42+
// waitConsistentKeyspacesCheck is the amount of time to wait for between checks to verify the keyspace is consistent.
43+
waitConsistentKeyspacesCheck = 100 * time.Millisecond
44+
)
45+
3946
// KeyspaceEventWatcher is an auxiliary watcher that watches all availability incidents
4047
// for all keyspaces in a Vitess cell and notifies listeners when the events have been resolved.
4148
// Right now this is capable of detecting the end of failovers, both planned and unplanned,
@@ -643,28 +650,53 @@ func (kew *KeyspaceEventWatcher) TargetIsBeingResharded(ctx context.Context, tar
643650
return ks.beingResharded(target.Shard)
644651
}
645652

646-
// PrimaryIsNotServing checks if the reason why the given target is not accessible right now is
647-
// that the primary tablet for that shard is not serving. This is possible during a Planned Reparent Shard
648-
// operation. Just as the operation completes, a new primary will be elected, and it will send its own healthcheck
649-
// stating that it is serving. We should buffer requests until that point.
650-
// There are use cases where people do not run with a Primary server at all, so we must verify that
651-
// we only start buffering when a primary was present, and it went not serving.
652-
// The shard state keeps track of the current primary and the last externally reparented time, which we can use
653-
// to determine that there was a serving primary which now became non serving. This is only possible in a DemotePrimary
654-
// RPC which are only called from ERS and PRS. So buffering will stop when these operations succeed.
655-
// We return the tablet alias of the primary if it is serving.
656-
func (kew *KeyspaceEventWatcher) PrimaryIsNotServing(ctx context.Context, target *querypb.Target) (*topodatapb.TabletAlias, bool) {
653+
// ShouldStartBufferingForTarget checks if we should be starting buffering for the given target.
654+
// We check the following things before we start buffering -
655+
// 1. The shard must have a primary.
656+
// 2. The primary must be non-serving.
657+
// 3. The keyspace must be marked inconsistent.
658+
//
659+
// This buffering is meant to kick in during a Planned Reparent Shard operation.
660+
// As part of that operation the old primary will become non-serving. At that point
661+
// this code should return true to start buffering requests.
662+
// Just as the PRS operation completes, a new primary will be elected, and
663+
// it will send its own healthcheck stating that it is serving. We should buffer requests until
664+
// that point.
665+
//
666+
// There are use cases where people do not run with a Primary server at all, so we must
667+
// verify that we only start buffering when a primary was present, and it went not serving.
668+
// The shard state keeps track of the current primary and the last externally reparented time, which
669+
// we can use to determine that there was a serving primary which now became non serving. This is
670+
// only possible in a DemotePrimary RPC which are only called from ERS and PRS. So buffering will
671+
// stop when these operations succeed. We also return the tablet alias of the primary if it is serving.
672+
func (kew *KeyspaceEventWatcher) ShouldStartBufferingForTarget(ctx context.Context, target *querypb.Target) (*topodatapb.TabletAlias, bool) {
657673
if target.TabletType != topodatapb.TabletType_PRIMARY {
674+
// We don't support buffering for any target tablet type other than the primary.
658675
return nil, false
659676
}
660677
ks := kew.getKeyspaceStatus(ctx, target.Keyspace)
661678
if ks == nil {
679+
// If the keyspace status is nil, then the keyspace must be deleted.
680+
// The user query is trying to access a keyspace that has been deleted.
681+
// There is no reason to buffer this query.
662682
return nil, false
663683
}
664684
ks.mu.Lock()
665685
defer ks.mu.Unlock()
666686
if state, ok := ks.shards[target.Shard]; ok {
667-
// If the primary tablet was present then externallyReparented will be non-zero and currentPrimary will be not nil
687+
// As described in the function comment, we only want to start buffering when all the following conditions are met -
688+
// 1. The shard must have a primary. We check this by checking the currentPrimary and externallyReparented fields being non-empty.
689+
// They are set the first time the shard registers an update from a serving primary and are never cleared out after that.
690+
// If the user has configured vtgates to wait for the primary tablet healthchecks before starting query service, this condition
691+
// will always be true.
692+
// 2. The primary must be non-serving. We check this by checking the serving field in the shard state.
693+
// When a primary becomes non-serving, it also marks the keyspace inconsistent. So the next check is only added
694+
// for being defensive against any bugs.
695+
// 3. The keyspace must be marked inconsistent. We check this by checking the consistent field in the keyspace state.
696+
//
697+
// The reason we need all the three checks is that we want to be very defensive in when we start buffering.
698+
// We don't want to start buffering when we don't know for sure if the primary
699+
// is not serving and we will receive an update that stops buffering soon.
668700
return state.currentPrimary, !state.serving && !ks.consistent && state.externallyReparented != 0 && state.currentPrimary != nil
669701
}
670702
return nil, false
@@ -716,3 +748,46 @@ func (kew *KeyspaceEventWatcher) MarkShardNotServing(ctx context.Context, keyspa
716748
}
717749
return true
718750
}
751+
752+
// WaitForConsistentKeyspaces waits for the given set of keyspaces to be marked consistent.
753+
func (kew *KeyspaceEventWatcher) WaitForConsistentKeyspaces(ctx context.Context, ksList []string) error {
754+
// We don't want to change the original keyspace list that we receive so we clone it
755+
// before we empty it elements down below.
756+
keyspaces := slices.Clone(ksList)
757+
for {
758+
// We empty keyspaces as we find them to be consistent.
759+
allConsistent := true
760+
for i, ks := range keyspaces {
761+
if ks == "" {
762+
continue
763+
}
764+
765+
// Get the keyspace status and see it is consistent yet or not.
766+
kss := kew.getKeyspaceStatus(ctx, ks)
767+
// If kss is nil, then it must be deleted. In that case too it is fine for us to consider
768+
// it consistent since the keyspace has been deleted.
769+
if kss == nil || kss.consistent {
770+
keyspaces[i] = ""
771+
} else {
772+
allConsistent = false
773+
}
774+
}
775+
776+
if allConsistent {
777+
// all the keyspaces are consistent.
778+
return nil
779+
}
780+
781+
// Unblock after the sleep or when the context has expired.
782+
select {
783+
case <-ctx.Done():
784+
for _, ks := range keyspaces {
785+
if ks != "" {
786+
log.Infof("keyspace %v didn't become consistent", ks)
787+
}
788+
}
789+
return ctx.Err()
790+
case <-time.After(waitConsistentKeyspacesCheck):
791+
}
792+
}
793+
}

go/vt/discovery/keyspace_events_test.go

+100-19
Original file line numberDiff line numberDiff line change
@@ -86,11 +86,11 @@ func TestKeyspaceEventTypes(t *testing.T) {
8686
kew := NewKeyspaceEventWatcher(ctx, ts2, hc, cell)
8787

8888
type testCase struct {
89-
name string
90-
kss *keyspaceState
91-
shardToCheck string
92-
expectResharding bool
93-
expectPrimaryNotServing bool
89+
name string
90+
kss *keyspaceState
91+
shardToCheck string
92+
expectResharding bool
93+
expectShouldBuffer bool
9494
}
9595

9696
testCases := []testCase{
@@ -127,9 +127,9 @@ func TestKeyspaceEventTypes(t *testing.T) {
127127
},
128128
consistent: false,
129129
},
130-
shardToCheck: "-",
131-
expectResharding: true,
132-
expectPrimaryNotServing: false,
130+
shardToCheck: "-",
131+
expectResharding: true,
132+
expectShouldBuffer: false,
133133
},
134134
{
135135
name: "two to four resharding in progress",
@@ -188,9 +188,9 @@ func TestKeyspaceEventTypes(t *testing.T) {
188188
},
189189
consistent: false,
190190
},
191-
shardToCheck: "-80",
192-
expectResharding: true,
193-
expectPrimaryNotServing: false,
191+
shardToCheck: "-80",
192+
expectResharding: true,
193+
expectShouldBuffer: false,
194194
},
195195
{
196196
name: "unsharded primary not serving",
@@ -214,9 +214,9 @@ func TestKeyspaceEventTypes(t *testing.T) {
214214
},
215215
consistent: false,
216216
},
217-
shardToCheck: "-",
218-
expectResharding: false,
219-
expectPrimaryNotServing: true,
217+
shardToCheck: "-",
218+
expectResharding: false,
219+
expectShouldBuffer: true,
220220
},
221221
{
222222
name: "sharded primary not serving",
@@ -248,9 +248,9 @@ func TestKeyspaceEventTypes(t *testing.T) {
248248
},
249249
consistent: false,
250250
},
251-
shardToCheck: "-80",
252-
expectResharding: false,
253-
expectPrimaryNotServing: true,
251+
shardToCheck: "-80",
252+
expectResharding: false,
253+
expectShouldBuffer: true,
254254
},
255255
}
256256

@@ -265,8 +265,89 @@ func TestKeyspaceEventTypes(t *testing.T) {
265265
resharding := kew.TargetIsBeingResharded(ctx, tc.kss.shards[tc.shardToCheck].target)
266266
require.Equal(t, resharding, tc.expectResharding, "TargetIsBeingResharded should return %t", tc.expectResharding)
267267

268-
_, primaryDown := kew.PrimaryIsNotServing(ctx, tc.kss.shards[tc.shardToCheck].target)
269-
require.Equal(t, primaryDown, tc.expectPrimaryNotServing, "PrimaryIsNotServing should return %t", tc.expectPrimaryNotServing)
268+
_, shouldBuffer := kew.ShouldStartBufferingForTarget(ctx, tc.kss.shards[tc.shardToCheck].target)
269+
require.Equal(t, shouldBuffer, tc.expectShouldBuffer, "ShouldStartBufferingForTarget should return %t", tc.expectShouldBuffer)
270+
})
271+
}
272+
}
273+
274+
// TestWaitForConsistentKeyspaces tests the behaviour of WaitForConsistent for different scenarios.
275+
func TestWaitForConsistentKeyspaces(t *testing.T) {
276+
testcases := []struct {
277+
name string
278+
ksMap map[string]*keyspaceState
279+
ksList []string
280+
errExpected string
281+
}{
282+
{
283+
name: "Empty keyspace list",
284+
ksList: nil,
285+
ksMap: map[string]*keyspaceState{
286+
"ks1": {},
287+
},
288+
errExpected: "",
289+
},
290+
{
291+
name: "All keyspaces consistent",
292+
ksList: []string{"ks1", "ks2"},
293+
ksMap: map[string]*keyspaceState{
294+
"ks1": {
295+
consistent: true,
296+
},
297+
"ks2": {
298+
consistent: true,
299+
},
300+
},
301+
errExpected: "",
302+
},
303+
{
304+
name: "One keyspace inconsistent",
305+
ksList: []string{"ks1", "ks2"},
306+
ksMap: map[string]*keyspaceState{
307+
"ks1": {
308+
consistent: true,
309+
},
310+
"ks2": {
311+
consistent: false,
312+
},
313+
},
314+
errExpected: "context canceled",
315+
},
316+
{
317+
name: "One deleted keyspace - consistent",
318+
ksList: []string{"ks1", "ks2"},
319+
ksMap: map[string]*keyspaceState{
320+
"ks1": {
321+
consistent: true,
322+
},
323+
"ks2": {
324+
deleted: true,
325+
},
326+
},
327+
errExpected: "",
328+
},
329+
}
330+
331+
for _, tt := range testcases {
332+
t.Run(tt.name, func(t *testing.T) {
333+
// We create a cancelable context and immediately cancel it.
334+
// We don't want the unit tests to wait, so we only test the first
335+
// iteration of whether the keyspace event watcher returns
336+
// that the keyspaces are consistent or not.
337+
ctx, cancel := context.WithCancel(context.Background())
338+
cancel()
339+
kew := KeyspaceEventWatcher{
340+
keyspaces: tt.ksMap,
341+
mu: sync.Mutex{},
342+
ts: &fakeTopoServer{},
343+
}
344+
err := kew.WaitForConsistentKeyspaces(ctx, tt.ksList)
345+
if tt.errExpected != "" {
346+
require.ErrorContains(t, err, tt.errExpected)
347+
} else {
348+
require.NoError(t, err)
349+
}
350+
270351
})
271352
}
272353
}

go/vt/srvtopo/discover.go

+7-7
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,8 @@ limitations under the License.
1717
package srvtopo
1818

1919
import (
20-
"sync"
21-
2220
"context"
21+
"sync"
2322

2423
"vitess.io/vitess/go/vt/concurrency"
2524
"vitess.io/vitess/go/vt/log"
@@ -29,15 +28,16 @@ import (
2928
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
3029
)
3130

32-
// FindAllTargets goes through all serving shards in the topology for the provided keyspaces
31+
// FindAllTargetsAndKeyspaces goes through all serving shards in the topology for the provided keyspaces
3332
// and tablet types. If no keyspaces are provided all available keyspaces in the topo are
3433
// fetched. It returns one Target object per keyspace/shard/matching TabletType.
35-
func FindAllTargets(ctx context.Context, ts Server, cell string, keyspaces []string, tabletTypes []topodatapb.TabletType) ([]*querypb.Target, error) {
34+
// It also returns all the keyspaces that it found.
35+
func FindAllTargetsAndKeyspaces(ctx context.Context, ts Server, cell string, keyspaces []string, tabletTypes []topodatapb.TabletType) ([]*querypb.Target, []string, error) {
3636
var err error
3737
if len(keyspaces) == 0 {
3838
keyspaces, err = ts.GetSrvKeyspaceNames(ctx, cell, true)
3939
if err != nil {
40-
return nil, err
40+
return nil, nil, err
4141
}
4242
}
4343

@@ -95,8 +95,8 @@ func FindAllTargets(ctx context.Context, ts Server, cell string, keyspaces []str
9595
}
9696
wg.Wait()
9797
if errRecorder.HasErrors() {
98-
return nil, errRecorder.Error()
98+
return nil, nil, errRecorder.Error()
9999
}
100100

101-
return targets, nil
101+
return targets, keyspaces, nil
102102
}

0 commit comments

Comments
 (0)