-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Only refresh required tablet's information in VTOrc #11220
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ac661ec
89fb875
7b24036
817fa7a
f0f97b8
d6ff07a
7b26517
2049e16
4a09c3f
e851828
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -256,6 +256,10 @@ func DiscoverInstance(instanceKey inst.InstanceKey, forceDiscovery bool) { | |
| return | ||
| } | ||
|
|
||
| if forceDiscovery { | ||
| log.Infof("Force discovered - %+v", instance) | ||
| } | ||
|
Comment on lines
+259
to
+261
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This addition of logging is intentional. Until we have a metrics page where we export the internal database information of VTOrc, this is going to be very useful in debugging. I had it in my mind to add this log and I am just piggy-backing on this PR. |
||
|
|
||
| discoveryMetrics.Append(&discovery.Metric{ | ||
| Timestamp: time.Now(), | ||
| InstanceKey: instanceKey, | ||
|
|
@@ -452,7 +456,7 @@ func ContinuousDiscovery() { | |
| }() | ||
| case <-tabletTopoTick: | ||
| go RefreshAllKeyspaces() | ||
| go RefreshTablets(false /* forceRefresh */) | ||
| go refreshAllTablets() | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,7 +20,6 @@ import ( | |
| "context" | ||
| "errors" | ||
| "flag" | ||
| "fmt" | ||
| "strings" | ||
| "sync" | ||
| "sync/atomic" | ||
|
|
@@ -48,6 +47,8 @@ var ( | |
| clustersToWatch = flag.String("clusters_to_watch", "", "Comma-separated list of keyspaces or keyspace/shards that this instance will monitor and repair. Defaults to all clusters in the topology. Example: \"ks1,ks2/-80\"") | ||
| shutdownWaitTime = flag.Duration("shutdown_wait_time", 30*time.Second, "maximum time to wait for vtorc to release all the locks that it is holding before shutting down on SIGTERM") | ||
| shardsLockCounter int32 | ||
| // ErrNoPrimaryTablet is a fixed error message. | ||
| ErrNoPrimaryTablet = errors.New("no primary tablet found") | ||
| ) | ||
|
|
||
| // OpenTabletDiscovery opens the vitess topo if enables and returns a ticker | ||
|
|
@@ -69,11 +70,11 @@ func OpenTabletDiscovery() <-chan time.Time { | |
| return time.Tick(15 * time.Second) //nolint SA1015: using time.Tick leaks the underlying ticker | ||
| } | ||
|
|
||
| // RefreshTablets reloads the tablets from topo. | ||
| func RefreshTablets(forceRefresh bool) { | ||
| // refreshAllTablets reloads the tablets from topo and discovers the ones which haven't been refreshed in a while | ||
| func refreshAllTablets() { | ||
| refreshTabletsUsing(func(instanceKey *inst.InstanceKey) { | ||
| DiscoverInstance(*instanceKey, forceRefresh) | ||
| }, forceRefresh) | ||
| DiscoverInstance(*instanceKey, false /* forceDiscovery */) | ||
| }, false /* forceRefresh */) | ||
| } | ||
|
|
||
| func refreshTabletsUsing(loader func(instanceKey *inst.InstanceKey), forceRefresh bool) { | ||
|
|
@@ -157,6 +158,28 @@ func refreshTabletsInCell(ctx context.Context, cell string, loader func(instance | |
| refreshTablets(tablets, query, args, loader, forceRefresh) | ||
| } | ||
|
|
||
| // forceRefreshAllTabletsInShard is used to refresh all the tablet's information (both MySQL information and topo records) | ||
| // for a given shard. This function is meant to be called before or after a cluster-wide operation that we know will | ||
| // change the replication information for the entire cluster drastically enough to warrant a full forceful refresh | ||
| func forceRefreshAllTabletsInShard(ctx context.Context, keyspace, shard string) { | ||
| log.Infof("force refresh of all tablets in shard - %v/%v", keyspace, shard) | ||
| refreshCtx, refreshCancel := context.WithTimeout(ctx, *topo.RemoteOperationTimeout) | ||
| defer refreshCancel() | ||
| refreshTabletsInKeyspaceShard(refreshCtx, keyspace, shard, func(instanceKey *inst.InstanceKey) { | ||
| DiscoverInstance(*instanceKey, true) | ||
| }, true) | ||
| } | ||
|
|
||
| // refreshTabletInfoOfShard only refreshes the tablet records from the topo-server for all the tablets | ||
| // of the given keyspace-shard. | ||
| func refreshTabletInfoOfShard(ctx context.Context, keyspace, shard string) { | ||
| log.Infof("refresh of tablet records of shard - %v/%v", keyspace, shard) | ||
| refreshTabletsInKeyspaceShard(ctx, keyspace, shard, func(instanceKey *inst.InstanceKey) { | ||
GuptaManan100 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| // No-op | ||
| // We only want to refresh the tablet information for the given shard | ||
| }, false) | ||
| } | ||
|
|
||
| func refreshTabletsInKeyspaceShard(ctx context.Context, keyspace, shard string, loader func(instanceKey *inst.InstanceKey), forceRefresh bool) { | ||
| tablets, err := ts.GetTabletMapForShard(ctx, keyspace, shard) | ||
| if err != nil { | ||
|
|
@@ -285,21 +308,32 @@ func setReplicationSource(ctx context.Context, replica *topodatapb.Tablet, prima | |
| return tmc.SetReplicationSource(ctx, replica, primary.Alias, 0, "", true, semiSync) | ||
| } | ||
|
|
||
| // shardPrimary finds the primary of the given keyspace-shard by reading the topo server | ||
| func shardPrimary(ctx context.Context, keyspace string, shard string) (primary *topodatapb.Tablet, err error) { | ||
| si, err := ts.GetShard(ctx, keyspace, shard) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
| if !si.HasPrimary() { | ||
| return nil, fmt.Errorf("no primary tablet for shard %v/%v", keyspace, shard) | ||
| } | ||
| // TODO(GuptaManan100): Instead of another topo call, use the local information by calling | ||
| // ReadTablet. Currently this isn't possible since we only have the primary alias and not the source host and port | ||
| // This should be fixed once the tablet alias is changed to be the primary key of the table | ||
| primaryInfo, err := ts.GetTablet(ctx, si.PrimaryAlias) | ||
| if err != nil { | ||
| return nil, err | ||
| // shardPrimary finds the primary of the given keyspace-shard by reading the orchestrator backend | ||
| func shardPrimary(keyspace string, shard string) (primary *topodatapb.Tablet, err error) { | ||
| query := `SELECT | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. general comment: should we not put all query execution in some retryable template function?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, that would be a good addition, but so far we have not really needed, because even if the read fails, we just fail the recovery and then retry later. |
||
| info, | ||
| hostname, | ||
| port, | ||
| tablet_type, | ||
| primary_timestamp | ||
| FROM | ||
| vitess_tablet | ||
| WHERE | ||
| keyspace = ? AND shard = ? | ||
| AND tablet_type = ? | ||
| ORDER BY | ||
| primary_timestamp DESC | ||
| LIMIT 1 | ||
| ` | ||
| err = db.Db.QueryOrchestrator(query, sqlutils.Args(keyspace, shard, topodatapb.TabletType_PRIMARY), func(m sqlutils.RowMap) error { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As part of cleaning up, we should rename some of these functions.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I had it in my mind to get rid of the |
||
| if primary == nil { | ||
| primary = &topodatapb.Tablet{} | ||
| return prototext.Unmarshal([]byte(m.GetString("info")), primary) | ||
| } | ||
| return nil | ||
| }) | ||
| if primary == nil && err == nil { | ||
| err = ErrNoPrimaryTablet | ||
| } | ||
| return primaryInfo.Tablet, nil | ||
| return primary, err | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We need the name of the shard that was analyzed too now that we want to restrict the number of tablets we want to refresh