From ea4f3f0f58d7348d61b79faeee4df6688bb31dc1 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Sun, 26 Jan 2025 11:21:15 +0800 Subject: [PATCH 1/3] tso: add retry for UpdateTSO Signed-off-by: lhy1024 --- pkg/tso/global_allocator.go | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pkg/tso/global_allocator.go b/pkg/tso/global_allocator.go index 553e0b0effd..2e170f0b7e8 100644 --- a/pkg/tso/global_allocator.go +++ b/pkg/tso/global_allocator.go @@ -140,8 +140,18 @@ func (gta *GlobalTSOAllocator) IsInitialize() bool { } // UpdateTSO is used to update the TSO in memory and the time window in etcd. -func (gta *GlobalTSOAllocator) UpdateTSO() error { - return gta.timestampOracle.UpdateTimestamp() +func (gta *GlobalTSOAllocator) UpdateTSO() (err error) { + // When meet network partition, we need to manually retry to update the global tso, + // next request succeeds with the new endpoint, according to https://github.com/etcd-io/etcd/issues/8711 + for i := 0; i < 3; i++ { + err = gta.timestampOracle.UpdateTimestamp() + if err == nil { + return nil + } + log.Warn("try to update the global tso but failed", errs.ZapError(err)) + time.Sleep(gta.am.updatePhysicalInterval) + } + return } // SetTSO sets the physical part with given TSO. From c01a203c5f108d84bca7e3e22146d0aae7b75708 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Sun, 26 Jan 2025 11:37:44 +0800 Subject: [PATCH 2/3] fix lint Signed-off-by: lhy1024 --- pkg/tso/global_allocator.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/tso/global_allocator.go b/pkg/tso/global_allocator.go index 2e170f0b7e8..9ad5a25d8b8 100644 --- a/pkg/tso/global_allocator.go +++ b/pkg/tso/global_allocator.go @@ -143,7 +143,8 @@ func (gta *GlobalTSOAllocator) IsInitialize() bool { func (gta *GlobalTSOAllocator) UpdateTSO() (err error) { // When meet network partition, we need to manually retry to update the global tso, // next request succeeds with the new endpoint, according to https://github.com/etcd-io/etcd/issues/8711 - for i := 0; i < 3; i++ { + maxRetryCount := 3 + for range maxRetryCount { err = gta.timestampOracle.UpdateTimestamp() if err == nil { return nil From a557b407949cfdb0a7d6752325d07e5400532599 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Wed, 5 Feb 2025 11:58:35 +0800 Subject: [PATCH 3/3] update interval Signed-off-by: lhy1024 --- pkg/tso/global_allocator.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/tso/global_allocator.go b/pkg/tso/global_allocator.go index 9ad5a25d8b8..84325cf6820 100644 --- a/pkg/tso/global_allocator.go +++ b/pkg/tso/global_allocator.go @@ -150,7 +150,9 @@ func (gta *GlobalTSOAllocator) UpdateTSO() (err error) { return nil } log.Warn("try to update the global tso but failed", errs.ZapError(err)) - time.Sleep(gta.am.updatePhysicalInterval) + // Etcd client retry with roundRobinQuorumBackoff https://github.com/etcd-io/etcd/blob/d62cdeee4863001b09e772ed013eb1342a1d0f89/client/v3/client.go#L488 + // And its default interval is 25ms, so we sleep 50ms here. https://github.com/etcd-io/etcd/blob/d62cdeee4863001b09e772ed013eb1342a1d0f89/client/v3/options.go#L53 + time.Sleep(50 * time.Millisecond) } return }