From a1adff0e29ce63d4beed5298ba970cb85be96125 Mon Sep 17 00:00:00 2001 From: Flo Date: Tue, 24 Feb 2026 10:04:47 +0100 Subject: [PATCH 1/3] fix: retry memberlist creation --- pkg/cluster/bridge.go | 51 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 6 deletions(-) diff --git a/pkg/cluster/bridge.go b/pkg/cluster/bridge.go index e483e7e033..18d7562881 100644 --- a/pkg/cluster/bridge.go +++ b/pkg/cluster/bridge.go @@ -48,7 +48,14 @@ func (c *gossipCluster) evaluateBridge() { } // promoteToBridge creates a WAN memberlist and joins WAN seeds. +// +// memberlist.Create is performed outside the lock because it does network I/O +// (binding a port) and must not block Broadcast/Members/IsBridge. Retries with +// exponential backoff handle the case where a previous WAN memberlist's socket +// hasn't been fully released by the OS yet (e.g. after a rapid demote→promote +// cycle). func (c *gossipCluster) promoteToBridge() { + // Phase 1: check state and capture config under lock. c.mu.Lock() if c.isBridge { c.mu.Unlock() @@ -67,13 +74,47 @@ func (c *gossipCluster) promoteToBridge() { } wanCfg.LogOutput = newLogWriter("wan") wanCfg.SecretKey = c.config.SecretKey - wanCfg.Delegate = newWANDelegate(c) - wanList, err := memberlist.Create(wanCfg) - if err != nil { + seeds := c.config.WANSeeds + c.mu.Unlock() + + // Phase 2: create memberlist outside lock, retry with exponential backoff. + backoff := 500 * time.Millisecond + var wanList *memberlist.Memberlist + + for { + if c.closing.Load() { + return + } + + var err error + wanList, err = memberlist.Create(wanCfg) + if err == nil { + break + } + + logger.Warn("Failed to create WAN memberlist, retrying", + "error", err, + "next_backoff", backoff, + ) + + select { + case <-c.done: + return + case <-time.After(backoff): + } + + backoff = min(backoff*2, reconnectInterval) + } + + // Phase 3: re-check state and commit under lock. + c.mu.Lock() + if c.isBridge || c.closing.Load() { c.mu.Unlock() - logger.Error("Failed to create WAN memberlist", "error", err) + // Another goroutine promoted or the node is shutting down; discard. + wanList.Leave(5 * time.Second) //nolint:errcheck + wanList.Shutdown() //nolint:errcheck return } @@ -82,9 +123,7 @@ func (c *gossipCluster) promoteToBridge() { NumNodes: func() int { return wanList.NumMembers() }, RetransmitMult: 4, } - c.isBridge = true - seeds := c.config.WANSeeds c.mu.Unlock() metrics.ClusterBridgeStatus.Set(1) From 3b24569f77e2610c729603f750c6da45c614efbe Mon Sep 17 00:00:00 2001 From: Flo Date: Tue, 24 Feb 2026 10:06:39 +0100 Subject: [PATCH 2/3] remove comments --- pkg/cluster/bridge.go | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/pkg/cluster/bridge.go b/pkg/cluster/bridge.go index 18d7562881..5d4c18e45a 100644 --- a/pkg/cluster/bridge.go +++ b/pkg/cluster/bridge.go @@ -48,14 +48,11 @@ func (c *gossipCluster) evaluateBridge() { } // promoteToBridge creates a WAN memberlist and joins WAN seeds. -// // memberlist.Create is performed outside the lock because it does network I/O -// (binding a port) and must not block Broadcast/Members/IsBridge. Retries with -// exponential backoff handle the case where a previous WAN memberlist's socket -// hasn't been fully released by the OS yet (e.g. after a rapid demote→promote -// cycle). +// and must not block Broadcast/Members/IsBridge. Retries with exponential +// backoff handle the case where a previous WAN memberlist's socket hasn't been +// fully released by the OS yet (e.g. after a rapid demote→promote cycle). func (c *gossipCluster) promoteToBridge() { - // Phase 1: check state and capture config under lock. c.mu.Lock() if c.isBridge { c.mu.Unlock() @@ -79,7 +76,6 @@ func (c *gossipCluster) promoteToBridge() { seeds := c.config.WANSeeds c.mu.Unlock() - // Phase 2: create memberlist outside lock, retry with exponential backoff. backoff := 500 * time.Millisecond var wanList *memberlist.Memberlist @@ -108,11 +104,9 @@ func (c *gossipCluster) promoteToBridge() { backoff = min(backoff*2, reconnectInterval) } - // Phase 3: re-check state and commit under lock. c.mu.Lock() if c.isBridge || c.closing.Load() { c.mu.Unlock() - // Another goroutine promoted or the node is shutting down; discard. wanList.Leave(5 * time.Second) //nolint:errcheck wanList.Shutdown() //nolint:errcheck return From cb824242f4cfada948be80402c541d31ed184f77 Mon Sep 17 00:00:00 2001 From: Flo Date: Tue, 24 Feb 2026 11:46:41 +0100 Subject: [PATCH 3/3] move to const --- pkg/cluster/bridge.go | 4 ++-- pkg/cluster/cluster.go | 14 +++++++++----- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/pkg/cluster/bridge.go b/pkg/cluster/bridge.go index 5d4c18e45a..eadcbd29df 100644 --- a/pkg/cluster/bridge.go +++ b/pkg/cluster/bridge.go @@ -76,7 +76,7 @@ func (c *gossipCluster) promoteToBridge() { seeds := c.config.WANSeeds c.mu.Unlock() - backoff := 500 * time.Millisecond + backoff := initialBackoff var wanList *memberlist.Memberlist for { @@ -105,7 +105,7 @@ func (c *gossipCluster) promoteToBridge() { } c.mu.Lock() - if c.isBridge || c.closing.Load() { + if c.closing.Load() { c.mu.Unlock() wanList.Leave(5 * time.Second) //nolint:errcheck wanList.Shutdown() //nolint:errcheck diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go index dd87ff0c88..d2bf25dc30 100644 --- a/pkg/cluster/cluster.go +++ b/pkg/cluster/cluster.go @@ -14,9 +14,14 @@ import ( "google.golang.org/protobuf/proto" ) -// reconnectInterval is how often the background loop checks whether -// the node is isolated and needs to re-join seeds. -const reconnectInterval = 30 * time.Second +const ( + // initialBackoff is the starting delay for exponential backoff retries. + initialBackoff = 500 * time.Millisecond + + // reconnectInterval is how often the background loop checks whether + // the node is isolated and needs to re-join seeds. + reconnectInterval = 30 * time.Second +) // Cluster is the public interface for gossip-based cluster membership. type Cluster interface { @@ -135,8 +140,7 @@ func New(cfg Config) (Cluster, error) { // stays connected to seeds. It handles initial join (with backoff for DNS // readiness) and periodic reconnection if the node becomes isolated. func (c *gossipCluster) maintainMembership(pool string, list func() *memberlist.Memberlist, seeds []string, onJoin func()) { - // Initial join with exponential backoff — DNS may not resolve immediately. - backoff := 500 * time.Millisecond + backoff := initialBackoff for { select {