From 2b111081f20d534208e6525f83abc5766b677995 Mon Sep 17 00:00:00 2001 From: srikar-jilugu Date: Fri, 5 Jul 2024 13:43:49 +0530 Subject: [PATCH 1/2] fix node routing when all nodes are failing --- osscluster.go | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/osscluster.go b/osscluster.go index e28cb1e39..40f5bd110 100644 --- a/osscluster.go +++ b/osscluster.go @@ -341,6 +341,8 @@ func (n *clusterNode) Close() error { return n.Client.Close() } +const maximumNodeLatency = 1 * time.Minute + func (n *clusterNode) updateLatency() { const numProbe = 10 var dur uint64 @@ -361,7 +363,7 @@ func (n *clusterNode) updateLatency() { if successes == 0 { // If none of the pings worked, set latency to some arbitrarily high value so this node gets // least priority. - latency = float64((1 * time.Minute) / time.Microsecond) + latency = float64((maximumNodeLatency) / time.Microsecond) } else { latency = float64(dur) / float64(successes) } @@ -735,21 +737,39 @@ func (c *clusterState) slotClosestNode(slot int) (*clusterNode, error) { return c.nodes.Random() } - var node *clusterNode + var allNodesFailing = true + var ( + closestNonFailingNode *clusterNode + closestNode *clusterNode + minLatency time.Duration + ) + for _, n := range nodes { - if n.Failing() { - continue - } - if node == nil || n.Latency() < node.Latency() { - node = n + if closestNode == nil || n.Latency() < minLatency { + closestNode = n + minLatency = n.Latency() + if !n.Failing() { + closestNonFailingNode = n + allNodesFailing = false + } } } - if node != nil { - return node, nil + + // pick the healthly node with the lowest latency + if !allNodesFailing && closestNonFailingNode != nil { + return closestNonFailingNode, nil } - // If all nodes are failing - return random node - return c.nodes.Random() + // if all nodes are failing, we will pick the temporarily failing node with lowest latency + if minLatency < maximumNodeLatency { + internal.Logger.Printf(context.TODO(), "redis: all nodes are failing, picking the temporarily failing node with lowest latency") + return closestNode, nil + } + + // If all nodes are having the maximum latency(all pings are failing) - return a random node within the nodes corresponding to the slot + randomNodes := rand.Perm(len(nodes)) + internal.Logger.Printf(context.TODO(), "redis: pings to all nodes are failing, picking a random node within the slot") + return nodes[randomNodes[0]], nil } func (c *clusterState) slotRandomNode(slot int) (*clusterNode, error) { From 34c1db18c2346c9776cdd3fa778916663b5ecfc8 Mon Sep 17 00:00:00 2001 From: srikar-jilugu Date: Mon, 8 Jul 2024 19:17:28 +0530 Subject: [PATCH 2/2] fix minlatency zero value --- osscluster.go | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/osscluster.go b/osscluster.go index 40f5bd110..73a9e2b74 100644 --- a/osscluster.go +++ b/osscluster.go @@ -744,6 +744,9 @@ func (c *clusterState) slotClosestNode(slot int) (*clusterNode, error) { minLatency time.Duration ) + // setting the max possible duration as zerovalue for minlatency + minLatency = time.Duration(math.MaxInt64) + for _, n := range nodes { if closestNode == nil || n.Latency() < minLatency { closestNode = n @@ -761,15 +764,14 @@ func (c *clusterState) slotClosestNode(slot int) (*clusterNode, error) { } // if all nodes are failing, we will pick the temporarily failing node with lowest latency - if minLatency < maximumNodeLatency { - internal.Logger.Printf(context.TODO(), "redis: all nodes are failing, picking the temporarily failing node with lowest latency") + if minLatency < maximumNodeLatency && closestNode != nil { + internal.Logger.Printf(context.TODO(), "redis: all nodes are marked as failed, picking the temporarily failing node with lowest latency") return closestNode, nil } - // If all nodes are having the maximum latency(all pings are failing) - return a random node within the nodes corresponding to the slot - randomNodes := rand.Perm(len(nodes)) - internal.Logger.Printf(context.TODO(), "redis: pings to all nodes are failing, picking a random node within the slot") - return nodes[randomNodes[0]], nil + // If all nodes are having the maximum latency(all pings are failing) - return a random node across the cluster + internal.Logger.Printf(context.TODO(), "redis: pings to all nodes are failing, picking a random node across the cluster") + return c.nodes.Random() } func (c *clusterState) slotRandomNode(slot int) (*clusterNode, error) {