From d64445b8c6b09ce8c8ffc7daf9e81dd3ce0308f0 Mon Sep 17 00:00:00 2001 From: Gregorius Soedharmo Date: Wed, 18 Mar 2026 20:28:20 +0700 Subject: [PATCH 1/2] Fix heartbeat self-conflict causes unnecessary lease release after transient timeout --- .../LeaseActorSpec.cs | 122 ++++++++++++++++++ .../Akka.Coordination.Azure/LeaseActor.cs | 12 ++ .../LeaseActorSpec.cs | 122 ++++++++++++++++++ .../LeaseActor.cs | 12 ++ 4 files changed, 268 insertions(+) diff --git a/src/coordination/azure/Akka.Coordination.Azure.Tests/LeaseActorSpec.cs b/src/coordination/azure/Akka.Coordination.Azure.Tests/LeaseActorSpec.cs index f29e6fbff..54007f907 100644 --- a/src/coordination/azure/Akka.Coordination.Azure.Tests/LeaseActorSpec.cs +++ b/src/coordination/azure/Akka.Coordination.Azure.Tests/LeaseActorSpec.cs @@ -452,6 +452,128 @@ public void HeartBeatFailureShouldNotCallLeaseLostCallbackDuringRetry() }); } + /// + /// Reproduces the self-conflict bug: a heartbeat PUT succeeds on the API server but times out + /// on the client. The actor retries (per #3404 fix) with a stale version. The retry gets a CAS + /// conflict (Left<>), but the conflict response shows the SAME owner — the node is fighting + /// itself because its previous PUT actually succeeded and bumped the version. + /// + /// Current behavior (BUG): the actor unconditionally releases the lease on any heartbeat conflict. + /// Expected behavior: the actor should recognize it still owns the lease, update the version, + /// and stay in Granted state. + /// + [Fact(DisplayName = "Bug #3407: heartbeat self-conflict after timeout should stay granted")] + public void HeartbeatSelfConflictAfterTimeoutShouldStayGranted() + { + RunTest(() => + { + AcquireLease(); + ExpectHeartBeat(); + Granted.Value.Should().BeTrue(); + + // Step 1: Heartbeat fires, but PUT times out on client (server succeeded and bumped version) + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + UpdateProbe.Reply(new Status.Failure( + new LeaseException("API server request timed out"))); + + // Should still be granted — within TTL retry window (#3404 fix) + Granted.Value.Should().BeTrue(); + + // Step 2: Actor retries heartbeat with stale version (server already moved to next version) + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + + // Step 3: Retry gets CAS conflict — but the owner is US (previous PUT succeeded) + // The server version moved forward because our timed-out PUT actually went through + IncrementVersion(); + UpdateProbe.Reply( + new Left( + new LeaseResource(OwnerName, CurrentVersion, CurrentTime))); + + // BUG: Actor currently releases the lease here (lines 400-404) + // EXPECTED: Actor should recognize it's still the owner and stay Granted + Granted.Value.Should().BeTrue(); + + // Step 4: Heartbeat should continue normally with the updated version + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + IncrementVersion(); + UpdateProbe.Reply( + new Right( + new LeaseResource(OwnerName, CurrentVersion, CurrentTime))); + + Granted.Value.Should().BeTrue(); + }); + } + + /// + /// Verifies that the lease lost callback is NOT called when a heartbeat self-conflict occurs. + /// The node still owns the lease — the conflict is only due to a stale version from a + /// previously-timed-out-but-successful PUT. + /// + [Fact(DisplayName = "Bug #3407: heartbeat self-conflict should not call lease lost callback")] + public void HeartbeatSelfConflictShouldNotCallLeaseLostCallback() + { + RunTest(() => + { + var callbackCalled = false; + AcquireLease(e => + { + callbackCalled = true; + }); + ExpectHeartBeat(); + Granted.Value.Should().BeTrue(); + + // Heartbeat times out on client + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + UpdateProbe.Reply(new Status.Failure( + new LeaseException("API server request timed out"))); + + // Retry gets self-conflict + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + IncrementVersion(); + UpdateProbe.Reply( + new Left( + new LeaseResource(OwnerName, CurrentVersion, CurrentTime))); + + // Callback should NOT be called — we still own the lease + callbackCalled.Should().BeFalse(); + Granted.Value.Should().BeTrue(); + }); + } + + /// + /// Verifies that a genuine conflict (different owner) during heartbeat still correctly + /// releases the lease. This ensures the self-conflict fix doesn't break the existing + /// behavior for real conflicts. + /// + [Fact(DisplayName = "Bug #3407: heartbeat conflict with different owner after timeout should still release")] + public void HeartbeatConflictWithDifferentOwnerAfterTimeoutShouldRelease() + { + RunTest(() => + { + AcquireLease(); + ExpectHeartBeat(); + Granted.Value.Should().BeTrue(); + + // Heartbeat times out on client + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + UpdateProbe.Reply(new Status.Failure( + new LeaseException("API server request timed out"))); + + // Retry gets conflict with a DIFFERENT owner — lease was genuinely stolen + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + IncrementVersion(); + UpdateProbe.Reply( + new Left( + new LeaseResource("another-node-stole-it", CurrentVersion, CurrentTime))); + + // Should release — this is a genuine conflict + AwaitAssert(() => + { + Granted.Value.Should().BeFalse(); + }); + }); + } + [Fact(DisplayName = "lock should be acquire-able after heart beat conflict")] public void LockShouldAcquireAfterHeartBeatConflict() { diff --git a/src/coordination/azure/Akka.Coordination.Azure/LeaseActor.cs b/src/coordination/azure/Akka.Coordination.Azure/LeaseActor.cs index 36e9f8306..d5b59cd29 100644 --- a/src/coordination/azure/Akka.Coordination.Azure/LeaseActor.cs +++ b/src/coordination/azure/Akka.Coordination.Azure/LeaseActor.cs @@ -404,6 +404,18 @@ public LeaseActor(IAzureApi client, LeaseSettings settings, string leaseName, At return Stay().Using(gv.Copy(version: resource.Value.Version, lastSuccessfulHeartbeat: DateTime.UtcNow)); case WriteResponse {Response: Left resource}: + if (string.Equals(resource.Value.Owner, _ownerName, StringComparison.Ordinal)) + { + // Self-conflict: our previous PUT succeeded on the server but timed out + // on the client, so we retried with a stale version. The lease is still ours — + // update the version and continue heartbeating. + _log.Warning( + "Heartbeat conflict for lease {0}: current holder is [{1}] (we are [{2}]). " + + "Lease is still ours, updating version from {3} to {4} and continuing.", + leaseName, resource.Value.Owner, _ownerName, version, resource.Value.Version); + Timers!.StartSingleTimer("heartbeat", Heartbeat.Instance, settings.TimeoutSettings.HeartbeatInterval); + return Stay().Using(gv.Copy(version: resource.Value.Version, lastSuccessfulHeartbeat: DateTime.UtcNow)); + } _log.Warning("Conflict during heartbeat to lease {0}. Lease assumed to be released.", resource.Value); localGranted.GetAndSet(false); ExecuteLeaseLockCallback(leaseLost, null); diff --git a/src/coordination/kubernetes/Akka.Coordination.KubernetesApi.Tests/LeaseActorSpec.cs b/src/coordination/kubernetes/Akka.Coordination.KubernetesApi.Tests/LeaseActorSpec.cs index 733e27910..07b7c6279 100644 --- a/src/coordination/kubernetes/Akka.Coordination.KubernetesApi.Tests/LeaseActorSpec.cs +++ b/src/coordination/kubernetes/Akka.Coordination.KubernetesApi.Tests/LeaseActorSpec.cs @@ -450,6 +450,128 @@ public void HeartBeatFailureShouldNotCallLeaseLostCallbackDuringRetry() }); } + /// + /// Reproduces the self-conflict bug: a heartbeat PUT succeeds on the API server but times out + /// on the client. The actor retries (per #3404 fix) with a stale version. The retry gets a CAS + /// conflict (Left<>), but the conflict response shows the SAME owner — the node is fighting + /// itself because its previous PUT actually succeeded and bumped the version. + /// + /// Current behavior (BUG): the actor unconditionally releases the lease on any heartbeat conflict. + /// Expected behavior: the actor should recognize it still owns the lease, update the version, + /// and stay in Granted state. + /// + [Fact(DisplayName = "Bug #3407: heartbeat self-conflict after timeout should stay granted")] + public void HeartbeatSelfConflictAfterTimeoutShouldStayGranted() + { + RunTest(() => + { + AcquireLease(); + ExpectHeartBeat(); + Granted.Value.Should().BeTrue(); + + // Step 1: Heartbeat fires, but PUT times out on client (server succeeded and bumped version) + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + UpdateProbe.Reply(new Status.Failure( + new LeaseException("API server request timed out"))); + + // Should still be granted — within TTL retry window (#3404 fix) + Granted.Value.Should().BeTrue(); + + // Step 2: Actor retries heartbeat with stale version (server already moved to next version) + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + + // Step 3: Retry gets CAS conflict — but the owner is US (previous PUT succeeded) + // The server version moved forward because our timed-out PUT actually went through + IncrementVersion(); + UpdateProbe.Reply( + new Left( + new LeaseResource(OwnerName, CurrentVersion, CurrentTime))); + + // BUG: Actor currently releases the lease here (lines 400-404) + // EXPECTED: Actor should recognize it's still the owner and stay Granted + Granted.Value.Should().BeTrue(); + + // Step 4: Heartbeat should continue normally with the updated version + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + IncrementVersion(); + UpdateProbe.Reply( + new Right( + new LeaseResource(OwnerName, CurrentVersion, CurrentTime))); + + Granted.Value.Should().BeTrue(); + }); + } + + /// + /// Verifies that the lease lost callback is NOT called when a heartbeat self-conflict occurs. + /// The node still owns the lease — the conflict is only due to a stale version from a + /// previously-timed-out-but-successful PUT. + /// + [Fact(DisplayName = "Bug #3407: heartbeat self-conflict should not call lease lost callback")] + public void HeartbeatSelfConflictShouldNotCallLeaseLostCallback() + { + RunTest(() => + { + var callbackCalled = false; + AcquireLease(e => + { + callbackCalled = true; + }); + ExpectHeartBeat(); + Granted.Value.Should().BeTrue(); + + // Heartbeat times out on client + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + UpdateProbe.Reply(new Status.Failure( + new LeaseException("API server request timed out"))); + + // Retry gets self-conflict + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + IncrementVersion(); + UpdateProbe.Reply( + new Left( + new LeaseResource(OwnerName, CurrentVersion, CurrentTime))); + + // Callback should NOT be called — we still own the lease + callbackCalled.Should().BeFalse(); + Granted.Value.Should().BeTrue(); + }); + } + + /// + /// Verifies that a genuine conflict (different owner) during heartbeat still correctly + /// releases the lease. This ensures the self-conflict fix doesn't break the existing + /// behavior for real conflicts. + /// + [Fact(DisplayName = "Bug #3407: heartbeat conflict with different owner after timeout should still release")] + public void HeartbeatConflictWithDifferentOwnerAfterTimeoutShouldRelease() + { + RunTest(() => + { + AcquireLease(); + ExpectHeartBeat(); + Granted.Value.Should().BeTrue(); + + // Heartbeat times out on client + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + UpdateProbe.Reply(new Status.Failure( + new LeaseException("API server request timed out"))); + + // Retry gets conflict with a DIFFERENT owner — lease was genuinely stolen + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + IncrementVersion(); + UpdateProbe.Reply( + new Left( + new LeaseResource("another-node-stole-it", CurrentVersion, CurrentTime))); + + // Should release — this is a genuine conflict + AwaitAssert(() => + { + Granted.Value.Should().BeFalse(); + }); + }); + } + [Fact(DisplayName = "lock should be acquire-able after heart beat conflict")] public void LockShouldAcquireAfterHeartBeatConflict() { diff --git a/src/coordination/kubernetes/Akka.Coordination.KubernetesApi/LeaseActor.cs b/src/coordination/kubernetes/Akka.Coordination.KubernetesApi/LeaseActor.cs index b35fcfec1..64845f518 100644 --- a/src/coordination/kubernetes/Akka.Coordination.KubernetesApi/LeaseActor.cs +++ b/src/coordination/kubernetes/Akka.Coordination.KubernetesApi/LeaseActor.cs @@ -398,6 +398,18 @@ public LeaseActor(IKubernetesApi client, LeaseSettings settings, string leaseNam return Stay().Using(gv.Copy(version: resource.Value.Version, lastSuccessfulHeartbeat: DateTime.UtcNow)); case WriteResponse {Response: Left resource}: + if (string.Equals(resource.Value.Owner, _ownerName, StringComparison.Ordinal)) + { + // Self-conflict: our previous PUT succeeded on the server but timed out + // on the client, so we retried with a stale version. The lease is still ours — + // update the version and continue heartbeating. + _log.Warning( + "Heartbeat conflict for lease {0}: current holder is [{1}] (we are [{2}]). " + + "Lease is still ours, updating version from {3} to {4} and continuing.", + leaseName, resource.Value.Owner, _ownerName, version, resource.Value.Version); + Timers!.StartSingleTimer("heartbeat", Heartbeat.Instance, settings.TimeoutSettings.HeartbeatInterval); + return Stay().Using(gv.Copy(version: resource.Value.Version, lastSuccessfulHeartbeat: DateTime.UtcNow)); + } _log.Warning("Conflict during heartbeat to lease {0}. Lease assumed to be released.", resource.Value); localGranted.GetAndSet(false); ExecuteLeaseLockCallback(leaseLost, null); From 0ff5f629b6e40b99a5bb861c2147b03cab1b4995 Mon Sep 17 00:00:00 2001 From: Aaron Stannard Date: Wed, 18 Mar 2026 20:15:21 +0000 Subject: [PATCH 2/2] Simplify self-conflict log message to reduce noise --- .../azure/Akka.Coordination.Azure/LeaseActor.cs | 6 +++--- .../Akka.Coordination.KubernetesApi/LeaseActor.cs | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/coordination/azure/Akka.Coordination.Azure/LeaseActor.cs b/src/coordination/azure/Akka.Coordination.Azure/LeaseActor.cs index d5b59cd29..82c6252dd 100644 --- a/src/coordination/azure/Akka.Coordination.Azure/LeaseActor.cs +++ b/src/coordination/azure/Akka.Coordination.Azure/LeaseActor.cs @@ -410,9 +410,9 @@ public LeaseActor(IAzureApi client, LeaseSettings settings, string leaseName, At // on the client, so we retried with a stale version. The lease is still ours — // update the version and continue heartbeating. _log.Warning( - "Heartbeat conflict for lease {0}: current holder is [{1}] (we are [{2}]). " + - "Lease is still ours, updating version from {3} to {4} and continuing.", - leaseName, resource.Value.Owner, _ownerName, version, resource.Value.Version); + "Heartbeat self-conflict for lease {0}: our previous write succeeded but timed out. " + + "Updating version from {1} to {2} and continuing.", + leaseName, version, resource.Value.Version); Timers!.StartSingleTimer("heartbeat", Heartbeat.Instance, settings.TimeoutSettings.HeartbeatInterval); return Stay().Using(gv.Copy(version: resource.Value.Version, lastSuccessfulHeartbeat: DateTime.UtcNow)); } diff --git a/src/coordination/kubernetes/Akka.Coordination.KubernetesApi/LeaseActor.cs b/src/coordination/kubernetes/Akka.Coordination.KubernetesApi/LeaseActor.cs index 64845f518..abcb7b7fd 100644 --- a/src/coordination/kubernetes/Akka.Coordination.KubernetesApi/LeaseActor.cs +++ b/src/coordination/kubernetes/Akka.Coordination.KubernetesApi/LeaseActor.cs @@ -404,9 +404,9 @@ public LeaseActor(IKubernetesApi client, LeaseSettings settings, string leaseNam // on the client, so we retried with a stale version. The lease is still ours — // update the version and continue heartbeating. _log.Warning( - "Heartbeat conflict for lease {0}: current holder is [{1}] (we are [{2}]). " + - "Lease is still ours, updating version from {3} to {4} and continuing.", - leaseName, resource.Value.Owner, _ownerName, version, resource.Value.Version); + "Heartbeat self-conflict for lease {0}: our previous write succeeded but timed out. " + + "Updating version from {1} to {2} and continuing.", + leaseName, version, resource.Value.Version); Timers!.StartSingleTimer("heartbeat", Heartbeat.Instance, settings.TimeoutSettings.HeartbeatInterval); return Stay().Using(gv.Copy(version: resource.Value.Version, lastSuccessfulHeartbeat: DateTime.UtcNow)); }