diff --git a/src/coordination/azure/Akka.Coordination.Azure.Tests/LeaseActorSpec.cs b/src/coordination/azure/Akka.Coordination.Azure.Tests/LeaseActorSpec.cs index f29e6fbff..54007f907 100644 --- a/src/coordination/azure/Akka.Coordination.Azure.Tests/LeaseActorSpec.cs +++ b/src/coordination/azure/Akka.Coordination.Azure.Tests/LeaseActorSpec.cs @@ -452,6 +452,128 @@ public void HeartBeatFailureShouldNotCallLeaseLostCallbackDuringRetry() }); } + /// + /// Reproduces the self-conflict bug: a heartbeat PUT succeeds on the API server but times out + /// on the client. The actor retries (per #3404 fix) with a stale version. The retry gets a CAS + /// conflict (Left<>), but the conflict response shows the SAME owner — the node is fighting + /// itself because its previous PUT actually succeeded and bumped the version. + /// + /// Current behavior (BUG): the actor unconditionally releases the lease on any heartbeat conflict. + /// Expected behavior: the actor should recognize it still owns the lease, update the version, + /// and stay in Granted state. + /// + [Fact(DisplayName = "Bug #3407: heartbeat self-conflict after timeout should stay granted")] + public void HeartbeatSelfConflictAfterTimeoutShouldStayGranted() + { + RunTest(() => + { + AcquireLease(); + ExpectHeartBeat(); + Granted.Value.Should().BeTrue(); + + // Step 1: Heartbeat fires, but PUT times out on client (server succeeded and bumped version) + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + UpdateProbe.Reply(new Status.Failure( + new LeaseException("API server request timed out"))); + + // Should still be granted — within TTL retry window (#3404 fix) + Granted.Value.Should().BeTrue(); + + // Step 2: Actor retries heartbeat with stale version (server already moved to next version) + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + + // Step 3: Retry gets CAS conflict — but the owner is US (previous PUT succeeded) + // The server version moved forward because our timed-out PUT actually went through + IncrementVersion(); + UpdateProbe.Reply( + new Left( + new LeaseResource(OwnerName, CurrentVersion, CurrentTime))); + + // BUG: Actor currently releases the lease here (lines 400-404) + // EXPECTED: Actor should recognize it's still the owner and stay Granted + Granted.Value.Should().BeTrue(); + + // Step 4: Heartbeat should continue normally with the updated version + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + IncrementVersion(); + UpdateProbe.Reply( + new Right( + new LeaseResource(OwnerName, CurrentVersion, CurrentTime))); + + Granted.Value.Should().BeTrue(); + }); + } + + /// + /// Verifies that the lease lost callback is NOT called when a heartbeat self-conflict occurs. + /// The node still owns the lease — the conflict is only due to a stale version from a + /// previously-timed-out-but-successful PUT. + /// + [Fact(DisplayName = "Bug #3407: heartbeat self-conflict should not call lease lost callback")] + public void HeartbeatSelfConflictShouldNotCallLeaseLostCallback() + { + RunTest(() => + { + var callbackCalled = false; + AcquireLease(e => + { + callbackCalled = true; + }); + ExpectHeartBeat(); + Granted.Value.Should().BeTrue(); + + // Heartbeat times out on client + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + UpdateProbe.Reply(new Status.Failure( + new LeaseException("API server request timed out"))); + + // Retry gets self-conflict + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + IncrementVersion(); + UpdateProbe.Reply( + new Left( + new LeaseResource(OwnerName, CurrentVersion, CurrentTime))); + + // Callback should NOT be called — we still own the lease + callbackCalled.Should().BeFalse(); + Granted.Value.Should().BeTrue(); + }); + } + + /// + /// Verifies that a genuine conflict (different owner) during heartbeat still correctly + /// releases the lease. This ensures the self-conflict fix doesn't break the existing + /// behavior for real conflicts. + /// + [Fact(DisplayName = "Bug #3407: heartbeat conflict with different owner after timeout should still release")] + public void HeartbeatConflictWithDifferentOwnerAfterTimeoutShouldRelease() + { + RunTest(() => + { + AcquireLease(); + ExpectHeartBeat(); + Granted.Value.Should().BeTrue(); + + // Heartbeat times out on client + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + UpdateProbe.Reply(new Status.Failure( + new LeaseException("API server request timed out"))); + + // Retry gets conflict with a DIFFERENT owner — lease was genuinely stolen + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + IncrementVersion(); + UpdateProbe.Reply( + new Left( + new LeaseResource("another-node-stole-it", CurrentVersion, CurrentTime))); + + // Should release — this is a genuine conflict + AwaitAssert(() => + { + Granted.Value.Should().BeFalse(); + }); + }); + } + [Fact(DisplayName = "lock should be acquire-able after heart beat conflict")] public void LockShouldAcquireAfterHeartBeatConflict() { diff --git a/src/coordination/azure/Akka.Coordination.Azure/LeaseActor.cs b/src/coordination/azure/Akka.Coordination.Azure/LeaseActor.cs index 36e9f8306..82c6252dd 100644 --- a/src/coordination/azure/Akka.Coordination.Azure/LeaseActor.cs +++ b/src/coordination/azure/Akka.Coordination.Azure/LeaseActor.cs @@ -404,6 +404,18 @@ public LeaseActor(IAzureApi client, LeaseSettings settings, string leaseName, At return Stay().Using(gv.Copy(version: resource.Value.Version, lastSuccessfulHeartbeat: DateTime.UtcNow)); case WriteResponse {Response: Left resource}: + if (string.Equals(resource.Value.Owner, _ownerName, StringComparison.Ordinal)) + { + // Self-conflict: our previous PUT succeeded on the server but timed out + // on the client, so we retried with a stale version. The lease is still ours — + // update the version and continue heartbeating. + _log.Warning( + "Heartbeat self-conflict for lease {0}: our previous write succeeded but timed out. " + + "Updating version from {1} to {2} and continuing.", + leaseName, version, resource.Value.Version); + Timers!.StartSingleTimer("heartbeat", Heartbeat.Instance, settings.TimeoutSettings.HeartbeatInterval); + return Stay().Using(gv.Copy(version: resource.Value.Version, lastSuccessfulHeartbeat: DateTime.UtcNow)); + } _log.Warning("Conflict during heartbeat to lease {0}. Lease assumed to be released.", resource.Value); localGranted.GetAndSet(false); ExecuteLeaseLockCallback(leaseLost, null); diff --git a/src/coordination/kubernetes/Akka.Coordination.KubernetesApi.Tests/LeaseActorSpec.cs b/src/coordination/kubernetes/Akka.Coordination.KubernetesApi.Tests/LeaseActorSpec.cs index 733e27910..07b7c6279 100644 --- a/src/coordination/kubernetes/Akka.Coordination.KubernetesApi.Tests/LeaseActorSpec.cs +++ b/src/coordination/kubernetes/Akka.Coordination.KubernetesApi.Tests/LeaseActorSpec.cs @@ -450,6 +450,128 @@ public void HeartBeatFailureShouldNotCallLeaseLostCallbackDuringRetry() }); } + /// + /// Reproduces the self-conflict bug: a heartbeat PUT succeeds on the API server but times out + /// on the client. The actor retries (per #3404 fix) with a stale version. The retry gets a CAS + /// conflict (Left<>), but the conflict response shows the SAME owner — the node is fighting + /// itself because its previous PUT actually succeeded and bumped the version. + /// + /// Current behavior (BUG): the actor unconditionally releases the lease on any heartbeat conflict. + /// Expected behavior: the actor should recognize it still owns the lease, update the version, + /// and stay in Granted state. + /// + [Fact(DisplayName = "Bug #3407: heartbeat self-conflict after timeout should stay granted")] + public void HeartbeatSelfConflictAfterTimeoutShouldStayGranted() + { + RunTest(() => + { + AcquireLease(); + ExpectHeartBeat(); + Granted.Value.Should().BeTrue(); + + // Step 1: Heartbeat fires, but PUT times out on client (server succeeded and bumped version) + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + UpdateProbe.Reply(new Status.Failure( + new LeaseException("API server request timed out"))); + + // Should still be granted — within TTL retry window (#3404 fix) + Granted.Value.Should().BeTrue(); + + // Step 2: Actor retries heartbeat with stale version (server already moved to next version) + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + + // Step 3: Retry gets CAS conflict — but the owner is US (previous PUT succeeded) + // The server version moved forward because our timed-out PUT actually went through + IncrementVersion(); + UpdateProbe.Reply( + new Left( + new LeaseResource(OwnerName, CurrentVersion, CurrentTime))); + + // BUG: Actor currently releases the lease here (lines 400-404) + // EXPECTED: Actor should recognize it's still the owner and stay Granted + Granted.Value.Should().BeTrue(); + + // Step 4: Heartbeat should continue normally with the updated version + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + IncrementVersion(); + UpdateProbe.Reply( + new Right( + new LeaseResource(OwnerName, CurrentVersion, CurrentTime))); + + Granted.Value.Should().BeTrue(); + }); + } + + /// + /// Verifies that the lease lost callback is NOT called when a heartbeat self-conflict occurs. + /// The node still owns the lease — the conflict is only due to a stale version from a + /// previously-timed-out-but-successful PUT. + /// + [Fact(DisplayName = "Bug #3407: heartbeat self-conflict should not call lease lost callback")] + public void HeartbeatSelfConflictShouldNotCallLeaseLostCallback() + { + RunTest(() => + { + var callbackCalled = false; + AcquireLease(e => + { + callbackCalled = true; + }); + ExpectHeartBeat(); + Granted.Value.Should().BeTrue(); + + // Heartbeat times out on client + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + UpdateProbe.Reply(new Status.Failure( + new LeaseException("API server request timed out"))); + + // Retry gets self-conflict + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + IncrementVersion(); + UpdateProbe.Reply( + new Left( + new LeaseResource(OwnerName, CurrentVersion, CurrentTime))); + + // Callback should NOT be called — we still own the lease + callbackCalled.Should().BeFalse(); + Granted.Value.Should().BeTrue(); + }); + } + + /// + /// Verifies that a genuine conflict (different owner) during heartbeat still correctly + /// releases the lease. This ensures the self-conflict fix doesn't break the existing + /// behavior for real conflicts. + /// + [Fact(DisplayName = "Bug #3407: heartbeat conflict with different owner after timeout should still release")] + public void HeartbeatConflictWithDifferentOwnerAfterTimeoutShouldRelease() + { + RunTest(() => + { + AcquireLease(); + ExpectHeartBeat(); + Granted.Value.Should().BeTrue(); + + // Heartbeat times out on client + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + UpdateProbe.Reply(new Status.Failure( + new LeaseException("API server request timed out"))); + + // Retry gets conflict with a DIFFERENT owner — lease was genuinely stolen + UpdateProbe.ExpectMsg((OwnerName, CurrentVersion)); + IncrementVersion(); + UpdateProbe.Reply( + new Left( + new LeaseResource("another-node-stole-it", CurrentVersion, CurrentTime))); + + // Should release — this is a genuine conflict + AwaitAssert(() => + { + Granted.Value.Should().BeFalse(); + }); + }); + } + [Fact(DisplayName = "lock should be acquire-able after heart beat conflict")] public void LockShouldAcquireAfterHeartBeatConflict() { diff --git a/src/coordination/kubernetes/Akka.Coordination.KubernetesApi/LeaseActor.cs b/src/coordination/kubernetes/Akka.Coordination.KubernetesApi/LeaseActor.cs index b35fcfec1..abcb7b7fd 100644 --- a/src/coordination/kubernetes/Akka.Coordination.KubernetesApi/LeaseActor.cs +++ b/src/coordination/kubernetes/Akka.Coordination.KubernetesApi/LeaseActor.cs @@ -398,6 +398,18 @@ public LeaseActor(IKubernetesApi client, LeaseSettings settings, string leaseNam return Stay().Using(gv.Copy(version: resource.Value.Version, lastSuccessfulHeartbeat: DateTime.UtcNow)); case WriteResponse {Response: Left resource}: + if (string.Equals(resource.Value.Owner, _ownerName, StringComparison.Ordinal)) + { + // Self-conflict: our previous PUT succeeded on the server but timed out + // on the client, so we retried with a stale version. The lease is still ours — + // update the version and continue heartbeating. + _log.Warning( + "Heartbeat self-conflict for lease {0}: our previous write succeeded but timed out. " + + "Updating version from {1} to {2} and continuing.", + leaseName, version, resource.Value.Version); + Timers!.StartSingleTimer("heartbeat", Heartbeat.Instance, settings.TimeoutSettings.HeartbeatInterval); + return Stay().Using(gv.Copy(version: resource.Value.Version, lastSuccessfulHeartbeat: DateTime.UtcNow)); + } _log.Warning("Conflict during heartbeat to lease {0}. Lease assumed to be released.", resource.Value); localGranted.GetAndSet(false); ExecuteLeaseLockCallback(leaseLost, null);