diff --git a/src/coordination/azure/Akka.Coordination.Azure.Tests/LeaseActorSpec.cs b/src/coordination/azure/Akka.Coordination.Azure.Tests/LeaseActorSpec.cs
index f29e6fbff..54007f907 100644
--- a/src/coordination/azure/Akka.Coordination.Azure.Tests/LeaseActorSpec.cs
+++ b/src/coordination/azure/Akka.Coordination.Azure.Tests/LeaseActorSpec.cs
@@ -452,6 +452,128 @@ public void HeartBeatFailureShouldNotCallLeaseLostCallbackDuringRetry()
});
}
+ ///
+ /// Reproduces the self-conflict bug: a heartbeat PUT succeeds on the API server but times out
+ /// on the client. The actor retries (per #3404 fix) with a stale version. The retry gets a CAS
+ /// conflict (Left<>), but the conflict response shows the SAME owner — the node is fighting
+ /// itself because its previous PUT actually succeeded and bumped the version.
+ ///
+ /// Current behavior (BUG): the actor unconditionally releases the lease on any heartbeat conflict.
+ /// Expected behavior: the actor should recognize it still owns the lease, update the version,
+ /// and stay in Granted state.
+ ///
+ [Fact(DisplayName = "Bug #3407: heartbeat self-conflict after timeout should stay granted")]
+ public void HeartbeatSelfConflictAfterTimeoutShouldStayGranted()
+ {
+ RunTest(() =>
+ {
+ AcquireLease();
+ ExpectHeartBeat();
+ Granted.Value.Should().BeTrue();
+
+ // Step 1: Heartbeat fires, but PUT times out on client (server succeeded and bumped version)
+ UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
+ UpdateProbe.Reply(new Status.Failure(
+ new LeaseException("API server request timed out")));
+
+ // Should still be granted — within TTL retry window (#3404 fix)
+ Granted.Value.Should().BeTrue();
+
+ // Step 2: Actor retries heartbeat with stale version (server already moved to next version)
+ UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
+
+ // Step 3: Retry gets CAS conflict — but the owner is US (previous PUT succeeded)
+ // The server version moved forward because our timed-out PUT actually went through
+ IncrementVersion();
+ UpdateProbe.Reply(
+ new Left(
+ new LeaseResource(OwnerName, CurrentVersion, CurrentTime)));
+
+ // BUG: Actor currently releases the lease here (lines 400-404)
+ // EXPECTED: Actor should recognize it's still the owner and stay Granted
+ Granted.Value.Should().BeTrue();
+
+ // Step 4: Heartbeat should continue normally with the updated version
+ UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
+ IncrementVersion();
+ UpdateProbe.Reply(
+ new Right(
+ new LeaseResource(OwnerName, CurrentVersion, CurrentTime)));
+
+ Granted.Value.Should().BeTrue();
+ });
+ }
+
+ ///
+ /// Verifies that the lease lost callback is NOT called when a heartbeat self-conflict occurs.
+ /// The node still owns the lease — the conflict is only due to a stale version from a
+ /// previously-timed-out-but-successful PUT.
+ ///
+ [Fact(DisplayName = "Bug #3407: heartbeat self-conflict should not call lease lost callback")]
+ public void HeartbeatSelfConflictShouldNotCallLeaseLostCallback()
+ {
+ RunTest(() =>
+ {
+ var callbackCalled = false;
+ AcquireLease(e =>
+ {
+ callbackCalled = true;
+ });
+ ExpectHeartBeat();
+ Granted.Value.Should().BeTrue();
+
+ // Heartbeat times out on client
+ UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
+ UpdateProbe.Reply(new Status.Failure(
+ new LeaseException("API server request timed out")));
+
+ // Retry gets self-conflict
+ UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
+ IncrementVersion();
+ UpdateProbe.Reply(
+ new Left(
+ new LeaseResource(OwnerName, CurrentVersion, CurrentTime)));
+
+ // Callback should NOT be called — we still own the lease
+ callbackCalled.Should().BeFalse();
+ Granted.Value.Should().BeTrue();
+ });
+ }
+
+ ///
+ /// Verifies that a genuine conflict (different owner) during heartbeat still correctly
+ /// releases the lease. This ensures the self-conflict fix doesn't break the existing
+ /// behavior for real conflicts.
+ ///
+ [Fact(DisplayName = "Bug #3407: heartbeat conflict with different owner after timeout should still release")]
+ public void HeartbeatConflictWithDifferentOwnerAfterTimeoutShouldRelease()
+ {
+ RunTest(() =>
+ {
+ AcquireLease();
+ ExpectHeartBeat();
+ Granted.Value.Should().BeTrue();
+
+ // Heartbeat times out on client
+ UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
+ UpdateProbe.Reply(new Status.Failure(
+ new LeaseException("API server request timed out")));
+
+ // Retry gets conflict with a DIFFERENT owner — lease was genuinely stolen
+ UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
+ IncrementVersion();
+ UpdateProbe.Reply(
+ new Left(
+ new LeaseResource("another-node-stole-it", CurrentVersion, CurrentTime)));
+
+ // Should release — this is a genuine conflict
+ AwaitAssert(() =>
+ {
+ Granted.Value.Should().BeFalse();
+ });
+ });
+ }
+
[Fact(DisplayName = "lock should be acquire-able after heart beat conflict")]
public void LockShouldAcquireAfterHeartBeatConflict()
{
diff --git a/src/coordination/azure/Akka.Coordination.Azure/LeaseActor.cs b/src/coordination/azure/Akka.Coordination.Azure/LeaseActor.cs
index 36e9f8306..82c6252dd 100644
--- a/src/coordination/azure/Akka.Coordination.Azure/LeaseActor.cs
+++ b/src/coordination/azure/Akka.Coordination.Azure/LeaseActor.cs
@@ -404,6 +404,18 @@ public LeaseActor(IAzureApi client, LeaseSettings settings, string leaseName, At
return Stay().Using(gv.Copy(version: resource.Value.Version, lastSuccessfulHeartbeat: DateTime.UtcNow));
case WriteResponse {Response: Left resource}:
+ if (string.Equals(resource.Value.Owner, _ownerName, StringComparison.Ordinal))
+ {
+ // Self-conflict: our previous PUT succeeded on the server but timed out
+ // on the client, so we retried with a stale version. The lease is still ours —
+ // update the version and continue heartbeating.
+ _log.Warning(
+ "Heartbeat self-conflict for lease {0}: our previous write succeeded but timed out. " +
+ "Updating version from {1} to {2} and continuing.",
+ leaseName, version, resource.Value.Version);
+ Timers!.StartSingleTimer("heartbeat", Heartbeat.Instance, settings.TimeoutSettings.HeartbeatInterval);
+ return Stay().Using(gv.Copy(version: resource.Value.Version, lastSuccessfulHeartbeat: DateTime.UtcNow));
+ }
_log.Warning("Conflict during heartbeat to lease {0}. Lease assumed to be released.", resource.Value);
localGranted.GetAndSet(false);
ExecuteLeaseLockCallback(leaseLost, null);
diff --git a/src/coordination/kubernetes/Akka.Coordination.KubernetesApi.Tests/LeaseActorSpec.cs b/src/coordination/kubernetes/Akka.Coordination.KubernetesApi.Tests/LeaseActorSpec.cs
index 733e27910..07b7c6279 100644
--- a/src/coordination/kubernetes/Akka.Coordination.KubernetesApi.Tests/LeaseActorSpec.cs
+++ b/src/coordination/kubernetes/Akka.Coordination.KubernetesApi.Tests/LeaseActorSpec.cs
@@ -450,6 +450,128 @@ public void HeartBeatFailureShouldNotCallLeaseLostCallbackDuringRetry()
});
}
+ ///
+ /// Reproduces the self-conflict bug: a heartbeat PUT succeeds on the API server but times out
+ /// on the client. The actor retries (per #3404 fix) with a stale version. The retry gets a CAS
+ /// conflict (Left<>), but the conflict response shows the SAME owner — the node is fighting
+ /// itself because its previous PUT actually succeeded and bumped the version.
+ ///
+ /// Current behavior (BUG): the actor unconditionally releases the lease on any heartbeat conflict.
+ /// Expected behavior: the actor should recognize it still owns the lease, update the version,
+ /// and stay in Granted state.
+ ///
+ [Fact(DisplayName = "Bug #3407: heartbeat self-conflict after timeout should stay granted")]
+ public void HeartbeatSelfConflictAfterTimeoutShouldStayGranted()
+ {
+ RunTest(() =>
+ {
+ AcquireLease();
+ ExpectHeartBeat();
+ Granted.Value.Should().BeTrue();
+
+ // Step 1: Heartbeat fires, but PUT times out on client (server succeeded and bumped version)
+ UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
+ UpdateProbe.Reply(new Status.Failure(
+ new LeaseException("API server request timed out")));
+
+ // Should still be granted — within TTL retry window (#3404 fix)
+ Granted.Value.Should().BeTrue();
+
+ // Step 2: Actor retries heartbeat with stale version (server already moved to next version)
+ UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
+
+ // Step 3: Retry gets CAS conflict — but the owner is US (previous PUT succeeded)
+ // The server version moved forward because our timed-out PUT actually went through
+ IncrementVersion();
+ UpdateProbe.Reply(
+ new Left(
+ new LeaseResource(OwnerName, CurrentVersion, CurrentTime)));
+
+ // BUG: Actor currently releases the lease here (lines 400-404)
+ // EXPECTED: Actor should recognize it's still the owner and stay Granted
+ Granted.Value.Should().BeTrue();
+
+ // Step 4: Heartbeat should continue normally with the updated version
+ UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
+ IncrementVersion();
+ UpdateProbe.Reply(
+ new Right(
+ new LeaseResource(OwnerName, CurrentVersion, CurrentTime)));
+
+ Granted.Value.Should().BeTrue();
+ });
+ }
+
+ ///
+ /// Verifies that the lease lost callback is NOT called when a heartbeat self-conflict occurs.
+ /// The node still owns the lease — the conflict is only due to a stale version from a
+ /// previously-timed-out-but-successful PUT.
+ ///
+ [Fact(DisplayName = "Bug #3407: heartbeat self-conflict should not call lease lost callback")]
+ public void HeartbeatSelfConflictShouldNotCallLeaseLostCallback()
+ {
+ RunTest(() =>
+ {
+ var callbackCalled = false;
+ AcquireLease(e =>
+ {
+ callbackCalled = true;
+ });
+ ExpectHeartBeat();
+ Granted.Value.Should().BeTrue();
+
+ // Heartbeat times out on client
+ UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
+ UpdateProbe.Reply(new Status.Failure(
+ new LeaseException("API server request timed out")));
+
+ // Retry gets self-conflict
+ UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
+ IncrementVersion();
+ UpdateProbe.Reply(
+ new Left(
+ new LeaseResource(OwnerName, CurrentVersion, CurrentTime)));
+
+ // Callback should NOT be called — we still own the lease
+ callbackCalled.Should().BeFalse();
+ Granted.Value.Should().BeTrue();
+ });
+ }
+
+ ///
+ /// Verifies that a genuine conflict (different owner) during heartbeat still correctly
+ /// releases the lease. This ensures the self-conflict fix doesn't break the existing
+ /// behavior for real conflicts.
+ ///
+ [Fact(DisplayName = "Bug #3407: heartbeat conflict with different owner after timeout should still release")]
+ public void HeartbeatConflictWithDifferentOwnerAfterTimeoutShouldRelease()
+ {
+ RunTest(() =>
+ {
+ AcquireLease();
+ ExpectHeartBeat();
+ Granted.Value.Should().BeTrue();
+
+ // Heartbeat times out on client
+ UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
+ UpdateProbe.Reply(new Status.Failure(
+ new LeaseException("API server request timed out")));
+
+ // Retry gets conflict with a DIFFERENT owner — lease was genuinely stolen
+ UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
+ IncrementVersion();
+ UpdateProbe.Reply(
+ new Left(
+ new LeaseResource("another-node-stole-it", CurrentVersion, CurrentTime)));
+
+ // Should release — this is a genuine conflict
+ AwaitAssert(() =>
+ {
+ Granted.Value.Should().BeFalse();
+ });
+ });
+ }
+
[Fact(DisplayName = "lock should be acquire-able after heart beat conflict")]
public void LockShouldAcquireAfterHeartBeatConflict()
{
diff --git a/src/coordination/kubernetes/Akka.Coordination.KubernetesApi/LeaseActor.cs b/src/coordination/kubernetes/Akka.Coordination.KubernetesApi/LeaseActor.cs
index b35fcfec1..abcb7b7fd 100644
--- a/src/coordination/kubernetes/Akka.Coordination.KubernetesApi/LeaseActor.cs
+++ b/src/coordination/kubernetes/Akka.Coordination.KubernetesApi/LeaseActor.cs
@@ -398,6 +398,18 @@ public LeaseActor(IKubernetesApi client, LeaseSettings settings, string leaseNam
return Stay().Using(gv.Copy(version: resource.Value.Version, lastSuccessfulHeartbeat: DateTime.UtcNow));
case WriteResponse {Response: Left resource}:
+ if (string.Equals(resource.Value.Owner, _ownerName, StringComparison.Ordinal))
+ {
+ // Self-conflict: our previous PUT succeeded on the server but timed out
+ // on the client, so we retried with a stale version. The lease is still ours —
+ // update the version and continue heartbeating.
+ _log.Warning(
+ "Heartbeat self-conflict for lease {0}: our previous write succeeded but timed out. " +
+ "Updating version from {1} to {2} and continuing.",
+ leaseName, version, resource.Value.Version);
+ Timers!.StartSingleTimer("heartbeat", Heartbeat.Instance, settings.TimeoutSettings.HeartbeatInterval);
+ return Stay().Using(gv.Copy(version: resource.Value.Version, lastSuccessfulHeartbeat: DateTime.UtcNow));
+ }
_log.Warning("Conflict during heartbeat to lease {0}. Lease assumed to be released.", resource.Value);
localGranted.GetAndSet(false);
ExecuteLeaseLockCallback(leaseLost, null);