Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 122 additions & 0 deletions src/coordination/azure/Akka.Coordination.Azure.Tests/LeaseActorSpec.cs
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,128 @@ public void HeartBeatFailureShouldNotCallLeaseLostCallbackDuringRetry()
});
}

/// <summary>
/// Reproduces the self-conflict bug: a heartbeat PUT succeeds on the API server but times out
/// on the client. The actor retries (per #3404 fix) with a stale version. The retry gets a CAS
/// conflict (Left&lt;&gt;), but the conflict response shows the SAME owner — the node is fighting
/// itself because its previous PUT actually succeeded and bumped the version.
///
/// Current behavior (BUG): the actor unconditionally releases the lease on any heartbeat conflict.
/// Expected behavior: the actor should recognize it still owns the lease, update the version,
/// and stay in Granted state.
/// </summary>
[Fact(DisplayName = "Bug #3407: heartbeat self-conflict after timeout should stay granted")]
public void HeartbeatSelfConflictAfterTimeoutShouldStayGranted()
{
RunTest(() =>
{
AcquireLease();
ExpectHeartBeat();
Granted.Value.Should().BeTrue();

// Step 1: Heartbeat fires, but PUT times out on client (server succeeded and bumped version)
UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
UpdateProbe.Reply(new Status.Failure(
new LeaseException("API server request timed out")));

// Should still be granted — within TTL retry window (#3404 fix)
Granted.Value.Should().BeTrue();

// Step 2: Actor retries heartbeat with stale version (server already moved to next version)
UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));

// Step 3: Retry gets CAS conflict — but the owner is US (previous PUT succeeded)
// The server version moved forward because our timed-out PUT actually went through
IncrementVersion();
UpdateProbe.Reply(
new Left<LeaseResource, LeaseResource>(
new LeaseResource(OwnerName, CurrentVersion, CurrentTime)));

// BUG: Actor currently releases the lease here (lines 400-404)
// EXPECTED: Actor should recognize it's still the owner and stay Granted
Granted.Value.Should().BeTrue();

// Step 4: Heartbeat should continue normally with the updated version
UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
IncrementVersion();
UpdateProbe.Reply(
new Right<LeaseResource, LeaseResource>(
new LeaseResource(OwnerName, CurrentVersion, CurrentTime)));

Granted.Value.Should().BeTrue();
});
}

/// <summary>
/// Verifies that the lease lost callback is NOT called when a heartbeat self-conflict occurs.
/// The node still owns the lease — the conflict is only due to a stale version from a
/// previously-timed-out-but-successful PUT.
/// </summary>
[Fact(DisplayName = "Bug #3407: heartbeat self-conflict should not call lease lost callback")]
public void HeartbeatSelfConflictShouldNotCallLeaseLostCallback()
{
RunTest(() =>
{
var callbackCalled = false;
AcquireLease(e =>
{
callbackCalled = true;
});
ExpectHeartBeat();
Granted.Value.Should().BeTrue();

// Heartbeat times out on client
UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
UpdateProbe.Reply(new Status.Failure(
new LeaseException("API server request timed out")));

// Retry gets self-conflict
UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
IncrementVersion();
UpdateProbe.Reply(
new Left<LeaseResource, LeaseResource>(
new LeaseResource(OwnerName, CurrentVersion, CurrentTime)));

// Callback should NOT be called — we still own the lease
callbackCalled.Should().BeFalse();
Granted.Value.Should().BeTrue();
});
}

/// <summary>
/// Verifies that a genuine conflict (different owner) during heartbeat still correctly
/// releases the lease. This ensures the self-conflict fix doesn't break the existing
/// behavior for real conflicts.
/// </summary>
[Fact(DisplayName = "Bug #3407: heartbeat conflict with different owner after timeout should still release")]
public void HeartbeatConflictWithDifferentOwnerAfterTimeoutShouldRelease()
{
RunTest(() =>
{
AcquireLease();
ExpectHeartBeat();
Granted.Value.Should().BeTrue();

// Heartbeat times out on client
UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
UpdateProbe.Reply(new Status.Failure(
new LeaseException("API server request timed out")));

// Retry gets conflict with a DIFFERENT owner — lease was genuinely stolen
UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
IncrementVersion();
UpdateProbe.Reply(
new Left<LeaseResource, LeaseResource>(
new LeaseResource("another-node-stole-it", CurrentVersion, CurrentTime)));

// Should release — this is a genuine conflict
AwaitAssert(() =>
{
Granted.Value.Should().BeFalse();
});
});
}

[Fact(DisplayName = "lock should be acquire-able after heart beat conflict")]
public void LockShouldAcquireAfterHeartBeatConflict()
{
Expand Down
12 changes: 12 additions & 0 deletions src/coordination/azure/Akka.Coordination.Azure/LeaseActor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,18 @@ public LeaseActor(IAzureApi client, LeaseSettings settings, string leaseName, At
return Stay().Using(gv.Copy(version: resource.Value.Version, lastSuccessfulHeartbeat: DateTime.UtcNow));

case WriteResponse {Response: Left<LeaseResource, LeaseResource> resource}:
if (string.Equals(resource.Value.Owner, _ownerName, StringComparison.Ordinal))
{
// Self-conflict: our previous PUT succeeded on the server but timed out
// on the client, so we retried with a stale version. The lease is still ours —
// update the version and continue heartbeating.
_log.Warning(
"Heartbeat self-conflict for lease {0}: our previous write succeeded but timed out. " +
"Updating version from {1} to {2} and continuing.",
leaseName, version, resource.Value.Version);
Timers!.StartSingleTimer("heartbeat", Heartbeat.Instance, settings.TimeoutSettings.HeartbeatInterval);
return Stay().Using(gv.Copy(version: resource.Value.Version, lastSuccessfulHeartbeat: DateTime.UtcNow));
}
_log.Warning("Conflict during heartbeat to lease {0}. Lease assumed to be released.", resource.Value);
localGranted.GetAndSet(false);
ExecuteLeaseLockCallback(leaseLost, null);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,128 @@ public void HeartBeatFailureShouldNotCallLeaseLostCallbackDuringRetry()
});
}

/// <summary>
/// Reproduces the self-conflict bug: a heartbeat PUT succeeds on the API server but times out
/// on the client. The actor retries (per #3404 fix) with a stale version. The retry gets a CAS
/// conflict (Left&lt;&gt;), but the conflict response shows the SAME owner — the node is fighting
/// itself because its previous PUT actually succeeded and bumped the version.
///
/// Current behavior (BUG): the actor unconditionally releases the lease on any heartbeat conflict.
/// Expected behavior: the actor should recognize it still owns the lease, update the version,
/// and stay in Granted state.
/// </summary>
[Fact(DisplayName = "Bug #3407: heartbeat self-conflict after timeout should stay granted")]
public void HeartbeatSelfConflictAfterTimeoutShouldStayGranted()
{
RunTest(() =>
{
AcquireLease();
ExpectHeartBeat();
Granted.Value.Should().BeTrue();

// Step 1: Heartbeat fires, but PUT times out on client (server succeeded and bumped version)
UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
UpdateProbe.Reply(new Status.Failure(
new LeaseException("API server request timed out")));

// Should still be granted — within TTL retry window (#3404 fix)
Granted.Value.Should().BeTrue();

// Step 2: Actor retries heartbeat with stale version (server already moved to next version)
UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));

// Step 3: Retry gets CAS conflict — but the owner is US (previous PUT succeeded)
// The server version moved forward because our timed-out PUT actually went through
IncrementVersion();
UpdateProbe.Reply(
new Left<LeaseResource, LeaseResource>(
new LeaseResource(OwnerName, CurrentVersion, CurrentTime)));

// BUG: Actor currently releases the lease here (lines 400-404)
// EXPECTED: Actor should recognize it's still the owner and stay Granted
Granted.Value.Should().BeTrue();

// Step 4: Heartbeat should continue normally with the updated version
UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
IncrementVersion();
UpdateProbe.Reply(
new Right<LeaseResource, LeaseResource>(
new LeaseResource(OwnerName, CurrentVersion, CurrentTime)));

Granted.Value.Should().BeTrue();
});
}

/// <summary>
/// Verifies that the lease lost callback is NOT called when a heartbeat self-conflict occurs.
/// The node still owns the lease — the conflict is only due to a stale version from a
/// previously-timed-out-but-successful PUT.
/// </summary>
[Fact(DisplayName = "Bug #3407: heartbeat self-conflict should not call lease lost callback")]
public void HeartbeatSelfConflictShouldNotCallLeaseLostCallback()
{
RunTest(() =>
{
var callbackCalled = false;
AcquireLease(e =>
{
callbackCalled = true;
});
ExpectHeartBeat();
Granted.Value.Should().BeTrue();

// Heartbeat times out on client
UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
UpdateProbe.Reply(new Status.Failure(
new LeaseException("API server request timed out")));

// Retry gets self-conflict
UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
IncrementVersion();
UpdateProbe.Reply(
new Left<LeaseResource, LeaseResource>(
new LeaseResource(OwnerName, CurrentVersion, CurrentTime)));

// Callback should NOT be called — we still own the lease
callbackCalled.Should().BeFalse();
Granted.Value.Should().BeTrue();
});
}

/// <summary>
/// Verifies that a genuine conflict (different owner) during heartbeat still correctly
/// releases the lease. This ensures the self-conflict fix doesn't break the existing
/// behavior for real conflicts.
/// </summary>
[Fact(DisplayName = "Bug #3407: heartbeat conflict with different owner after timeout should still release")]
public void HeartbeatConflictWithDifferentOwnerAfterTimeoutShouldRelease()
{
RunTest(() =>
{
AcquireLease();
ExpectHeartBeat();
Granted.Value.Should().BeTrue();

// Heartbeat times out on client
UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
UpdateProbe.Reply(new Status.Failure(
new LeaseException("API server request timed out")));

// Retry gets conflict with a DIFFERENT owner — lease was genuinely stolen
UpdateProbe.ExpectMsg((OwnerName, CurrentVersion));
IncrementVersion();
UpdateProbe.Reply(
new Left<LeaseResource, LeaseResource>(
new LeaseResource("another-node-stole-it", CurrentVersion, CurrentTime)));

// Should release — this is a genuine conflict
AwaitAssert(() =>
{
Granted.Value.Should().BeFalse();
});
});
}

[Fact(DisplayName = "lock should be acquire-able after heart beat conflict")]
public void LockShouldAcquireAfterHeartBeatConflict()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,18 @@ public LeaseActor(IKubernetesApi client, LeaseSettings settings, string leaseNam
return Stay().Using(gv.Copy(version: resource.Value.Version, lastSuccessfulHeartbeat: DateTime.UtcNow));

case WriteResponse {Response: Left<LeaseResource, LeaseResource> resource}:
if (string.Equals(resource.Value.Owner, _ownerName, StringComparison.Ordinal))
{
// Self-conflict: our previous PUT succeeded on the server but timed out
// on the client, so we retried with a stale version. The lease is still ours —
// update the version and continue heartbeating.
_log.Warning(

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

resource.Value.Owner and _ownerName are always the same so we could clean this log message up and simplify it.

"Heartbeat self-conflict for lease {0}: our previous write succeeded but timed out. " +
"Updating version from {1} to {2} and continuing.",
leaseName, version, resource.Value.Version);
Timers!.StartSingleTimer("heartbeat", Heartbeat.Instance, settings.TimeoutSettings.HeartbeatInterval);
return Stay().Using(gv.Copy(version: resource.Value.Version, lastSuccessfulHeartbeat: DateTime.UtcNow));
}
_log.Warning("Conflict during heartbeat to lease {0}. Lease assumed to be released.", resource.Value);
localGranted.GetAndSet(false);
ExecuteLeaseLockCallback(leaseLost, null);
Expand Down