diff --git a/src/Persistence/PostgresqlTests/Bugs/Bug_advisory_lock_stacking_blocks_failover.cs b/src/Persistence/PostgresqlTests/Bugs/Bug_advisory_lock_stacking_blocks_failover.cs
new file mode 100644
index 000000000..97682164b
--- /dev/null
+++ b/src/Persistence/PostgresqlTests/Bugs/Bug_advisory_lock_stacking_blocks_failover.cs
@@ -0,0 +1,158 @@
+using IntegrationTests;
+using Microsoft.Extensions.Logging.Abstractions;
+using Npgsql;
+using Shouldly;
+using Xunit;
+using AdvisoryLock = Wolverine.Postgresql.AdvisoryLock;
+
+namespace PostgresqlTests.Bugs;
+
+///
+/// Regression for the leader-failover stall introduced by the heartbeat
+/// lease-renewal change in a84d6a262. After that commit
+/// NodeAgentController.DoHealthChecksAsync calls
+/// TryAttainLeadershipLockAsync on every tick — including ticks
+/// where the leader already holds the lock. Postgres session-level
+/// advisory locks STACK: "Multiple lock requests stack, so that if the
+/// same resource is locked three times it must then be unlocked three
+/// times to be released." The leader's lock count therefore grew by one
+/// per heartbeat. The single ReleaseLeadershipLockAsync call during
+/// DisableAgentsAsync / stepDownAsync only decremented once,
+/// leaving the lock still held server-side and silently blocking
+/// failover — no error logged, just a stalled election. Surfaced in
+/// SlowTests.SharedMemory.leadership_compliance.take_over_leader_ship_if_leader_becomes_stale
+/// (consistent failure on main, 0/10 in isolation; passes 3/3 with this fix).
+///
+/// The fix makes idempotent
+/// against re-entrant calls on a still-held lock — a no-op short-circuit
+/// when _locks already contains the id and the held connection is
+/// still alive. performs the same
+/// liveness check, so this also keeps the GH-2602 split-brain detection
+/// behaviour intact.
+///
+public class Bug_advisory_lock_stacking_blocks_failover : PostgresqlContext
+{
+ [Fact]
+ public async Task repeated_TryAttainLockAsync_does_not_stack_so_one_release_actually_releases()
+ {
+ const int lockId = unchecked((int)0xDEADBEEF);
+
+ var dataSource = NpgsqlDataSource.Create(Servers.PostgresConnectionString);
+ try
+ {
+ var holder = new AdvisoryLock(dataSource, NullLogger.Instance, "stacking-test");
+ try
+ {
+ // Simulate ten heartbeat ticks on the same leader: every
+ // DoHealthChecksAsync now calls TryAttainLeadershipLockAsync.
+ // Pre-fix this stacked the Postgres advisory lock ten times.
+ for (var i = 0; i < 10; i++)
+ {
+ (await holder.TryAttainLockAsync(lockId, CancellationToken.None))
+ .ShouldBeTrue($"holder must still report success on tick {i}");
+ }
+
+ holder.HasLock(lockId).ShouldBeTrue();
+
+ // The leader steps down or is disabled — exactly ONE release
+ // call, matching DisableAgentsAsync / stepDownAsync semantics.
+ await holder.ReleaseLockAsync(lockId);
+
+ // A would-be new leader on a different connection tries to
+ // attain. Pre-fix this returned false because the holder's
+ // session still held nine stacked locks; post-fix it
+ // succeeds.
+ var contender = new AdvisoryLock(dataSource, NullLogger.Instance, "contender");
+ try
+ {
+ (await contender.TryAttainLockAsync(lockId, CancellationToken.None))
+ .ShouldBeTrue(
+ "A single ReleaseLockAsync after repeated TryAttainLockAsync calls on the same session " +
+ "must fully release the Postgres advisory lock so a different node can take over.");
+ }
+ finally
+ {
+ await contender.ReleaseLockAsync(lockId);
+ await contender.DisposeAsync();
+ }
+ }
+ finally
+ {
+ await holder.DisposeAsync();
+ }
+ }
+ finally
+ {
+ await dataSource.DisposeAsync();
+ }
+ }
+
+ [Fact]
+ public async Task TryAttainLockAsync_remains_idempotent_after_lock_lost_server_side()
+ {
+ // Defense in depth for the GH-2602 split-brain detector. If the
+ // holder's backend is killed and Postgres releases the lock
+ // server-side, HasLock returns false and clears _locks. A
+ // subsequent TryAttainLockAsync must NOT short-circuit on stale
+ // in-memory state — it must actually re-acquire (or fail honestly).
+ const int lockId = unchecked((int)0xCAFEBABE);
+
+ var dataSource = NpgsqlDataSource.Create(Servers.PostgresConnectionString);
+ try
+ {
+ var holder = new AdvisoryLock(dataSource, NullLogger.Instance, "idempotent-after-loss-test");
+ try
+ {
+ (await holder.TryAttainLockAsync(lockId, CancellationToken.None)).ShouldBeTrue();
+
+ var holderPid = await findAdvisoryLockHolderPidAsync(lockId);
+ holderPid.ShouldNotBeNull();
+ await terminateBackendAsync(holderPid!.Value);
+
+ // After a backend kill HasLock detects the loss and clears
+ // _locks. The next TryAttainLockAsync must therefore NOT
+ // short-circuit on the stale in-memory state.
+ holder.HasLock(lockId).ShouldBeFalse();
+
+ // Re-attain on a fresh connection.
+ (await holder.TryAttainLockAsync(lockId, CancellationToken.None))
+ .ShouldBeTrue("After backend loss the holder should be able to re-attain on a fresh connection");
+ }
+ finally
+ {
+ await holder.DisposeAsync();
+ }
+ }
+ finally
+ {
+ await dataSource.DisposeAsync();
+ }
+ }
+
+ private static async Task findAdvisoryLockHolderPidAsync(int lockId)
+ {
+ await using var conn = new NpgsqlConnection(Servers.PostgresConnectionString);
+ await conn.OpenAsync();
+ await using var cmd = conn.CreateCommand();
+ cmd.CommandText = @"
+ select pid from pg_locks
+ where locktype = 'advisory'
+ and granted = true
+ and ((classid::bigint << 32) | (objid::bigint & x'FFFFFFFF'::bigint)) = @lockId
+ limit 1";
+ cmd.Parameters.AddWithValue("@lockId", (long)lockId);
+ var raw = await cmd.ExecuteScalarAsync();
+ return raw is int pid ? pid : null;
+ }
+
+ private static async Task terminateBackendAsync(int pid)
+ {
+ await using var conn = new NpgsqlConnection(Servers.PostgresConnectionString);
+ await conn.OpenAsync();
+ await using var cmd = conn.CreateCommand();
+ cmd.CommandText = "select pg_terminate_backend(@pid)";
+ cmd.Parameters.AddWithValue("@pid", pid);
+ await cmd.ExecuteScalarAsync();
+ await Task.Delay(250);
+ }
+}
diff --git a/src/Persistence/SqlServerTests/Bug_advisory_lock_stacking_blocks_failover.cs b/src/Persistence/SqlServerTests/Bug_advisory_lock_stacking_blocks_failover.cs
new file mode 100644
index 000000000..ec966c147
--- /dev/null
+++ b/src/Persistence/SqlServerTests/Bug_advisory_lock_stacking_blocks_failover.cs
@@ -0,0 +1,80 @@
+using IntegrationTests;
+using Microsoft.Data.SqlClient;
+using Microsoft.Extensions.Logging.Abstractions;
+using Shouldly;
+using Wolverine.SqlServer.Persistence;
+using Xunit;
+
+namespace SqlServerTests;
+
+///
+/// SQL Server companion to Bug_advisory_lock_stacking_blocks_failover
+/// in PostgresqlTests. SQL Server session-scoped application locks
+/// (sp_getapplock) are reentrant — quoting the docs: "If a lock has been
+/// requested in the current transaction or by the current session,
+/// sp_getapplock can be called multiple times for it ... For each request
+/// that returns success ... sp_releaseapplock must also be called." The
+/// heartbeat-renewal change in a84d6a262 calls
+/// TryAttainLeadershipLockAsync on every tick, including ticks
+/// where the leader already holds the lock, so the leader's lock count
+/// grows by one per heartbeat. The single ReleaseLeadershipLockAsync
+/// call during DisableAgentsAsync / stepDownAsync only
+/// decrements once, leaving the lock still held server-side and silently
+/// blocking failover. The fix makes
+/// idempotent
+/// against re-entrant calls on a still-held lock.
+///
+public class Bug_advisory_lock_stacking_blocks_failover
+{
+ [Fact]
+ public async Task repeated_TryAttainLockAsync_does_not_stack_so_one_release_actually_releases()
+ {
+ const int lockId = unchecked((int)0xDEADBEEF);
+
+ var holder = new SqlServerAdvisoryLock(
+ () => new SqlConnection(Servers.SqlServerConnectionString),
+ NullLogger.Instance,
+ "stacking-test");
+ try
+ {
+ // Simulate ten heartbeat ticks on the same leader: every
+ // DoHealthChecksAsync now calls TryAttainLeadershipLockAsync.
+ // Pre-fix this stacked the SQL Server application lock ten times.
+ for (var i = 0; i < 10; i++)
+ {
+ (await holder.TryAttainLockAsync(lockId, CancellationToken.None))
+ .ShouldBeTrue($"holder must still report success on tick {i}");
+ }
+
+ holder.HasLock(lockId).ShouldBeTrue();
+
+ // The leader steps down or is disabled — exactly ONE release
+ // call, matching DisableAgentsAsync / stepDownAsync semantics.
+ await holder.ReleaseLockAsync(lockId);
+
+ // A would-be new leader on a different connection tries to
+ // attain. Pre-fix this returned false because the holder's
+ // session still held nine stacked locks; post-fix it succeeds.
+ var contender = new SqlServerAdvisoryLock(
+ () => new SqlConnection(Servers.SqlServerConnectionString),
+ NullLogger.Instance,
+ "contender");
+ try
+ {
+ (await contender.TryAttainLockAsync(lockId, CancellationToken.None))
+ .ShouldBeTrue(
+ "A single ReleaseLockAsync after repeated TryAttainLockAsync calls on the same session " +
+ "must fully release the SQL Server application lock so a different node can take over.");
+ }
+ finally
+ {
+ await contender.ReleaseLockAsync(lockId);
+ await contender.DisposeAsync();
+ }
+ }
+ finally
+ {
+ await holder.DisposeAsync();
+ }
+ }
+}
diff --git a/src/Persistence/Wolverine.Postgresql/PostgresqlNodePersistence.cs b/src/Persistence/Wolverine.Postgresql/PostgresqlNodePersistence.cs
index 543898a50..ebbbc804f 100644
--- a/src/Persistence/Wolverine.Postgresql/PostgresqlNodePersistence.cs
+++ b/src/Persistence/Wolverine.Postgresql/PostgresqlNodePersistence.cs
@@ -433,6 +433,22 @@ public bool HasLock(int lockId)
public async Task TryAttainLockAsync(int lockId, CancellationToken token)
{
+ // Idempotent against repeated calls on the same session. Postgres
+ // session-level advisory locks STACK ("Multiple lock requests stack,
+ // so that if the same resource is locked three times it must then be
+ // unlocked three times to be released" — Postgres docs). Since the
+ // a84d6a262 heartbeat-renewal change calls TryAttainLeadershipLockAsync
+ // every tick — including ticks where the leader already holds the
+ // lock — without this short-circuit the leader's lock count grows by
+ // one per heartbeat. The single ReleaseLeadershipLockAsync call
+ // during DisableAgentsAsync or stepDownAsync then only decrements
+ // once, leaving the lock still held server-side and silently
+ // blocking failover (no error logged, just a stalled election).
+ if (_locks.Contains(lockId) && HasLock(lockId))
+ {
+ return true;
+ }
+
if (_conn == null)
{
_conn = _source.CreateConnection();
diff --git a/src/Persistence/Wolverine.SqlServer/Persistence/SqlServerAdvisoryLock.cs b/src/Persistence/Wolverine.SqlServer/Persistence/SqlServerAdvisoryLock.cs
index 5591f20ac..5fef01347 100644
--- a/src/Persistence/Wolverine.SqlServer/Persistence/SqlServerAdvisoryLock.cs
+++ b/src/Persistence/Wolverine.SqlServer/Persistence/SqlServerAdvisoryLock.cs
@@ -73,6 +73,24 @@ public bool HasLock(int lockId)
public async Task TryAttainLockAsync(int lockId, CancellationToken token)
{
+ // Idempotent against repeated calls on the same session. SQL Server
+ // session-scoped application locks (sp_getapplock) are reentrant —
+ // "If a lock has been requested in the current transaction or by the
+ // current session, sp_getapplock can be called multiple times for it
+ // (with the same name and lock owner). For each request that returns
+ // success ... sp_releaseapplock must also be called." The
+ // a84d6a262 heartbeat-renewal change calls TryAttainLeadershipLockAsync
+ // every tick — including ticks where the leader already holds the
+ // lock — so without this short-circuit the leader's lock count grows
+ // by one per heartbeat. The single ReleaseLeadershipLockAsync call
+ // during DisableAgentsAsync or stepDownAsync then only decrements
+ // once, leaving the lock still held server-side and silently
+ // blocking failover (no error logged, just a stalled election).
+ if (_locks.Contains(lockId) && HasLock(lockId))
+ {
+ return true;
+ }
+
if (_conn == null)
{
_conn = _source();