From 41b62cb35f407c97f7a2a5ce7be5fc80822338a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Louv-Jansen?= Date: Wed, 2 Apr 2025 13:21:20 +0200 Subject: [PATCH] Retry release to fix flaky tests (#216781) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Related to https://github.com/elastic/kibana/pull/216397 Closes https://github.com/elastic/kibana/issues/216763 This change ensures that we do not send the `release` request and `extendTtl` request simultaneously in `withLock`. This caused a conflict causing tests to fail: ``` └-> "before all" hook for "should return the result of the callback" │ERROR Failed to release lock "my_lock_with_ttl_extension": version_conflict_engine_exception │ Root causes: │ version_conflict_engine_exception: [my_lock_with_ttl_extension]: version conflict, required seqNo [43], primary term [1]. current document has seqNo [44] and primary term [1] ``` Flaky tests: https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/8142 (cherry picked from commit 7275d2e8bd834303898e3f95c5fd1ab734e92947) --- .../lock_manager_client.ts | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/x-pack/platform/plugins/shared/observability_ai_assistant/server/service/distributed_lock_manager/lock_manager_client.ts b/x-pack/platform/plugins/shared/observability_ai_assistant/server/service/distributed_lock_manager/lock_manager_client.ts index 18412a17bde30..8fedfd81d978e 100644 --- a/x-pack/platform/plugins/shared/observability_ai_assistant/server/service/distributed_lock_manager/lock_manager_client.ts +++ b/x-pack/platform/plugins/shared/observability_ai_assistant/server/service/distributed_lock_manager/lock_manager_client.ts @@ -285,20 +285,26 @@ export async function withLock( `Lock "${lockId}" acquired. Extending TTL every ${prettyMilliseconds(extendInterval)}` ); + let extendTTlPromise = Promise.resolve(true); const intervalId = setInterval(() => { - lockManager.extendTtl().catch((err) => { - logger.error(`Failed to extend lock "${lockId}":`, err); - }); + // wait for the previous extendTtl request to finish before sending the next one. This is to avoid flooding ES with extendTtl requests in cases where ES is slow to respond. + extendTTlPromise = extendTTlPromise + .then(() => lockManager.extendTtl()) + .catch((err) => { + logger.error(`Failed to extend lock "${lockId}":`, err); + return false; + }); }, extendInterval); try { return await callback(); } finally { - clearInterval(intervalId); try { + clearInterval(intervalId); + await extendTTlPromise; await lockManager.release(); } catch (error) { - logger.error(`Failed to release lock "${lockId}": ${error.message}`); + logger.error(`Failed to release lock "${lockId}" in withLock: ${error.message}`); } } }