-
Notifications
You must be signed in to change notification settings - Fork 25.8k
Include clusterApplyListener in long cluster apply warnings #120087
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
035bbc0
95694da
c0395b6
915951e
eaddfdb
a71181e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| pr: 120087 | ||
| summary: Include `clusterApplyListener` in long cluster apply warnings | ||
| area: Cluster Coordination | ||
| type: enhancement | ||
| issues: [] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -158,6 +158,31 @@ public void run() { | |
| } | ||
| } | ||
|
|
||
| private record TimedListener(ActionListener<Void> listener, Recorder recorder) implements ActionListener<Void> { | ||
|
|
||
| @Override | ||
| public void onResponse(Void response) { | ||
| try (Releasable ignored = recorder.record("listener.onResponse")) { | ||
| listener.onResponse(null); | ||
| } catch (Exception e) { | ||
| assert false : e; | ||
| logger.error("exception thrown by listener.onResponse", e); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public void onFailure(Exception e) { | ||
| assert e != null; | ||
| try (Releasable ignored = recorder.record("listener.onFailure")) { | ||
| listener.onFailure(e); | ||
| } catch (Exception inner) { | ||
| e.addSuppressed(inner); | ||
| assert false : e; | ||
| logger.error(() -> "exception thrown by listener.onFailure", e); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| protected synchronized void doStop() { | ||
| for (Map.Entry<TimeoutClusterStateListener, NotifyTimeout> onGoingTimeout : timeoutClusterStateListeners.entrySet()) { | ||
|
|
@@ -394,12 +419,14 @@ private void runTask(String source, Function<ClusterState, ClusterState> updateF | |
|
|
||
| final long startTimeMillis = threadPool.relativeTimeInMillis(); | ||
| final Recorder stopWatch = new Recorder(threadPool, slowTaskThreadDumpTimeout); | ||
| final TimedListener timedListener = new TimedListener(clusterApplyListener, stopWatch); | ||
| final ClusterState newClusterState; | ||
| try { | ||
| try (Releasable ignored = stopWatch.record("running task [" + source + ']')) { | ||
| newClusterState = updateFunction.apply(previousClusterState); | ||
| } | ||
| } catch (Exception e) { | ||
| timedListener.onFailure(e); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the listener throws, the following code will not run. Probably shouldn't happen in practice. But maybe still worthwhile to wrap in try-finally? I also wonder whether we should add more details in the log message for the time spent on applying the cluster and calling the listener?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The Is that what you meant by "time spent ... calling the listener" I will look at using one of the
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think Yang is right, we just need a
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually no real need to propagate the exception to the caller either, it just bubbles up to the unhandled exception handler which logs it and drops it. We may as well catch and log (and assert) in
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
| TimeValue executionTime = getTimeSince(startTimeMillis); | ||
| logger.trace( | ||
| () -> format( | ||
|
|
@@ -412,15 +439,14 @@ private void runTask(String source, Function<ClusterState, ClusterState> updateF | |
| e | ||
| ); | ||
| warnAboutSlowTaskIfNeeded(executionTime, source, stopWatch); | ||
| clusterApplyListener.onFailure(e); | ||
| return; | ||
| } | ||
|
|
||
| if (previousClusterState == newClusterState) { | ||
| timedListener.onResponse(null); | ||
| TimeValue executionTime = getTimeSince(startTimeMillis); | ||
| logger.debug("processing [{}]: took [{}] no change in cluster state", source, executionTime); | ||
| warnAboutSlowTaskIfNeeded(executionTime, source, stopWatch); | ||
| clusterApplyListener.onResponse(null); | ||
| } else { | ||
| if (logger.isTraceEnabled()) { | ||
| logger.debug("cluster state updated, version [{}], source [{}]\n{}", newClusterState.version(), source, newClusterState); | ||
|
|
@@ -430,6 +456,7 @@ private void runTask(String source, Function<ClusterState, ClusterState> updateF | |
| try { | ||
| setIsApplyingClusterState(); | ||
| applyChanges(previousClusterState, newClusterState, source, stopWatch); | ||
| timedListener.onResponse(null); | ||
| TimeValue executionTime = getTimeSince(startTimeMillis); | ||
| logger.debug( | ||
| "processing [{}]: took [{}] done applying updated cluster state (version: {}, uuid: {})", | ||
|
|
@@ -439,8 +466,11 @@ private void runTask(String source, Function<ClusterState, ClusterState> updateF | |
| newClusterState.stateUUID() | ||
| ); | ||
| warnAboutSlowTaskIfNeeded(executionTime, source, stopWatch); | ||
| clusterApplyListener.onResponse(null); | ||
| } catch (Exception e) { | ||
| // failing to apply a cluster state with an exception indicates a bug in validation or in one of the appliers; if we | ||
| // continue we will retry with the same cluster state but that might not help. | ||
| assert applicationMayFail(); | ||
| timedListener.onFailure(e); | ||
| TimeValue executionTime = getTimeSince(startTimeMillis); | ||
| if (logger.isTraceEnabled()) { | ||
| logger.warn(() -> format(""" | ||
|
|
@@ -460,10 +490,6 @@ private void runTask(String source, Function<ClusterState, ClusterState> updateF | |
| e | ||
| ); | ||
| } | ||
| // failing to apply a cluster state with an exception indicates a bug in validation or in one of the appliers; if we | ||
| // continue we will retry with the same cluster state but that might not help. | ||
| assert applicationMayFail(); | ||
| clusterApplyListener.onFailure(e); | ||
| } finally { | ||
| clearIsApplyingClusterState(); | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -66,6 +66,12 @@ public long relativeTimeInMillis() { | |
| assertThat(Thread.currentThread().getName(), containsString(ClusterApplierService.CLUSTER_UPDATE_THREAD_NAME)); | ||
| return currentTimeMillis; | ||
| } | ||
|
|
||
| @Override | ||
| public long rawRelativeTimeInMillis() { | ||
| assertThat(Thread.currentThread().getName(), containsString(ClusterApplierService.CLUSTER_UPDATE_THREAD_NAME)); | ||
| return currentTimeMillis; | ||
| } | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I notice that we use
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess if the cluster is under heavy GC or the cachedTimer thread is experiencing starvation, the elapsed calcuation will be off if we use the cached timer. This can be detected by log message "timer thread slept ...". But it's a bit indirect. Not sure if that was the intention behind using
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The cached timer only updates every 200ms or so, and although the total threshold is many seconds in length many of the individual steps we record should take much less than 200ms. It's not a huge deal to call |
||
| }; | ||
| clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); | ||
| allowClusterStateApplicationFailure = false; | ||
|
|
@@ -207,15 +213,33 @@ public void testLongClusterStateUpdateLogging() throws Exception { | |
| ); | ||
| mockLog.addExpectation( | ||
| new MockLog.SeenEventExpectation( | ||
| "test4", | ||
| "test3", | ||
| ClusterApplierService.class.getCanonicalName(), | ||
| Level.WARN, | ||
| "*cluster state applier task [test3] took [34s] which is above the warn threshold of [*]: " | ||
| + "[running task [test3]] took [*" | ||
| ) | ||
| ); | ||
| mockLog.addExpectation( | ||
| new MockLog.SeenEventExpectation( | ||
| "test4", | ||
| ClusterApplierService.class.getCanonicalName(), | ||
| Level.WARN, | ||
| "*cluster state applier task [test4] took [36s] which is above the warn threshold of [*]: " | ||
| + "[running task [test4]] took [*" | ||
| ) | ||
| ); | ||
| mockLog.addExpectation( | ||
| new MockLog.SeenEventExpectation( | ||
| "test5", | ||
| ClusterApplierService.class.getCanonicalName(), | ||
| Level.WARN, | ||
| "*cluster state applier task [test5] took [38s] which is above the warn threshold of [*]: " | ||
| + "[running task [test5]] took [*" | ||
| ) | ||
| ); | ||
|
|
||
| final CountDownLatch latch = new CountDownLatch(4); | ||
| final CountDownLatch latch = new CountDownLatch(6); | ||
| final CountDownLatch processedFirstTask = new CountDownLatch(1); | ||
| currentTimeMillis = randomLongBetween(0L, Long.MAX_VALUE / 2); | ||
| clusterApplierService.runOnApplierThread( | ||
|
|
@@ -266,9 +290,39 @@ public void onFailure(Exception e) { | |
| } | ||
| } | ||
| ); | ||
| clusterApplierService.runOnApplierThread("test4", Priority.HIGH, currentState -> { | ||
| // do nothing (testing that onResponse is included in timing) | ||
| }, new ActionListener<>() { | ||
|
|
||
| @Override | ||
| public void onResponse(Void unused) { | ||
| advanceTime(TimeValue.timeValueSeconds(36).millis()); | ||
| latch.countDown(); | ||
| } | ||
|
|
||
| @Override | ||
| public void onFailure(Exception e) { | ||
| fail(); | ||
| } | ||
| }); | ||
| clusterApplierService.runOnApplierThread("test5", Priority.HIGH, currentState -> { | ||
| throw new IllegalArgumentException("Testing that onFailure is included in timing"); | ||
| }, new ActionListener<>() { | ||
|
|
||
| @Override | ||
| public void onResponse(Void unused) { | ||
| fail(); | ||
| } | ||
|
|
||
| @Override | ||
| public void onFailure(Exception e) { | ||
| advanceTime(TimeValue.timeValueSeconds(38).millis()); | ||
| latch.countDown(); | ||
| } | ||
| }); | ||
| // Additional update task to make sure all previous logging made it to the loggerName | ||
| // We don't check logging for this on since there is no guarantee that it will occur before our check | ||
| clusterApplierService.runOnApplierThread("test4", Priority.HIGH, currentState -> {}, new ActionListener<>() { | ||
| clusterApplierService.runOnApplierThread("test6", Priority.HIGH, currentState -> {}, new ActionListener<>() { | ||
| @Override | ||
| public void onResponse(Void ignored) { | ||
| latch.countDown(); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.