-
Notifications
You must be signed in to change notification settings - Fork 25.8k
logging hot threads on large queue of the management threadpool #140251
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c231fcf
8088379
d2c8b15
a920ab5
0c5c6c2
66c5171
ace3225
53730f2
a644445
5dabd1b
6202a70
61687b1
f1c9fa4
acd1665
d9470bb
0e6ec3d
8328d5d
84a98d4
5dbb68c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,15 +9,21 @@ | |
|
|
||
| package org.elasticsearch.common.util.concurrent; | ||
|
|
||
| import org.apache.logging.log4j.Level; | ||
| import org.apache.logging.log4j.LogManager; | ||
| import org.apache.logging.log4j.Logger; | ||
| import org.elasticsearch.common.ReferenceDocs; | ||
| import org.elasticsearch.core.SuppressForbidden; | ||
| import org.elasticsearch.core.TimeValue; | ||
| import org.elasticsearch.monitor.jvm.HotThreads; | ||
|
|
||
| import java.util.concurrent.BlockingQueue; | ||
| import java.util.concurrent.RejectedExecutionHandler; | ||
| import java.util.concurrent.ThreadFactory; | ||
| import java.util.concurrent.ThreadPoolExecutor; | ||
| import java.util.concurrent.TimeUnit; | ||
| import java.util.concurrent.atomic.AtomicLong; | ||
| import java.util.function.LongSupplier; | ||
| import java.util.stream.Stream; | ||
|
|
||
| import static org.elasticsearch.core.Strings.format; | ||
|
|
@@ -28,6 +34,7 @@ | |
| public class EsThreadPoolExecutor extends ThreadPoolExecutor { | ||
|
|
||
| private static final Logger logger = LogManager.getLogger(EsThreadPoolExecutor.class); | ||
| private static final long NOT_TRACKED_TIME = -1L; | ||
|
|
||
| // noop probe to prevent starvation of work in the work queue due to ForceQueuePolicy | ||
| // https://github.com/elastic/elasticsearch/issues/124667 | ||
|
|
@@ -45,6 +52,15 @@ public void run() {} | |
| */ | ||
| private final String name; | ||
|
|
||
| private final EsExecutors.HotThreadsOnLargeQueueConfig hotThreadsOnLargeQueueConfig; | ||
| private final LongSupplier currentTimeMillisSupplier; | ||
|
|
||
| // There may be racing on updating this field. It's OK since hot threads logging is very coarse grained time wise | ||
| // and can tolerate some inaccuracies. | ||
| private volatile long startTimeMillisOfLargeQueue = NOT_TRACKED_TIME; | ||
|
|
||
| private final AtomicLong lastLoggingTimeMillisForHotThreads; | ||
|
|
||
| EsThreadPoolExecutor( | ||
| String name, | ||
| int corePoolSize, | ||
|
|
@@ -55,7 +71,45 @@ public void run() {} | |
| ThreadFactory threadFactory, | ||
| ThreadContext contextHolder | ||
| ) { | ||
| this(name, corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, threadFactory, new EsAbortPolicy(), contextHolder); | ||
| this( | ||
| name, | ||
| corePoolSize, | ||
| maximumPoolSize, | ||
| keepAliveTime, | ||
| unit, | ||
| workQueue, | ||
| threadFactory, | ||
| new EsAbortPolicy(), | ||
| contextHolder, | ||
| EsExecutors.HotThreadsOnLargeQueueConfig.DISABLED | ||
| ); | ||
| } | ||
|
|
||
| EsThreadPoolExecutor( | ||
| String name, | ||
| int corePoolSize, | ||
| int maximumPoolSize, | ||
| long keepAliveTime, | ||
| TimeUnit unit, | ||
| BlockingQueue<Runnable> workQueue, | ||
| ThreadFactory threadFactory, | ||
| RejectedExecutionHandler handler, | ||
| ThreadContext contextHolder, | ||
| EsExecutors.HotThreadsOnLargeQueueConfig hotThreadsOnLargeQueueConfig | ||
| ) { | ||
| this( | ||
| name, | ||
| corePoolSize, | ||
| maximumPoolSize, | ||
| keepAliveTime, | ||
| unit, | ||
| workQueue, | ||
| threadFactory, | ||
| handler, | ||
| contextHolder, | ||
| hotThreadsOnLargeQueueConfig, | ||
| System::currentTimeMillis | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we use the cached time from the threadpool?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That was indeed the first thing I had in mind. Though it is technically possible. It implies quite a bit cascading changes to the builders, subclasses and utility methods. In addition, executors are built in ThreadPool's constructor so that we would need this-escape to pass the relativeTimeInMillis method. Overall I think it is not really worth the effort especially since we might not want to support it in long term. |
||
| ); | ||
| } | ||
|
|
||
| @SuppressForbidden(reason = "properly rethrowing errors, see EsExecutors.rethrowErrors") | ||
|
|
@@ -68,11 +122,18 @@ public void run() {} | |
| BlockingQueue<Runnable> workQueue, | ||
| ThreadFactory threadFactory, | ||
| RejectedExecutionHandler handler, | ||
| ThreadContext contextHolder | ||
| ThreadContext contextHolder, | ||
| EsExecutors.HotThreadsOnLargeQueueConfig hotThreadsOnLargeQueueConfig, | ||
| LongSupplier currentTimeMillisSupplier // For test to configure a custom time supplier | ||
| ) { | ||
| super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, threadFactory, handler); | ||
| this.name = name; | ||
| this.contextHolder = contextHolder; | ||
| this.hotThreadsOnLargeQueueConfig = hotThreadsOnLargeQueueConfig; | ||
| this.currentTimeMillisSupplier = currentTimeMillisSupplier; | ||
| this.lastLoggingTimeMillisForHotThreads = hotThreadsOnLargeQueueConfig.isEnabled() | ||
| ? new AtomicLong(currentTimeMillisSupplier.getAsLong() - hotThreadsOnLargeQueueConfig.intervalInMillis()) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NIt: this could also just be zero couldn't it?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not really. Zero effectively means the logging has an initial delay of 60 min (the default interval) on node start. So we can miss the logging if a large queue size occurs within that time frame.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it? because if (now - lastLoggingTime >= hotThreadsOnLargeQueueConfig.intervalInMillis()
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry I miss understood your suggestion. Yes, it can technically be 0 since we are using absolute timer. I didn't do it because that make test manipulation harder, i.e. the initial time is fixed and cannot controlled by the unit test, hence the choice. I prefer to keep it as is for the time being. |
||
| : null; | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -88,6 +149,9 @@ public void setMaximumPoolSize(int maximumPoolSize) { | |
| @Override | ||
| public void execute(Runnable command) { | ||
| final Runnable wrappedRunnable = command != WORKER_PROBE ? wrapRunnable(command) : WORKER_PROBE; | ||
|
|
||
| maybeLogForLargeQueueSize(); | ||
|
|
||
| try { | ||
| super.execute(wrappedRunnable); | ||
| } catch (Exception e) { | ||
|
|
@@ -109,6 +173,56 @@ public void execute(Runnable command) { | |
| } | ||
| } | ||
|
|
||
| private void maybeLogForLargeQueueSize() { | ||
| if (hotThreadsOnLargeQueueConfig.isEnabled() == false) { | ||
| return; | ||
| } | ||
|
|
||
| final int queueSize = getQueue().size(); | ||
| // Use queueSize + 1 so that we start to track when queueSize is 499 and this task is most likely to be queued as well, | ||
| // thus reaching the threshold of 500. It won't log right away due to the duration threshold. | ||
| if (queueSize + 1 >= hotThreadsOnLargeQueueConfig.sizeThreshold()) { | ||
| final long startTime = startTimeMillisOfLargeQueue; | ||
| final long now = currentTimeMillisSupplier.getAsLong(); | ||
| if (startTime == NOT_TRACKED_TIME) { | ||
| startTimeMillisOfLargeQueue = now; | ||
| return; | ||
| } | ||
| final long duration = now - startTime; | ||
| if (duration >= hotThreadsOnLargeQueueConfig.durationThresholdInMillis()) { | ||
| final var lastLoggingTime = lastLoggingTimeMillisForHotThreads.get(); | ||
| if (now - lastLoggingTime >= hotThreadsOnLargeQueueConfig.intervalInMillis() | ||
| && lastLoggingTimeMillisForHotThreads.compareAndSet(lastLoggingTime, now)) { | ||
| logger.info("start logging hot-threads for large queue size [{}] on [{}] executor", queueSize, name); | ||
| HotThreads.logLocalHotThreads( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This emits its logs after some delay - might be worth logging a message right now saying we're starting to capture hot threads at this point too.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added in acd1665 |
||
| logger, | ||
| Level.INFO, | ||
| "ThreadPoolExecutor [" | ||
| + name | ||
| + "] queue size [" | ||
| + queueSize | ||
| + "] has been over threshold for [" | ||
| + TimeValue.timeValueMillis(duration) | ||
| + "]", | ||
| ReferenceDocs.LOGGING | ||
| ); | ||
| } | ||
| } | ||
| } else { | ||
| startTimeMillisOfLargeQueue = NOT_TRACKED_TIME; | ||
| } | ||
| } | ||
|
|
||
| // package private for testing | ||
| EsExecutors.HotThreadsOnLargeQueueConfig getHotThreadsOnLargeQueueConfig() { | ||
| return hotThreadsOnLargeQueueConfig; | ||
| } | ||
|
|
||
| // package private for testing | ||
| long getStartTimeMillisOfLargeQueue() { | ||
| return startTimeMillisOfLargeQueue; | ||
| } | ||
|
|
||
| // package-visible for testing | ||
| void logException(AbstractRunnable r, Exception e) { | ||
| logger.error(() -> format("[%s] unexpected exception when submitting task [%s] for execution", name, r), e); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we use
== DISABLEDrather than needing to call this method?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To do that, we will need to ensure the
DISABLEDinstance is returned whenever the class is instantiated withsizeThreshold == 0. It is technically possible but would require private access to the class's constructor which means it cannot be a record and the need to add utility method. I am not sure whether this is a better trade-off compare to having this method. Please let me know if you think otherwise. Thanks!There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok we can go with this at least for now.