-
Notifications
You must be signed in to change notification settings - Fork 5.5k
Add support to perform heapDump on exceeded memory limit failures #16669
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,9 +17,12 @@ | |
| import com.facebook.presto.spi.StandardErrorCode; | ||
| import io.airlift.units.DataSize; | ||
|
|
||
| import java.util.Optional; | ||
|
|
||
| import static com.facebook.presto.spi.StandardErrorCode.EXCEEDED_GLOBAL_MEMORY_LIMIT; | ||
| import static com.facebook.presto.spi.StandardErrorCode.EXCEEDED_LOCAL_MEMORY_LIMIT; | ||
| import static com.facebook.presto.spi.StandardErrorCode.EXCEEDED_REVOCABLE_MEMORY_LIMIT; | ||
| import static com.facebook.presto.util.HeapDumper.dumpHeap; | ||
| import static java.lang.String.format; | ||
|
|
||
| public class ExceededMemoryLimitException | ||
|
|
@@ -35,8 +38,13 @@ public static ExceededMemoryLimitException exceededGlobalTotalLimit(DataSize max | |
| return new ExceededMemoryLimitException(EXCEEDED_GLOBAL_MEMORY_LIMIT, format("Query exceeded distributed total memory limit of %s defined at the %s", maxMemory, limitSource)); | ||
| } | ||
|
|
||
| public static ExceededMemoryLimitException exceededLocalUserMemoryLimit(DataSize maxMemory, String additionalFailureInfo) | ||
| public static ExceededMemoryLimitException exceededLocalUserMemoryLimit( | ||
| DataSize maxMemory, | ||
| String additionalFailureInfo, | ||
| boolean heapDumpOnExceededMemoryLimitEnabled, | ||
| Optional<String> heapDumpFilePath) | ||
| { | ||
| performHeapDumpIfEnabled(heapDumpOnExceededMemoryLimitEnabled, heapDumpFilePath); | ||
| return new ExceededMemoryLimitException(EXCEEDED_LOCAL_MEMORY_LIMIT, | ||
| format("Query exceeded per-node user memory limit of %s [%s]", maxMemory, additionalFailureInfo)); | ||
| } | ||
|
|
@@ -47,19 +55,38 @@ public static ExceededMemoryLimitException exceededLocalBroadcastMemoryLimit(Dat | |
| format("Query exceeded per-node broadcast memory limit of %s [%s]", maxMemory, additionalFailureInfo)); | ||
| } | ||
|
|
||
| public static ExceededMemoryLimitException exceededLocalTotalMemoryLimit(DataSize maxMemory, String additionalFailureInfo) | ||
| public static ExceededMemoryLimitException exceededLocalTotalMemoryLimit( | ||
| DataSize maxMemory, | ||
| String additionalFailureInfo, | ||
| boolean heapDumpOnExceededMemoryLimitEnabled, | ||
| Optional<String> heapDumpFilePath) | ||
| { | ||
| performHeapDumpIfEnabled(heapDumpOnExceededMemoryLimitEnabled, heapDumpFilePath); | ||
| return new ExceededMemoryLimitException(EXCEEDED_LOCAL_MEMORY_LIMIT, | ||
| format("Query exceeded per-node total memory limit of %s [%s]", maxMemory, additionalFailureInfo)); | ||
| } | ||
|
|
||
| public static ExceededMemoryLimitException exceededLocalRevocableMemoryLimit(DataSize maxMemory, String additionalFailureInfo) | ||
| public static ExceededMemoryLimitException exceededLocalRevocableMemoryLimit( | ||
|
||
| DataSize maxMemory, | ||
| String additionalFailureInfo, | ||
| boolean heapDumpOnExceededMemoryLimitEnabled, | ||
| Optional<String> heapDumpFilePath) | ||
| { | ||
| performHeapDumpIfEnabled(heapDumpOnExceededMemoryLimitEnabled, heapDumpFilePath); | ||
| return new ExceededMemoryLimitException( | ||
| EXCEEDED_REVOCABLE_MEMORY_LIMIT, | ||
| format("Query exceeded per-node revocable memory limit of %s [%s]", maxMemory, additionalFailureInfo)); | ||
| } | ||
|
|
||
| // Heap dump is done synchronously to ensure that we capture the current state of the heap | ||
| // This is intended to be used for debugging purposes only | ||
| private static void performHeapDumpIfEnabled(boolean heapDumpOnExceededMemoryLimitEnabled, Optional<String> heapDumpFilePath) | ||
| { | ||
| if (heapDumpOnExceededMemoryLimitEnabled && heapDumpFilePath.isPresent()) { | ||
|
||
| dumpHeap(heapDumpFilePath.get()); | ||
| } | ||
| } | ||
|
|
||
| private ExceededMemoryLimitException(StandardErrorCode errorCode, String message) | ||
| { | ||
| super(errorCode, message); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -111,6 +111,12 @@ public class QueryContext | |
| @GuardedBy("this") | ||
| private boolean verboseExceededMemoryLimitErrorsEnabled; | ||
|
|
||
| @GuardedBy("this") | ||
| private boolean heapDumpOnExceededMemoryLimitEnabled; | ||
|
|
||
| @GuardedBy("this") | ||
| private Optional<String> heapDumpFilePath; | ||
|
||
|
|
||
| public QueryContext( | ||
| QueryId queryId, | ||
| DataSize maxUserMemory, | ||
|
|
@@ -332,6 +338,16 @@ public long getMaxTotalMemory() | |
| return maxTotalMemory; | ||
| } | ||
|
|
||
| public synchronized void setHeapDumpOnExceededMemoryLimitEnabled(boolean heapDumpOnExceededMemoryLimitEnabled) | ||
| { | ||
| this.heapDumpOnExceededMemoryLimitEnabled = heapDumpOnExceededMemoryLimitEnabled; | ||
| } | ||
|
|
||
| public synchronized void setHeapDumpFilePath(String heapDumpFilePath) | ||
| { | ||
| this.heapDumpFilePath = Optional.ofNullable(heapDumpFilePath); | ||
| } | ||
|
|
||
| public TaskContext addTaskContext( | ||
| TaskStateMachine taskStateMachine, | ||
| Session session, | ||
|
|
@@ -477,7 +493,7 @@ private void enforceBroadcastMemoryLimit(long allocated, long delta, long maxMem | |
| private void enforceUserMemoryLimit(long allocated, long delta, long maxMemory) | ||
| { | ||
| if (allocated + delta > maxMemory) { | ||
| throw exceededLocalUserMemoryLimit(succinctBytes(maxMemory), getAdditionalFailureInfo(allocated, delta)); | ||
| throw exceededLocalUserMemoryLimit(succinctBytes(maxMemory), getAdditionalFailureInfo(allocated, delta), heapDumpOnExceededMemoryLimitEnabled, heapDumpFilePath); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -487,15 +503,15 @@ private void enforceTotalMemoryLimit(long allocated, long delta, long maxMemory) | |
| long totalMemory = allocated + delta; | ||
| peakNodeTotalMemory = Math.max(totalMemory, peakNodeTotalMemory); | ||
| if (totalMemory > maxMemory) { | ||
| throw exceededLocalTotalMemoryLimit(succinctBytes(maxMemory), getAdditionalFailureInfo(allocated, delta)); | ||
| throw exceededLocalTotalMemoryLimit(succinctBytes(maxMemory), getAdditionalFailureInfo(allocated, delta), heapDumpOnExceededMemoryLimitEnabled, heapDumpFilePath); | ||
| } | ||
| } | ||
|
|
||
| @GuardedBy("this") | ||
| private void enforceRevocableMemoryLimit(long allocated, long delta, long maxMemory) | ||
| { | ||
| if (allocated + delta > maxMemory) { | ||
| throw exceededLocalRevocableMemoryLimit(succinctBytes(maxMemory), getAdditionalFailureInfo(allocated, delta)); | ||
| throw exceededLocalRevocableMemoryLimit(succinctBytes(maxMemory), getAdditionalFailureInfo(allocated, delta), heapDumpOnExceededMemoryLimitEnabled, heapDumpFilePath); | ||
| } | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,64 @@ | ||
| /* | ||
| * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| * you may not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
| package com.facebook.presto.util; | ||
|
|
||
| import com.facebook.airlift.log.Logger; | ||
| import com.sun.management.HotSpotDiagnosticMXBean; | ||
|
|
||
| import javax.management.MBeanServer; | ||
|
|
||
| import java.io.IOException; | ||
| import java.lang.management.ManagementFactory; | ||
| import java.util.concurrent.atomic.AtomicBoolean; | ||
|
|
||
| public final class HeapDumper | ||
| { | ||
| private static final Logger log = Logger.get(HeapDumper.class); | ||
| private static final String HOTSPOT_BEAN_NAME = "com.sun.management:type=HotSpotDiagnostic"; | ||
| private static final AtomicBoolean IS_HEAPDUMP_TRIGGERED = new AtomicBoolean(false); | ||
|
|
||
| private static volatile HotSpotDiagnosticMXBean hotspotMBean; | ||
|
|
||
| private HeapDumper() {} | ||
|
|
||
| /** | ||
| * Call this method from your application whenever you | ||
| * want to dump the heap snapshot into a file. | ||
| * | ||
| * @param fileName name of the heap dump file | ||
| */ | ||
| public static void dumpHeap(String fileName) | ||
| { | ||
| if (IS_HEAPDUMP_TRIGGERED.compareAndSet(false, true)) { | ||
| log.info("Performing heapdump to file: " + fileName); | ||
| try { | ||
| if (hotspotMBean == null) { | ||
| hotspotMBean = getHotspotMBean(); | ||
| } | ||
| hotspotMBean.dumpHeap(fileName, false); | ||
| } | ||
| catch (Throwable throwable) { | ||
| // Consume the error as we do not want to fail during heapdump | ||
| log.error(throwable, "Unable to perform heap dump"); | ||
| } | ||
|
||
| } | ||
| } | ||
|
|
||
| private static HotSpotDiagnosticMXBean getHotspotMBean() | ||
| throws IOException | ||
| { | ||
| MBeanServer server = ManagementFactory.getPlatformMBeanServer(); | ||
| return ManagementFactory.newPlatformMXBeanProxy(server, HOTSPOT_BEAN_NAME, HotSpotDiagnosticMXBean.class); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's kinda weird having a dump inside an exception and do it synchronically.... Can we at the callsite, when memory exceeds, we throw the error and asynchronically use an executor to do the dump?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a basically a catch-all method for any exceeded memory OOM. If we add this logic at callsite, it will be spread throughout the code and might not look clean. Presto exceeded memory limit failure is like an OOM so getting a heapdump here makes most sense for debugging purpose.
Also, this is a debugging feature and will be disabled in prod. I didn't wanted to over-engineer the solution since this logic will be triggered in control environment while testing a specific query for OOM failures.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's important the heap dump be triggered synchronously with the out of memory error because the purpose of the entire thing is capture the state of the heap when the error occurs. If it's asynchronous it allows time for queries to fail and cleanup to occur both in this query and in other queries that are running concurrently.
WDYT?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If that is the case, shall we add comments to this method (and other's in this class) to indicate this needed and only for debugging purpose.