From 5684f90036682a17018c8ab48e6072eda549a5ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Osipiuk?= Date: Wed, 20 Jul 2022 18:14:00 +0200 Subject: [PATCH 1/2] Make thread name for split runner threads depend on time --- .../src/main/java/io/trino/execution/executor/TaskExecutor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/TaskExecutor.java b/core/trino-main/src/main/java/io/trino/execution/executor/TaskExecutor.java index af8ae5667e1e..1eee07c287c8 100644 --- a/core/trino-main/src/main/java/io/trino/execution/executor/TaskExecutor.java +++ b/core/trino-main/src/main/java/io/trino/execution/executor/TaskExecutor.java @@ -479,7 +479,7 @@ public void run() return; } - String threadId = split.getTaskHandle().getTaskId() + "-" + split.getSplitId(); + String threadId = split.getTaskHandle().getTaskId() + "-" + split.getSplitId() + "-" + System.nanoTime(); try (SetThreadName splitName = new SetThreadName(threadId)) { RunningSplitInfo splitInfo = new RunningSplitInfo(ticker.read(), threadId, Thread.currentThread(), split); runningSplitInfos.add(splitInfo); From d55973a920e3949f876f4a75befb5254bcb54786 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Osipiuk?= Date: Wed, 20 Jul 2022 18:02:48 +0200 Subject: [PATCH 2/2] Harden runaway split detection Improve detection of runaway splits and related task killing code to ensure that we do not kill a thread which we suppose hung, but moved to execute on behalf of another query, just before we issue kill command. --- .../src/main/java/io/trino/execution/SqlTaskManager.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/core/trino-main/src/main/java/io/trino/execution/SqlTaskManager.java b/core/trino-main/src/main/java/io/trino/execution/SqlTaskManager.java index a38e465eb395..feef6caed261 100644 --- a/core/trino-main/src/main/java/io/trino/execution/SqlTaskManager.java +++ b/core/trino-main/src/main/java/io/trino/execution/SqlTaskManager.java @@ -735,7 +735,14 @@ private void failStuckSplitTasks() log.warn("%s is long running with stackTrace:\n%s", splitInfo.getSplitInfo(), stackTraceElements.stream().map(Object::toString).collect(joining(lineSeparator()))); } - return stuckSplitStackTracePredicate.test(stackTraceElements); + boolean isThreadStack = stuckSplitStackTracePredicate.test(stackTraceElements); + + // We check if thread's name matches threadId stored in RunningSplitInfo to be sure that stacktrace we obtained belongs to the execution of split + // described by RunningSplitInfo. + // There is still a chance that we may observe the stacktrace from execution of new split before thread name is set in io.trino.execution.executor.TaskExecutor.TaskRunner.run() + // Yet, we assume that such stacktrace would not be classified as "stack" by stuckSplitStackTracePredicate. + boolean splitAssignmentDidNotChange = splitInfo.getThread().getName().startsWith(splitInfo.getThreadId()); + return isThreadStack && splitAssignmentDidNotChange; }); for (TaskId stuckSplitTaskId : stuckSplitTaskIds) {