From b661b11ec03eb9b9b1daca52ff61896346406643 Mon Sep 17 00:00:00 2001 From: Martin Traverso Date: Fri, 7 Jul 2023 10:48:05 -0700 Subject: [PATCH 1/7] Move stuck split test to a separate class The test mixed class-level state with method-level state and wasn't consistent with everything that was going on in that class. --- .../trino/execution/TestSqlTaskManager.java | 174 ------------ .../TestTaskExecutorStuckSplits.java | 261 ++++++++++++++++++ 2 files changed, 261 insertions(+), 174 deletions(-) create mode 100644 core/trino-main/src/test/java/io/trino/execution/TestTaskExecutorStuckSplits.java diff --git a/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManager.java b/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManager.java index 6015e4eb3ed3..c7eb8d06f05a 100644 --- a/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManager.java +++ b/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManager.java @@ -17,12 +17,8 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; -import com.google.common.util.concurrent.ListenableFuture; -import com.google.common.util.concurrent.SettableFuture; -import com.google.errorprone.annotations.concurrent.GuardedBy; import io.airlift.node.NodeInfo; import io.airlift.stats.TestingGcMonitor; -import io.airlift.testing.TestingTicker; import io.airlift.units.DataSize; import io.airlift.units.DataSize.Unit; import io.airlift.units.Duration; @@ -38,7 +34,6 @@ import io.trino.execution.buffer.PipelinedOutputBuffers; import io.trino.execution.buffer.PipelinedOutputBuffers.OutputBufferId; import io.trino.execution.executor.TaskExecutor; -import io.trino.execution.executor.TaskHandle; import io.trino.memory.LocalMemoryManager; import io.trino.memory.NodeMemoryConfig; import io.trino.memory.QueryContext; @@ -60,14 +55,11 @@ import java.net.URI; import java.util.List; import java.util.Optional; -import java.util.OptionalInt; import java.util.Set; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; -import java.util.function.Predicate; -import static com.google.common.util.concurrent.Futures.immediateVoidFuture; import static com.google.common.util.concurrent.MoreExecutors.directExecutor; import static io.airlift.tracing.Tracing.noopTracer; import static io.trino.SessionTestUtils.TEST_SESSION; @@ -263,58 +255,6 @@ public void testRemoveOldTasks() } } - @Test - public void testFailStuckSplitTasks() - throws InterruptedException, ExecutionException, TimeoutException - { - TestingTicker ticker = new TestingTicker(); - - TaskHandle taskHandle = taskExecutor.addTask( - TASK_ID, - () -> 1.0, - 1, - new Duration(1, SECONDS), - OptionalInt.of(1)); - MockSplitRunner mockSplitRunner = new MockSplitRunner(); - - TaskExecutor taskExecutor = new TaskExecutor(4, 8, 3, 4, ticker); - // Here we explicitly enqueue an indefinite running split runner - taskExecutor.enqueueSplits(taskHandle, false, ImmutableList.of(mockSplitRunner)); - - taskExecutor.start(); - try { - // wait for the task executor to start processing the split - mockSplitRunner.waitForStart(); - - TaskManagerConfig taskManagerConfig = new TaskManagerConfig() - .setInterruptStuckSplitTasksEnabled(true) - .setInterruptStuckSplitTasksDetectionInterval(new Duration(10, SECONDS)) - .setInterruptStuckSplitTasksWarningThreshold(new Duration(10, SECONDS)) - .setInterruptStuckSplitTasksTimeout(new Duration(10, SECONDS)); - - try (SqlTaskManager sqlTaskManager = createSqlTaskManager(taskManagerConfig, new NodeMemoryConfig(), taskExecutor, stackTraceElements -> true)) { - sqlTaskManager.addStateChangeListener(TASK_ID, (state) -> { - if (state.isTerminatingOrDone() && !taskHandle.isDestroyed()) { - taskExecutor.removeTask(taskHandle); - } - }); - - ticker.increment(30, SECONDS); - sqlTaskManager.failStuckSplitTasks(); - - mockSplitRunner.waitForFinish(); - List taskInfos = sqlTaskManager.getAllTaskInfo(); - assertEquals(taskInfos.size(), 1); - - TaskInfo taskInfo = pollTerminatingTaskInfoUntilDone(sqlTaskManager, taskInfos.get(0)); - assertEquals(taskInfo.getTaskStatus().getState(), TaskState.FAILED); - } - } - finally { - taskExecutor.stop(); - } - } - @Test public void testSessionPropertyMemoryLimitOverride() { @@ -393,32 +333,6 @@ private SqlTaskManager createSqlTaskManager(TaskManagerConfig taskManagerConfig, new ExchangeManagerRegistry()); } - private SqlTaskManager createSqlTaskManager( - TaskManagerConfig taskManagerConfig, - NodeMemoryConfig nodeMemoryConfig, - TaskExecutor taskExecutor, - Predicate> stuckSplitStackTracePredicate) - { - return new SqlTaskManager( - new EmbedVersion("testversion"), - new NoConnectorServicesProvider(), - createTestingPlanner(), - new MockLocationFactory(), - taskExecutor, - createTestSplitMonitor(), - new NodeInfo("test"), - localMemoryManager, - taskManagementExecutor, - taskManagerConfig, - nodeMemoryConfig, - localSpillManager, - new NodeSpillConfig(), - new TestingGcMonitor(), - noopTracer(), - new ExchangeManagerRegistry(), - stuckSplitStackTracePredicate); - } - private TaskInfo createTask(SqlTaskManager sqlTaskManager, TaskId taskId, ImmutableSet splits, OutputBuffers outputBuffers) { return sqlTaskManager.updateTask(TEST_SESSION, @@ -500,94 +414,6 @@ public URI createMemoryInfoLocation(InternalNode node) } } - private static class MockSplitRunner - implements SplitRunner - { - private final SettableFuture startedFuture = SettableFuture.create(); - private final SettableFuture finishedFuture = SettableFuture.create(); - - @GuardedBy("this") - private Thread runnerThread; - @GuardedBy("this") - private boolean closed; - - public void waitForStart() - throws ExecutionException, InterruptedException, TimeoutException - { - startedFuture.get(10, SECONDS); - } - - public void waitForFinish() - throws ExecutionException, InterruptedException, TimeoutException - { - finishedFuture.get(10, SECONDS); - } - - @Override - public int getPipelineId() - { - return 0; - } - - @Override - public Span getPipelineSpan() - { - return Span.getInvalid(); - } - - @Override - public synchronized boolean isFinished() - { - return closed; - } - - @Override - public ListenableFuture processFor(Duration duration) - { - startedFuture.set(null); - synchronized (this) { - runnerThread = Thread.currentThread(); - - if (closed) { - finishedFuture.set(null); - return immediateVoidFuture(); - } - } - - while (true) { - try { - Thread.sleep(100000); - } - catch (InterruptedException e) { - break; - } - } - - synchronized (this) { - closed = true; - } - finishedFuture.set(null); - - return immediateVoidFuture(); - } - - @Override - public String getInfo() - { - return "MockSplitRunner"; - } - - @Override - public synchronized void close() - { - closed = true; - - if (runnerThread != null) { - runnerThread.interrupt(); - } - } - } - private static class NoConnectorServicesProvider implements ConnectorServicesProvider { diff --git a/core/trino-main/src/test/java/io/trino/execution/TestTaskExecutorStuckSplits.java b/core/trino-main/src/test/java/io/trino/execution/TestTaskExecutorStuckSplits.java new file mode 100644 index 000000000000..ee044ceeea59 --- /dev/null +++ b/core/trino-main/src/test/java/io/trino/execution/TestTaskExecutorStuckSplits.java @@ -0,0 +1,261 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution; + +import com.google.common.collect.ImmutableList; +import com.google.common.util.concurrent.ListenableFuture; +import com.google.common.util.concurrent.SettableFuture; +import com.google.errorprone.annotations.concurrent.GuardedBy; +import io.airlift.node.NodeInfo; +import io.airlift.stats.TestingGcMonitor; +import io.airlift.testing.TestingTicker; +import io.airlift.units.Duration; +import io.opentelemetry.api.trace.Span; +import io.trino.Session; +import io.trino.connector.CatalogProperties; +import io.trino.connector.ConnectorServices; +import io.trino.connector.ConnectorServicesProvider; +import io.trino.exchange.ExchangeManagerRegistry; +import io.trino.execution.executor.TaskExecutor; +import io.trino.execution.executor.TaskHandle; +import io.trino.memory.LocalMemoryManager; +import io.trino.memory.NodeMemoryConfig; +import io.trino.spi.connector.CatalogHandle; +import io.trino.spiller.LocalSpillManager; +import io.trino.spiller.NodeSpillConfig; +import io.trino.version.EmbedVersion; +import org.testng.annotations.Test; + +import java.util.List; +import java.util.OptionalInt; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; +import java.util.function.Predicate; + +import static com.google.common.util.concurrent.Futures.immediateVoidFuture; +import static io.airlift.tracing.Tracing.noopTracer; +import static io.trino.execution.TaskTestUtils.createTestSplitMonitor; +import static io.trino.execution.TaskTestUtils.createTestingPlanner; +import static java.util.concurrent.TimeUnit.SECONDS; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + +public class TestTaskExecutorStuckSplits +{ + @Test + public void testFailStuckSplitTasks() + throws InterruptedException, ExecutionException, TimeoutException + { + TestingTicker ticker = new TestingTicker(); + TaskManagementExecutor taskManagementExecutor = new TaskManagementExecutor(); + + TaskId taskId = new TaskId(new StageId("query", 0), 1, 0); + + TaskExecutor taskExecutor = new TaskExecutor(4, 8, 3, 4, ticker); + TaskHandle taskHandle = taskExecutor.addTask( + taskId, + () -> 1.0, + 1, + new Duration(1, SECONDS), + OptionalInt.of(1)); + + // Here we explicitly enqueue an indefinite running split runner + MockSplitRunner mockSplitRunner = new MockSplitRunner(); + taskExecutor.enqueueSplits(taskHandle, false, ImmutableList.of(mockSplitRunner)); + + taskExecutor.start(); + try { + // wait for the task executor to start processing the split + mockSplitRunner.waitForStart(); + + TaskManagerConfig taskManagerConfig = new TaskManagerConfig() + .setInterruptStuckSplitTasksEnabled(true) + .setInterruptStuckSplitTasksDetectionInterval(new Duration(10, SECONDS)) + .setInterruptStuckSplitTasksWarningThreshold(new Duration(10, SECONDS)) + .setInterruptStuckSplitTasksTimeout(new Duration(10, SECONDS)); + + try (SqlTaskManager sqlTaskManager = createSqlTaskManager(taskManagerConfig, new NodeMemoryConfig(), taskExecutor, taskManagementExecutor, stackTraceElements -> true)) { + sqlTaskManager.addStateChangeListener(taskId, (state) -> { + if (state.isTerminatingOrDone() && !taskHandle.isDestroyed()) { + taskExecutor.removeTask(taskHandle); + } + }); + + ticker.increment(30, SECONDS); + sqlTaskManager.failStuckSplitTasks(); + + mockSplitRunner.waitForFinish(); + List taskInfos = sqlTaskManager.getAllTaskInfo(); + assertEquals(taskInfos.size(), 1); + + TaskInfo taskInfo = pollTerminatingTaskInfoUntilDone(sqlTaskManager, taskInfos.get(0)); + assertEquals(taskInfo.getTaskStatus().getState(), TaskState.FAILED); + } + } + finally { + taskExecutor.stop(); + taskManagementExecutor.close(); + } + } + + private SqlTaskManager createSqlTaskManager( + TaskManagerConfig taskManagerConfig, + NodeMemoryConfig nodeMemoryConfig, + TaskExecutor taskExecutor, + TaskManagementExecutor taskManagementExecutor, + Predicate> stuckSplitStackTracePredicate) + { + return new SqlTaskManager( + new EmbedVersion("testversion"), + new NoConnectorServicesProvider(), + createTestingPlanner(), + new TestSqlTaskManager.MockLocationFactory(), + taskExecutor, + createTestSplitMonitor(), + new NodeInfo("test"), + new LocalMemoryManager(new NodeMemoryConfig()), + taskManagementExecutor, + taskManagerConfig, + nodeMemoryConfig, + new LocalSpillManager(new NodeSpillConfig()), + new NodeSpillConfig(), + new TestingGcMonitor(), + noopTracer(), + new ExchangeManagerRegistry(), + stuckSplitStackTracePredicate); + } + + private static TaskInfo pollTerminatingTaskInfoUntilDone(SqlTaskManager taskManager, TaskInfo taskInfo) + throws InterruptedException, ExecutionException, TimeoutException + { + assertTrue(taskInfo.getTaskStatus().getState().isTerminatingOrDone()); + int attempts = 3; + while (attempts > 0 && taskInfo.getTaskStatus().getState().isTerminating()) { + taskInfo = taskManager.getTaskInfo(taskInfo.getTaskStatus().getTaskId(), taskInfo.getTaskStatus().getVersion()).get(5, SECONDS); + attempts--; + } + return taskInfo; + } + + private static class NoConnectorServicesProvider + implements ConnectorServicesProvider + { + @Override + public void loadInitialCatalogs() {} + + @Override + public void ensureCatalogsLoaded(Session session, List catalogs) {} + + @Override + public void pruneCatalogs(Set catalogsInUse) + { + throw new UnsupportedOperationException(); + } + + @Override + public ConnectorServices getConnectorServices(CatalogHandle catalogHandle) + { + throw new UnsupportedOperationException(); + } + } + + private static class MockSplitRunner + implements SplitRunner + { + private final SettableFuture startedFuture = SettableFuture.create(); + private final SettableFuture finishedFuture = SettableFuture.create(); + + @GuardedBy("this") + private Thread runnerThread; + @GuardedBy("this") + private boolean closed; + + public void waitForStart() + throws ExecutionException, InterruptedException, TimeoutException + { + startedFuture.get(10, SECONDS); + } + + public void waitForFinish() + throws ExecutionException, InterruptedException, TimeoutException + { + finishedFuture.get(10, SECONDS); + } + + @Override + public int getPipelineId() + { + return 0; + } + + @Override + public Span getPipelineSpan() + { + return Span.getInvalid(); + } + + @Override + public synchronized boolean isFinished() + { + return closed; + } + + @Override + public ListenableFuture processFor(Duration duration) + { + startedFuture.set(null); + synchronized (this) { + runnerThread = Thread.currentThread(); + + if (closed) { + finishedFuture.set(null); + return immediateVoidFuture(); + } + } + + while (true) { + try { + Thread.sleep(100000); + } + catch (InterruptedException e) { + break; + } + } + + synchronized (this) { + closed = true; + } + finishedFuture.set(null); + + return immediateVoidFuture(); + } + + @Override + public String getInfo() + { + return "MockSplitRunner"; + } + + @Override + public synchronized void close() + { + closed = true; + + if (runnerThread != null) { + runnerThread.interrupt(); + } + } + } +} From 1a47841a171e38bf18b264fb1aec512b0ce97334 Mon Sep 17 00:00:00 2001 From: Martin Traverso Date: Fri, 7 Jul 2023 10:59:05 -0700 Subject: [PATCH 2/7] Remove unnecessary shared state in test These objects don't need lifecycle managed throughout the test, so they can be created where they are needed --- .../test/java/io/trino/execution/TestSqlTaskManager.java | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManager.java b/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManager.java index c7eb8d06f05a..8ed4277d09ea 100644 --- a/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManager.java +++ b/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManager.java @@ -87,14 +87,10 @@ public class TestSqlTaskManager private TaskExecutor taskExecutor; private TaskManagementExecutor taskManagementExecutor; - private LocalMemoryManager localMemoryManager; - private LocalSpillManager localSpillManager; @BeforeClass public void setUp() { - localMemoryManager = new LocalMemoryManager(new NodeMemoryConfig()); - localSpillManager = new LocalSpillManager(new NodeSpillConfig()); taskExecutor = new TaskExecutor(8, 16, 3, 4, Ticker.systemTicker()); taskExecutor.start(); taskManagementExecutor = new TaskManagementExecutor(); @@ -322,11 +318,11 @@ private SqlTaskManager createSqlTaskManager(TaskManagerConfig taskManagerConfig, taskExecutor, createTestSplitMonitor(), new NodeInfo("test"), - localMemoryManager, + new LocalMemoryManager(nodeMemoryConfig), taskManagementExecutor, taskManagerConfig, nodeMemoryConfig, - localSpillManager, + new LocalSpillManager(new NodeSpillConfig()), new NodeSpillConfig(), new TestingGcMonitor(), noopTracer(), From 5b9ab8851d8d9b8cc8eb049ac3968db31b3c6797 Mon Sep 17 00:00:00 2001 From: Martin Traverso Date: Fri, 7 Jul 2023 10:54:42 -0700 Subject: [PATCH 3/7] Migrate TestSqlTaskManager to JUnit --- .../trino/execution/TestSqlTaskManager.java | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManager.java b/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManager.java index 8ed4277d09ea..0ce54e2fdec9 100644 --- a/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManager.java +++ b/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManager.java @@ -48,9 +48,11 @@ import io.trino.spiller.LocalSpillManager; import io.trino.spiller.NodeSpillConfig; import io.trino.version.EmbedVersion; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInstance; +import org.junit.jupiter.api.Timeout; import java.net.URI; import java.util.List; @@ -73,6 +75,7 @@ import static io.trino.execution.buffer.PipelinedOutputBuffers.BufferType.PARTITIONED; import static io.trino.testing.TestingSession.testSessionBuilder; import static java.util.concurrent.TimeUnit.SECONDS; +import static org.junit.jupiter.api.TestInstance.Lifecycle.PER_CLASS; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertNotEquals; @@ -80,6 +83,7 @@ import static org.testng.Assert.assertNull; import static org.testng.Assert.assertTrue; +@TestInstance(PER_CLASS) public class TestSqlTaskManager { private static final TaskId TASK_ID = new TaskId(new StageId("query", 0), 1, 0); @@ -88,7 +92,7 @@ public class TestSqlTaskManager private TaskExecutor taskExecutor; private TaskManagementExecutor taskManagementExecutor; - @BeforeClass + @BeforeAll public void setUp() { taskExecutor = new TaskExecutor(8, 16, 3, 4, Ticker.systemTicker()); @@ -96,7 +100,7 @@ public void setUp() taskManagementExecutor = new TaskManagementExecutor(); } - @AfterClass(alwaysRun = true) + @AfterAll public void tearDown() { taskExecutor.stop(); @@ -124,7 +128,8 @@ public void testEmptyQuery() } } - @Test(timeOut = 30_000) + @Test + @Timeout(30) public void testSimpleQuery() throws Exception { @@ -205,7 +210,8 @@ public void testAbort() } } - @Test(timeOut = 30_000) + @Test + @Timeout(30) public void testAbortResults() throws Exception { From ade3ef5b3107d8210b701cb796486eaac5ee23cb Mon Sep 17 00:00:00 2001 From: Martin Traverso Date: Thu, 25 May 2023 13:28:28 -0700 Subject: [PATCH 4/7] Extract interface for TaskExecutor In preparation for adding a ThreadPerSplitTaskExecutor --- .../io/trino/execution/SqlTaskManager.java | 6 +- .../execution/executor/RunningSplitInfo.java | 98 ++ .../execution/executor/TaskExecutor.java | 983 +----------------- .../trino/execution/executor/TaskHandle.java | 185 +--- .../MultilevelSplitQueue.java | 2 +- .../PrioritizedSplitRunner.java | 8 +- .../executor/{ => timesharing}/Priority.java | 2 +- .../timesharing/TimeSharingTaskExecutor.java | 953 +++++++++++++++++ .../timesharing/TimeSharingTaskHandle.java | 203 ++++ .../io/trino/server/ServerMainModule.java | 10 +- .../io/trino/server/TaskExecutorResource.java | 6 +- .../io/trino/sql/gen/ExpressionProfiler.java | 2 +- ...nager.java => BaseTestSqlTaskManager.java} | 27 +- .../io/trino/execution/TaskTestUtils.java | 2 +- .../TestMemoryRevokingScheduler.java | 3 +- .../java/io/trino/execution/TestSqlTask.java | 3 +- .../trino/execution/TestSqlTaskExecution.java | 3 +- .../TestSqlTaskManagerTimeSharing.java | 28 + .../TestTaskExecutorStuckSplits.java | 5 +- .../executor/{ => timesharing}/Histogram.java | 2 +- .../SimulationController.java | 16 +- .../{ => timesharing}/SimulationSplit.java | 2 +- .../{ => timesharing}/SimulationTask.java | 17 +- .../{ => timesharing}/SplitGenerators.java | 8 +- .../{ => timesharing}/SplitSpecification.java | 6 +- .../TestTimeSharingTaskExecutor.java} | 38 +- .../TimeSharingTaskExecutorSimulation.java} | 56 +- .../operator/project/TestPageProcessor.java | 2 +- .../server/remotetask/TestHttpRemoteTask.java | 4 +- .../benchmark/AbstractOperatorBenchmark.java | 2 +- .../io/trino/benchmark/HashJoinBenchmark.java | 2 +- 31 files changed, 1419 insertions(+), 1265 deletions(-) create mode 100644 core/trino-main/src/main/java/io/trino/execution/executor/RunningSplitInfo.java rename core/trino-main/src/main/java/io/trino/execution/executor/{ => timesharing}/MultilevelSplitQueue.java (99%) rename core/trino-main/src/main/java/io/trino/execution/executor/{ => timesharing}/PrioritizedSplitRunner.java (97%) rename core/trino-main/src/main/java/io/trino/execution/executor/{ => timesharing}/Priority.java (97%) create mode 100644 core/trino-main/src/main/java/io/trino/execution/executor/timesharing/TimeSharingTaskExecutor.java create mode 100644 core/trino-main/src/main/java/io/trino/execution/executor/timesharing/TimeSharingTaskHandle.java rename core/trino-main/src/test/java/io/trino/execution/{TestSqlTaskManager.java => BaseTestSqlTaskManager.java} (96%) create mode 100644 core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManagerTimeSharing.java rename core/trino-main/src/test/java/io/trino/execution/executor/{ => timesharing}/Histogram.java (99%) rename core/trino-main/src/test/java/io/trino/execution/executor/{ => timesharing}/SimulationController.java (93%) rename core/trino-main/src/test/java/io/trino/execution/executor/{ => timesharing}/SimulationSplit.java (99%) rename core/trino-main/src/test/java/io/trino/execution/executor/{ => timesharing}/SimulationTask.java (86%) rename core/trino-main/src/test/java/io/trino/execution/executor/{ => timesharing}/SplitGenerators.java (97%) rename core/trino-main/src/test/java/io/trino/execution/executor/{ => timesharing}/SplitSpecification.java (93%) rename core/trino-main/src/test/java/io/trino/execution/executor/{TestTaskExecutor.java => timesharing/TestTimeSharingTaskExecutor.java} (92%) rename core/trino-main/src/test/java/io/trino/execution/executor/{TaskExecutorSimulator.java => timesharing/TimeSharingTaskExecutorSimulation.java} (90%) diff --git a/core/trino-main/src/main/java/io/trino/execution/SqlTaskManager.java b/core/trino-main/src/main/java/io/trino/execution/SqlTaskManager.java index 66c841b5d894..ec6c2b041105 100644 --- a/core/trino-main/src/main/java/io/trino/execution/SqlTaskManager.java +++ b/core/trino-main/src/main/java/io/trino/execution/SqlTaskManager.java @@ -40,9 +40,9 @@ import io.trino.execution.buffer.BufferResult; import io.trino.execution.buffer.OutputBuffers; import io.trino.execution.buffer.PipelinedOutputBuffers; -import io.trino.execution.executor.PrioritizedSplitRunner; +import io.trino.execution.executor.RunningSplitInfo; import io.trino.execution.executor.TaskExecutor; -import io.trino.execution.executor.TaskExecutor.RunningSplitInfo; +import io.trino.execution.executor.timesharing.PrioritizedSplitRunner; import io.trino.memory.LocalMemoryManager; import io.trino.memory.NodeMemoryConfig; import io.trino.memory.QueryContext; @@ -90,7 +90,7 @@ import static io.trino.SystemSessionProperties.resourceOvercommit; import static io.trino.cache.SafeCaches.buildNonEvictableCache; import static io.trino.execution.SqlTask.createSqlTask; -import static io.trino.execution.executor.PrioritizedSplitRunner.SPLIT_RUN_QUANTA; +import static io.trino.execution.executor.timesharing.PrioritizedSplitRunner.SPLIT_RUN_QUANTA; import static io.trino.operator.RetryPolicy.TASK; import static io.trino.spi.StandardErrorCode.ABANDONED_TASK; import static io.trino.spi.StandardErrorCode.GENERIC_USER_ERROR; diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/RunningSplitInfo.java b/core/trino-main/src/main/java/io/trino/execution/executor/RunningSplitInfo.java new file mode 100644 index 000000000000..6669d7ef597b --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/execution/executor/RunningSplitInfo.java @@ -0,0 +1,98 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution.executor; + +import com.google.common.collect.ComparisonChain; +import io.trino.execution.TaskId; + +import java.util.function.Supplier; + +import static java.util.Objects.requireNonNull; + +/** + * A class representing a split that is running on the TaskRunner. + * It has a Thread object that gets assigned while assigning the split + * to the taskRunner. However, when the TaskRunner moves to a different split, + * the thread stored here will not remain assigned to this split anymore. + */ +public class RunningSplitInfo + implements Comparable +{ + private final long startTime; + private final String threadId; + private final Thread thread; + private boolean printed; + private final TaskId taskId; + private final Supplier splitInfo; + + public RunningSplitInfo(long startTime, String threadId, Thread thread, TaskId taskId, Supplier splitInfo) + { + this.startTime = startTime; + this.threadId = requireNonNull(threadId, "threadId is null"); + this.thread = requireNonNull(thread, "thread is null"); + this.taskId = requireNonNull(taskId, "taskId is null"); + this.splitInfo = requireNonNull(splitInfo, "split is null"); + this.printed = false; + } + + public long getStartTime() + { + return startTime; + } + + public String getThreadId() + { + return threadId; + } + + public Thread getThread() + { + return thread; + } + + public TaskId getTaskId() + { + return taskId; + } + + /** + * {@link PrioritizedSplitRunner#getInfo()} provides runtime statistics for the split (such as total cpu utilization so far). + * A value returned from this method changes over time and cannot be cached as a field of {@link RunningSplitInfo}. + * + * @return Formatted string containing runtime statistics for the split. + */ + public String getSplitInfo() + { + return splitInfo.get(); + } + + public boolean isPrinted() + { + return printed; + } + + public void setPrinted() + { + printed = true; + } + + @Override + public int compareTo(RunningSplitInfo o) + { + return ComparisonChain.start() + .compare(startTime, o.getStartTime()) + .compare(threadId, o.getThreadId()) + .result(); + } +} diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/TaskExecutor.java b/core/trino-main/src/main/java/io/trino/execution/executor/TaskExecutor.java index f7ed83aa7b7e..d9ddc32772be 100644 --- a/core/trino-main/src/main/java/io/trino/execution/executor/TaskExecutor.java +++ b/core/trino-main/src/main/java/io/trino/execution/executor/TaskExecutor.java @@ -13,1000 +13,33 @@ */ package io.trino.execution.executor; -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Ticker; -import com.google.common.collect.ComparisonChain; import com.google.common.util.concurrent.ListenableFuture; -import com.google.errorprone.annotations.ThreadSafe; -import com.google.errorprone.annotations.concurrent.GuardedBy; -import com.google.inject.Inject; -import io.airlift.concurrent.SetThreadName; -import io.airlift.concurrent.ThreadPoolExecutorMBean; -import io.airlift.log.Logger; -import io.airlift.stats.CounterStat; -import io.airlift.stats.DistributionStat; -import io.airlift.stats.TimeDistribution; -import io.airlift.stats.TimeStat; import io.airlift.units.Duration; -import io.opentelemetry.api.trace.Span; -import io.opentelemetry.api.trace.Tracer; -import io.opentelemetry.context.Context; import io.trino.execution.SplitRunner; import io.trino.execution.TaskId; -import io.trino.execution.TaskManagerConfig; -import io.trino.spi.TrinoException; -import io.trino.spi.VersionEmbedder; -import io.trino.tracing.TrinoAttributes; -import jakarta.annotation.PostConstruct; -import jakarta.annotation.PreDestroy; -import org.weakref.jmx.Managed; -import org.weakref.jmx.Nested; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedList; import java.util.List; -import java.util.Map; import java.util.OptionalInt; import java.util.Set; -import java.util.SortedSet; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ConcurrentSkipListSet; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Future; -import java.util.concurrent.RejectedExecutionException; -import java.util.concurrent.ThreadPoolExecutor; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicLong; -import java.util.concurrent.atomic.AtomicLongArray; import java.util.function.DoubleSupplier; import java.util.function.Predicate; -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkState; -import static com.google.common.collect.ImmutableSet.toImmutableSet; -import static com.google.common.collect.Sets.newConcurrentHashSet; -import static io.airlift.concurrent.Threads.threadsNamed; -import static io.airlift.tracing.Tracing.noopTracer; -import static io.trino.execution.executor.MultilevelSplitQueue.computeLevel; -import static io.trino.version.EmbedVersion.testingVersionEmbedder; -import static java.lang.Math.min; -import static java.lang.String.format; -import static java.util.Objects.requireNonNull; -import static java.util.concurrent.Executors.newCachedThreadPool; -import static java.util.concurrent.TimeUnit.MICROSECONDS; -import static java.util.concurrent.TimeUnit.NANOSECONDS; - -@ThreadSafe -public class TaskExecutor +public interface TaskExecutor { - private static final Logger log = Logger.get(TaskExecutor.class); - private static final AtomicLong NEXT_RUNNER_ID = new AtomicLong(); - - private final ExecutorService executor; - private final ThreadPoolExecutorMBean executorMBean; - - private final int runnerThreads; - private final int minimumNumberOfDrivers; - private final int guaranteedNumberOfDriversPerTask; - private final int maximumNumberOfDriversPerTask; - private final VersionEmbedder versionEmbedder; - private final Tracer tracer; - - private final Ticker ticker; - - private final Duration stuckSplitsWarningThreshold; - private final SortedSet runningSplitInfos = new ConcurrentSkipListSet<>(); - - @GuardedBy("this") - private final List tasks; - - /** - * All splits registered with the task executor. - */ - @GuardedBy("this") - private final Set allSplits = new HashSet<>(); - - /** - * Intermediate splits (i.e. splits that should not be queued). - */ - @GuardedBy("this") - private final Set intermediateSplits = new HashSet<>(); - - /** - * Splits waiting for a runner thread. - */ - private final MultilevelSplitQueue waitingSplits; - - /** - * Splits running on a thread. - */ - private final Set runningSplits = newConcurrentHashSet(); - - /** - * Splits blocked by the driver. - */ - private final Map> blockedSplits = new ConcurrentHashMap<>(); - - private final AtomicLongArray completedTasksPerLevel = new AtomicLongArray(5); - private final AtomicLongArray completedSplitsPerLevel = new AtomicLongArray(5); - - private final TimeStat splitQueuedTime = new TimeStat(NANOSECONDS); - private final TimeStat splitWallTime = new TimeStat(NANOSECONDS); - - private final TimeDistribution leafSplitWallTime = new TimeDistribution(MICROSECONDS); - private final TimeDistribution intermediateSplitWallTime = new TimeDistribution(MICROSECONDS); - - private final TimeDistribution leafSplitScheduledTime = new TimeDistribution(MICROSECONDS); - private final TimeDistribution intermediateSplitScheduledTime = new TimeDistribution(MICROSECONDS); - - private final TimeDistribution leafSplitWaitTime = new TimeDistribution(MICROSECONDS); - private final TimeDistribution intermediateSplitWaitTime = new TimeDistribution(MICROSECONDS); - - private final TimeDistribution leafSplitCpuTime = new TimeDistribution(MICROSECONDS); - private final TimeDistribution intermediateSplitCpuTime = new TimeDistribution(MICROSECONDS); - - // shared between SplitRunners - private final CounterStat globalCpuTimeMicros = new CounterStat(); - private final CounterStat globalScheduledTimeMicros = new CounterStat(); - - private final TimeStat blockedQuantaWallTime = new TimeStat(MICROSECONDS); - private final TimeStat unblockedQuantaWallTime = new TimeStat(MICROSECONDS); - - private final DistributionStat leafSplitsSize = new DistributionStat(); - @GuardedBy("this") - private long lastLeafSplitsSizeRecordTime; - @GuardedBy("this") - private long lastLeafSplitsSize; - - private volatile boolean closed; - - @Inject - public TaskExecutor(TaskManagerConfig config, VersionEmbedder versionEmbedder, Tracer tracer, MultilevelSplitQueue splitQueue) - { - this( - config.getMaxWorkerThreads(), - config.getMinDrivers(), - config.getMinDriversPerTask(), - config.getMaxDriversPerTask(), - config.getInterruptStuckSplitTasksWarningThreshold(), - versionEmbedder, - tracer, - splitQueue, - Ticker.systemTicker()); - } - - @VisibleForTesting - public TaskExecutor(int runnerThreads, int minDrivers, int guaranteedNumberOfDriversPerTask, int maximumNumberOfDriversPerTask, Ticker ticker) - { - this(runnerThreads, minDrivers, guaranteedNumberOfDriversPerTask, maximumNumberOfDriversPerTask, new Duration(10, TimeUnit.MINUTES), testingVersionEmbedder(), noopTracer(), new MultilevelSplitQueue(2), ticker); - } - - @VisibleForTesting - public TaskExecutor(int runnerThreads, int minDrivers, int guaranteedNumberOfDriversPerTask, int maximumNumberOfDriversPerTask, MultilevelSplitQueue splitQueue, Ticker ticker) - { - this(runnerThreads, minDrivers, guaranteedNumberOfDriversPerTask, maximumNumberOfDriversPerTask, new Duration(10, TimeUnit.MINUTES), testingVersionEmbedder(), noopTracer(), splitQueue, ticker); - } - - @VisibleForTesting - public TaskExecutor( - int runnerThreads, - int minDrivers, - int guaranteedNumberOfDriversPerTask, - int maximumNumberOfDriversPerTask, - Duration stuckSplitsWarningThreshold, - VersionEmbedder versionEmbedder, - Tracer tracer, - MultilevelSplitQueue splitQueue, - Ticker ticker) - { - checkArgument(runnerThreads > 0, "runnerThreads must be at least 1"); - checkArgument(guaranteedNumberOfDriversPerTask > 0, "guaranteedNumberOfDriversPerTask must be at least 1"); - checkArgument(maximumNumberOfDriversPerTask > 0, "maximumNumberOfDriversPerTask must be at least 1"); - checkArgument(guaranteedNumberOfDriversPerTask <= maximumNumberOfDriversPerTask, "guaranteedNumberOfDriversPerTask cannot be greater than maximumNumberOfDriversPerTask"); - - // we manage thread pool size directly, so create an unlimited pool - this.executor = newCachedThreadPool(threadsNamed("task-processor-%s")); - this.executorMBean = new ThreadPoolExecutorMBean((ThreadPoolExecutor) executor); - this.runnerThreads = runnerThreads; - this.versionEmbedder = requireNonNull(versionEmbedder, "versionEmbedder is null"); - this.tracer = requireNonNull(tracer, "tracer is null"); - - this.ticker = requireNonNull(ticker, "ticker is null"); - this.stuckSplitsWarningThreshold = requireNonNull(stuckSplitsWarningThreshold, "stuckSplitsWarningThreshold is null"); - - this.minimumNumberOfDrivers = minDrivers; - this.guaranteedNumberOfDriversPerTask = guaranteedNumberOfDriversPerTask; - this.maximumNumberOfDriversPerTask = maximumNumberOfDriversPerTask; - this.waitingSplits = requireNonNull(splitQueue, "splitQueue is null"); - this.tasks = new LinkedList<>(); - this.lastLeafSplitsSizeRecordTime = ticker.read(); - } - - @PostConstruct - public synchronized void start() - { - checkState(!closed, "TaskExecutor is closed"); - for (int i = 0; i < runnerThreads; i++) { - addRunnerThread(); - } - } - - @PreDestroy - public synchronized void stop() - { - closed = true; - executor.shutdownNow(); - } - - @Override - public synchronized String toString() - { - return toStringHelper(this) - .add("runnerThreads", runnerThreads) - .add("allSplits", allSplits.size()) - .add("intermediateSplits", intermediateSplits.size()) - .add("waitingSplits", waitingSplits.size()) - .add("runningSplits", runningSplits.size()) - .add("blockedSplits", blockedSplits.size()) - .toString(); - } - - private synchronized void addRunnerThread() - { - try { - executor.execute(versionEmbedder.embedVersion(new TaskRunner())); - } - catch (RejectedExecutionException ignored) { - } - } - - public synchronized TaskHandle addTask( + TaskHandle addTask( TaskId taskId, DoubleSupplier utilizationSupplier, int initialSplitConcurrency, Duration splitConcurrencyAdjustFrequency, - OptionalInt maxDriversPerTask) - { - requireNonNull(taskId, "taskId is null"); - requireNonNull(utilizationSupplier, "utilizationSupplier is null"); - checkArgument(maxDriversPerTask.isEmpty() || maxDriversPerTask.getAsInt() <= maximumNumberOfDriversPerTask, - "maxDriversPerTask cannot be greater than the configured value"); - - log.debug("Task scheduled %s", taskId); - - TaskHandle taskHandle = new TaskHandle(taskId, waitingSplits, utilizationSupplier, initialSplitConcurrency, splitConcurrencyAdjustFrequency, maxDriversPerTask); - - tasks.add(taskHandle); - return taskHandle; - } - - public void removeTask(TaskHandle taskHandle) - { - try (SetThreadName ignored = new SetThreadName("Task-%s", taskHandle.getTaskId())) { - // Skip additional scheduling if the task was already destroyed - if (!doRemoveTask(taskHandle)) { - return; - } - } - - // replace blocked splits that were terminated - synchronized (this) { - addNewEntrants(); - recordLeafSplitsSize(); - } - } - - /** - * Returns true if the task handle was destroyed and removed splits as a result that may need to be replaced. Otherwise, - * if the {@link TaskHandle} was already destroyed or no splits were removed then this method returns false and no additional - * splits need to be scheduled. - */ - private boolean doRemoveTask(TaskHandle taskHandle) - { - List splits; - synchronized (this) { - tasks.remove(taskHandle); - - // Task is already destroyed - if (taskHandle.isDestroyed()) { - return false; - } - - splits = taskHandle.destroy(); - // stop tracking splits (especially blocked splits which may never unblock) - allSplits.removeAll(splits); - intermediateSplits.removeAll(splits); - blockedSplits.keySet().removeAll(splits); - waitingSplits.removeAll(splits); - recordLeafSplitsSize(); - } - - // call destroy outside of synchronized block as it is expensive and doesn't need a lock on the task executor - for (PrioritizedSplitRunner split : splits) { - split.destroy(); - } - - // record completed stats - long threadUsageNanos = taskHandle.getScheduledNanos(); - completedTasksPerLevel.incrementAndGet(computeLevel(threadUsageNanos)); - - log.debug("Task finished or failed %s", taskHandle.getTaskId()); - return !splits.isEmpty(); - } - - public List> enqueueSplits(TaskHandle taskHandle, boolean intermediate, List taskSplits) - { - List splitsToDestroy = new ArrayList<>(); - List> finishedFutures = new ArrayList<>(taskSplits.size()); - synchronized (this) { - for (SplitRunner taskSplit : taskSplits) { - TaskId taskId = taskHandle.getTaskId(); - int splitId = taskHandle.getNextSplitId(); - - Span splitSpan = tracer.spanBuilder(intermediate ? "split (intermediate)" : "split (leaf)") - .setParent(Context.current().with(taskSplit.getPipelineSpan())) - .setAttribute(TrinoAttributes.QUERY_ID, taskId.getQueryId().toString()) - .setAttribute(TrinoAttributes.STAGE_ID, taskId.getStageId().toString()) - .setAttribute(TrinoAttributes.TASK_ID, taskId.toString()) - .setAttribute(TrinoAttributes.PIPELINE_ID, taskId.getStageId() + "-" + taskSplit.getPipelineId()) - .setAttribute(TrinoAttributes.SPLIT_ID, taskId + "-" + splitId) - .startSpan(); - - PrioritizedSplitRunner prioritizedSplitRunner = new PrioritizedSplitRunner( - taskHandle, - splitId, - taskSplit, - splitSpan, - tracer, - ticker, - globalCpuTimeMicros, - globalScheduledTimeMicros, - blockedQuantaWallTime, - unblockedQuantaWallTime); - - if (intermediate) { - // add the runner to the handle so it can be destroyed if the task is canceled - if (taskHandle.recordIntermediateSplit(prioritizedSplitRunner)) { - // Note: we do not record queued time for intermediate splits - startIntermediateSplit(prioritizedSplitRunner); - } - else { - splitsToDestroy.add(prioritizedSplitRunner); - } - } - else { - // add this to the work queue for the task - if (taskHandle.enqueueSplit(prioritizedSplitRunner)) { - // if task is under the limit for guaranteed splits, start one - scheduleTaskIfNecessary(taskHandle); - // if globally we have more resources, start more - addNewEntrants(); - } - else { - splitsToDestroy.add(prioritizedSplitRunner); - } - } - - finishedFutures.add(prioritizedSplitRunner.getFinishedFuture()); - } - recordLeafSplitsSize(); - } - for (PrioritizedSplitRunner split : splitsToDestroy) { - split.destroy(); - } - return finishedFutures; - } - - private void splitFinished(PrioritizedSplitRunner split) - { - completedSplitsPerLevel.incrementAndGet(split.getPriority().getLevel()); - synchronized (this) { - allSplits.remove(split); - - long wallNanos = System.nanoTime() - split.getCreatedNanos(); - splitWallTime.add(Duration.succinctNanos(wallNanos)); - - if (intermediateSplits.remove(split)) { - intermediateSplitWallTime.add(wallNanos); - intermediateSplitScheduledTime.add(split.getScheduledNanos()); - intermediateSplitWaitTime.add(split.getWaitNanos()); - intermediateSplitCpuTime.add(split.getCpuTimeNanos()); - } - else { - leafSplitWallTime.add(wallNanos); - leafSplitScheduledTime.add(split.getScheduledNanos()); - leafSplitWaitTime.add(split.getWaitNanos()); - leafSplitCpuTime.add(split.getCpuTimeNanos()); - } - - TaskHandle taskHandle = split.getTaskHandle(); - taskHandle.splitComplete(split); - - scheduleTaskIfNecessary(taskHandle); - - addNewEntrants(); - recordLeafSplitsSize(); - } - // call destroy outside of synchronized block as it is expensive and doesn't need a lock on the task executor - split.destroy(); - } - - private synchronized void scheduleTaskIfNecessary(TaskHandle taskHandle) - { - // if task has less than the minimum guaranteed splits running, - // immediately schedule new splits for this task. This assures - // that a task gets its fair amount of consideration (you have to - // have splits to be considered for running on a thread). - int splitsToSchedule = min(guaranteedNumberOfDriversPerTask, taskHandle.getMaxDriversPerTask().orElse(Integer.MAX_VALUE)) - taskHandle.getRunningLeafSplits(); - for (int i = 0; i < splitsToSchedule; ++i) { - PrioritizedSplitRunner split = taskHandle.pollNextSplit(); - if (split == null) { - // no more splits to schedule - return; - } - - startSplit(split); - splitQueuedTime.add(Duration.nanosSince(split.getCreatedNanos())); - } - recordLeafSplitsSize(); - } - - private synchronized void addNewEntrants() - { - // Ignore intermediate splits when checking minimumNumberOfDrivers. - // Otherwise with (for example) minimumNumberOfDrivers = 100, 200 intermediate splits - // and 100 leaf splits, depending on order of appearing splits, number of - // simultaneously running splits may vary. If leaf splits start first, there will - // be 300 running splits. If intermediate splits start first, there will be only - // 200 running splits. - int running = allSplits.size() - intermediateSplits.size(); - for (int i = 0; i < minimumNumberOfDrivers - running; i++) { - PrioritizedSplitRunner split = pollNextSplitWorker(); - if (split == null) { - break; - } - - splitQueuedTime.add(Duration.nanosSince(split.getCreatedNanos())); - startSplit(split); - } - } - - private synchronized void startIntermediateSplit(PrioritizedSplitRunner split) - { - startSplit(split); - intermediateSplits.add(split); - } - - private synchronized void startSplit(PrioritizedSplitRunner split) - { - allSplits.add(split); - waitingSplits.offer(split); - } - - private synchronized PrioritizedSplitRunner pollNextSplitWorker() - { - // todo find a better algorithm for this - // find the first task that produces a split, then move that task to the - // end of the task list, so we get round robin - for (Iterator iterator = tasks.iterator(); iterator.hasNext(); ) { - TaskHandle task = iterator.next(); - // skip tasks that are already running the configured max number of drivers - if (task.getRunningLeafSplits() >= task.getMaxDriversPerTask().orElse(maximumNumberOfDriversPerTask)) { - continue; - } - PrioritizedSplitRunner split = task.pollNextSplit(); - if (split != null) { - // move task to end of list - iterator.remove(); - - // CAUTION: we are modifying the list in the loop which would normally - // cause a ConcurrentModificationException but we exit immediately - tasks.add(task); - return split; - } - } - return null; - } - - private synchronized void recordLeafSplitsSize() - { - long now = ticker.read(); - long timeDifference = now - this.lastLeafSplitsSizeRecordTime; - if (timeDifference > 0) { - this.leafSplitsSize.add(lastLeafSplitsSize, timeDifference); - this.lastLeafSplitsSizeRecordTime = now; - } - // always record new lastLeafSplitsSize as it might have changed - // even if timeDifference is 0 - this.lastLeafSplitsSize = allSplits.size() - intermediateSplits.size(); - } - - private class TaskRunner - implements Runnable - { - private final long runnerId = NEXT_RUNNER_ID.getAndIncrement(); - - @Override - public void run() - { - try (SetThreadName runnerName = new SetThreadName("SplitRunner-%s", runnerId)) { - while (!closed && !Thread.currentThread().isInterrupted()) { - // select next worker - PrioritizedSplitRunner split; - try { - split = waitingSplits.take(); - } - catch (InterruptedException e) { - Thread.currentThread().interrupt(); - return; - } - - String threadId = split.getTaskHandle().getTaskId() + "-" + split.getSplitId(); - try (SetThreadName splitName = new SetThreadName(threadId)) { - RunningSplitInfo splitInfo = new RunningSplitInfo(ticker.read(), threadId, Thread.currentThread(), split); - runningSplitInfos.add(splitInfo); - runningSplits.add(split); - - ListenableFuture blocked; - try { - blocked = split.process(); - } - finally { - runningSplitInfos.remove(splitInfo); - runningSplits.remove(split); - } - - if (split.isFinished()) { - if (log.isDebugEnabled()) { - log.debug("%s is finished", split.getInfo()); - } - splitFinished(split); - } - else { - if (blocked.isDone()) { - waitingSplits.offer(split); - } - else { - blockedSplits.put(split, blocked); - blocked.addListener(() -> { - blockedSplits.remove(split); - // reset the level priority to prevent previously-blocked splits from starving existing splits - split.resetLevelPriority(); - waitingSplits.offer(split); - }, executor); - } - } - } - catch (Throwable t) { - // ignore random errors due to driver thread interruption - if (!split.isDestroyed()) { - if (t instanceof TrinoException trinoException) { - log.error(t, "Error processing %s: %s: %s", split.getInfo(), trinoException.getErrorCode().getName(), trinoException.getMessage()); - } - else { - log.error(t, "Error processing %s", split.getInfo()); - } - } - splitFinished(split); - } - finally { - // Clear the interrupted flag on the current thread, driver cancellation may have triggered an interrupt - if (Thread.interrupted()) { - if (closed) { - // reset interrupted flag if closed before interrupt - Thread.currentThread().interrupt(); - } - } - } - } - } - finally { - // unless we have been closed, we need to replace this thread - if (!closed) { - addRunnerThread(); - } - } - } - } - - // - // STATS - // - - @Managed - public synchronized int getTasks() - { - return tasks.size(); - } - - @Managed - public int getRunnerThreads() - { - return runnerThreads; - } - - @Managed - public int getMinimumNumberOfDrivers() - { - return minimumNumberOfDrivers; - } - - @Managed - public synchronized int getTotalSplits() - { - return allSplits.size(); - } - - @Managed - public synchronized int getIntermediateSplits() - { - return intermediateSplits.size(); - } - - @Managed - public int getWaitingSplits() - { - return waitingSplits.size(); - } - - @Managed - @Nested - public DistributionStat getLeafSplitsSize() - { - return leafSplitsSize; - } - - @Managed - public int getRunningSplits() - { - return runningSplits.size(); - } - - @Managed - public int getBlockedSplits() - { - return blockedSplits.size(); - } - - @Managed - public long getCompletedTasksLevel0() - { - return completedTasksPerLevel.get(0); - } - - @Managed - public long getCompletedTasksLevel1() - { - return completedTasksPerLevel.get(1); - } - - @Managed - public long getCompletedTasksLevel2() - { - return completedTasksPerLevel.get(2); - } - - @Managed - public long getCompletedTasksLevel3() - { - return completedTasksPerLevel.get(3); - } - - @Managed - public long getCompletedTasksLevel4() - { - return completedTasksPerLevel.get(4); - } - - @Managed - public long getCompletedSplitsLevel0() - { - return completedSplitsPerLevel.get(0); - } - - @Managed - public long getCompletedSplitsLevel1() - { - return completedSplitsPerLevel.get(1); - } - - @Managed - public long getCompletedSplitsLevel2() - { - return completedSplitsPerLevel.get(2); - } - - @Managed - public long getCompletedSplitsLevel3() - { - return completedSplitsPerLevel.get(3); - } - - @Managed - public long getCompletedSplitsLevel4() - { - return completedSplitsPerLevel.get(4); - } - - @Managed - public long getRunningTasksLevel0() - { - return getRunningTasksForLevel(0); - } - - @Managed - public long getRunningTasksLevel1() - { - return getRunningTasksForLevel(1); - } - - @Managed - public long getRunningTasksLevel2() - { - return getRunningTasksForLevel(2); - } - - @Managed - public long getRunningTasksLevel3() - { - return getRunningTasksForLevel(3); - } - - @Managed - public long getRunningTasksLevel4() - { - return getRunningTasksForLevel(4); - } - - @Managed - @Nested - public TimeStat getSplitQueuedTime() - { - return splitQueuedTime; - } - - @Managed - @Nested - public TimeStat getSplitWallTime() - { - return splitWallTime; - } - - @Managed - @Nested - public TimeStat getBlockedQuantaWallTime() - { - return blockedQuantaWallTime; - } - - @Managed - @Nested - public TimeStat getUnblockedQuantaWallTime() - { - return unblockedQuantaWallTime; - } - - @Managed - @Nested - public TimeDistribution getLeafSplitScheduledTime() - { - return leafSplitScheduledTime; - } - - @Managed - @Nested - public TimeDistribution getIntermediateSplitScheduledTime() - { - return intermediateSplitScheduledTime; - } - - @Managed - @Nested - public TimeDistribution getLeafSplitWallTime() - { - return leafSplitWallTime; - } - - @Managed - @Nested - public TimeDistribution getIntermediateSplitWallTime() - { - return intermediateSplitWallTime; - } - - @Managed - @Nested - public TimeDistribution getLeafSplitWaitTime() - { - return leafSplitWaitTime; - } - - @Managed - @Nested - public TimeDistribution getIntermediateSplitWaitTime() - { - return intermediateSplitWaitTime; - } - - @Managed - @Nested - public TimeDistribution getLeafSplitCpuTime() - { - return leafSplitCpuTime; - } - - @Managed - @Nested - public TimeDistribution getIntermediateSplitCpuTime() - { - return intermediateSplitCpuTime; - } - - @Managed - @Nested - public CounterStat getGlobalScheduledTimeMicros() - { - return globalScheduledTimeMicros; - } - - @Managed - @Nested - public CounterStat getGlobalCpuTimeMicros() - { - return globalCpuTimeMicros; - } - - private synchronized int getRunningTasksForLevel(int level) - { - int count = 0; - for (TaskHandle task : tasks) { - if (task.getPriority().getLevel() == level) { - count++; - } - } - return count; - } - - public String getMaxActiveSplitsInfo() - { - // Sample output: - // - // 2 splits have been continuously active for more than 600.00ms seconds - // - // "20180907_054754_00000_88xi4.1.0-2" tid=99 - // at java.util.Formatter$FormatSpecifier.(Formatter.java:2708) - // at java.util.Formatter.parse(Formatter.java:2560) - // at java.util.Formatter.format(Formatter.java:2501) - // at ... (more lines of stacktrace) - // - // "20180907_054754_00000_88xi4.1.0-3" tid=106 - // at java.util.Formatter$FormatSpecifier.(Formatter.java:2709) - // at java.util.Formatter.parse(Formatter.java:2560) - // at java.util.Formatter.format(Formatter.java:2501) - // at ... (more line of stacktrace) - StringBuilder stackTrace = new StringBuilder(); - int maxActiveSplitCount = 0; - String message = "%s splits have been continuously active for more than %s seconds\n"; - for (RunningSplitInfo splitInfo : runningSplitInfos) { - Duration duration = Duration.succinctNanos(ticker.read() - splitInfo.getStartTime()); - if (duration.compareTo(stuckSplitsWarningThreshold) >= 0) { - maxActiveSplitCount++; - stackTrace.append("\n"); - stackTrace.append(format("\"%s\" tid=%s", splitInfo.getThreadId(), splitInfo.getThread().getId())).append("\n"); - for (StackTraceElement traceElement : splitInfo.getThread().getStackTrace()) { - stackTrace.append("\tat ").append(traceElement).append("\n"); - } - } - } - - return format(message, maxActiveSplitCount, stuckSplitsWarningThreshold).concat(stackTrace.toString()); - } - - @Managed - public long getRunAwaySplitCount() - { - int count = 0; - for (RunningSplitInfo splitInfo : runningSplitInfos) { - Duration duration = Duration.succinctNanos(ticker.read() - splitInfo.getStartTime()); - if (duration.compareTo(stuckSplitsWarningThreshold) > 0) { - count++; - } - } - return count; - } - - public Set getStuckSplitTaskIds(Duration processingDurationThreshold, Predicate filter) - { - return runningSplitInfos.stream() - .filter((RunningSplitInfo splitInfo) -> { - Duration splitProcessingDuration = Duration.succinctNanos(ticker.read() - splitInfo.getStartTime()); - return splitProcessingDuration.compareTo(processingDurationThreshold) > 0; - }) - .filter(filter).map(RunningSplitInfo::getTaskId).collect(toImmutableSet()); - } - - /** - * A class representing a split that is running on the TaskRunner. - * It has a Thread object that gets assigned while assigning the split - * to the taskRunner. However, when the TaskRunner moves to a different split, - * the thread stored here will not remain assigned to this split anymore. - */ - public static class RunningSplitInfo - implements Comparable - { - private final long startTime; - private final String threadId; - private final Thread thread; - private boolean printed; - private final PrioritizedSplitRunner split; - - public RunningSplitInfo(long startTime, String threadId, Thread thread, PrioritizedSplitRunner split) - { - this.startTime = startTime; - this.threadId = requireNonNull(threadId, "threadId is null"); - this.thread = requireNonNull(thread, "thread is null"); - this.split = requireNonNull(split, "split is null"); - this.printed = false; - } - - public long getStartTime() - { - return startTime; - } - - public String getThreadId() - { - return threadId; - } - - public Thread getThread() - { - return thread; - } - - public TaskId getTaskId() - { - return split.getTaskHandle().getTaskId(); - } + OptionalInt maxDriversPerTask); - /** - * {@link PrioritizedSplitRunner#getInfo()} provides runtime statistics for the split (such as total cpu utilization so far). - * A value returned from this method changes over time and cannot be cached as a field of {@link RunningSplitInfo}. - * - * @return Formatted string containing runtime statistics for the split. - */ - public String getSplitInfo() - { - return split.getInfo(); - } + void removeTask(TaskHandle taskHandle); - public boolean isPrinted() - { - return printed; - } + List> enqueueSplits(TaskHandle taskHandle, boolean intermediate, List taskSplits); - public void setPrinted() - { - printed = true; - } + Set getStuckSplitTaskIds(Duration processingDurationThreshold, Predicate filter); - @Override - public int compareTo(RunningSplitInfo o) - { - return ComparisonChain.start() - .compare(startTime, o.getStartTime()) - .compare(threadId, o.getThreadId()) - .result(); - } - } + void start(); - @Managed(description = "Task processor executor") - @Nested - public ThreadPoolExecutorMBean getProcessorExecutor() - { - return executorMBean; - } + void stop(); } diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/TaskHandle.java b/core/trino-main/src/main/java/io/trino/execution/executor/TaskHandle.java index f3d99d0fb1b7..2a032768feea 100644 --- a/core/trino-main/src/main/java/io/trino/execution/executor/TaskHandle.java +++ b/core/trino-main/src/main/java/io/trino/execution/executor/TaskHandle.java @@ -13,188 +13,7 @@ */ package io.trino.execution.executor; -import com.google.common.collect.ImmutableList; -import com.google.errorprone.annotations.ThreadSafe; -import com.google.errorprone.annotations.concurrent.GuardedBy; -import io.airlift.units.Duration; -import io.trino.execution.SplitConcurrencyController; -import io.trino.execution.TaskId; - -import java.util.ArrayDeque; -import java.util.ArrayList; -import java.util.List; -import java.util.OptionalInt; -import java.util.Queue; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicReference; -import java.util.function.DoubleSupplier; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static java.util.Objects.requireNonNull; - -@ThreadSafe -public class TaskHandle +public interface TaskHandle { - private volatile boolean destroyed; - private final TaskId taskId; - private final DoubleSupplier utilizationSupplier; - - @GuardedBy("this") - protected final Queue queuedLeafSplits = new ArrayDeque<>(10); - @GuardedBy("this") - protected final List runningLeafSplits = new ArrayList<>(10); - @GuardedBy("this") - protected final List runningIntermediateSplits = new ArrayList<>(10); - @GuardedBy("this") - protected long scheduledNanos; - @GuardedBy("this") - protected final SplitConcurrencyController concurrencyController; - - private final AtomicInteger nextSplitId = new AtomicInteger(); - - private final AtomicReference priority = new AtomicReference<>(new Priority(0, 0)); - private final MultilevelSplitQueue splitQueue; - private final OptionalInt maxDriversPerTask; - - public TaskHandle( - TaskId taskId, - MultilevelSplitQueue splitQueue, - DoubleSupplier utilizationSupplier, - int initialSplitConcurrency, - Duration splitConcurrencyAdjustFrequency, - OptionalInt maxDriversPerTask) - { - this.taskId = requireNonNull(taskId, "taskId is null"); - this.splitQueue = requireNonNull(splitQueue, "splitQueue is null"); - this.utilizationSupplier = requireNonNull(utilizationSupplier, "utilizationSupplier is null"); - this.maxDriversPerTask = requireNonNull(maxDriversPerTask, "maxDriversPerTask is null"); - this.concurrencyController = new SplitConcurrencyController( - initialSplitConcurrency, - requireNonNull(splitConcurrencyAdjustFrequency, "splitConcurrencyAdjustFrequency is null")); - } - - public synchronized Priority addScheduledNanos(long durationNanos) - { - concurrencyController.update(durationNanos, utilizationSupplier.getAsDouble(), runningLeafSplits.size()); - scheduledNanos += durationNanos; - - Priority newPriority = splitQueue.updatePriority(priority.get(), durationNanos, scheduledNanos); - - priority.set(newPriority); - return newPriority; - } - - public synchronized Priority resetLevelPriority() - { - Priority currentPriority = priority.get(); - long levelMinPriority = splitQueue.getLevelMinPriority(currentPriority.getLevel(), scheduledNanos); - - if (currentPriority.getLevelPriority() < levelMinPriority) { - Priority newPriority = new Priority(currentPriority.getLevel(), levelMinPriority); - priority.set(newPriority); - return newPriority; - } - - return currentPriority; - } - - public boolean isDestroyed() - { - return destroyed; - } - - public Priority getPriority() - { - return priority.get(); - } - - public TaskId getTaskId() - { - return taskId; - } - - public OptionalInt getMaxDriversPerTask() - { - return maxDriversPerTask; - } - - // Returns any remaining splits. The caller must destroy these. - public synchronized List destroy() - { - destroyed = true; - - ImmutableList.Builder builder = ImmutableList.builderWithExpectedSize(runningIntermediateSplits.size() + runningLeafSplits.size() + queuedLeafSplits.size()); - builder.addAll(runningIntermediateSplits); - builder.addAll(runningLeafSplits); - builder.addAll(queuedLeafSplits); - runningIntermediateSplits.clear(); - runningLeafSplits.clear(); - queuedLeafSplits.clear(); - return builder.build(); - } - - public synchronized boolean enqueueSplit(PrioritizedSplitRunner split) - { - if (destroyed) { - return false; - } - queuedLeafSplits.add(split); - return true; - } - - public synchronized boolean recordIntermediateSplit(PrioritizedSplitRunner split) - { - if (destroyed) { - return false; - } - runningIntermediateSplits.add(split); - return true; - } - - synchronized int getRunningLeafSplits() - { - return runningLeafSplits.size(); - } - - public synchronized long getScheduledNanos() - { - return scheduledNanos; - } - - public synchronized PrioritizedSplitRunner pollNextSplit() - { - if (destroyed) { - return null; - } - - if (runningLeafSplits.size() >= concurrencyController.getTargetConcurrency()) { - return null; - } - - PrioritizedSplitRunner split = queuedLeafSplits.poll(); - if (split != null) { - runningLeafSplits.add(split); - } - return split; - } - - public synchronized void splitComplete(PrioritizedSplitRunner split) - { - concurrencyController.splitFinished(split.getScheduledNanos(), utilizationSupplier.getAsDouble(), runningLeafSplits.size()); - runningIntermediateSplits.remove(split); - runningLeafSplits.remove(split); - } - - public int getNextSplitId() - { - return nextSplitId.getAndIncrement(); - } - - @Override - public String toString() - { - return toStringHelper(this) - .add("taskId", taskId) - .toString(); - } + boolean isDestroyed(); } diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/MultilevelSplitQueue.java b/core/trino-main/src/main/java/io/trino/execution/executor/timesharing/MultilevelSplitQueue.java similarity index 99% rename from core/trino-main/src/main/java/io/trino/execution/executor/MultilevelSplitQueue.java rename to core/trino-main/src/main/java/io/trino/execution/executor/timesharing/MultilevelSplitQueue.java index 317e28bdbd09..70403cb6c7ca 100644 --- a/core/trino-main/src/main/java/io/trino/execution/executor/MultilevelSplitQueue.java +++ b/core/trino-main/src/main/java/io/trino/execution/executor/timesharing/MultilevelSplitQueue.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.execution.executor; +package io.trino.execution.executor.timesharing; import com.google.common.annotations.VisibleForTesting; import com.google.errorprone.annotations.ThreadSafe; diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/PrioritizedSplitRunner.java b/core/trino-main/src/main/java/io/trino/execution/executor/timesharing/PrioritizedSplitRunner.java similarity index 97% rename from core/trino-main/src/main/java/io/trino/execution/executor/PrioritizedSplitRunner.java rename to core/trino-main/src/main/java/io/trino/execution/executor/timesharing/PrioritizedSplitRunner.java index 1c3b5d15de9e..157c6cdd0cd9 100644 --- a/core/trino-main/src/main/java/io/trino/execution/executor/PrioritizedSplitRunner.java +++ b/core/trino-main/src/main/java/io/trino/execution/executor/timesharing/PrioritizedSplitRunner.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.execution.executor; +package io.trino.execution.executor.timesharing; import com.google.common.base.Ticker; import com.google.common.util.concurrent.ListenableFuture; @@ -50,7 +50,7 @@ public final class PrioritizedSplitRunner private final long createdNanos = System.nanoTime(); - private final TaskHandle taskHandle; + private final TimeSharingTaskHandle taskHandle; private final int splitId; private final long workerId; private final SplitRunner split; @@ -81,7 +81,7 @@ public final class PrioritizedSplitRunner private final TimeStat unblockedQuantaWallTime; PrioritizedSplitRunner( - TaskHandle taskHandle, + TimeSharingTaskHandle taskHandle, int splitId, SplitRunner split, Span splitSpan, @@ -107,7 +107,7 @@ public final class PrioritizedSplitRunner updateLevelPriority(); } - public TaskHandle getTaskHandle() + public TimeSharingTaskHandle getTaskHandle() { return taskHandle; } diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/Priority.java b/core/trino-main/src/main/java/io/trino/execution/executor/timesharing/Priority.java similarity index 97% rename from core/trino-main/src/main/java/io/trino/execution/executor/Priority.java rename to core/trino-main/src/main/java/io/trino/execution/executor/timesharing/Priority.java index 180f698742ae..a7c0ac0c9f82 100644 --- a/core/trino-main/src/main/java/io/trino/execution/executor/Priority.java +++ b/core/trino-main/src/main/java/io/trino/execution/executor/timesharing/Priority.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.execution.executor; +package io.trino.execution.executor.timesharing; import com.google.errorprone.annotations.Immutable; diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/timesharing/TimeSharingTaskExecutor.java b/core/trino-main/src/main/java/io/trino/execution/executor/timesharing/TimeSharingTaskExecutor.java new file mode 100644 index 000000000000..f5c40ff449f1 --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/execution/executor/timesharing/TimeSharingTaskExecutor.java @@ -0,0 +1,953 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution.executor.timesharing; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Ticker; +import com.google.common.util.concurrent.ListenableFuture; +import com.google.errorprone.annotations.ThreadSafe; +import com.google.errorprone.annotations.concurrent.GuardedBy; +import com.google.inject.Inject; +import io.airlift.concurrent.SetThreadName; +import io.airlift.concurrent.ThreadPoolExecutorMBean; +import io.airlift.log.Logger; +import io.airlift.stats.CounterStat; +import io.airlift.stats.DistributionStat; +import io.airlift.stats.TimeDistribution; +import io.airlift.stats.TimeStat; +import io.airlift.units.Duration; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.Tracer; +import io.opentelemetry.context.Context; +import io.trino.execution.SplitRunner; +import io.trino.execution.TaskId; +import io.trino.execution.TaskManagerConfig; +import io.trino.execution.executor.RunningSplitInfo; +import io.trino.execution.executor.TaskExecutor; +import io.trino.execution.executor.TaskHandle; +import io.trino.spi.TrinoException; +import io.trino.spi.VersionEmbedder; +import io.trino.tracing.TrinoAttributes; +import jakarta.annotation.PostConstruct; +import jakarta.annotation.PreDestroy; +import org.weakref.jmx.Managed; +import org.weakref.jmx.Nested; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.OptionalInt; +import java.util.Set; +import java.util.SortedSet; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentSkipListSet; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Future; +import java.util.concurrent.RejectedExecutionException; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicLongArray; +import java.util.function.DoubleSupplier; +import java.util.function.Predicate; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static com.google.common.collect.Sets.newConcurrentHashSet; +import static io.airlift.concurrent.Threads.daemonThreadsNamed; +import static io.airlift.concurrent.Threads.threadsNamed; +import static io.airlift.tracing.Tracing.noopTracer; +import static io.trino.execution.executor.timesharing.MultilevelSplitQueue.computeLevel; +import static io.trino.version.EmbedVersion.testingVersionEmbedder; +import static java.lang.Math.min; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; +import static java.util.concurrent.Executors.newCachedThreadPool; +import static java.util.concurrent.Executors.newSingleThreadScheduledExecutor; +import static java.util.concurrent.TimeUnit.MICROSECONDS; +import static java.util.concurrent.TimeUnit.NANOSECONDS; + +@ThreadSafe +public class TimeSharingTaskExecutor + implements TaskExecutor +{ + private static final Logger log = Logger.get(TimeSharingTaskExecutor.class); + private static final AtomicLong NEXT_RUNNER_ID = new AtomicLong(); + + private final ExecutorService executor; + private final ThreadPoolExecutorMBean executorMBean; + + private final int runnerThreads; + private final int minimumNumberOfDrivers; + private final int guaranteedNumberOfDriversPerTask; + private final int maximumNumberOfDriversPerTask; + private final VersionEmbedder versionEmbedder; + private final Tracer tracer; + + private final Ticker ticker; + + private final Duration stuckSplitsWarningThreshold; + private final ScheduledExecutorService splitMonitorExecutor = newSingleThreadScheduledExecutor(daemonThreadsNamed("TaskExecutor")); + private final SortedSet runningSplitInfos = new ConcurrentSkipListSet<>(); + + @GuardedBy("this") + private final List tasks; + + /** + * All splits registered with the task executor. + */ + @GuardedBy("this") + private final Set allSplits = new HashSet<>(); + + /** + * Intermediate splits (i.e. splits that should not be queued). + */ + @GuardedBy("this") + private final Set intermediateSplits = new HashSet<>(); + + /** + * Splits waiting for a runner thread. + */ + private final MultilevelSplitQueue waitingSplits; + + /** + * Splits running on a thread. + */ + private final Set runningSplits = newConcurrentHashSet(); + + /** + * Splits blocked by the driver. + */ + private final Map> blockedSplits = new ConcurrentHashMap<>(); + + private final AtomicLongArray completedTasksPerLevel = new AtomicLongArray(5); + private final AtomicLongArray completedSplitsPerLevel = new AtomicLongArray(5); + + private final TimeStat splitQueuedTime = new TimeStat(NANOSECONDS); + private final TimeStat splitWallTime = new TimeStat(NANOSECONDS); + + private final TimeDistribution leafSplitWallTime = new TimeDistribution(MICROSECONDS); + private final TimeDistribution intermediateSplitWallTime = new TimeDistribution(MICROSECONDS); + + private final TimeDistribution leafSplitScheduledTime = new TimeDistribution(MICROSECONDS); + private final TimeDistribution intermediateSplitScheduledTime = new TimeDistribution(MICROSECONDS); + + private final TimeDistribution leafSplitWaitTime = new TimeDistribution(MICROSECONDS); + private final TimeDistribution intermediateSplitWaitTime = new TimeDistribution(MICROSECONDS); + + private final TimeDistribution leafSplitCpuTime = new TimeDistribution(MICROSECONDS); + private final TimeDistribution intermediateSplitCpuTime = new TimeDistribution(MICROSECONDS); + + // shared between SplitRunners + private final CounterStat globalCpuTimeMicros = new CounterStat(); + private final CounterStat globalScheduledTimeMicros = new CounterStat(); + + private final TimeStat blockedQuantaWallTime = new TimeStat(MICROSECONDS); + private final TimeStat unblockedQuantaWallTime = new TimeStat(MICROSECONDS); + + private final DistributionStat leafSplitsSize = new DistributionStat(); + @GuardedBy("this") + private long lastLeafSplitsSizeRecordTime; + @GuardedBy("this") + private long lastLeafSplitsSize; + + private volatile boolean closed; + + @Inject + public TimeSharingTaskExecutor(TaskManagerConfig config, VersionEmbedder versionEmbedder, Tracer tracer, MultilevelSplitQueue splitQueue) + { + this( + config.getMaxWorkerThreads(), + config.getMinDrivers(), + config.getMinDriversPerTask(), + config.getMaxDriversPerTask(), + config.getInterruptStuckSplitTasksWarningThreshold(), + versionEmbedder, + tracer, + splitQueue, + Ticker.systemTicker()); + } + + @VisibleForTesting + public TimeSharingTaskExecutor(int runnerThreads, int minDrivers, int guaranteedNumberOfDriversPerTask, int maximumNumberOfDriversPerTask, Ticker ticker) + { + this(runnerThreads, minDrivers, guaranteedNumberOfDriversPerTask, maximumNumberOfDriversPerTask, new Duration(10, TimeUnit.MINUTES), testingVersionEmbedder(), noopTracer(), new MultilevelSplitQueue(2), ticker); + } + + @VisibleForTesting + public TimeSharingTaskExecutor(int runnerThreads, int minDrivers, int guaranteedNumberOfDriversPerTask, int maximumNumberOfDriversPerTask, MultilevelSplitQueue splitQueue, Ticker ticker) + { + this(runnerThreads, minDrivers, guaranteedNumberOfDriversPerTask, maximumNumberOfDriversPerTask, new Duration(10, TimeUnit.MINUTES), testingVersionEmbedder(), noopTracer(), splitQueue, ticker); + } + + @VisibleForTesting + public TimeSharingTaskExecutor( + int runnerThreads, + int minDrivers, + int guaranteedNumberOfDriversPerTask, + int maximumNumberOfDriversPerTask, + Duration stuckSplitsWarningThreshold, + VersionEmbedder versionEmbedder, + Tracer tracer, + MultilevelSplitQueue splitQueue, + Ticker ticker) + { + checkArgument(runnerThreads > 0, "runnerThreads must be at least 1"); + checkArgument(guaranteedNumberOfDriversPerTask > 0, "guaranteedNumberOfDriversPerTask must be at least 1"); + checkArgument(maximumNumberOfDriversPerTask > 0, "maximumNumberOfDriversPerTask must be at least 1"); + checkArgument(guaranteedNumberOfDriversPerTask <= maximumNumberOfDriversPerTask, "guaranteedNumberOfDriversPerTask cannot be greater than maximumNumberOfDriversPerTask"); + + // we manage thread pool size directly, so create an unlimited pool + this.executor = newCachedThreadPool(threadsNamed("task-processor-%s")); + this.executorMBean = new ThreadPoolExecutorMBean((ThreadPoolExecutor) executor); + this.runnerThreads = runnerThreads; + this.versionEmbedder = requireNonNull(versionEmbedder, "versionEmbedder is null"); + this.tracer = requireNonNull(tracer, "tracer is null"); + + this.ticker = requireNonNull(ticker, "ticker is null"); + this.stuckSplitsWarningThreshold = requireNonNull(stuckSplitsWarningThreshold, "stuckSplitsWarningThreshold is null"); + + this.minimumNumberOfDrivers = minDrivers; + this.guaranteedNumberOfDriversPerTask = guaranteedNumberOfDriversPerTask; + this.maximumNumberOfDriversPerTask = maximumNumberOfDriversPerTask; + this.waitingSplits = requireNonNull(splitQueue, "splitQueue is null"); + this.tasks = new LinkedList<>(); + this.lastLeafSplitsSizeRecordTime = ticker.read(); + } + + @PostConstruct + @Override + public synchronized void start() + { + checkState(!closed, "TaskExecutor is closed"); + for (int i = 0; i < runnerThreads; i++) { + addRunnerThread(); + } + } + + @PreDestroy + @Override + public synchronized void stop() + { + closed = true; + executor.shutdownNow(); + splitMonitorExecutor.shutdownNow(); + } + + @Override + public synchronized String toString() + { + return toStringHelper(this) + .add("runnerThreads", runnerThreads) + .add("allSplits", allSplits.size()) + .add("intermediateSplits", intermediateSplits.size()) + .add("waitingSplits", waitingSplits.size()) + .add("runningSplits", runningSplits.size()) + .add("blockedSplits", blockedSplits.size()) + .toString(); + } + + private synchronized void addRunnerThread() + { + try { + executor.execute(versionEmbedder.embedVersion(new TaskRunner())); + } + catch (RejectedExecutionException ignored) { + } + } + + @Override + public synchronized TimeSharingTaskHandle addTask( + TaskId taskId, + DoubleSupplier utilizationSupplier, + int initialSplitConcurrency, + Duration splitConcurrencyAdjustFrequency, + OptionalInt maxDriversPerTask) + { + requireNonNull(taskId, "taskId is null"); + requireNonNull(utilizationSupplier, "utilizationSupplier is null"); + checkArgument(maxDriversPerTask.isEmpty() || maxDriversPerTask.getAsInt() <= maximumNumberOfDriversPerTask, + "maxDriversPerTask cannot be greater than the configured value"); + + log.debug("Task scheduled %s", taskId); + + TimeSharingTaskHandle taskHandle = new TimeSharingTaskHandle(taskId, waitingSplits, utilizationSupplier, initialSplitConcurrency, splitConcurrencyAdjustFrequency, maxDriversPerTask); + + tasks.add(taskHandle); + return taskHandle; + } + + @Override + public void removeTask(TaskHandle taskHandle) + { + TimeSharingTaskHandle handle = (TimeSharingTaskHandle) taskHandle; + try (SetThreadName ignored = new SetThreadName("Task-%s", handle.getTaskId())) { + // Skip additional scheduling if the task was already destroyed + if (!doRemoveTask(handle)) { + return; + } + } + + // replace blocked splits that were terminated + synchronized (this) { + addNewEntrants(); + recordLeafSplitsSize(); + } + } + + /** + * Returns true if the task handle was destroyed and removed splits as a result that may need to be replaced. Otherwise, + * if the {@link TimeSharingTaskHandle} was already destroyed or no splits were removed then this method returns false and no additional + * splits need to be scheduled. + */ + private boolean doRemoveTask(TimeSharingTaskHandle taskHandle) + { + List splits; + synchronized (this) { + tasks.remove(taskHandle); + + // Task is already destroyed + if (taskHandle.isDestroyed()) { + return false; + } + + splits = taskHandle.destroy(); + // stop tracking splits (especially blocked splits which may never unblock) + allSplits.removeAll(splits); + intermediateSplits.removeAll(splits); + blockedSplits.keySet().removeAll(splits); + waitingSplits.removeAll(splits); + recordLeafSplitsSize(); + } + + // call destroy outside of synchronized block as it is expensive and doesn't need a lock on the task executor + for (PrioritizedSplitRunner split : splits) { + split.destroy(); + } + + // record completed stats + long threadUsageNanos = taskHandle.getScheduledNanos(); + completedTasksPerLevel.incrementAndGet(computeLevel(threadUsageNanos)); + + log.debug("Task finished or failed %s", taskHandle.getTaskId()); + return !splits.isEmpty(); + } + + @Override + public List> enqueueSplits(TaskHandle taskHandle, boolean intermediate, List taskSplits) + { + TimeSharingTaskHandle handle = (TimeSharingTaskHandle) taskHandle; + List splitsToDestroy = new ArrayList<>(); + List> finishedFutures = new ArrayList<>(taskSplits.size()); + synchronized (this) { + for (SplitRunner taskSplit : taskSplits) { + TaskId taskId = handle.getTaskId(); + int splitId = handle.getNextSplitId(); + + Span splitSpan = tracer.spanBuilder(intermediate ? "split (intermediate)" : "split (leaf)") + .setParent(Context.current().with(taskSplit.getPipelineSpan())) + .setAttribute(TrinoAttributes.QUERY_ID, taskId.getQueryId().toString()) + .setAttribute(TrinoAttributes.STAGE_ID, taskId.getStageId().toString()) + .setAttribute(TrinoAttributes.TASK_ID, taskId.toString()) + .setAttribute(TrinoAttributes.PIPELINE_ID, taskId.getStageId() + "-" + taskSplit.getPipelineId()) + .setAttribute(TrinoAttributes.SPLIT_ID, taskId + "-" + splitId) + .startSpan(); + + PrioritizedSplitRunner prioritizedSplitRunner = new PrioritizedSplitRunner( + handle, + splitId, + taskSplit, + splitSpan, + tracer, + ticker, + globalCpuTimeMicros, + globalScheduledTimeMicros, + blockedQuantaWallTime, + unblockedQuantaWallTime); + + if (intermediate) { + // add the runner to the handle so it can be destroyed if the task is canceled + if (handle.recordIntermediateSplit(prioritizedSplitRunner)) { + // Note: we do not record queued time for intermediate splits + startIntermediateSplit(prioritizedSplitRunner); + } + else { + splitsToDestroy.add(prioritizedSplitRunner); + } + } + else { + // add this to the work queue for the task + if (handle.enqueueSplit(prioritizedSplitRunner)) { + // if task is under the limit for guaranteed splits, start one + scheduleTaskIfNecessary(handle); + // if globally we have more resources, start more + addNewEntrants(); + } + else { + splitsToDestroy.add(prioritizedSplitRunner); + } + } + + finishedFutures.add(prioritizedSplitRunner.getFinishedFuture()); + } + recordLeafSplitsSize(); + } + for (PrioritizedSplitRunner split : splitsToDestroy) { + split.destroy(); + } + return finishedFutures; + } + + private void splitFinished(PrioritizedSplitRunner split) + { + completedSplitsPerLevel.incrementAndGet(split.getPriority().getLevel()); + synchronized (this) { + allSplits.remove(split); + + long wallNanos = System.nanoTime() - split.getCreatedNanos(); + splitWallTime.add(Duration.succinctNanos(wallNanos)); + + if (intermediateSplits.remove(split)) { + intermediateSplitWallTime.add(wallNanos); + intermediateSplitScheduledTime.add(split.getScheduledNanos()); + intermediateSplitWaitTime.add(split.getWaitNanos()); + intermediateSplitCpuTime.add(split.getCpuTimeNanos()); + } + else { + leafSplitWallTime.add(wallNanos); + leafSplitScheduledTime.add(split.getScheduledNanos()); + leafSplitWaitTime.add(split.getWaitNanos()); + leafSplitCpuTime.add(split.getCpuTimeNanos()); + } + + TimeSharingTaskHandle taskHandle = split.getTaskHandle(); + taskHandle.splitComplete(split); + + scheduleTaskIfNecessary(taskHandle); + + addNewEntrants(); + recordLeafSplitsSize(); + } + // call destroy outside of synchronized block as it is expensive and doesn't need a lock on the task executor + split.destroy(); + } + + private synchronized void scheduleTaskIfNecessary(TimeSharingTaskHandle taskHandle) + { + // if task has less than the minimum guaranteed splits running, + // immediately schedule new splits for this task. This assures + // that a task gets its fair amount of consideration (you have to + // have splits to be considered for running on a thread). + int splitsToSchedule = min(guaranteedNumberOfDriversPerTask, taskHandle.getMaxDriversPerTask().orElse(Integer.MAX_VALUE)) - taskHandle.getRunningLeafSplits(); + for (int i = 0; i < splitsToSchedule; ++i) { + PrioritizedSplitRunner split = taskHandle.pollNextSplit(); + if (split == null) { + // no more splits to schedule + return; + } + + startSplit(split); + splitQueuedTime.add(Duration.nanosSince(split.getCreatedNanos())); + } + recordLeafSplitsSize(); + } + + private synchronized void addNewEntrants() + { + // Ignore intermediate splits when checking minimumNumberOfDrivers. + // Otherwise with (for example) minimumNumberOfDrivers = 100, 200 intermediate splits + // and 100 leaf splits, depending on order of appearing splits, number of + // simultaneously running splits may vary. If leaf splits start first, there will + // be 300 running splits. If intermediate splits start first, there will be only + // 200 running splits. + int running = allSplits.size() - intermediateSplits.size(); + for (int i = 0; i < minimumNumberOfDrivers - running; i++) { + PrioritizedSplitRunner split = pollNextSplitWorker(); + if (split == null) { + break; + } + + splitQueuedTime.add(Duration.nanosSince(split.getCreatedNanos())); + startSplit(split); + } + } + + private synchronized void startIntermediateSplit(PrioritizedSplitRunner split) + { + startSplit(split); + intermediateSplits.add(split); + } + + private synchronized void startSplit(PrioritizedSplitRunner split) + { + allSplits.add(split); + waitingSplits.offer(split); + } + + private synchronized PrioritizedSplitRunner pollNextSplitWorker() + { + // todo find a better algorithm for this + // find the first task that produces a split, then move that task to the + // end of the task list, so we get round robin + for (Iterator iterator = tasks.iterator(); iterator.hasNext(); ) { + TimeSharingTaskHandle task = iterator.next(); + // skip tasks that are already running the configured max number of drivers + if (task.getRunningLeafSplits() >= task.getMaxDriversPerTask().orElse(maximumNumberOfDriversPerTask)) { + continue; + } + PrioritizedSplitRunner split = task.pollNextSplit(); + if (split != null) { + // move task to end of list + iterator.remove(); + + // CAUTION: we are modifying the list in the loop which would normally + // cause a ConcurrentModificationException but we exit immediately + tasks.add(task); + return split; + } + } + return null; + } + + private synchronized void recordLeafSplitsSize() + { + long now = ticker.read(); + long timeDifference = now - this.lastLeafSplitsSizeRecordTime; + if (timeDifference > 0) { + this.leafSplitsSize.add(lastLeafSplitsSize, timeDifference); + this.lastLeafSplitsSizeRecordTime = now; + } + // always record new lastLeafSplitsSize as it might have changed + // even if timeDifference is 0 + this.lastLeafSplitsSize = allSplits.size() - intermediateSplits.size(); + } + + private class TaskRunner + implements Runnable + { + private final long runnerId = NEXT_RUNNER_ID.getAndIncrement(); + + @Override + public void run() + { + try (SetThreadName runnerName = new SetThreadName("SplitRunner-%s", runnerId)) { + while (!closed && !Thread.currentThread().isInterrupted()) { + // select next worker + PrioritizedSplitRunner split; + try { + split = waitingSplits.take(); + } + catch (InterruptedException e) { + Thread.currentThread().interrupt(); + return; + } + + String threadId = split.getTaskHandle().getTaskId() + "-" + split.getSplitId(); + try (SetThreadName splitName = new SetThreadName(threadId)) { + RunningSplitInfo splitInfo = new RunningSplitInfo(ticker.read(), threadId, Thread.currentThread(), split.getTaskHandle().getTaskId(), split::getInfo); + runningSplitInfos.add(splitInfo); + runningSplits.add(split); + + ListenableFuture blocked; + try { + blocked = split.process(); + } + finally { + runningSplitInfos.remove(splitInfo); + runningSplits.remove(split); + } + + if (split.isFinished()) { + if (log.isDebugEnabled()) { + log.debug("%s is finished", split.getInfo()); + } + splitFinished(split); + } + else { + if (blocked.isDone()) { + waitingSplits.offer(split); + } + else { + blockedSplits.put(split, blocked); + blocked.addListener(() -> { + blockedSplits.remove(split); + // reset the level priority to prevent previously-blocked splits from starving existing splits + split.resetLevelPriority(); + waitingSplits.offer(split); + }, executor); + } + } + } + catch (Throwable t) { + // ignore random errors due to driver thread interruption + if (!split.isDestroyed()) { + if (t instanceof TrinoException trinoException) { + log.error(t, "Error processing %s: %s: %s", split.getInfo(), trinoException.getErrorCode().getName(), trinoException.getMessage()); + } + else { + log.error(t, "Error processing %s", split.getInfo()); + } + } + splitFinished(split); + } + finally { + // Clear the interrupted flag on the current thread, driver cancellation may have triggered an interrupt + if (Thread.interrupted()) { + if (closed) { + // reset interrupted flag if closed before interrupt + Thread.currentThread().interrupt(); + } + } + } + } + } + finally { + // unless we have been closed, we need to replace this thread + if (!closed) { + addRunnerThread(); + } + } + } + } + + // + // STATS + // + + @Managed + public synchronized int getTasks() + { + return tasks.size(); + } + + @Managed + public int getRunnerThreads() + { + return runnerThreads; + } + + @Managed + public int getMinimumNumberOfDrivers() + { + return minimumNumberOfDrivers; + } + + @Managed + public synchronized int getTotalSplits() + { + return allSplits.size(); + } + + @Managed + public synchronized int getIntermediateSplits() + { + return intermediateSplits.size(); + } + + @Managed + public int getWaitingSplits() + { + return waitingSplits.size(); + } + + @Managed + @Nested + public DistributionStat getLeafSplitsSize() + { + return leafSplitsSize; + } + + @Managed + public int getRunningSplits() + { + return runningSplits.size(); + } + + @Managed + public int getBlockedSplits() + { + return blockedSplits.size(); + } + + @Managed + public long getCompletedTasksLevel0() + { + return completedTasksPerLevel.get(0); + } + + @Managed + public long getCompletedTasksLevel1() + { + return completedTasksPerLevel.get(1); + } + + @Managed + public long getCompletedTasksLevel2() + { + return completedTasksPerLevel.get(2); + } + + @Managed + public long getCompletedTasksLevel3() + { + return completedTasksPerLevel.get(3); + } + + @Managed + public long getCompletedTasksLevel4() + { + return completedTasksPerLevel.get(4); + } + + @Managed + public long getCompletedSplitsLevel0() + { + return completedSplitsPerLevel.get(0); + } + + @Managed + public long getCompletedSplitsLevel1() + { + return completedSplitsPerLevel.get(1); + } + + @Managed + public long getCompletedSplitsLevel2() + { + return completedSplitsPerLevel.get(2); + } + + @Managed + public long getCompletedSplitsLevel3() + { + return completedSplitsPerLevel.get(3); + } + + @Managed + public long getCompletedSplitsLevel4() + { + return completedSplitsPerLevel.get(4); + } + + @Managed + public long getRunningTasksLevel0() + { + return getRunningTasksForLevel(0); + } + + @Managed + public long getRunningTasksLevel1() + { + return getRunningTasksForLevel(1); + } + + @Managed + public long getRunningTasksLevel2() + { + return getRunningTasksForLevel(2); + } + + @Managed + public long getRunningTasksLevel3() + { + return getRunningTasksForLevel(3); + } + + @Managed + public long getRunningTasksLevel4() + { + return getRunningTasksForLevel(4); + } + + @Managed + @Nested + public TimeStat getSplitQueuedTime() + { + return splitQueuedTime; + } + + @Managed + @Nested + public TimeStat getSplitWallTime() + { + return splitWallTime; + } + + @Managed + @Nested + public TimeStat getBlockedQuantaWallTime() + { + return blockedQuantaWallTime; + } + + @Managed + @Nested + public TimeStat getUnblockedQuantaWallTime() + { + return unblockedQuantaWallTime; + } + + @Managed + @Nested + public TimeDistribution getLeafSplitScheduledTime() + { + return leafSplitScheduledTime; + } + + @Managed + @Nested + public TimeDistribution getIntermediateSplitScheduledTime() + { + return intermediateSplitScheduledTime; + } + + @Managed + @Nested + public TimeDistribution getLeafSplitWallTime() + { + return leafSplitWallTime; + } + + @Managed + @Nested + public TimeDistribution getIntermediateSplitWallTime() + { + return intermediateSplitWallTime; + } + + @Managed + @Nested + public TimeDistribution getLeafSplitWaitTime() + { + return leafSplitWaitTime; + } + + @Managed + @Nested + public TimeDistribution getIntermediateSplitWaitTime() + { + return intermediateSplitWaitTime; + } + + @Managed + @Nested + public TimeDistribution getLeafSplitCpuTime() + { + return leafSplitCpuTime; + } + + @Managed + @Nested + public TimeDistribution getIntermediateSplitCpuTime() + { + return intermediateSplitCpuTime; + } + + @Managed + @Nested + public CounterStat getGlobalScheduledTimeMicros() + { + return globalScheduledTimeMicros; + } + + @Managed + @Nested + public CounterStat getGlobalCpuTimeMicros() + { + return globalCpuTimeMicros; + } + + private synchronized int getRunningTasksForLevel(int level) + { + int count = 0; + for (TimeSharingTaskHandle task : tasks) { + if (task.getPriority().getLevel() == level) { + count++; + } + } + return count; + } + + public String getMaxActiveSplitsInfo() + { + // Sample output: + // + // 2 splits have been continuously active for more than 600.00ms seconds + // + // "20180907_054754_00000_88xi4.1.0-2" tid=99 + // at java.util.Formatter$FormatSpecifier.(Formatter.java:2708) + // at java.util.Formatter.parse(Formatter.java:2560) + // at java.util.Formatter.format(Formatter.java:2501) + // at ... (more lines of stacktrace) + // + // "20180907_054754_00000_88xi4.1.0-3" tid=106 + // at java.util.Formatter$FormatSpecifier.(Formatter.java:2709) + // at java.util.Formatter.parse(Formatter.java:2560) + // at java.util.Formatter.format(Formatter.java:2501) + // at ... (more line of stacktrace) + StringBuilder stackTrace = new StringBuilder(); + int maxActiveSplitCount = 0; + String message = "%s splits have been continuously active for more than %s seconds\n"; + for (RunningSplitInfo splitInfo : runningSplitInfos) { + Duration duration = Duration.succinctNanos(ticker.read() - splitInfo.getStartTime()); + if (duration.compareTo(stuckSplitsWarningThreshold) >= 0) { + maxActiveSplitCount++; + stackTrace.append("\n"); + stackTrace.append(format("\"%s\" tid=%s", splitInfo.getThreadId(), splitInfo.getThread().getId())).append("\n"); + for (StackTraceElement traceElement : splitInfo.getThread().getStackTrace()) { + stackTrace.append("\tat ").append(traceElement).append("\n"); + } + } + } + + return format(message, maxActiveSplitCount, stuckSplitsWarningThreshold).concat(stackTrace.toString()); + } + + @Managed + public long getRunAwaySplitCount() + { + int count = 0; + for (RunningSplitInfo splitInfo : runningSplitInfos) { + Duration duration = Duration.succinctNanos(ticker.read() - splitInfo.getStartTime()); + if (duration.compareTo(stuckSplitsWarningThreshold) > 0) { + count++; + } + } + return count; + } + + @Override + public Set getStuckSplitTaskIds(Duration processingDurationThreshold, Predicate filter) + { + return runningSplitInfos.stream() + .filter((RunningSplitInfo splitInfo) -> { + Duration splitProcessingDuration = Duration.succinctNanos(ticker.read() - splitInfo.getStartTime()); + return splitProcessingDuration.compareTo(processingDurationThreshold) > 0; + }) + .filter(filter).map(RunningSplitInfo::getTaskId).collect(toImmutableSet()); + } + + @Managed(description = "Task processor executor") + @Nested + public ThreadPoolExecutorMBean getProcessorExecutor() + { + return executorMBean; + } +} diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/timesharing/TimeSharingTaskHandle.java b/core/trino-main/src/main/java/io/trino/execution/executor/timesharing/TimeSharingTaskHandle.java new file mode 100644 index 000000000000..6c235ba90b48 --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/execution/executor/timesharing/TimeSharingTaskHandle.java @@ -0,0 +1,203 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution.executor.timesharing; + +import com.google.common.collect.ImmutableList; +import com.google.errorprone.annotations.ThreadSafe; +import com.google.errorprone.annotations.concurrent.GuardedBy; +import io.airlift.units.Duration; +import io.trino.execution.SplitConcurrencyController; +import io.trino.execution.TaskId; +import io.trino.execution.executor.TaskHandle; + +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.List; +import java.util.OptionalInt; +import java.util.Queue; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.DoubleSupplier; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +@ThreadSafe +public class TimeSharingTaskHandle + implements TaskHandle +{ + private volatile boolean destroyed; + private final TaskId taskId; + private final DoubleSupplier utilizationSupplier; + + @GuardedBy("this") + protected final Queue queuedLeafSplits = new ArrayDeque<>(10); + @GuardedBy("this") + protected final List runningLeafSplits = new ArrayList<>(10); + @GuardedBy("this") + protected final List runningIntermediateSplits = new ArrayList<>(10); + @GuardedBy("this") + protected long scheduledNanos; + @GuardedBy("this") + protected final SplitConcurrencyController concurrencyController; + + private final AtomicInteger nextSplitId = new AtomicInteger(); + + private final AtomicReference priority = new AtomicReference<>(new Priority(0, 0)); + private final MultilevelSplitQueue splitQueue; + private final OptionalInt maxDriversPerTask; + + public TimeSharingTaskHandle( + TaskId taskId, + MultilevelSplitQueue splitQueue, + DoubleSupplier utilizationSupplier, + int initialSplitConcurrency, + Duration splitConcurrencyAdjustFrequency, + OptionalInt maxDriversPerTask) + { + this.taskId = requireNonNull(taskId, "taskId is null"); + this.splitQueue = requireNonNull(splitQueue, "splitQueue is null"); + this.utilizationSupplier = requireNonNull(utilizationSupplier, "utilizationSupplier is null"); + this.maxDriversPerTask = requireNonNull(maxDriversPerTask, "maxDriversPerTask is null"); + this.concurrencyController = new SplitConcurrencyController( + initialSplitConcurrency, + requireNonNull(splitConcurrencyAdjustFrequency, "splitConcurrencyAdjustFrequency is null")); + } + + public synchronized Priority addScheduledNanos(long durationNanos) + { + concurrencyController.update(durationNanos, utilizationSupplier.getAsDouble(), runningLeafSplits.size()); + scheduledNanos += durationNanos; + + Priority newPriority = splitQueue.updatePriority(priority.get(), durationNanos, scheduledNanos); + + priority.set(newPriority); + return newPriority; + } + + public synchronized Priority resetLevelPriority() + { + Priority currentPriority = priority.get(); + long levelMinPriority = splitQueue.getLevelMinPriority(currentPriority.getLevel(), scheduledNanos); + + if (currentPriority.getLevelPriority() < levelMinPriority) { + Priority newPriority = new Priority(currentPriority.getLevel(), levelMinPriority); + priority.set(newPriority); + return newPriority; + } + + return currentPriority; + } + + @Override + public boolean isDestroyed() + { + return destroyed; + } + + public Priority getPriority() + { + return priority.get(); + } + + public TaskId getTaskId() + { + return taskId; + } + + public OptionalInt getMaxDriversPerTask() + { + return maxDriversPerTask; + } + + // Returns any remaining splits. The caller must destroy these. + public synchronized List destroy() + { + destroyed = true; + + ImmutableList.Builder builder = ImmutableList.builderWithExpectedSize(runningIntermediateSplits.size() + runningLeafSplits.size() + queuedLeafSplits.size()); + builder.addAll(runningIntermediateSplits); + builder.addAll(runningLeafSplits); + builder.addAll(queuedLeafSplits); + runningIntermediateSplits.clear(); + runningLeafSplits.clear(); + queuedLeafSplits.clear(); + return builder.build(); + } + + public synchronized boolean enqueueSplit(PrioritizedSplitRunner split) + { + if (destroyed) { + return false; + } + queuedLeafSplits.add(split); + return true; + } + + public synchronized boolean recordIntermediateSplit(PrioritizedSplitRunner split) + { + if (destroyed) { + return false; + } + runningIntermediateSplits.add(split); + return true; + } + + synchronized int getRunningLeafSplits() + { + return runningLeafSplits.size(); + } + + public synchronized long getScheduledNanos() + { + return scheduledNanos; + } + + public synchronized PrioritizedSplitRunner pollNextSplit() + { + if (destroyed) { + return null; + } + + if (runningLeafSplits.size() >= concurrencyController.getTargetConcurrency()) { + return null; + } + + PrioritizedSplitRunner split = queuedLeafSplits.poll(); + if (split != null) { + runningLeafSplits.add(split); + } + return split; + } + + public synchronized void splitComplete(PrioritizedSplitRunner split) + { + concurrencyController.splitFinished(split.getScheduledNanos(), utilizationSupplier.getAsDouble(), runningLeafSplits.size()); + runningIntermediateSplits.remove(split); + runningLeafSplits.remove(split); + } + + public int getNextSplitId() + { + return nextSplitId.getAndIncrement(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("taskId", taskId) + .toString(); + } +} diff --git a/core/trino-main/src/main/java/io/trino/server/ServerMainModule.java b/core/trino-main/src/main/java/io/trino/server/ServerMainModule.java index 8f4502fbe51a..dd9b66321c1b 100644 --- a/core/trino-main/src/main/java/io/trino/server/ServerMainModule.java +++ b/core/trino-main/src/main/java/io/trino/server/ServerMainModule.java @@ -49,8 +49,9 @@ import io.trino.execution.TableExecuteContextManager; import io.trino.execution.TaskManagementExecutor; import io.trino.execution.TaskManagerConfig; -import io.trino.execution.executor.MultilevelSplitQueue; import io.trino.execution.executor.TaskExecutor; +import io.trino.execution.executor.timesharing.MultilevelSplitQueue; +import io.trino.execution.executor.timesharing.TimeSharingTaskExecutor; import io.trino.execution.scheduler.NodeScheduler; import io.trino.execution.scheduler.NodeSchedulerConfig; import io.trino.execution.scheduler.TopologyAwareNodeSelectorModule; @@ -305,7 +306,12 @@ protected void setup(Binder binder) binder.bind(LocalMemoryManagerExporter.class).in(Scopes.SINGLETON); newOptionalBinder(binder, VersionEmbedder.class).setDefault().to(EmbedVersion.class).in(Scopes.SINGLETON); newExporter(binder).export(SqlTaskManager.class).withGeneratedName(); - binder.bind(TaskExecutor.class).in(Scopes.SINGLETON); + + binder.bind(TaskExecutor.class) + .to(TimeSharingTaskExecutor.class) + .in(Scopes.SINGLETON); + binder.bind(TimeSharingTaskExecutor.class).in(Scopes.SINGLETON); + newExporter(binder).export(TaskExecutor.class).withGeneratedName(); binder.bind(MultilevelSplitQueue.class).in(Scopes.SINGLETON); newExporter(binder).export(MultilevelSplitQueue.class).withGeneratedName(); diff --git a/core/trino-main/src/main/java/io/trino/server/TaskExecutorResource.java b/core/trino-main/src/main/java/io/trino/server/TaskExecutorResource.java index 1248cb1515cf..0f8d19d52fba 100644 --- a/core/trino-main/src/main/java/io/trino/server/TaskExecutorResource.java +++ b/core/trino-main/src/main/java/io/trino/server/TaskExecutorResource.java @@ -14,7 +14,7 @@ package io.trino.server; import com.google.inject.Inject; -import io.trino.execution.executor.TaskExecutor; +import io.trino.execution.executor.timesharing.TimeSharingTaskExecutor; import io.trino.server.security.ResourceSecurity; import jakarta.ws.rs.GET; import jakarta.ws.rs.Path; @@ -27,11 +27,11 @@ @Path("/v1/maxActiveSplits") public class TaskExecutorResource { - private final TaskExecutor taskExecutor; + private final TimeSharingTaskExecutor taskExecutor; @Inject public TaskExecutorResource( - TaskExecutor taskExecutor) + TimeSharingTaskExecutor taskExecutor) { this.taskExecutor = requireNonNull(taskExecutor, "taskExecutor is null"); } diff --git a/core/trino-main/src/main/java/io/trino/sql/gen/ExpressionProfiler.java b/core/trino-main/src/main/java/io/trino/sql/gen/ExpressionProfiler.java index 7839d49b2d30..49f635be2ff4 100644 --- a/core/trino-main/src/main/java/io/trino/sql/gen/ExpressionProfiler.java +++ b/core/trino-main/src/main/java/io/trino/sql/gen/ExpressionProfiler.java @@ -19,7 +19,7 @@ import static com.google.common.base.Ticker.systemTicker; import static com.google.common.base.Verify.verify; -import static io.trino.execution.executor.PrioritizedSplitRunner.SPLIT_RUN_QUANTA; +import static io.trino.execution.executor.timesharing.PrioritizedSplitRunner.SPLIT_RUN_QUANTA; import static java.util.Objects.requireNonNull; import static java.util.concurrent.TimeUnit.NANOSECONDS; diff --git a/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManager.java b/core/trino-main/src/test/java/io/trino/execution/BaseTestSqlTaskManager.java similarity index 96% rename from core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManager.java rename to core/trino-main/src/test/java/io/trino/execution/BaseTestSqlTaskManager.java index 0ce54e2fdec9..58e468ea2a1f 100644 --- a/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManager.java +++ b/core/trino-main/src/test/java/io/trino/execution/BaseTestSqlTaskManager.java @@ -13,7 +13,6 @@ */ package io.trino.execution; -import com.google.common.base.Ticker; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; @@ -61,6 +60,7 @@ import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicInteger; import static com.google.common.util.concurrent.MoreExecutors.directExecutor; import static io.airlift.tracing.Tracing.noopTracer; @@ -84,18 +84,20 @@ import static org.testng.Assert.assertTrue; @TestInstance(PER_CLASS) -public class TestSqlTaskManager +public abstract class BaseTestSqlTaskManager { - private static final TaskId TASK_ID = new TaskId(new StageId("query", 0), 1, 0); public static final OutputBufferId OUT = new OutputBufferId(0); + private final AtomicInteger sequence = new AtomicInteger(); private TaskExecutor taskExecutor; private TaskManagementExecutor taskManagementExecutor; + protected abstract TaskExecutor createTaskExecutor(); + @BeforeAll public void setUp() { - taskExecutor = new TaskExecutor(8, 16, 3, 4, Ticker.systemTicker()); + taskExecutor = createTaskExecutor(); taskExecutor.start(); taskManagementExecutor = new TaskManagementExecutor(); } @@ -113,7 +115,7 @@ public void tearDown() public void testEmptyQuery() { try (SqlTaskManager sqlTaskManager = createSqlTaskManager(new TaskManagerConfig())) { - TaskId taskId = TASK_ID; + TaskId taskId = newTaskId(); TaskInfo taskInfo = createTask(sqlTaskManager, taskId, PipelinedOutputBuffers.createInitial(PARTITIONED).withNoMoreBufferIds()); assertEquals(taskInfo.getTaskStatus().getState(), TaskState.RUNNING); @@ -134,7 +136,7 @@ public void testSimpleQuery() throws Exception { try (SqlTaskManager sqlTaskManager = createSqlTaskManager(new TaskManagerConfig())) { - TaskId taskId = TASK_ID; + TaskId taskId = newTaskId(); createTask(sqlTaskManager, taskId, ImmutableSet.of(SPLIT), PipelinedOutputBuffers.createInitial(PARTITIONED).withBuffer(OUT, 0).withNoMoreBufferIds()); TaskInfo taskInfo = sqlTaskManager.getTaskInfo(taskId, TaskStatus.STARTING_VERSION).get(); @@ -167,7 +169,7 @@ public void testCancel() throws InterruptedException, ExecutionException, TimeoutException { try (SqlTaskManager sqlTaskManager = createSqlTaskManager(new TaskManagerConfig())) { - TaskId taskId = TASK_ID; + TaskId taskId = newTaskId(); TaskInfo taskInfo = createTask(sqlTaskManager, taskId, PipelinedOutputBuffers.createInitial(PARTITIONED).withBuffer(OUT, 0).withNoMoreBufferIds()); assertEquals(taskInfo.getTaskStatus().getState(), TaskState.RUNNING); assertNull(taskInfo.getStats().getEndTime()); @@ -191,7 +193,7 @@ public void testAbort() throws InterruptedException, ExecutionException, TimeoutException { try (SqlTaskManager sqlTaskManager = createSqlTaskManager(new TaskManagerConfig())) { - TaskId taskId = TASK_ID; + TaskId taskId = newTaskId(); TaskInfo taskInfo = createTask(sqlTaskManager, taskId, PipelinedOutputBuffers.createInitial(PARTITIONED).withBuffer(OUT, 0).withNoMoreBufferIds()); assertEquals(taskInfo.getTaskStatus().getState(), TaskState.RUNNING); assertNull(taskInfo.getStats().getEndTime()); @@ -216,7 +218,7 @@ public void testAbortResults() throws Exception { try (SqlTaskManager sqlTaskManager = createSqlTaskManager(new TaskManagerConfig())) { - TaskId taskId = TASK_ID; + TaskId taskId = newTaskId(); createTask(sqlTaskManager, taskId, ImmutableSet.of(SPLIT), PipelinedOutputBuffers.createInitial(PARTITIONED).withBuffer(OUT, 0).withNoMoreBufferIds()); TaskInfo taskInfo = sqlTaskManager.getTaskInfo(taskId, TaskStatus.STARTING_VERSION).get(); @@ -237,7 +239,7 @@ public void testRemoveOldTasks() throws InterruptedException, ExecutionException, TimeoutException { try (SqlTaskManager sqlTaskManager = createSqlTaskManager(new TaskManagerConfig().setInfoMaxAge(new Duration(5, TimeUnit.MILLISECONDS)))) { - TaskId taskId = TASK_ID; + TaskId taskId = newTaskId(); TaskInfo taskInfo = createTask(sqlTaskManager, taskId, PipelinedOutputBuffers.createInitial(PARTITIONED).withBuffer(OUT, 0).withNoMoreBufferIds()); assertEquals(taskInfo.getTaskStatus().getState(), TaskState.RUNNING); @@ -437,4 +439,9 @@ public ConnectorServices getConnectorServices(CatalogHandle catalogHandle) throw new UnsupportedOperationException(); } } + + private TaskId newTaskId() + { + return new TaskId(new StageId("query" + sequence.incrementAndGet(), 0), 1, 0); + } } diff --git a/core/trino-main/src/test/java/io/trino/execution/TaskTestUtils.java b/core/trino-main/src/test/java/io/trino/execution/TaskTestUtils.java index c330906ccfcd..da666dac24e3 100644 --- a/core/trino-main/src/test/java/io/trino/execution/TaskTestUtils.java +++ b/core/trino-main/src/test/java/io/trino/execution/TaskTestUtils.java @@ -24,7 +24,7 @@ import io.trino.eventlistener.EventListenerConfig; import io.trino.eventlistener.EventListenerManager; import io.trino.exchange.ExchangeManagerRegistry; -import io.trino.execution.TestSqlTaskManager.MockDirectExchangeClientSupplier; +import io.trino.execution.BaseTestSqlTaskManager.MockDirectExchangeClientSupplier; import io.trino.execution.buffer.OutputBuffers; import io.trino.execution.scheduler.NodeScheduler; import io.trino.execution.scheduler.NodeSchedulerConfig; diff --git a/core/trino-main/src/test/java/io/trino/execution/TestMemoryRevokingScheduler.java b/core/trino-main/src/test/java/io/trino/execution/TestMemoryRevokingScheduler.java index b57d796277dc..fc5a1434b130 100644 --- a/core/trino-main/src/test/java/io/trino/execution/TestMemoryRevokingScheduler.java +++ b/core/trino-main/src/test/java/io/trino/execution/TestMemoryRevokingScheduler.java @@ -24,6 +24,7 @@ import io.trino.exchange.ExchangeManagerRegistry; import io.trino.execution.buffer.PipelinedOutputBuffers; import io.trino.execution.executor.TaskExecutor; +import io.trino.execution.executor.timesharing.TimeSharingTaskExecutor; import io.trino.memory.MemoryPool; import io.trino.memory.QueryContext; import io.trino.memory.context.LocalMemoryContext; @@ -85,7 +86,7 @@ public void setUp() { memoryPool = new MemoryPool(DataSize.ofBytes(10)); - taskExecutor = new TaskExecutor(8, 16, 3, 4, Ticker.systemTicker()); + taskExecutor = new TimeSharingTaskExecutor(8, 16, 3, 4, Ticker.systemTicker()); taskExecutor.start(); // Must be single threaded diff --git a/core/trino-main/src/test/java/io/trino/execution/TestSqlTask.java b/core/trino-main/src/test/java/io/trino/execution/TestSqlTask.java index 7ce9630da8c1..f47f8c6f4410 100644 --- a/core/trino-main/src/test/java/io/trino/execution/TestSqlTask.java +++ b/core/trino-main/src/test/java/io/trino/execution/TestSqlTask.java @@ -31,6 +31,7 @@ import io.trino.execution.buffer.PipelinedOutputBuffers; import io.trino.execution.buffer.PipelinedOutputBuffers.OutputBufferId; import io.trino.execution.executor.TaskExecutor; +import io.trino.execution.executor.timesharing.TimeSharingTaskExecutor; import io.trino.memory.MemoryPool; import io.trino.memory.QueryContext; import io.trino.operator.TaskContext; @@ -96,7 +97,7 @@ public class TestSqlTask @BeforeClass public void setUp() { - taskExecutor = new TaskExecutor(8, 16, 3, 4, Ticker.systemTicker()); + taskExecutor = new TimeSharingTaskExecutor(8, 16, 3, 4, Ticker.systemTicker()); taskExecutor.start(); taskNotificationExecutor = newScheduledThreadPool(10, threadsNamed("task-notification-%s")); diff --git a/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskExecution.java b/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskExecution.java index aedcb033b15b..6ec7a383fcd2 100644 --- a/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskExecution.java +++ b/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskExecution.java @@ -34,6 +34,7 @@ import io.trino.execution.buffer.PipelinedOutputBuffers; import io.trino.execution.buffer.PipelinedOutputBuffers.OutputBufferId; import io.trino.execution.executor.TaskExecutor; +import io.trino.execution.executor.timesharing.TimeSharingTaskExecutor; import io.trino.memory.MemoryPool; import io.trino.memory.QueryContext; import io.trino.memory.context.SimpleLocalMemoryContext; @@ -101,7 +102,7 @@ public void testSimple() { ScheduledExecutorService taskNotificationExecutor = newScheduledThreadPool(10, threadsNamed("task-notification-%s")); ScheduledExecutorService driverYieldExecutor = newScheduledThreadPool(2, threadsNamed("driver-yield-%s")); - TaskExecutor taskExecutor = new TaskExecutor(5, 10, 3, 4, Ticker.systemTicker()); + TaskExecutor taskExecutor = new TimeSharingTaskExecutor(5, 10, 3, 4, Ticker.systemTicker()); taskExecutor.start(); try { diff --git a/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManagerTimeSharing.java b/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManagerTimeSharing.java new file mode 100644 index 000000000000..aab4c2fdd704 --- /dev/null +++ b/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManagerTimeSharing.java @@ -0,0 +1,28 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution; + +import com.google.common.base.Ticker; +import io.trino.execution.executor.TaskExecutor; +import io.trino.execution.executor.timesharing.TimeSharingTaskExecutor; + +public class TestSqlTaskManagerTimeSharing + extends BaseTestSqlTaskManager +{ + @Override + protected TaskExecutor createTaskExecutor() + { + return new TimeSharingTaskExecutor(8, 16, 3, 4, Ticker.systemTicker()); + } +} diff --git a/core/trino-main/src/test/java/io/trino/execution/TestTaskExecutorStuckSplits.java b/core/trino-main/src/test/java/io/trino/execution/TestTaskExecutorStuckSplits.java index ee044ceeea59..09fc44900410 100644 --- a/core/trino-main/src/test/java/io/trino/execution/TestTaskExecutorStuckSplits.java +++ b/core/trino-main/src/test/java/io/trino/execution/TestTaskExecutorStuckSplits.java @@ -29,6 +29,7 @@ import io.trino.exchange.ExchangeManagerRegistry; import io.trino.execution.executor.TaskExecutor; import io.trino.execution.executor.TaskHandle; +import io.trino.execution.executor.timesharing.TimeSharingTaskExecutor; import io.trino.memory.LocalMemoryManager; import io.trino.memory.NodeMemoryConfig; import io.trino.spi.connector.CatalogHandle; @@ -63,7 +64,7 @@ public void testFailStuckSplitTasks() TaskId taskId = new TaskId(new StageId("query", 0), 1, 0); - TaskExecutor taskExecutor = new TaskExecutor(4, 8, 3, 4, ticker); + TaskExecutor taskExecutor = new TimeSharingTaskExecutor(4, 8, 3, 4, ticker); TaskHandle taskHandle = taskExecutor.addTask( taskId, () -> 1.0, @@ -121,7 +122,7 @@ private SqlTaskManager createSqlTaskManager( new EmbedVersion("testversion"), new NoConnectorServicesProvider(), createTestingPlanner(), - new TestSqlTaskManager.MockLocationFactory(), + new BaseTestSqlTaskManager.MockLocationFactory(), taskExecutor, createTestSplitMonitor(), new NodeInfo("test"), diff --git a/core/trino-main/src/test/java/io/trino/execution/executor/Histogram.java b/core/trino-main/src/test/java/io/trino/execution/executor/timesharing/Histogram.java similarity index 99% rename from core/trino-main/src/test/java/io/trino/execution/executor/Histogram.java rename to core/trino-main/src/test/java/io/trino/execution/executor/timesharing/Histogram.java index 343f02c30607..8262c14d95aa 100644 --- a/core/trino-main/src/test/java/io/trino/execution/executor/Histogram.java +++ b/core/trino-main/src/test/java/io/trino/execution/executor/timesharing/Histogram.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.execution.executor; +package io.trino.execution.executor.timesharing; import com.google.common.collect.ImmutableList; diff --git a/core/trino-main/src/test/java/io/trino/execution/executor/SimulationController.java b/core/trino-main/src/test/java/io/trino/execution/executor/timesharing/SimulationController.java similarity index 93% rename from core/trino-main/src/test/java/io/trino/execution/executor/SimulationController.java rename to core/trino-main/src/test/java/io/trino/execution/executor/timesharing/SimulationController.java index d8771580b4da..990a351170e9 100644 --- a/core/trino-main/src/test/java/io/trino/execution/executor/SimulationController.java +++ b/core/trino-main/src/test/java/io/trino/execution/executor/timesharing/SimulationController.java @@ -11,16 +11,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.execution.executor; +package io.trino.execution.executor.timesharing; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.ListMultimap; import com.google.common.collect.Multimaps; import io.trino.execution.StageId; import io.trino.execution.TaskId; -import io.trino.execution.executor.SimulationTask.IntermediateTask; -import io.trino.execution.executor.SimulationTask.LeafTask; -import io.trino.execution.executor.SplitGenerators.SplitGenerator; +import io.trino.execution.executor.timesharing.SimulationTask.IntermediateTask; +import io.trino.execution.executor.timesharing.SimulationTask.LeafTask; +import io.trino.execution.executor.timesharing.SplitGenerators.SplitGenerator; import java.util.Map; import java.util.OptionalInt; @@ -29,7 +29,7 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.BiConsumer; -import static io.trino.execution.executor.SimulationController.TaskSpecification.Type.LEAF; +import static io.trino.execution.executor.timesharing.SimulationController.TaskSpecification.Type.LEAF; import static java.util.concurrent.Executors.newSingleThreadExecutor; import static java.util.concurrent.TimeUnit.MILLISECONDS; @@ -37,8 +37,8 @@ class SimulationController { private static final int DEFAULT_MIN_SPLITS_PER_TASK = 3; - private final TaskExecutor taskExecutor; - private final BiConsumer callback; + private final TimeSharingTaskExecutor taskExecutor; + private final BiConsumer callback; private final ExecutorService controllerExecutor = newSingleThreadExecutor(); @@ -50,7 +50,7 @@ class SimulationController private final AtomicBoolean stopped = new AtomicBoolean(); - public SimulationController(TaskExecutor taskExecutor, BiConsumer callback) + public SimulationController(TimeSharingTaskExecutor taskExecutor, BiConsumer callback) { this.taskExecutor = taskExecutor; this.callback = callback; diff --git a/core/trino-main/src/test/java/io/trino/execution/executor/SimulationSplit.java b/core/trino-main/src/test/java/io/trino/execution/executor/timesharing/SimulationSplit.java similarity index 99% rename from core/trino-main/src/test/java/io/trino/execution/executor/SimulationSplit.java rename to core/trino-main/src/test/java/io/trino/execution/executor/timesharing/SimulationSplit.java index 54ac4c2cd2b1..9cf452655c07 100644 --- a/core/trino-main/src/test/java/io/trino/execution/executor/SimulationSplit.java +++ b/core/trino-main/src/test/java/io/trino/execution/executor/timesharing/SimulationSplit.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.execution.executor; +package io.trino.execution.executor.timesharing; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.SettableFuture; diff --git a/core/trino-main/src/test/java/io/trino/execution/executor/SimulationTask.java b/core/trino-main/src/test/java/io/trino/execution/executor/timesharing/SimulationTask.java similarity index 86% rename from core/trino-main/src/test/java/io/trino/execution/executor/SimulationTask.java rename to core/trino-main/src/test/java/io/trino/execution/executor/timesharing/SimulationTask.java index ed2ae7c1790c..e42314723244 100644 --- a/core/trino-main/src/test/java/io/trino/execution/executor/SimulationTask.java +++ b/core/trino-main/src/test/java/io/trino/execution/executor/timesharing/SimulationTask.java @@ -11,13 +11,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.execution.executor; +package io.trino.execution.executor.timesharing; import com.google.common.collect.ImmutableList; import com.google.common.collect.Sets; import io.airlift.units.Duration; import io.trino.execution.TaskId; -import io.trino.execution.executor.SimulationController.TaskSpecification; +import io.trino.execution.executor.TaskHandle; +import io.trino.execution.executor.timesharing.SimulationController.TaskSpecification; import java.util.OptionalInt; import java.util.Set; @@ -36,7 +37,7 @@ abstract class SimulationTask private final TaskHandle taskHandle; private final AtomicBoolean killed = new AtomicBoolean(); - public SimulationTask(TaskExecutor taskExecutor, TaskSpecification specification, TaskId taskId) + public SimulationTask(TimeSharingTaskExecutor taskExecutor, TaskSpecification specification, TaskId taskId) { this.specification = specification; this.taskId = taskId; @@ -123,21 +124,21 @@ public long getScheduledTimeNanos() return runningWallTime + completedWallTime; } - public abstract void schedule(TaskExecutor taskExecutor, int numSplits); + public abstract void schedule(TimeSharingTaskExecutor taskExecutor, int numSplits); public static class LeafTask extends SimulationTask { private final TaskSpecification taskSpecification; - public LeafTask(TaskExecutor taskExecutor, TaskSpecification specification, TaskId taskId) + public LeafTask(TimeSharingTaskExecutor taskExecutor, TaskSpecification specification, TaskId taskId) { super(taskExecutor, specification, taskId); this.taskSpecification = specification; } @Override - public void schedule(TaskExecutor taskExecutor, int numSplits) + public void schedule(TimeSharingTaskExecutor taskExecutor, int numSplits) { ImmutableList.Builder splits = ImmutableList.builder(); for (int i = 0; i < numSplits; i++) { @@ -153,14 +154,14 @@ public static class IntermediateTask { private final SplitSpecification splitSpecification; - public IntermediateTask(TaskExecutor taskExecutor, TaskSpecification specification, TaskId taskId) + public IntermediateTask(TimeSharingTaskExecutor taskExecutor, TaskSpecification specification, TaskId taskId) { super(taskExecutor, specification, taskId); this.splitSpecification = specification.nextSpecification(); } @Override - public void schedule(TaskExecutor taskExecutor, int numSplits) + public void schedule(TimeSharingTaskExecutor taskExecutor, int numSplits) { ImmutableList.Builder splits = ImmutableList.builderWithExpectedSize(numSplits); for (int i = 0; i < numSplits; i++) { diff --git a/core/trino-main/src/test/java/io/trino/execution/executor/SplitGenerators.java b/core/trino-main/src/test/java/io/trino/execution/executor/timesharing/SplitGenerators.java similarity index 97% rename from core/trino-main/src/test/java/io/trino/execution/executor/SplitGenerators.java rename to core/trino-main/src/test/java/io/trino/execution/executor/timesharing/SplitGenerators.java index 9a4aebdd0895..ebcd33f40450 100644 --- a/core/trino-main/src/test/java/io/trino/execution/executor/SplitGenerators.java +++ b/core/trino-main/src/test/java/io/trino/execution/executor/timesharing/SplitGenerators.java @@ -11,19 +11,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.execution.executor; +package io.trino.execution.executor.timesharing; import com.google.common.collect.ImmutableList; import io.airlift.units.Duration; -import io.trino.execution.executor.SplitSpecification.IntermediateSplitSpecification; -import io.trino.execution.executor.SplitSpecification.LeafSplitSpecification; +import io.trino.execution.executor.timesharing.SplitSpecification.IntermediateSplitSpecification; +import io.trino.execution.executor.timesharing.SplitSpecification.LeafSplitSpecification; import java.util.ArrayList; import java.util.List; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ThreadLocalRandom; -import static io.trino.execution.executor.Histogram.fromContinuous; +import static io.trino.execution.executor.timesharing.Histogram.fromContinuous; import static java.util.concurrent.TimeUnit.DAYS; import static java.util.concurrent.TimeUnit.MICROSECONDS; import static java.util.concurrent.TimeUnit.MILLISECONDS; diff --git a/core/trino-main/src/test/java/io/trino/execution/executor/SplitSpecification.java b/core/trino-main/src/test/java/io/trino/execution/executor/timesharing/SplitSpecification.java similarity index 93% rename from core/trino-main/src/test/java/io/trino/execution/executor/SplitSpecification.java rename to core/trino-main/src/test/java/io/trino/execution/executor/timesharing/SplitSpecification.java index f998ea8f7a7c..f88b63a893f2 100644 --- a/core/trino-main/src/test/java/io/trino/execution/executor/SplitSpecification.java +++ b/core/trino-main/src/test/java/io/trino/execution/executor/timesharing/SplitSpecification.java @@ -11,10 +11,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.execution.executor; +package io.trino.execution.executor.timesharing; -import io.trino.execution.executor.SimulationSplit.IntermediateSplit; -import io.trino.execution.executor.SimulationSplit.LeafSplit; +import io.trino.execution.executor.timesharing.SimulationSplit.IntermediateSplit; +import io.trino.execution.executor.timesharing.SimulationSplit.LeafSplit; import java.util.concurrent.ScheduledExecutorService; diff --git a/core/trino-main/src/test/java/io/trino/execution/executor/TestTaskExecutor.java b/core/trino-main/src/test/java/io/trino/execution/executor/timesharing/TestTimeSharingTaskExecutor.java similarity index 92% rename from core/trino-main/src/test/java/io/trino/execution/executor/TestTaskExecutor.java rename to core/trino-main/src/test/java/io/trino/execution/executor/timesharing/TestTimeSharingTaskExecutor.java index 484b58231c44..1625000b2bd0 100644 --- a/core/trino-main/src/test/java/io/trino/execution/executor/TestTaskExecutor.java +++ b/core/trino-main/src/test/java/io/trino/execution/executor/timesharing/TestTimeSharingTaskExecutor.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.execution.executor; +package io.trino.execution.executor.timesharing; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; @@ -23,6 +23,8 @@ import io.trino.execution.SplitRunner; import io.trino.execution.StageId; import io.trino.execution.TaskId; +import io.trino.execution.executor.TaskExecutor; +import io.trino.execution.executor.TaskHandle; import io.trino.spi.QueryId; import org.testng.annotations.Test; @@ -39,8 +41,8 @@ import static com.google.common.util.concurrent.Futures.immediateVoidFuture; import static io.airlift.testing.Assertions.assertGreaterThan; import static io.airlift.testing.Assertions.assertLessThan; -import static io.trino.execution.executor.MultilevelSplitQueue.LEVEL_CONTRIBUTION_CAP; -import static io.trino.execution.executor.MultilevelSplitQueue.LEVEL_THRESHOLD_SECONDS; +import static io.trino.execution.executor.timesharing.MultilevelSplitQueue.LEVEL_CONTRIBUTION_CAP; +import static io.trino.execution.executor.timesharing.MultilevelSplitQueue.LEVEL_THRESHOLD_SECONDS; import static java.lang.Double.isNaN; import static java.util.concurrent.TimeUnit.MILLISECONDS; import static java.util.concurrent.TimeUnit.MINUTES; @@ -49,7 +51,7 @@ import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; -public class TestTaskExecutor +public class TestTimeSharingTaskExecutor { @Test(invocationCount = 100) public void testTasksComplete() @@ -58,7 +60,7 @@ public void testTasksComplete() TestingTicker ticker = new TestingTicker(); Duration splitProcessingDurationThreshold = new Duration(10, MINUTES); - TaskExecutor taskExecutor = new TaskExecutor(4, 8, 3, 4, ticker); + TimeSharingTaskExecutor taskExecutor = new TimeSharingTaskExecutor(4, 8, 3, 4, ticker); taskExecutor.start(); try { @@ -156,7 +158,7 @@ public void testTasksComplete() public void testQuantaFairness() { TestingTicker ticker = new TestingTicker(); - TaskExecutor taskExecutor = new TaskExecutor(1, 2, 3, 4, ticker); + TaskExecutor taskExecutor = new TimeSharingTaskExecutor(1, 2, 3, 4, ticker); taskExecutor.start(); try { @@ -190,12 +192,12 @@ public void testQuantaFairness() public void testLevelMovement() { TestingTicker ticker = new TestingTicker(); - TaskExecutor taskExecutor = new TaskExecutor(2, 2, 3, 4, ticker); + TimeSharingTaskExecutor taskExecutor = new TimeSharingTaskExecutor(2, 2, 3, 4, ticker); taskExecutor.start(); try { ticker.increment(20, MILLISECONDS); - TaskHandle testTaskHandle = taskExecutor.addTask(new TaskId(new StageId("test", 0), 0, 0), () -> 0, 10, new Duration(1, MILLISECONDS), OptionalInt.empty()); + TimeSharingTaskHandle testTaskHandle = taskExecutor.addTask(new TaskId(new StageId("test", 0), 0, 0), () -> 0, 10, new Duration(1, MILLISECONDS), OptionalInt.empty()); Phaser globalPhaser = new Phaser(); globalPhaser.bulkRegister(3); // 2 taskExecutor threads + test thread @@ -229,7 +231,7 @@ public void testLevelMultipliers() throws Exception { TestingTicker ticker = new TestingTicker(); - TaskExecutor taskExecutor = new TaskExecutor(6, 3, 3, 4, new MultilevelSplitQueue(2), ticker); + TimeSharingTaskExecutor taskExecutor = new TimeSharingTaskExecutor(6, 3, 3, 4, new MultilevelSplitQueue(2), ticker); taskExecutor.start(); try { @@ -307,12 +309,12 @@ public void testLevelMultipliers() public void testTaskHandle() { TestingTicker ticker = new TestingTicker(); - TaskExecutor taskExecutor = new TaskExecutor(4, 8, 3, 4, ticker); + TimeSharingTaskExecutor taskExecutor = new TimeSharingTaskExecutor(4, 8, 3, 4, ticker); taskExecutor.start(); try { TaskId taskId = new TaskId(new StageId("test", 0), 0, 0); - TaskHandle taskHandle = taskExecutor.addTask(taskId, () -> 0, 10, new Duration(1, MILLISECONDS), OptionalInt.empty()); + TimeSharingTaskHandle taskHandle = taskExecutor.addTask(taskId, () -> 0, 10, new Duration(1, MILLISECONDS), OptionalInt.empty()); Phaser beginPhase = new Phaser(); beginPhase.register(); @@ -343,8 +345,8 @@ public void testTaskHandle() public void testLevelContributionCap() { MultilevelSplitQueue splitQueue = new MultilevelSplitQueue(2); - TaskHandle handle0 = new TaskHandle(new TaskId(new StageId("test0", 0), 0, 0), splitQueue, () -> 1, 1, new Duration(1, SECONDS), OptionalInt.empty()); - TaskHandle handle1 = new TaskHandle(new TaskId(new StageId("test1", 0), 0, 0), splitQueue, () -> 1, 1, new Duration(1, SECONDS), OptionalInt.empty()); + TimeSharingTaskHandle handle0 = new TimeSharingTaskHandle(new TaskId(new StageId("test0", 0), 0, 0), splitQueue, () -> 1, 1, new Duration(1, SECONDS), OptionalInt.empty()); + TimeSharingTaskHandle handle1 = new TimeSharingTaskHandle(new TaskId(new StageId("test1", 0), 0, 0), splitQueue, () -> 1, 1, new Duration(1, SECONDS), OptionalInt.empty()); for (int i = 0; i < (LEVEL_THRESHOLD_SECONDS.length - 1); i++) { long levelAdvanceTime = SECONDS.toNanos(LEVEL_THRESHOLD_SECONDS[i + 1] - LEVEL_THRESHOLD_SECONDS[i]); @@ -363,7 +365,7 @@ public void testLevelContributionCap() public void testUpdateLevelWithCap() { MultilevelSplitQueue splitQueue = new MultilevelSplitQueue(2); - TaskHandle handle0 = new TaskHandle(new TaskId(new StageId("test0", 0), 0, 0), splitQueue, () -> 1, 1, new Duration(1, SECONDS), OptionalInt.empty()); + TimeSharingTaskHandle handle0 = new TimeSharingTaskHandle(new TaskId(new StageId("test0", 0), 0, 0), splitQueue, () -> 1, 1, new Duration(1, SECONDS), OptionalInt.empty()); long quantaNanos = MINUTES.toNanos(10); handle0.addScheduledNanos(quantaNanos); @@ -382,7 +384,7 @@ public void testMinMaxDriversPerTask() int maxDriversPerTask = 2; MultilevelSplitQueue splitQueue = new MultilevelSplitQueue(2); TestingTicker ticker = new TestingTicker(); - TaskExecutor taskExecutor = new TaskExecutor(4, 16, 1, maxDriversPerTask, splitQueue, ticker); + TimeSharingTaskExecutor taskExecutor = new TimeSharingTaskExecutor(4, 16, 1, maxDriversPerTask, splitQueue, ticker); taskExecutor.start(); try { @@ -423,7 +425,7 @@ public void testUserSpecifiedMaxDriversPerTask() MultilevelSplitQueue splitQueue = new MultilevelSplitQueue(2); TestingTicker ticker = new TestingTicker(); // create a task executor with min/max drivers per task to be 2 and 4 - TaskExecutor taskExecutor = new TaskExecutor(4, 16, 2, 4, splitQueue, ticker); + TimeSharingTaskExecutor taskExecutor = new TimeSharingTaskExecutor(4, 16, 2, 4, splitQueue, ticker); taskExecutor.start(); try { @@ -463,7 +465,7 @@ public void testMinDriversPerTaskWhenTargetConcurrencyIncreases() MultilevelSplitQueue splitQueue = new MultilevelSplitQueue(2); TestingTicker ticker = new TestingTicker(); // create a task executor with min/max drivers per task to be 2 - TaskExecutor taskExecutor = new TaskExecutor(4, 1, 2, 2, splitQueue, ticker); + TimeSharingTaskExecutor taskExecutor = new TimeSharingTaskExecutor(4, 1, 2, 2, splitQueue, ticker); taskExecutor.start(); try { @@ -508,7 +510,7 @@ public void testLeafSplitsSize() { MultilevelSplitQueue splitQueue = new MultilevelSplitQueue(2); TestingTicker ticker = new TestingTicker(); - TaskExecutor taskExecutor = new TaskExecutor(4, 1, 2, 2, splitQueue, ticker); + TimeSharingTaskExecutor taskExecutor = new TimeSharingTaskExecutor(4, 1, 2, 2, splitQueue, ticker); TaskHandle testTaskHandle = taskExecutor.addTask(new TaskId(new StageId("test", 0), 0, 0), () -> 0, 10, new Duration(1, MILLISECONDS), OptionalInt.empty()); TestingJob driver1 = new TestingJob(ticker, new Phaser(), new Phaser(), new Phaser(), 1, 500); diff --git a/core/trino-main/src/test/java/io/trino/execution/executor/TaskExecutorSimulator.java b/core/trino-main/src/test/java/io/trino/execution/executor/timesharing/TimeSharingTaskExecutorSimulation.java similarity index 90% rename from core/trino-main/src/test/java/io/trino/execution/executor/TaskExecutorSimulator.java rename to core/trino-main/src/test/java/io/trino/execution/executor/timesharing/TimeSharingTaskExecutorSimulation.java index 115fa3588dad..91942d567e62 100644 --- a/core/trino-main/src/test/java/io/trino/execution/executor/TaskExecutorSimulator.java +++ b/core/trino-main/src/test/java/io/trino/execution/executor/timesharing/TimeSharingTaskExecutorSimulation.java @@ -11,7 +11,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.execution.executor; +package io.trino.execution.executor.timesharing; import com.google.common.base.Ticker; import com.google.common.collect.ImmutableList; @@ -19,14 +19,14 @@ import com.google.common.collect.ListMultimap; import com.google.common.util.concurrent.ListeningExecutorService; import io.airlift.units.Duration; -import io.trino.execution.executor.SimulationController.TaskSpecification; -import io.trino.execution.executor.SplitGenerators.AggregatedLeafSplitGenerator; -import io.trino.execution.executor.SplitGenerators.FastLeafSplitGenerator; -import io.trino.execution.executor.SplitGenerators.IntermediateSplitGenerator; -import io.trino.execution.executor.SplitGenerators.L4LeafSplitGenerator; -import io.trino.execution.executor.SplitGenerators.QuantaExceedingSplitGenerator; -import io.trino.execution.executor.SplitGenerators.SimpleLeafSplitGenerator; -import io.trino.execution.executor.SplitGenerators.SlowLeafSplitGenerator; +import io.trino.execution.executor.timesharing.SimulationController.TaskSpecification; +import io.trino.execution.executor.timesharing.SplitGenerators.AggregatedLeafSplitGenerator; +import io.trino.execution.executor.timesharing.SplitGenerators.FastLeafSplitGenerator; +import io.trino.execution.executor.timesharing.SplitGenerators.IntermediateSplitGenerator; +import io.trino.execution.executor.timesharing.SplitGenerators.L4LeafSplitGenerator; +import io.trino.execution.executor.timesharing.SplitGenerators.QuantaExceedingSplitGenerator; +import io.trino.execution.executor.timesharing.SplitGenerators.SimpleLeafSplitGenerator; +import io.trino.execution.executor.timesharing.SplitGenerators.SlowLeafSplitGenerator; import org.joda.time.DateTime; import java.io.Closeable; @@ -42,10 +42,10 @@ import static io.airlift.concurrent.Threads.threadsNamed; import static io.airlift.units.Duration.nanosSince; import static io.airlift.units.Duration.succinctNanos; -import static io.trino.execution.executor.Histogram.fromContinuous; -import static io.trino.execution.executor.Histogram.fromDiscrete; -import static io.trino.execution.executor.SimulationController.TaskSpecification.Type.INTERMEDIATE; -import static io.trino.execution.executor.SimulationController.TaskSpecification.Type.LEAF; +import static io.trino.execution.executor.timesharing.Histogram.fromContinuous; +import static io.trino.execution.executor.timesharing.Histogram.fromDiscrete; +import static io.trino.execution.executor.timesharing.SimulationController.TaskSpecification.Type.INTERMEDIATE; +import static io.trino.execution.executor.timesharing.SimulationController.TaskSpecification.Type.LEAF; import static java.lang.String.format; import static java.util.concurrent.Executors.newCachedThreadPool; import static java.util.concurrent.Executors.newScheduledThreadPool; @@ -57,13 +57,13 @@ import static java.util.concurrent.TimeUnit.SECONDS; import static java.util.function.Function.identity; -public class TaskExecutorSimulator +public class TimeSharingTaskExecutorSimulation implements Closeable { public static void main(String[] args) throws Exception { - try (TaskExecutorSimulator simulator = new TaskExecutorSimulator()) { + try (TimeSharingTaskExecutorSimulation simulator = new TimeSharingTaskExecutorSimulation()) { simulator.run(); } } @@ -73,13 +73,13 @@ public static void main(String[] args) private final ScheduledExecutorService runningSplitsPrintExecutor = newSingleThreadScheduledExecutor(); private final ScheduledExecutorService wakeupExecutor = newScheduledThreadPool(32); - private final TaskExecutor taskExecutor; + private final TimeSharingTaskExecutor taskExecutor; private final MultilevelSplitQueue splitQueue; - private TaskExecutorSimulator() + private TimeSharingTaskExecutorSimulation() { splitQueue = new MultilevelSplitQueue(2); - taskExecutor = new TaskExecutor(36, 72, 3, 8, splitQueue, Ticker.systemTicker()); + taskExecutor = new TimeSharingTaskExecutor(36, 72, 3, 8, splitQueue, Ticker.systemTicker()); taskExecutor.start(); } @@ -99,7 +99,7 @@ public void run() long start = System.nanoTime(); scheduleStatusPrinter(start); - SimulationController controller = new SimulationController(taskExecutor, TaskExecutorSimulator::printSummaryStats); + SimulationController controller = new SimulationController(taskExecutor, TimeSharingTaskExecutorSimulation::printSummaryStats); // Uncomment one of these: // runExperimentOverloadedCluster(controller); @@ -306,7 +306,7 @@ private void scheduleStatusPrinter(long start) }, 1, 1, SECONDS); } - private static void printSummaryStats(SimulationController controller, TaskExecutor taskExecutor) + private static void printSummaryStats(SimulationController controller, TimeSharingTaskExecutor taskExecutor) { Map specEnabled = controller.getSpecificationEnabled(); @@ -350,7 +350,7 @@ private static void printSummaryStats(SimulationController controller, TaskExecu SimulationTask::getScheduledTimeNanos, SimulationTask::getProcessedTimeNanos, Duration::succinctNanos, - TaskExecutorSimulator::formatNanos); + TimeSharingTaskExecutorSimulation::formatNanos); System.out.println(); System.out.println("Levels - Running Task Processed Time"); @@ -359,7 +359,7 @@ private static void printSummaryStats(SimulationController controller, TaskExecu SimulationTask::getScheduledTimeNanos, SimulationTask::getProcessedTimeNanos, Duration::succinctNanos, - TaskExecutorSimulator::formatNanos); + TimeSharingTaskExecutorSimulation::formatNanos); System.out.println(); System.out.println("Levels - All Task Wait Time"); @@ -368,7 +368,7 @@ private static void printSummaryStats(SimulationController controller, TaskExecu SimulationTask::getScheduledTimeNanos, SimulationTask::getTotalWaitTimeNanos, Duration::succinctNanos, - TaskExecutorSimulator::formatNanos); + TimeSharingTaskExecutorSimulation::formatNanos); System.out.println(); System.out.println("Specification - Processed time"); @@ -378,7 +378,7 @@ private static void printSummaryStats(SimulationController controller, TaskExecu t -> t.getSpecification().getName(), SimulationTask::getProcessedTimeNanos, identity(), - TaskExecutorSimulator::formatNanos); + TimeSharingTaskExecutorSimulation::formatNanos); System.out.println(); System.out.println("Specification - Wait time"); @@ -387,7 +387,7 @@ private static void printSummaryStats(SimulationController controller, TaskExecu t -> t.getSpecification().getName(), SimulationTask::getTotalWaitTimeNanos, identity(), - TaskExecutorSimulator::formatNanos); + TimeSharingTaskExecutorSimulation::formatNanos); System.out.println(); System.out.println("Breakdown by specification"); @@ -415,7 +415,7 @@ private static void printSummaryStats(SimulationController controller, TaskExecu SimulationTask::getScheduledTimeNanos, SimulationTask::getProcessedTimeNanos, Duration::succinctNanos, - TaskExecutorSimulator::formatNanos); + TimeSharingTaskExecutorSimulation::formatNanos); System.out.println(); System.out.println("All Tasks by Scheduled time - Wait Time"); @@ -424,7 +424,7 @@ private static void printSummaryStats(SimulationController controller, TaskExecu SimulationTask::getScheduledTimeNanos, SimulationTask::getTotalWaitTimeNanos, Duration::succinctNanos, - TaskExecutorSimulator::formatNanos); + TimeSharingTaskExecutorSimulation::formatNanos); System.out.println(); System.out.println("Complete Tasks by Scheduled time - Wait Time"); @@ -433,7 +433,7 @@ private static void printSummaryStats(SimulationController controller, TaskExecu SimulationTask::getScheduledTimeNanos, SimulationTask::getTotalWaitTimeNanos, Duration::succinctNanos, - TaskExecutorSimulator::formatNanos); + TimeSharingTaskExecutorSimulation::formatNanos); } } diff --git a/core/trino-main/src/test/java/io/trino/operator/project/TestPageProcessor.java b/core/trino-main/src/test/java/io/trino/operator/project/TestPageProcessor.java index 512f7df33832..e8cae580be26 100644 --- a/core/trino-main/src/test/java/io/trino/operator/project/TestPageProcessor.java +++ b/core/trino-main/src/test/java/io/trino/operator/project/TestPageProcessor.java @@ -51,7 +51,7 @@ import static io.trino.block.BlockAssertions.createLongSequenceBlock; import static io.trino.block.BlockAssertions.createSlicesBlock; import static io.trino.block.BlockAssertions.createStringsBlock; -import static io.trino.execution.executor.PrioritizedSplitRunner.SPLIT_RUN_QUANTA; +import static io.trino.execution.executor.timesharing.PrioritizedSplitRunner.SPLIT_RUN_QUANTA; import static io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext; import static io.trino.operator.PageAssertions.assertPageEquals; import static io.trino.operator.project.PageProcessor.MAX_BATCH_SIZE; diff --git a/core/trino-main/src/test/java/io/trino/server/remotetask/TestHttpRemoteTask.java b/core/trino-main/src/test/java/io/trino/server/remotetask/TestHttpRemoteTask.java index f15051a0b84d..d5082fa78a39 100644 --- a/core/trino-main/src/test/java/io/trino/server/remotetask/TestHttpRemoteTask.java +++ b/core/trino-main/src/test/java/io/trino/server/remotetask/TestHttpRemoteTask.java @@ -35,6 +35,7 @@ import io.trino.Session; import io.trino.block.BlockJsonSerde; import io.trino.client.NodeVersion; +import io.trino.execution.BaseTestSqlTaskManager; import io.trino.execution.DynamicFilterConfig; import io.trino.execution.DynamicFiltersCollector.VersionedDynamicFilterDomains; import io.trino.execution.NodeTaskMap; @@ -49,7 +50,6 @@ import io.trino.execution.TaskState; import io.trino.execution.TaskStatus; import io.trino.execution.TaskTestUtils; -import io.trino.execution.TestSqlTaskManager; import io.trino.execution.buffer.PipelinedOutputBuffers; import io.trino.metadata.BlockEncodingManager; import io.trino.metadata.HandleJsonModule; @@ -583,7 +583,7 @@ private HttpRemoteTaskFactory createHttpRemoteTaskFactory( new QueryManagerConfig(), TASK_MANAGER_CONFIG, testingHttpClient, - new TestSqlTaskManager.MockLocationFactory(), + new BaseTestSqlTaskManager.MockLocationFactory(), taskStatusCodec, dynamicFilterDomainsCodec, taskInfoCodec, diff --git a/testing/trino-benchmark/src/main/java/io/trino/benchmark/AbstractOperatorBenchmark.java b/testing/trino-benchmark/src/main/java/io/trino/benchmark/AbstractOperatorBenchmark.java index 186d3b0edb20..558d217c0d14 100644 --- a/testing/trino-benchmark/src/main/java/io/trino/benchmark/AbstractOperatorBenchmark.java +++ b/testing/trino-benchmark/src/main/java/io/trino/benchmark/AbstractOperatorBenchmark.java @@ -81,7 +81,7 @@ import static io.airlift.units.DataSize.Unit.MEGABYTE; import static io.trino.SystemSessionProperties.getFilterAndProjectMinOutputPageRowCount; import static io.trino.SystemSessionProperties.getFilterAndProjectMinOutputPageSize; -import static io.trino.execution.executor.PrioritizedSplitRunner.SPLIT_RUN_QUANTA; +import static io.trino.execution.executor.timesharing.PrioritizedSplitRunner.SPLIT_RUN_QUANTA; import static io.trino.spi.connector.Constraint.alwaysTrue; import static io.trino.spi.type.BigintType.BIGINT; import static io.trino.sql.analyzer.TypeSignatureProvider.fromTypes; diff --git a/testing/trino-benchmark/src/main/java/io/trino/benchmark/HashJoinBenchmark.java b/testing/trino-benchmark/src/main/java/io/trino/benchmark/HashJoinBenchmark.java index 87216d0f284d..900e362b6201 100644 --- a/testing/trino-benchmark/src/main/java/io/trino/benchmark/HashJoinBenchmark.java +++ b/testing/trino-benchmark/src/main/java/io/trino/benchmark/HashJoinBenchmark.java @@ -41,7 +41,7 @@ import static com.google.common.collect.ImmutableList.toImmutableList; import static io.airlift.concurrent.MoreFutures.getFutureValue; import static io.trino.benchmark.BenchmarkQueryRunner.createLocalQueryRunner; -import static io.trino.execution.executor.PrioritizedSplitRunner.SPLIT_RUN_QUANTA; +import static io.trino.execution.executor.timesharing.PrioritizedSplitRunner.SPLIT_RUN_QUANTA; import static io.trino.operator.HashArraySizeSupplier.incrementalLoadFactorHashArraySizeSupplier; import static io.trino.operator.JoinOperatorType.innerJoin; import static io.trino.operator.OperatorFactories.spillingJoin; From b0bb00ff322a7df2c4f5658d52b4673314a3e06b Mon Sep 17 00:00:00 2001 From: Martin Traverso Date: Thu, 25 May 2023 15:05:23 -0700 Subject: [PATCH 5/7] Remove unused field --- .../executor/timesharing/TimeSharingTaskExecutor.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/timesharing/TimeSharingTaskExecutor.java b/core/trino-main/src/main/java/io/trino/execution/executor/timesharing/TimeSharingTaskExecutor.java index f5c40ff449f1..d9b623481bfb 100644 --- a/core/trino-main/src/main/java/io/trino/execution/executor/timesharing/TimeSharingTaskExecutor.java +++ b/core/trino-main/src/main/java/io/trino/execution/executor/timesharing/TimeSharingTaskExecutor.java @@ -58,7 +58,6 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.concurrent.RejectedExecutionException; -import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; @@ -71,7 +70,6 @@ import static com.google.common.base.Preconditions.checkState; import static com.google.common.collect.ImmutableSet.toImmutableSet; import static com.google.common.collect.Sets.newConcurrentHashSet; -import static io.airlift.concurrent.Threads.daemonThreadsNamed; import static io.airlift.concurrent.Threads.threadsNamed; import static io.airlift.tracing.Tracing.noopTracer; import static io.trino.execution.executor.timesharing.MultilevelSplitQueue.computeLevel; @@ -80,7 +78,6 @@ import static java.lang.String.format; import static java.util.Objects.requireNonNull; import static java.util.concurrent.Executors.newCachedThreadPool; -import static java.util.concurrent.Executors.newSingleThreadScheduledExecutor; import static java.util.concurrent.TimeUnit.MICROSECONDS; import static java.util.concurrent.TimeUnit.NANOSECONDS; @@ -104,7 +101,6 @@ public class TimeSharingTaskExecutor private final Ticker ticker; private final Duration stuckSplitsWarningThreshold; - private final ScheduledExecutorService splitMonitorExecutor = newSingleThreadScheduledExecutor(daemonThreadsNamed("TaskExecutor")); private final SortedSet runningSplitInfos = new ConcurrentSkipListSet<>(); @GuardedBy("this") @@ -248,7 +244,6 @@ public synchronized void stop() { closed = true; executor.shutdownNow(); - splitMonitorExecutor.shutdownNow(); } @Override From 7d0a51aedf04e1b988accd0fa8a3c32e2057ab62 Mon Sep 17 00:00:00 2001 From: Martin Traverso Date: Thu, 13 Jul 2023 12:02:44 -0700 Subject: [PATCH 6/7] Add missing driver failure signal If there's an exception while the driver is being created, the error is not being propagated to the state machine. This can cause tasks to not be destroyed and blocked drivers to not be canceled. --- .../src/main/java/io/trino/operator/DriverFactory.java | 1 + 1 file changed, 1 insertion(+) diff --git a/core/trino-main/src/main/java/io/trino/operator/DriverFactory.java b/core/trino-main/src/main/java/io/trino/operator/DriverFactory.java index b2a4a27e88b7..a37b29e417cd 100644 --- a/core/trino-main/src/main/java/io/trino/operator/DriverFactory.java +++ b/core/trino-main/src/main/java/io/trino/operator/DriverFactory.java @@ -130,6 +130,7 @@ public Driver createDriver(DriverContext driverContext) } } } + driverContext.failed(failure); throw failure; } } From 82e2a6bb8d031586f2e05a5f3c181109d9934689 Mon Sep 17 00:00:00 2001 From: Martin Traverso Date: Mon, 14 Aug 2023 10:25:38 -0700 Subject: [PATCH 7/7] Add thread-per-driver executor * Uses a fair queue based on Completely Fair Scheduler * Runs each driver in a separate thread --- .../io/trino/execution/TaskManagerConfig.java | 13 + .../executor/dedicated/SplitProcessor.java | 136 +++++++ .../ThreadPerDriverTaskExecutor.java | 206 ++++++++++ .../scheduler/BlockingSchedulingQueue.java | 143 +++++++ .../executor/scheduler/FairScheduler.java | 306 +++++++++++++++ .../execution/executor/scheduler/Gate.java | 70 ++++ .../execution/executor/scheduler/Group.java | 28 ++ .../executor/scheduler/PriorityQueue.java | 149 +++++++ .../executor/scheduler/Reservation.java | 83 ++++ .../executor/scheduler/Schedulable.java | 19 + .../executor/scheduler/SchedulerContext.java | 81 ++++ .../executor/scheduler/SchedulingGroup.java | 194 ++++++++++ .../executor/scheduler/SchedulingQueue.java | 345 +++++++++++++++++ .../execution/executor/scheduler/State.java | 21 + .../execution/executor/scheduler/Task.java | 65 ++++ .../executor/scheduler/TaskControl.java | 366 ++++++++++++++++++ .../scheduler/group-state-diagram.dot | 24 ++ .../io/trino/server/ServerMainModule.java | 26 +- .../io/trino/tracing/TrinoAttributes.java | 2 + .../TestSqlTaskManagerThreadPerDriver.java | 35 ++ .../execution/TestTaskManagerConfig.java | 3 + .../TestThreadPerDriverTaskExecutor.java | 258 ++++++++++++ .../executor/scheduler/TestFairScheduler.java | 227 +++++++++++ .../executor/scheduler/TestPriorityQueue.java | 210 ++++++++++ .../scheduler/TestSchedulingQueue.java | 323 ++++++++++++++++ pom.xml | 3 +- 26 files changed, 3328 insertions(+), 8 deletions(-) create mode 100644 core/trino-main/src/main/java/io/trino/execution/executor/dedicated/SplitProcessor.java create mode 100644 core/trino-main/src/main/java/io/trino/execution/executor/dedicated/ThreadPerDriverTaskExecutor.java create mode 100644 core/trino-main/src/main/java/io/trino/execution/executor/scheduler/BlockingSchedulingQueue.java create mode 100644 core/trino-main/src/main/java/io/trino/execution/executor/scheduler/FairScheduler.java create mode 100644 core/trino-main/src/main/java/io/trino/execution/executor/scheduler/Gate.java create mode 100644 core/trino-main/src/main/java/io/trino/execution/executor/scheduler/Group.java create mode 100644 core/trino-main/src/main/java/io/trino/execution/executor/scheduler/PriorityQueue.java create mode 100644 core/trino-main/src/main/java/io/trino/execution/executor/scheduler/Reservation.java create mode 100644 core/trino-main/src/main/java/io/trino/execution/executor/scheduler/Schedulable.java create mode 100644 core/trino-main/src/main/java/io/trino/execution/executor/scheduler/SchedulerContext.java create mode 100644 core/trino-main/src/main/java/io/trino/execution/executor/scheduler/SchedulingGroup.java create mode 100644 core/trino-main/src/main/java/io/trino/execution/executor/scheduler/SchedulingQueue.java create mode 100644 core/trino-main/src/main/java/io/trino/execution/executor/scheduler/State.java create mode 100644 core/trino-main/src/main/java/io/trino/execution/executor/scheduler/Task.java create mode 100644 core/trino-main/src/main/java/io/trino/execution/executor/scheduler/TaskControl.java create mode 100644 core/trino-main/src/main/java/io/trino/execution/executor/scheduler/group-state-diagram.dot create mode 100644 core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManagerThreadPerDriver.java create mode 100644 core/trino-main/src/test/java/io/trino/execution/executor/dedicated/TestThreadPerDriverTaskExecutor.java create mode 100644 core/trino-main/src/test/java/io/trino/execution/executor/scheduler/TestFairScheduler.java create mode 100644 core/trino-main/src/test/java/io/trino/execution/executor/scheduler/TestPriorityQueue.java create mode 100644 core/trino-main/src/test/java/io/trino/execution/executor/scheduler/TestSchedulingQueue.java diff --git a/core/trino-main/src/main/java/io/trino/execution/TaskManagerConfig.java b/core/trino-main/src/main/java/io/trino/execution/TaskManagerConfig.java index ddd2fb50aa09..6758e362ba88 100644 --- a/core/trino-main/src/main/java/io/trino/execution/TaskManagerConfig.java +++ b/core/trino-main/src/main/java/io/trino/execution/TaskManagerConfig.java @@ -45,6 +45,7 @@ "task.level-absolute-priority"}) public class TaskManagerConfig { + private boolean threadPerDriverSchedulerEnabled; private boolean perOperatorCpuTimerEnabled = true; private boolean taskCpuTimerEnabled = true; private boolean statisticsCpuTimerEnabled = true; @@ -107,6 +108,18 @@ public class TaskManagerConfig private BigDecimal levelTimeMultiplier = new BigDecimal(2.0); + @Config("experimental.thread-per-driver-scheduler-enabled") + public TaskManagerConfig setThreadPerDriverSchedulerEnabled(boolean enabled) + { + this.threadPerDriverSchedulerEnabled = enabled; + return this; + } + + public boolean isThreadPerDriverSchedulerEnabled() + { + return threadPerDriverSchedulerEnabled; + } + @MinDuration("1ms") @MaxDuration("10s") @NotNull diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/dedicated/SplitProcessor.java b/core/trino-main/src/main/java/io/trino/execution/executor/dedicated/SplitProcessor.java new file mode 100644 index 000000000000..87efbed973a0 --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/execution/executor/dedicated/SplitProcessor.java @@ -0,0 +1,136 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution.executor.dedicated; + +import com.google.common.base.Ticker; +import com.google.common.util.concurrent.ListenableFuture; +import io.airlift.concurrent.SetThreadName; +import io.airlift.log.Logger; +import io.airlift.stats.CpuTimer; +import io.airlift.units.Duration; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.SpanBuilder; +import io.opentelemetry.api.trace.Tracer; +import io.opentelemetry.context.Context; +import io.trino.execution.SplitRunner; +import io.trino.execution.TaskId; +import io.trino.execution.executor.scheduler.Schedulable; +import io.trino.execution.executor.scheduler.SchedulerContext; +import io.trino.tracing.TrinoAttributes; + +import java.util.concurrent.TimeUnit; + +import static java.util.Objects.requireNonNull; +import static java.util.concurrent.TimeUnit.NANOSECONDS; + +class SplitProcessor + implements Schedulable +{ + private static final Logger LOG = Logger.get(SplitProcessor.class); + + private static final Duration SPLIT_RUN_QUANTA = new Duration(1, TimeUnit.SECONDS); + + private final TaskId taskId; + private final int splitId; + private final SplitRunner split; + private final Tracer tracer; + + public SplitProcessor(TaskId taskId, int splitId, SplitRunner split, Tracer tracer) + { + this.taskId = requireNonNull(taskId, "taskId is null"); + this.splitId = splitId; + this.split = requireNonNull(split, "split is null"); + this.tracer = requireNonNull(tracer, "tracer is null"); + } + + @Override + public void run(SchedulerContext context) + { + Span splitSpan = tracer.spanBuilder("split") + .setParent(Context.current().with(split.getPipelineSpan())) + .setAttribute(TrinoAttributes.QUERY_ID, taskId.getQueryId().toString()) + .setAttribute(TrinoAttributes.STAGE_ID, taskId.getStageId().toString()) + .setAttribute(TrinoAttributes.TASK_ID, taskId.toString()) + .setAttribute(TrinoAttributes.PIPELINE_ID, taskId.getStageId() + "-" + split.getPipelineId()) + .setAttribute(TrinoAttributes.SPLIT_ID, taskId + "-" + splitId) + .startSpan(); + + Span processSpan = newSpan(splitSpan, null); + + CpuTimer timer = new CpuTimer(Ticker.systemTicker(), false); + long previousCpuNanos = 0; + long previousScheduledNanos = 0; + try (SetThreadName ignored = new SetThreadName("SplitRunner-%s-%s", taskId, splitId)) { + while (!split.isFinished()) { + ListenableFuture blocked = split.processFor(SPLIT_RUN_QUANTA); + CpuTimer.CpuDuration elapsed = timer.elapsedTime(); + + long scheduledNanos = elapsed.getWall().roundTo(NANOSECONDS); + processSpan.setAttribute(TrinoAttributes.SPLIT_SCHEDULED_TIME_NANOS, scheduledNanos - previousScheduledNanos); + previousScheduledNanos = scheduledNanos; + + long cpuNanos = elapsed.getCpu().roundTo(NANOSECONDS); + processSpan.setAttribute(TrinoAttributes.SPLIT_CPU_TIME_NANOS, cpuNanos - previousCpuNanos); + previousCpuNanos = cpuNanos; + + if (!split.isFinished()) { + if (blocked.isDone()) { + processSpan.addEvent("yield"); + processSpan.end(); + if (!context.maybeYield()) { + processSpan = null; + return; + } + } + else { + processSpan.addEvent("blocked"); + processSpan.end(); + if (!context.block(blocked)) { + processSpan = null; + return; + } + } + processSpan = newSpan(splitSpan, processSpan); + } + } + } + catch (Exception e) { + LOG.error(e); + } + finally { + if (processSpan != null) { + processSpan.end(); + } + + splitSpan.setAttribute(TrinoAttributes.SPLIT_CPU_TIME_NANOS, timer.elapsedTime().getCpu().roundTo(NANOSECONDS)); + splitSpan.setAttribute(TrinoAttributes.SPLIT_SCHEDULED_TIME_NANOS, context.getScheduledNanos()); + splitSpan.setAttribute(TrinoAttributes.SPLIT_BLOCK_TIME_NANOS, context.getBlockedNanos()); + splitSpan.setAttribute(TrinoAttributes.SPLIT_WAIT_TIME_NANOS, context.getWaitNanos()); + splitSpan.setAttribute(TrinoAttributes.SPLIT_START_TIME_NANOS, context.getStartNanos()); + splitSpan.end(); + } + } + + private Span newSpan(Span parent, Span previous) + { + SpanBuilder builder = tracer.spanBuilder("process") + .setParent(Context.current().with(parent)); + + if (previous != null) { + builder.addLink(previous.getSpanContext()); + } + + return builder.startSpan(); + } +} diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/dedicated/ThreadPerDriverTaskExecutor.java b/core/trino-main/src/main/java/io/trino/execution/executor/dedicated/ThreadPerDriverTaskExecutor.java new file mode 100644 index 000000000000..4a2a42ad39ae --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/execution/executor/dedicated/ThreadPerDriverTaskExecutor.java @@ -0,0 +1,206 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution.executor.dedicated; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Ticker; +import com.google.common.collect.ImmutableSet; +import com.google.common.util.concurrent.ListenableFuture; +import com.google.errorprone.annotations.ThreadSafe; +import com.google.errorprone.annotations.concurrent.GuardedBy; +import com.google.inject.Inject; +import io.airlift.units.Duration; +import io.opentelemetry.api.trace.Tracer; +import io.trino.execution.SplitRunner; +import io.trino.execution.TaskId; +import io.trino.execution.TaskManagerConfig; +import io.trino.execution.executor.RunningSplitInfo; +import io.trino.execution.executor.TaskExecutor; +import io.trino.execution.executor.TaskHandle; +import io.trino.execution.executor.scheduler.FairScheduler; +import io.trino.execution.executor.scheduler.Group; +import io.trino.execution.executor.scheduler.Schedulable; +import io.trino.execution.executor.scheduler.SchedulerContext; +import io.trino.spi.VersionEmbedder; +import jakarta.annotation.PostConstruct; +import jakarta.annotation.PreDestroy; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.OptionalInt; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.DoubleSupplier; +import java.util.function.Predicate; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.util.concurrent.MoreExecutors.directExecutor; +import static java.util.Objects.requireNonNull; + +@ThreadSafe +public class ThreadPerDriverTaskExecutor + implements TaskExecutor +{ + private final FairScheduler scheduler; + private final Tracer tracer; + private final VersionEmbedder versionEmbedder; + private volatile boolean closed; + + @Inject + public ThreadPerDriverTaskExecutor(TaskManagerConfig config, Tracer tracer, VersionEmbedder versionEmbedder) + { + this(tracer, versionEmbedder, new FairScheduler(config.getMaxWorkerThreads(), "SplitRunner-%d", Ticker.systemTicker())); + } + + @VisibleForTesting + public ThreadPerDriverTaskExecutor(Tracer tracer, VersionEmbedder versionEmbedder, FairScheduler scheduler) + { + this.scheduler = scheduler; + this.tracer = requireNonNull(tracer, "tracer is null"); + this.versionEmbedder = requireNonNull(versionEmbedder, "versionEmbedder is null"); + } + + @PostConstruct + @Override + public synchronized void start() + { + scheduler.start(); + } + + @PreDestroy + @Override + public synchronized void stop() + { + closed = true; + scheduler.close(); + } + + @Override + public synchronized TaskHandle addTask( + TaskId taskId, + DoubleSupplier utilizationSupplier, + int initialSplitConcurrency, + Duration splitConcurrencyAdjustFrequency, + OptionalInt maxDriversPerTask) + { + checkArgument(!closed, "Executor is already closed"); + + Group group = scheduler.createGroup(taskId.toString()); + return new TaskEntry(taskId, group); + } + + @Override + public synchronized void removeTask(TaskHandle handle) + { + TaskEntry entry = (TaskEntry) handle; + + if (!entry.isDestroyed()) { + scheduler.removeGroup(entry.group()); + entry.destroy(); + } + } + + @Override + public synchronized List> enqueueSplits(TaskHandle handle, boolean intermediate, List splits) + { + checkArgument(!closed, "Executor is already closed"); + + TaskEntry entry = (TaskEntry) handle; + + List> futures = new ArrayList<>(); + for (SplitRunner split : splits) { + entry.addSplit(split); + + int splitId = entry.nextSplitId(); + ListenableFuture done = scheduler.submit(entry.group(), splitId, new VersionEmbedderBridge(versionEmbedder, new SplitProcessor(entry.taskId(), splitId, split, tracer))); + done.addListener(split::close, directExecutor()); + futures.add(done); + } + + return futures; + } + + @Override + public Set getStuckSplitTaskIds(Duration processingDurationThreshold, Predicate filter) + { + // TODO + return ImmutableSet.of(); + } + + private static class TaskEntry + implements TaskHandle + { + private final TaskId taskId; + private final Group group; + private final AtomicInteger nextSplitId = new AtomicInteger(); + private volatile boolean destroyed; + + @GuardedBy("this") + private Set splits = new HashSet<>(); + + public TaskEntry(TaskId taskId, Group group) + { + this.taskId = taskId; + this.group = group; + } + + public TaskId taskId() + { + return taskId; + } + + public Group group() + { + return group; + } + + public synchronized void destroy() + { + destroyed = true; + + for (SplitRunner split : splits) { + split.close(); + } + } + + public synchronized void addSplit(SplitRunner split) + { + checkArgument(!destroyed, "Task already destroyed: %s", taskId); + splits.add(split); + } + + public int nextSplitId() + { + return nextSplitId.incrementAndGet(); + } + + @Override + public boolean isDestroyed() + { + return destroyed; + } + } + + private record VersionEmbedderBridge(VersionEmbedder versionEmbedder, Schedulable delegate) + implements Schedulable + { + @Override + public void run(SchedulerContext context) + { + Runnable adapter = () -> delegate.run(context); + versionEmbedder.embedVersion(adapter).run(); + } + } +} diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/BlockingSchedulingQueue.java b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/BlockingSchedulingQueue.java new file mode 100644 index 000000000000..093017772fa1 --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/BlockingSchedulingQueue.java @@ -0,0 +1,143 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution.executor.scheduler; + +import com.google.errorprone.annotations.ThreadSafe; +import com.google.errorprone.annotations.concurrent.GuardedBy; + +import java.util.Set; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; + +@ThreadSafe +final class BlockingSchedulingQueue +{ + private final Lock lock = new ReentrantLock(); + private final Condition notEmpty = lock.newCondition(); + + @GuardedBy("lock") + private final SchedulingQueue queue = new SchedulingQueue<>(); + + public void startGroup(G group) + { + lock.lock(); + try { + queue.startGroup(group); + } + finally { + lock.unlock(); + } + } + + public Set finishGroup(G group) + { + lock.lock(); + try { + return queue.finishGroup(group); + } + finally { + lock.unlock(); + } + } + + public Set finishAll() + { + lock.lock(); + try { + return queue.finishAll(); + } + finally { + lock.unlock(); + } + } + + public boolean enqueue(G group, T task, long deltaWeight) + { + lock.lock(); + try { + if (!queue.containsGroup(group)) { + return false; + } + + queue.enqueue(group, task, deltaWeight); + notEmpty.signal(); + + return true; + } + finally { + lock.unlock(); + } + } + + public boolean block(G group, T task, long deltaWeight) + { + lock.lock(); + try { + if (!queue.containsGroup(group)) { + return false; + } + + queue.block(group, task, deltaWeight); + return true; + } + finally { + lock.unlock(); + } + } + + public T dequeue(long expectedWeight) + throws InterruptedException + { + lock.lock(); + try { + T result; + do { + result = queue.dequeue(expectedWeight); + if (result == null) { + notEmpty.await(); + } + } + while (result == null); + + return result; + } + finally { + lock.unlock(); + } + } + + @Override + public String toString() + { + lock.lock(); + try { + return queue.toString(); + } + finally { + lock.unlock(); + } + } + + public int getRunnableCount() + { + lock.lock(); + try { + return queue.getRunnableCount(); + } + finally { + lock.unlock(); + } + } +} diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/FairScheduler.java b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/FairScheduler.java new file mode 100644 index 000000000000..6c5f82dec7d3 --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/FairScheduler.java @@ -0,0 +1,306 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution.executor.scheduler; + +import com.google.common.base.Ticker; +import com.google.common.util.concurrent.ListenableFuture; +import com.google.common.util.concurrent.ListeningExecutorService; +import com.google.common.util.concurrent.MoreExecutors; +import com.google.common.util.concurrent.ThreadFactoryBuilder; +import com.google.errorprone.annotations.ThreadSafe; +import com.google.errorprone.annotations.concurrent.GuardedBy; +import io.airlift.log.Logger; + +import java.util.Set; +import java.util.StringJoiner; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static java.util.Objects.requireNonNull; + +/** + *

Implementation nodes

+ * + *
    + *
  • The TaskControl state machine is only modified by the task executor + * thread (i.e., from within {@link FairScheduler#runTask(Schedulable, TaskControl)} )}). Other threads + * can indirectly affect what the task executor thread does by marking the task as ready or cancelled + * and unblocking the task executor thread, which will then act on that information.
  • + *
+ */ +@ThreadSafe +public final class FairScheduler + implements AutoCloseable +{ + private static final Logger LOG = Logger.get(FairScheduler.class); + + public static final long QUANTUM_NANOS = TimeUnit.MILLISECONDS.toNanos(1000); + + private final ExecutorService schedulerExecutor; + private final ListeningExecutorService taskExecutor; + private final BlockingSchedulingQueue queue = new BlockingSchedulingQueue<>(); + private final Reservation concurrencyControl; + private final Ticker ticker; + + private final Gate paused = new Gate(true); + + @GuardedBy("this") + private boolean closed; + + public FairScheduler(int maxConcurrentTasks, String threadNameFormat, Ticker ticker) + { + this.ticker = requireNonNull(ticker, "ticker is null"); + + concurrencyControl = new Reservation<>(maxConcurrentTasks); + + schedulerExecutor = Executors.newCachedThreadPool(new ThreadFactoryBuilder() + .setNameFormat("fair-scheduler-%d") + .setDaemon(true) + .build()); + + taskExecutor = MoreExecutors.listeningDecorator(Executors.newCachedThreadPool(new ThreadFactoryBuilder() + .setNameFormat(threadNameFormat) + .setDaemon(true) + .build())); + } + + public static FairScheduler newInstance(int maxConcurrentTasks) + { + return newInstance(maxConcurrentTasks, Ticker.systemTicker()); + } + + public static FairScheduler newInstance(int maxConcurrentTasks, Ticker ticker) + { + FairScheduler scheduler = new FairScheduler(maxConcurrentTasks, "fair-scheduler-runner-%d", ticker); + scheduler.start(); + return scheduler; + } + + public void start() + { + schedulerExecutor.submit(this::runScheduler); + } + + public void pause() + { + paused.close(); + } + + public void resume() + { + paused.open(); + } + + @Override + public synchronized void close() + { + if (closed) { + return; + } + closed = true; + + Set tasks = queue.finishAll(); + + for (TaskControl task : tasks) { + task.cancel(); + } + + taskExecutor.shutdownNow(); + schedulerExecutor.shutdownNow(); + } + + public synchronized Group createGroup(String name) + { + checkArgument(!closed, "Already closed"); + + Group group = new Group(name); + queue.startGroup(group); + + return group; + } + + public synchronized void removeGroup(Group group) + { + checkArgument(!closed, "Already closed"); + + Set tasks = queue.finishGroup(group); + + for (TaskControl task : tasks) { + task.cancel(); + } + } + + public synchronized ListenableFuture submit(Group group, int id, Schedulable runner) + { + checkArgument(!closed, "Already closed"); + + TaskControl task = new TaskControl(group, id, ticker); + + return taskExecutor.submit(() -> runTask(runner, task), null); + } + + private void runTask(Schedulable runner, TaskControl task) + { + task.setThread(Thread.currentThread()); + + if (!makeRunnableAndAwait(task, 0)) { + return; + } + + SchedulerContext context = new SchedulerContext(this, task); + try { + runner.run(context); + } + catch (Exception e) { + LOG.error(e); + } + finally { + // If the runner exited due to an exception in user code or + // normally (not in response to an interruption during blocking or yield), + // it must have had a semaphore permit reserved, so release it. + if (task.getState() == TaskControl.State.RUNNING) { + concurrencyControl.release(task); + } + task.transitionToFinished(); + } + } + + private boolean makeRunnableAndAwait(TaskControl task, long deltaWeight) + { + if (!task.transitionToWaiting()) { + return false; + } + + if (!queue.enqueue(task.group(), task, deltaWeight)) { + return false; + } + + // wait for the task to be scheduled + return awaitReadyAndTransitionToRunning(task); + } + + /** + * @return false if the transition was unsuccessful due to the task being cancelled + */ + private boolean awaitReadyAndTransitionToRunning(TaskControl task) + { + if (!task.awaitReady()) { + if (task.isReady()) { + // If the task was marked as ready (slot acquired) but then cancelled before + // awaitReady() was notified, we need to release the slot. + concurrencyControl.release(task); + } + return false; + } + + if (!task.transitionToRunning()) { + concurrencyControl.release(task); + return false; + } + + return true; + } + + boolean yield(TaskControl task) + { + checkState(task.getThread() == Thread.currentThread(), "yield() may only be called from the task thread"); + + long delta = task.elapsed(); + if (delta < QUANTUM_NANOS) { + return true; + } + + concurrencyControl.release(task); + + return makeRunnableAndAwait(task, delta); + } + + boolean block(TaskControl task, ListenableFuture future) + { + checkState(task.getThread() == Thread.currentThread(), "block() may only be called from the task thread"); + + long delta = task.elapsed(); + + concurrencyControl.release(task); + + if (!task.transitionToBlocked()) { + return false; + } + + if (!queue.block(task.group(), task, delta)) { + return false; + } + + future.addListener(task::markUnblocked, MoreExecutors.directExecutor()); + task.awaitUnblock(); + + return makeRunnableAndAwait(task, 0); + } + + private void runScheduler() + { + while (true) { + try { + paused.awaitOpen(); + concurrencyControl.reserve(); + TaskControl task = queue.dequeue(QUANTUM_NANOS); + + concurrencyControl.register(task); + if (!task.markReady()) { + concurrencyControl.release(task); + } + } + catch (InterruptedException e) { + Thread.currentThread().interrupt(); + return; + } + catch (Exception e) { + LOG.error(e); + } + } + } + + long getStartNanos(TaskControl task) + { + return task.getStartNanos(); + } + + long getScheduledNanos(TaskControl task) + { + return task.getScheduledNanos(); + } + + long getWaitNanos(TaskControl task) + { + return task.getWaitNanos(); + } + + long getBlockedNanos(TaskControl task) + { + return task.getBlockedNanos(); + } + + @Override + public String toString() + { + return new StringJoiner(", ", FairScheduler.class.getSimpleName() + "[", "]") + .add("queue=" + queue) + .add("concurrencyControl=" + concurrencyControl) + .add("closed=" + closed) + .toString(); + } +} diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/Gate.java b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/Gate.java new file mode 100644 index 000000000000..adcc930b6f24 --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/Gate.java @@ -0,0 +1,70 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution.executor.scheduler; + +import com.google.errorprone.annotations.ThreadSafe; + +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; + +@ThreadSafe +final class Gate +{ + private final Lock lock = new ReentrantLock(); + private final Condition opened = lock.newCondition(); + private boolean open; + + public Gate(boolean opened) + { + this.open = opened; + } + + public void close() + { + lock.lock(); + try { + open = false; + } + finally { + lock.unlock(); + } + } + + public void open() + { + lock.lock(); + try { + open = true; + opened.signalAll(); + } + finally { + lock.unlock(); + } + } + + public void awaitOpen() + throws InterruptedException + { + lock.lock(); + try { + while (!open) { + opened.await(); + } + } + finally { + lock.unlock(); + } + } +} diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/Group.java b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/Group.java new file mode 100644 index 000000000000..596abffbe1e3 --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/Group.java @@ -0,0 +1,28 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution.executor.scheduler; + +public record Group(String name, long startTime) +{ + public Group(String name) + { + this(name, System.nanoTime()); + } + + @Override + public String toString() + { + return name; + } +} diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/PriorityQueue.java b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/PriorityQueue.java new file mode 100644 index 000000000000..7724400f4c50 --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/PriorityQueue.java @@ -0,0 +1,149 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution.executor.scheduler; + +import io.trino.annotation.NotThreadSafe; + +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; + +@NotThreadSafe +final class PriorityQueue +{ + // The tree is ordered by priorities in this map, so any operations on the data + // structures needs to consider the importance of the relative order of the operations. + // For instance, removing an entry from the tree before the corresponding entry in the + // queue is removed will lead to NPEs. + private final Map priorities = new HashMap<>(); + private final TreeSet queue; + + private long sequence; + + public PriorityQueue() + { + queue = new TreeSet<>((a, b) -> { + Priority first = priorities.get(a); + Priority second = priorities.get(b); + + int result = Long.compare(first.priority(), second.priority()); + if (result == 0) { + result = Long.compare(first.sequence(), second.sequence()); + } + return result; + }); + } + + public void add(T value, long priority) + { + checkArgument(!priorities.containsKey(value), "Value already in queue: %s", value); + priorities.put(value, new Priority(priority, nextSequence())); + queue.add(value); + } + + public void addOrReplace(T value, long priority) + { + if (priorities.containsKey(value)) { + queue.remove(value); + priorities.put(value, new Priority(priority, nextSequence())); + queue.add(value); + } + else { + add(value, priority); + } + } + + public T takeOrThrow() + { + T result = poll(); + checkState(result != null, "Queue is empty"); + return result; + } + + public T poll() + { + T result = queue.pollFirst(); + if (result != null) { + priorities.remove(result); + } + + return result; + } + + public void remove(T value) + { + checkArgument(priorities.containsKey(value), "Value not in queue: %s", value); + queue.remove(value); + priorities.remove(value); + } + + public void removeIfPresent(T value) + { + if (priorities.containsKey(value)) { + queue.remove(value); + priorities.remove(value); + } + } + + public boolean contains(T value) + { + return priorities.containsKey(value); + } + + public boolean isEmpty() + { + return priorities.isEmpty(); + } + + public Set values() + { + return priorities.keySet(); + } + + public long nextPriority() + { + checkState(!queue.isEmpty(), "Queue is empty"); + return priorities.get(queue.first()).priority(); + } + + public T peek() + { + if (queue.isEmpty()) { + return null; + } + return queue.first(); + } + + public int size() + { + return queue.size(); + } + + @Override + public String toString() + { + return queue.toString(); + } + + private long nextSequence() + { + return sequence++; + } + + private record Priority(long priority, long sequence) {} +} diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/Reservation.java b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/Reservation.java new file mode 100644 index 000000000000..85d484c99d36 --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/Reservation.java @@ -0,0 +1,83 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution.executor.scheduler; + +import com.google.common.collect.ImmutableSet; +import com.google.errorprone.annotations.ThreadSafe; + +import java.util.HashSet; +import java.util.Set; +import java.util.StringJoiner; +import java.util.concurrent.Semaphore; + +import static com.google.common.base.Preconditions.checkArgument; + +/** + *

Semaphore-like structure that allows for tracking reservations to avoid double-reserving or double-releasing.

+ * + *

Callers are expected to call {@link #reserve()} to acquire a slot, and then {@link #register(T)} to associate + * an entity with the reservation.

+ * + *

Upon completion, callers should call {@link #release(T)} to release the reservation.

+ */ +@ThreadSafe +final class Reservation +{ + private final Semaphore semaphore; + private final Set reservations = new HashSet<>(); + + public Reservation(int slots) + { + semaphore = new Semaphore(slots); + } + + public int availablePermits() + { + return semaphore.availablePermits(); + } + + public void reserve() + throws InterruptedException + { + semaphore.acquire(); + } + + public synchronized void register(T entry) + { + checkArgument(!reservations.contains(entry), "Already acquired: %s", entry); + reservations.add(entry); + } + + public synchronized void release(T entry) + { + checkArgument(reservations.contains(entry), "Already released: %s", entry); + reservations.remove(entry); + + semaphore.release(); + } + + public synchronized Set reservations() + { + return ImmutableSet.copyOf(reservations); + } + + @Override + public synchronized String toString() + { + return new StringJoiner(", ", Reservation.class.getSimpleName() + "[", "]") + .add("semaphore=" + semaphore) + .add("reservations=" + reservations) + .toString(); + } +} diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/Schedulable.java b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/Schedulable.java new file mode 100644 index 000000000000..39032d7dafff --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/Schedulable.java @@ -0,0 +1,19 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution.executor.scheduler; + +public interface Schedulable +{ + void run(SchedulerContext context); +} diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/SchedulerContext.java b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/SchedulerContext.java new file mode 100644 index 000000000000..789086ff08e8 --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/SchedulerContext.java @@ -0,0 +1,81 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution.executor.scheduler; + +import com.google.common.util.concurrent.ListenableFuture; +import io.trino.annotation.NotThreadSafe; + +import static com.google.common.base.Preconditions.checkArgument; + +@NotThreadSafe +public final class SchedulerContext +{ + private final FairScheduler scheduler; + private final TaskControl handle; + + public SchedulerContext(FairScheduler scheduler, TaskControl handle) + { + this.scheduler = scheduler; + this.handle = handle; + } + + /** + * Attempt to relinquish control to let other tasks run. + * + * @return false if the task was interrupted or cancelled while yielding, + * for example if the Java thread was interrupted, the scheduler was shutdown, + * or the scheduling group was removed. The caller is expected to clean up and finish. + */ + public boolean maybeYield() + { + checkArgument(handle.getState() == TaskControl.State.RUNNING, "Task is not running"); + + return scheduler.yield(handle); + } + + /** + * Indicate that the current task is blocked. The method returns when the future + * completes of it the task is interrupted. + * + * @return false if the task was interrupted or cancelled while blocked, + * for example if the Java thread was interrupted, the scheduler was shutdown, + * or the scheduling group was removed. The caller is expected to clean up and finish. + */ + public boolean block(ListenableFuture future) + { + checkArgument(handle.getState() == TaskControl.State.RUNNING, "Task is not running"); + + return scheduler.block(handle, future); + } + + public long getStartNanos() + { + return scheduler.getStartNanos(handle); + } + + public long getWaitNanos() + { + return scheduler.getWaitNanos(handle); + } + + public long getScheduledNanos() + { + return scheduler.getScheduledNanos(handle); + } + + public long getBlockedNanos() + { + return scheduler.getBlockedNanos(handle); + } +} diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/SchedulingGroup.java b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/SchedulingGroup.java new file mode 100644 index 000000000000..6eb20732e667 --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/SchedulingGroup.java @@ -0,0 +1,194 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution.executor.scheduler; + +import com.google.common.collect.ImmutableSet; +import io.trino.annotation.NotThreadSafe; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.trino.execution.executor.scheduler.State.BLOCKED; +import static io.trino.execution.executor.scheduler.State.RUNNABLE; +import static io.trino.execution.executor.scheduler.State.RUNNING; + +@NotThreadSafe +final class SchedulingGroup +{ + private State state; + private long weight; + private final Map tasks = new HashMap<>(); + private final PriorityQueue runnableQueue = new PriorityQueue<>(); + private final Set blocked = new HashSet<>(); + private final PriorityQueue baselineWeights = new PriorityQueue<>(); + + public SchedulingGroup() + { + this.state = BLOCKED; + } + + public void enqueue(T handle, long deltaWeight) + { + Task task = tasks.get(handle); + + if (task == null) { + // New tasks get assigned the baseline weight so that they don't monopolize the queue + // while they catch up + task = new Task(baselineWeight()); + tasks.put(handle, task); + } + else if (task.state() == BLOCKED) { + blocked.remove(handle); + task.addWeight(baselineWeight()); + } + + weight -= task.uncommittedWeight(); + weight += deltaWeight; + + task.commitWeight(deltaWeight); + task.setState(RUNNABLE); + runnableQueue.add(handle, task.weight()); + baselineWeights.addOrReplace(handle, task.weight()); + + updateState(); + } + + public T dequeue(long expectedWeight) + { + checkArgument(state == RUNNABLE); + + T task = runnableQueue.takeOrThrow(); + + Task info = tasks.get(task); + info.setUncommittedWeight(expectedWeight); + info.setState(RUNNING); + weight += expectedWeight; + + baselineWeights.addOrReplace(task, info.weight()); + + updateState(); + + return task; + } + + public void finish(T task) + { + checkArgument(tasks.containsKey(task), "Unknown task: %s", task); + tasks.remove(task); + blocked.remove(task); + runnableQueue.removeIfPresent(task); + baselineWeights.removeIfPresent(task); + + updateState(); + } + + public void block(T handle, long deltaWeight) + { + checkArgument(tasks.containsKey(handle), "Unknown task: %s", handle); + checkArgument(!runnableQueue.contains(handle), "Task is already in queue: %s", handle); + + weight += deltaWeight; + + Task task = tasks.get(handle); + task.commitWeight(deltaWeight); + task.setState(BLOCKED); + task.addWeight(-baselineWeight()); + blocked.add(handle); + baselineWeights.remove(handle); + + updateState(); + } + + public long baselineWeight() + { + if (baselineWeights.isEmpty()) { + return 0; + } + + return baselineWeights.nextPriority(); + } + + public void addWeight(long delta) + { + weight += delta; + } + + private void updateState() + { + if (blocked.size() == tasks.size()) { + state = BLOCKED; + } + else if (runnableQueue.isEmpty()) { + state = RUNNING; + } + else { + state = RUNNABLE; + } + } + + public long weight() + { + return weight; + } + + public Set tasks() + { + return ImmutableSet.copyOf(tasks.keySet()); + } + + public State state() + { + return state; + } + + public T peek() + { + return runnableQueue.peek(); + } + + public int runnableCount() + { + return runnableQueue.size(); + } + + @Override + public String toString() + { + StringBuilder builder = new StringBuilder(); + for (Map.Entry entry : tasks.entrySet()) { + T key = entry.getKey(); + Task task = entry.getValue(); + + String prefix = "%s %s".formatted( + key == peek() ? "=>" : " ", + key); + + String details = switch (task.state()) { + case BLOCKED -> "[BLOCKED, saved delta = %s]".formatted(task.weight()); + case RUNNABLE -> "[RUNNABLE, weight = %s]".formatted(task.weight()); + case RUNNING -> "[RUNNING, weight = %s, uncommitted = %s]".formatted(task.weight(), task.uncommittedWeight()); + }; + + builder.append(prefix) + .append(" ") + .append(details) + .append("\n"); + } + + return builder.toString(); + } +} diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/SchedulingQueue.java b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/SchedulingQueue.java new file mode 100644 index 000000000000..89a5278a5fd2 --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/SchedulingQueue.java @@ -0,0 +1,345 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution.executor.scheduler; + +import com.google.common.collect.ImmutableSet; +import io.trino.annotation.NotThreadSafe; + +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.base.Verify.verify; +import static io.trino.execution.executor.scheduler.State.BLOCKED; +import static io.trino.execution.executor.scheduler.State.RUNNABLE; +import static io.trino.execution.executor.scheduler.State.RUNNING; + +/** + *

A queue of tasks that are scheduled for execution. Modeled after + * Completely Fair Scheduler. + * Tasks are grouped into scheduling groups. Within a group, tasks are ordered based + * on their relative weight. Groups are ordered relative to each other based on the + * accumulated weight of their tasks.

+ * + *

A task can be in one of three states: + *

    + *
  • runnable: the task is ready to run and waiting to be dequeued + *
  • running: the task has been dequeued and is running + *
  • blocked: the task is blocked on some external event and is not running + *
+ *

+ *

+ * A group can be in one of three states: + *

    + *
  • runnable: the group has at least one runnable task + *
  • running: all the tasks in the group are currently running + *
  • blocked: all the tasks in the group are currently blocked + *
+ *

+ *

+ * The goal is to balance the consideration among groups to ensure the accumulated + * weight in the long run is equal among groups. Within a group, the goal is to + * balance the consideration among tasks to ensure the accumulated weight in the + * long run is equal among tasks within the group. + * + *

Groups start in the blocked state and transition to the runnable state when a task is + * added via the {@link #enqueue(Object, Object, long)} method.

+ * + *

Tasks are dequeued via the {@link #dequeue(long)}. When all tasks in a group have + * been dequeued, the group transitions to the running state and is removed from the + * queue.

+ * + *

When a task time slice completes, it needs to be re-enqueued via the + * {@link #enqueue(Object, Object, long)}, which includes the desired + * increment in relative weight to apply to the task for further prioritization. + * The weight increment is also applied to the group. + *

+ * + *

If a task blocks, the caller must call the {@link #block(Object, Object, long)} + * method to indicate that the task is no longer running. A weight increment can be + * included for the portion of time the task was not blocked.

+ *
+ *

Group state transitions

+ *
+ *                                                                 blockTask()
+ *    finishTask()               enqueueTask()                     enqueueTask()
+ *        ┌───┐   ┌──────────────────────────────────────────┐       ┌────┐
+ *        │   │   │                                          │       │    │
+ *        │   ▼   │                                          ▼       ▼    │
+ *      ┌─┴───────┴─┐   all blocked        finishTask()   ┌────────────┐  │
+ *      │           │◄──────────────O◄────────────────────┤            ├──┘
+ * ────►│  BLOCKED  │               │                     │  RUNNABLE  │
+ *      │           │               │   ┌────────────────►│            │◄───┐
+ *      └───────────┘       not all │   │  enqueueTask()  └──────┬─────┘    │
+ *            ▲             blocked │   │                        │          │
+ *            │                     │   │           dequeueTask()│          │
+ *            │ all blocked         ▼   │                        │          │
+ *            │                   ┌─────┴─────┐                  ▼          │
+ *            │                   │           │◄─────────────────O──────────┘
+ *            O◄──────────────────┤  RUNNING  │      queue empty     queue
+ *            │      blockTask()  │           ├───┐                 not empty
+ *            │                   └───────────┘   │
+ *            │                     ▲      ▲      │ finishTask()
+ *            └─────────────────────┘      └──────┘
+ *                not all blocked
+ *
+ * 
+ * + *

Implementation notes

+ *
    + *
  • TODO: Initial weight upon registration
  • + *
  • TODO: Weight adjustment during blocking / unblocking
  • + *
  • TODO: Uncommitted weight on dequeue
  • + *
+ *

+ */ +@NotThreadSafe +final class SchedulingQueue +{ + private final PriorityQueue runnableQueue = new PriorityQueue<>(); + private final Map> groups = new HashMap<>(); + private final PriorityQueue baselineWeights = new PriorityQueue<>(); + + public void startGroup(G group) + { + checkArgument(!groups.containsKey(group), "Group already started: %s", group); + + SchedulingGroup info = new SchedulingGroup<>(); + groups.put(group, info); + } + + public Set finishGroup(G group) + { + SchedulingGroup info = groups.remove(group); + checkArgument(info != null, "Unknown group: %s", group); + + runnableQueue.removeIfPresent(group); + baselineWeights.removeIfPresent(group); + return info.tasks(); + } + + public boolean containsGroup(G group) + { + return groups.containsKey(group); + } + + public Set finishAll() + { + Set groups = ImmutableSet.copyOf(this.groups.keySet()); + return groups.stream() + .map(this::finishGroup) + .flatMap(Collection::stream) + .collect(Collectors.toSet()); + } + + public void finish(G group, T task) + { + checkArgument(groups.containsKey(group), "Unknown group: %s", group); + + SchedulingGroup info = groups.get(group); + + State previousState = info.state(); + info.finish(task); + State newState = info.state(); + + if (newState == RUNNABLE) { + runnableQueue.addOrReplace(group, info.weight()); + baselineWeights.addOrReplace(group, info.weight()); + } + else if (newState == RUNNING) { + runnableQueue.removeIfPresent(group); + baselineWeights.addOrReplace(group, info.weight()); + } + else if (newState == BLOCKED && previousState != BLOCKED) { + info.addWeight(-baselineWeight()); + runnableQueue.removeIfPresent(group); + baselineWeights.removeIfPresent(group); + } + + verifyState(group); + } + + public void enqueue(G group, T task, long deltaWeight) + { + checkArgument(groups.containsKey(group), "Unknown group: %s", group); + + SchedulingGroup info = groups.get(group); + + State previousState = info.state(); + info.enqueue(task, deltaWeight); + verify(info.state() == RUNNABLE); + + if (previousState == BLOCKED) { + // When transitioning from blocked, set the baseline weight to the minimum current weight + // to avoid the newly unblocked group from monopolizing the queue while it catches up + info.addWeight(baselineWeight()); + } + + runnableQueue.addOrReplace(group, info.weight()); + baselineWeights.addOrReplace(group, info.weight()); + + verifyState(group); + } + + public void block(G group, T task, long deltaWeight) + { + SchedulingGroup info = groups.get(group); + checkArgument(info != null, "Unknown group: %s", group); + checkArgument(info.state() == RUNNABLE || info.state() == RUNNING, "Group is already blocked: %s", group); + + State previousState = info.state(); + info.block(task, deltaWeight); + + doTransition(group, info, previousState, info.state()); + } + + public T dequeue(long expectedWeight) + { + G group = runnableQueue.poll(); + + if (group == null) { + return null; + } + + SchedulingGroup info = groups.get(group); + verify(info.state() == RUNNABLE, "Group is not runnable: %s", group); + + T task = info.dequeue(expectedWeight); + verify(task != null); + + baselineWeights.addOrReplace(group, info.weight()); + if (info.state() == RUNNABLE) { + runnableQueue.add(group, info.weight()); + } + + checkState(info.state() == RUNNABLE || info.state() == RUNNING); + verifyState(group); + + return task; + } + + public T peek() + { + G group = runnableQueue.peek(); + + if (group == null) { + return null; + } + + SchedulingGroup info = groups.get(group); + verify(info.state() == RUNNABLE, "Group is not runnable: %s", group); + + T task = info.peek(); + checkState(task != null); + + return task; + } + + public int getRunnableCount() + { + return runnableQueue.values().stream() + .map(groups::get) + .mapToInt(SchedulingGroup::runnableCount) + .sum(); + } + + public State state(G group) + { + SchedulingGroup info = groups.get(group); + checkArgument(info != null, "Unknown group: %s", group); + + return info.state(); + } + + private long baselineWeight() + { + if (baselineWeights.isEmpty()) { + return 0; + } + + return baselineWeights.nextPriority(); + } + + private void doTransition(G group, SchedulingGroup info, State previousState, State newState) + { + if (newState == RUNNABLE) { + runnableQueue.addOrReplace(group, info.weight()); + baselineWeights.addOrReplace(group, info.weight()); + } + else if (newState == RUNNING) { + runnableQueue.removeIfPresent(group); + baselineWeights.addOrReplace(group, info.weight()); + } + else if (newState == BLOCKED && previousState != BLOCKED) { + info.addWeight(-baselineWeight()); + runnableQueue.removeIfPresent(group); + baselineWeights.removeIfPresent(group); + } + + verifyState(group); + } + + private void verifyState(G groupKey) + { + SchedulingGroup group = groups.get(groupKey); + checkArgument(group != null, "Unknown group: %s", groupKey); + + switch (group.state()) { + case BLOCKED -> { + checkState(!runnableQueue.contains(groupKey), "Group in BLOCKED state should not be in queue: %s", groupKey); + checkState(!baselineWeights.contains(groupKey)); + } + case RUNNABLE -> { + checkState(runnableQueue.contains(groupKey), "Group in RUNNABLE state should be in queue: %s", groupKey); + checkState(baselineWeights.contains(groupKey)); + } + case RUNNING -> { + checkState(!runnableQueue.contains(groupKey), "Group in RUNNING state should not be in queue: %s", groupKey); + checkState(baselineWeights.contains(groupKey)); + } + } + } + + @Override + public String toString() + { + StringBuilder builder = new StringBuilder(); + + builder.append("Baseline weight: %s\n".formatted(baselineWeight())); + builder.append("\n"); + + for (Map.Entry> entry : groups.entrySet()) { + G group = entry.getKey(); + SchedulingGroup info = entry.getValue(); + + String prefix = "%s %s".formatted( + group == runnableQueue.peek() ? "=>" : " -", + group); + + String details = switch (entry.getValue().state()) { + case BLOCKED -> "[BLOCKED, saved delta = %s]".formatted(info.weight()); + case RUNNING, RUNNABLE -> "[%s, weight = %s, baseline = %s]".formatted(info.state(), info.weight(), info.baselineWeight()); + }; + + builder.append((prefix + " " + details).indent(4)); + builder.append(info.toString().indent(8)); + } + + return builder.toString(); + } +} diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/State.java b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/State.java new file mode 100644 index 000000000000..35e9bca8a6d6 --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/State.java @@ -0,0 +1,21 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution.executor.scheduler; + +enum State +{ + BLOCKED, // all tasks are blocked + RUNNING, // all tasks are dequeued and running + RUNNABLE // some tasks are enqueued and ready to run +} diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/Task.java b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/Task.java new file mode 100644 index 000000000000..af62f7e4fb1d --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/Task.java @@ -0,0 +1,65 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution.executor.scheduler; + +import io.trino.annotation.NotThreadSafe; + +@NotThreadSafe +final class Task +{ + private State state; + private long weight; + private long uncommittedWeight; + + public Task(long initialWeight) + { + weight = initialWeight; + } + + public void setState(State state) + { + this.state = state; + } + + public void commitWeight(long delta) + { + weight += delta; + uncommittedWeight = 0; + } + + public void addWeight(long delta) + { + weight += delta; + } + + public long weight() + { + return weight + uncommittedWeight; + } + + public void setUncommittedWeight(long weight) + { + this.uncommittedWeight = weight; + } + + public long uncommittedWeight() + { + return uncommittedWeight; + } + + public State state() + { + return state; + } +} diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/TaskControl.java b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/TaskControl.java new file mode 100644 index 000000000000..9375ca0b6b0e --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/TaskControl.java @@ -0,0 +1,366 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution.executor.scheduler; + +import com.google.common.base.Ticker; +import com.google.errorprone.annotations.ThreadSafe; +import com.google.errorprone.annotations.concurrent.GuardedBy; + +import java.util.Objects; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; + +import static java.util.Objects.requireNonNull; + +/** + * Equality is based on group and id for the purpose of adding to the scheduling queue. + */ +@ThreadSafe +final class TaskControl +{ + private final Group group; + private final int id; + private final Ticker ticker; + + private final Lock lock = new ReentrantLock(); + + @GuardedBy("lock") + private final Condition wakeup = lock.newCondition(); + + @GuardedBy("lock") + private boolean ready; + + @GuardedBy("lock") + private boolean blocked; + + @GuardedBy("lock") + private boolean cancelled; + + @GuardedBy("lock") + private State state; + + private volatile long periodStart; + private final AtomicLong startNanos = new AtomicLong(); + private final AtomicLong scheduledNanos = new AtomicLong(); + private final AtomicLong blockedNanos = new AtomicLong(); + private final AtomicLong waitNanos = new AtomicLong(); + private volatile Thread thread; + + public TaskControl(Group group, int id, Ticker ticker) + { + this.group = requireNonNull(group, "group is null"); + this.id = id; + this.ticker = requireNonNull(ticker, "ticker is null"); + this.state = State.NEW; + this.ready = false; + this.periodStart = ticker.read(); + } + + public void setThread(Thread thread) + { + this.thread = thread; + } + + public void cancel() + { + lock.lock(); + try { + cancelled = true; + wakeup.signal(); + + // TODO: it should be possible to interrupt the thread, but + // it appears that it's not safe to do so. It can cause the query + // to get stuck (e.g., AbstractDistributedEngineOnlyQueries.testSelectiveLimit) + // + // Thread thread = this.thread; + // if (thread != null) { + // thread.interrupt(); + // } + } + finally { + lock.unlock(); + } + } + + /** + * Called by the scheduler thread when the task is ready to run. It + * causes anyone blocking in {@link #awaitReady()} to wake up. + * + * @return false if the task was already cancelled + */ + public boolean markReady() + { + lock.lock(); + try { + if (cancelled) { + return false; + } + ready = true; + wakeup.signal(); + } + finally { + lock.unlock(); + } + + return true; + } + + public void markNotReady() + { + lock.lock(); + try { + ready = false; + } + finally { + lock.unlock(); + } + } + + public boolean isReady() + { + lock.lock(); + try { + return ready; + } + finally { + lock.unlock(); + } + } + + /** + * @return false if the operation was interrupted due to cancellation + */ + public boolean awaitReady() + { + lock.lock(); + try { + while (!ready && !cancelled) { + try { + wakeup.await(); + } + catch (InterruptedException e) { + } + } + + return !cancelled; + } + finally { + lock.unlock(); + } + } + + public void markUnblocked() + { + lock.lock(); + try { + blocked = false; + wakeup.signal(); + } + finally { + lock.unlock(); + } + } + + public void markBlocked() + { + lock.lock(); + try { + blocked = true; + } + finally { + lock.unlock(); + } + } + + public void awaitUnblock() + { + lock.lock(); + try { + while (blocked && !cancelled) { + try { + wakeup.await(); + } + catch (InterruptedException e) { + } + } + } + finally { + lock.unlock(); + } + } + + /** + * @return false if the transition was unsuccessful due to the task being interrupted + */ + public boolean transitionToBlocked() + { + boolean success = transitionTo(State.BLOCKED); + + if (success) { + markBlocked(); + } + + return success; + } + + public void transitionToFinished() + { + transitionTo(State.FINISHED); + } + + /** + * @return false if the transition was unsuccessful due to the task being interrupted + */ + public boolean transitionToWaiting() + { + boolean success = transitionTo(State.WAITING); + + if (success) { + markNotReady(); + } + + return success; + } + + /** + * @return false if the transition was unsuccessful due to the task being interrupted + */ + public boolean transitionToRunning() + { + return transitionTo(State.RUNNING); + } + + private boolean transitionTo(State state) + { + lock.lock(); + try { + recordPeriodEnd(this.state); + + if (cancelled) { + this.state = State.INTERRUPTED; + return false; + } + else { + this.state = state; + return true; + } + } + finally { + lock.unlock(); + } + } + + private void recordPeriodEnd(State state) + { + long now = ticker.read(); + long elapsed = now - periodStart; + switch (state) { + case RUNNING -> scheduledNanos.addAndGet(elapsed); + case BLOCKED -> blockedNanos.addAndGet(elapsed); + case NEW -> startNanos.addAndGet(elapsed); + case WAITING -> waitNanos.addAndGet(elapsed); + case INTERRUPTED, FINISHED -> {} + } + periodStart = now; + } + + public Group group() + { + return group; + } + + public State getState() + { + lock.lock(); + try { + return state; + } + finally { + lock.unlock(); + } + } + + public long elapsed() + { + return ticker.read() - periodStart; + } + + public long getStartNanos() + { + return startNanos.get(); + } + + public long getWaitNanos() + { + return waitNanos.get(); + } + + public long getScheduledNanos() + { + return scheduledNanos.get(); + } + + public long getBlockedNanos() + { + return blockedNanos.get(); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + TaskControl that = (TaskControl) o; + return id == that.id && group.equals(that.group); + } + + @Override + public int hashCode() + { + return Objects.hash(group, id); + } + + @Override + public String toString() + { + lock.lock(); + try { + return group.name() + "-" + id + " [" + state + "]"; + } + finally { + lock.unlock(); + } + } + + public Thread getThread() + { + return thread; + } + + public enum State + { + NEW, + WAITING, + RUNNING, + BLOCKED, + INTERRUPTED, + FINISHED + } +} diff --git a/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/group-state-diagram.dot b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/group-state-diagram.dot new file mode 100644 index 000000000000..bc1346753ee0 --- /dev/null +++ b/core/trino-main/src/main/java/io/trino/execution/executor/scheduler/group-state-diagram.dot @@ -0,0 +1,24 @@ +digraph Group { + node [shape=box]; + + start [shape=point]; + split1 [shape=point]; + split2 [shape=point]; + + + start -> blocked; + blocked -> runnable [label="enqueueTask()"]; + runnable -> runnable [label="enqueueTask()\nblockTask()"]; + runnable -> split1 [label="dequeueTask()"]; + split1 -> runnable [label="queue not empty"]; + split1 -> running [label="queue empty"]; + running -> split2 [label="blockTask()"]; + running -> runnable [label="enqueueTask()"]; + split2 -> blocked [label="all blocked"]; + split2 -> running [label="not all blocked"]; + blocked -> blocked [label="finishTask()"]; + running -> running [label="finishTask()"]; + runnable -> split3 [label="finishTask()"]; + split3 -> blocked [label="all blocked"]; + split3 -> running [label="all running"]; +} diff --git a/core/trino-main/src/main/java/io/trino/server/ServerMainModule.java b/core/trino-main/src/main/java/io/trino/server/ServerMainModule.java index dd9b66321c1b..5ce7a985d0ac 100644 --- a/core/trino-main/src/main/java/io/trino/server/ServerMainModule.java +++ b/core/trino-main/src/main/java/io/trino/server/ServerMainModule.java @@ -50,6 +50,7 @@ import io.trino.execution.TaskManagementExecutor; import io.trino.execution.TaskManagerConfig; import io.trino.execution.executor.TaskExecutor; +import io.trino.execution.executor.dedicated.ThreadPerDriverTaskExecutor; import io.trino.execution.executor.timesharing.MultilevelSplitQueue; import io.trino.execution.executor.timesharing.TimeSharingTaskExecutor; import io.trino.execution.scheduler.NodeScheduler; @@ -283,8 +284,6 @@ protected void setup(Binder binder) binder.bind(FailureInjector.class).in(Scopes.SINGLETON); jaxrsBinder(binder).bind(TaskResource.class); newExporter(binder).export(TaskResource.class).withGeneratedName(); - jaxrsBinder(binder).bind(TaskExecutorResource.class); - newExporter(binder).export(TaskExecutorResource.class).withGeneratedName(); binder.bind(TaskManagementExecutor.class).in(Scopes.SINGLETON); binder.bind(SqlTaskManager.class).in(Scopes.SINGLETON); binder.bind(TableExecuteContextManager.class).in(Scopes.SINGLETON); @@ -307,11 +306,6 @@ protected void setup(Binder binder) newOptionalBinder(binder, VersionEmbedder.class).setDefault().to(EmbedVersion.class).in(Scopes.SINGLETON); newExporter(binder).export(SqlTaskManager.class).withGeneratedName(); - binder.bind(TaskExecutor.class) - .to(TimeSharingTaskExecutor.class) - .in(Scopes.SINGLETON); - binder.bind(TimeSharingTaskExecutor.class).in(Scopes.SINGLETON); - newExporter(binder).export(TaskExecutor.class).withGeneratedName(); binder.bind(MultilevelSplitQueue.class).in(Scopes.SINGLETON); newExporter(binder).export(MultilevelSplitQueue.class).withGeneratedName(); @@ -322,6 +316,24 @@ protected void setup(Binder binder) binder.bind(PageFunctionCompiler.class).in(Scopes.SINGLETON); newExporter(binder).export(PageFunctionCompiler.class).withGeneratedName(); configBinder(binder).bindConfig(TaskManagerConfig.class); + + // TODO: use conditional module + TaskManagerConfig taskManagerConfig = buildConfigObject(TaskManagerConfig.class); + if (taskManagerConfig.isThreadPerDriverSchedulerEnabled()) { + binder.bind(TaskExecutor.class) + .to(ThreadPerDriverTaskExecutor.class) + .in(Scopes.SINGLETON); + } + else { + jaxrsBinder(binder).bind(TaskExecutorResource.class); + newExporter(binder).export(TaskExecutorResource.class).withGeneratedName(); + + binder.bind(TaskExecutor.class) + .to(TimeSharingTaskExecutor.class) + .in(Scopes.SINGLETON); + binder.bind(TimeSharingTaskExecutor.class).in(Scopes.SINGLETON); + } + if (retryPolicy == TASK) { configBinder(binder).bindConfigDefaults(TaskManagerConfig.class, TaskManagerConfig::applyFaultTolerantExecutionDefaults); } diff --git a/core/trino-main/src/main/java/io/trino/tracing/TrinoAttributes.java b/core/trino-main/src/main/java/io/trino/tracing/TrinoAttributes.java index 31368e066e70..d32429a28f16 100644 --- a/core/trino-main/src/main/java/io/trino/tracing/TrinoAttributes.java +++ b/core/trino-main/src/main/java/io/trino/tracing/TrinoAttributes.java @@ -55,6 +55,8 @@ private TrinoAttributes() {} public static final AttributeKey SPLIT_SCHEDULED_TIME_NANOS = longKey("trino.split.scheduled_time_nanos"); public static final AttributeKey SPLIT_CPU_TIME_NANOS = longKey("trino.split.cpu_time_nanos"); public static final AttributeKey SPLIT_WAIT_TIME_NANOS = longKey("trino.split.wait_time_nanos"); + public static final AttributeKey SPLIT_START_TIME_NANOS = longKey("trino.split.start_time_nanos"); + public static final AttributeKey SPLIT_BLOCK_TIME_NANOS = longKey("trino.split.block_time_nanos"); public static final AttributeKey SPLIT_BLOCKED = booleanKey("trino.split.blocked"); public static final AttributeKey EVENT_STATE = stringKey("state"); diff --git a/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManagerThreadPerDriver.java b/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManagerThreadPerDriver.java new file mode 100644 index 000000000000..bbec2769cdc5 --- /dev/null +++ b/core/trino-main/src/test/java/io/trino/execution/TestSqlTaskManagerThreadPerDriver.java @@ -0,0 +1,35 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution; + +import com.google.common.base.Ticker; +import io.airlift.tracing.Tracing; +import io.trino.execution.executor.TaskExecutor; +import io.trino.execution.executor.dedicated.ThreadPerDriverTaskExecutor; +import io.trino.execution.executor.scheduler.FairScheduler; + +import static io.trino.version.EmbedVersion.testingVersionEmbedder; + +public class TestSqlTaskManagerThreadPerDriver + extends BaseTestSqlTaskManager +{ + @Override + protected TaskExecutor createTaskExecutor() + { + return new ThreadPerDriverTaskExecutor( + Tracing.noopTracer(), + testingVersionEmbedder(), + new FairScheduler(8, "Runner-%d", Ticker.systemTicker())); + } +} diff --git a/core/trino-main/src/test/java/io/trino/execution/TestTaskManagerConfig.java b/core/trino-main/src/test/java/io/trino/execution/TestTaskManagerConfig.java index ed74d9203560..0030d5d88fe2 100644 --- a/core/trino-main/src/test/java/io/trino/execution/TestTaskManagerConfig.java +++ b/core/trino-main/src/test/java/io/trino/execution/TestTaskManagerConfig.java @@ -40,6 +40,7 @@ public class TestTaskManagerConfig public void testDefaults() { assertRecordedDefaults(recordDefaults(TaskManagerConfig.class) + .setThreadPerDriverSchedulerEnabled(false) .setInitialSplitsPerNode(Runtime.getRuntime().availableProcessors() * 2) .setSplitConcurrencyAdjustmentInterval(new Duration(100, TimeUnit.MILLISECONDS)) .setStatusRefreshMaxWait(new Duration(1, TimeUnit.SECONDS)) @@ -85,6 +86,7 @@ public void testExplicitPropertyMappings() int processorCount = DEFAULT_PROCESSOR_COUNT == 32 ? 16 : 32; int maxWriterCount = DEFAULT_SCALE_WRITERS_MAX_WRITER_COUNT == 32 ? 16 : 32; Map properties = ImmutableMap.builder() + .put("experimental.thread-per-driver-scheduler-enabled", "true") .put("task.initial-splits-per-node", "1") .put("task.split-concurrency-adjustment-interval", "1s") .put("task.status-refresh-max-wait", "2s") @@ -125,6 +127,7 @@ public void testExplicitPropertyMappings() .buildOrThrow(); TaskManagerConfig expected = new TaskManagerConfig() + .setThreadPerDriverSchedulerEnabled(true) .setInitialSplitsPerNode(1) .setSplitConcurrencyAdjustmentInterval(new Duration(1, TimeUnit.SECONDS)) .setStatusRefreshMaxWait(new Duration(2, TimeUnit.SECONDS)) diff --git a/core/trino-main/src/test/java/io/trino/execution/executor/dedicated/TestThreadPerDriverTaskExecutor.java b/core/trino-main/src/test/java/io/trino/execution/executor/dedicated/TestThreadPerDriverTaskExecutor.java new file mode 100644 index 000000000000..2648bfbc581b --- /dev/null +++ b/core/trino-main/src/test/java/io/trino/execution/executor/dedicated/TestThreadPerDriverTaskExecutor.java @@ -0,0 +1,258 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution.executor.dedicated; + +import com.google.common.collect.ImmutableList; +import com.google.common.util.concurrent.AbstractFuture; +import com.google.common.util.concurrent.Futures; +import com.google.common.util.concurrent.ListenableFuture; +import io.airlift.testing.TestingTicker; +import io.airlift.units.Duration; +import io.opentelemetry.api.trace.Span; +import io.trino.execution.SplitRunner; +import io.trino.execution.StageId; +import io.trino.execution.TaskId; +import io.trino.execution.TaskManagerConfig; +import io.trino.execution.executor.TaskHandle; +import io.trino.execution.executor.scheduler.FairScheduler; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; + +import java.util.List; +import java.util.OptionalInt; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executor; +import java.util.concurrent.Phaser; +import java.util.concurrent.TimeUnit; +import java.util.function.Function; + +import static io.airlift.tracing.Tracing.noopTracer; +import static io.trino.version.EmbedVersion.testingVersionEmbedder; +import static java.util.concurrent.TimeUnit.MILLISECONDS; +import static org.assertj.core.api.Assertions.assertThat; + +public class TestThreadPerDriverTaskExecutor +{ + @Test + @Timeout(10) + public void testCancellationWhileProcessing() + throws ExecutionException, InterruptedException + { + ThreadPerDriverTaskExecutor executor = new ThreadPerDriverTaskExecutor(new TaskManagerConfig(), noopTracer(), testingVersionEmbedder()); + executor.start(); + try { + TaskId taskId = new TaskId(new StageId("query", 1), 1, 1); + TaskHandle task = executor.addTask(taskId, () -> 0, 10, new Duration(1, MILLISECONDS), OptionalInt.empty()); + + CountDownLatch started = new CountDownLatch(1); + + SplitRunner split = new TestingSplitRunner(ImmutableList.of(duration -> { + started.countDown(); + try { + Thread.currentThread().join(); + } + catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + + return Futures.immediateVoidFuture(); + })); + + ListenableFuture splitDone = executor.enqueueSplits(task, false, ImmutableList.of(split)).get(0); + + started.await(); + executor.removeTask(task); + + splitDone.get(); + assertThat(split.isFinished()).isTrue(); + } + finally { + executor.stop(); + } + } + + @Test + @Timeout(10) + public void testBlocking() + throws ExecutionException, InterruptedException + { + ThreadPerDriverTaskExecutor executor = new ThreadPerDriverTaskExecutor(new TaskManagerConfig(), noopTracer(), testingVersionEmbedder()); + executor.start(); + + try { + TaskId taskId = new TaskId(new StageId("query", 1), 1, 1); + TaskHandle task = executor.addTask(taskId, () -> 0, 10, new Duration(1, MILLISECONDS), OptionalInt.empty()); + + TestFuture blocked = new TestFuture(); + + SplitRunner split = new TestingSplitRunner(ImmutableList.of( + duration -> blocked, + duration -> Futures.immediateVoidFuture())); + + ListenableFuture splitDone = executor.enqueueSplits(task, false, ImmutableList.of(split)).get(0); + + blocked.awaitListenerAdded(); + blocked.set(null); // unblock the split + + splitDone.get(); + assertThat(split.isFinished()).isTrue(); + } + finally { + executor.stop(); + } + } + + @Test + @Timeout(10) + public void testYielding() + throws ExecutionException, InterruptedException + { + TestingTicker ticker = new TestingTicker(); + FairScheduler scheduler = new FairScheduler(1, "Runner-%d", ticker); + ThreadPerDriverTaskExecutor executor = new ThreadPerDriverTaskExecutor(noopTracer(), testingVersionEmbedder(), scheduler); + executor.start(); + + try { + TaskId taskId = new TaskId(new StageId("query", 1), 1, 1); + TaskHandle task = executor.addTask(taskId, () -> 0, 10, new Duration(1, MILLISECONDS), OptionalInt.empty()); + + Phaser phaser = new Phaser(2); + SplitRunner split = new TestingSplitRunner(ImmutableList.of( + duration -> { + phaser.arriveAndAwaitAdvance(); // wait to start + phaser.arriveAndAwaitAdvance(); // wait to advance time + return Futures.immediateVoidFuture(); + }, + duration -> { + phaser.arriveAndAwaitAdvance(); + return Futures.immediateVoidFuture(); + })); + + ListenableFuture splitDone = executor.enqueueSplits(task, false, ImmutableList.of(split)).get(0); + + phaser.arriveAndAwaitAdvance(); // wait for split to start + + // cause the task to yield + ticker.increment(FairScheduler.QUANTUM_NANOS * 2, TimeUnit.NANOSECONDS); + phaser.arriveAndAwaitAdvance(); + + // wait for reschedule + assertThat(phaser.arriveAndAwaitAdvance()).isEqualTo(3); // wait for reschedule + + splitDone.get(); + assertThat(split.isFinished()).isTrue(); + } + finally { + executor.stop(); + } + } + + private static class TestFuture + extends AbstractFuture + { + private final CountDownLatch listenerAdded = new CountDownLatch(1); + + @Override + public void addListener(Runnable listener, Executor executor) + { + super.addListener(listener, executor); + listenerAdded.countDown(); + } + + @Override + public boolean set(Void value) + { + return super.set(value); + } + + public void awaitListenerAdded() + throws InterruptedException + { + listenerAdded.await(); + } + } + + private static class TestingSplitRunner + implements SplitRunner + { + private final List>> invocations; + private int invocation; + private volatile boolean finished; + private volatile Thread runnerThread; + + public TestingSplitRunner(List>> invocations) + { + this.invocations = invocations; + } + + @Override + public final int getPipelineId() + { + return 0; + } + + @Override + public final Span getPipelineSpan() + { + return Span.getInvalid(); + } + + @Override + public final boolean isFinished() + { + return finished; + } + + @Override + public final ListenableFuture processFor(Duration duration) + { + ListenableFuture blocked; + + runnerThread = Thread.currentThread(); + try { + blocked = invocations.get(invocation).apply(duration); + } + finally { + runnerThread = null; + } + + invocation++; + + if (invocation == invocations.size()) { + finished = true; + } + + return blocked; + } + + @Override + public final String getInfo() + { + return ""; + } + + @Override + public final void close() + { + finished = true; + + Thread runnerThread = this.runnerThread; + + if (runnerThread != null) { + runnerThread.interrupt(); + } + } + } +} diff --git a/core/trino-main/src/test/java/io/trino/execution/executor/scheduler/TestFairScheduler.java b/core/trino-main/src/test/java/io/trino/execution/executor/scheduler/TestFairScheduler.java new file mode 100644 index 000000000000..24dd6809159e --- /dev/null +++ b/core/trino-main/src/test/java/io/trino/execution/executor/scheduler/TestFairScheduler.java @@ -0,0 +1,227 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution.executor.scheduler; + +import com.google.common.util.concurrent.AbstractFuture; +import com.google.common.util.concurrent.ListenableFuture; +import com.google.common.util.concurrent.SettableFuture; +import io.airlift.testing.TestingTicker; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; + +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executor; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; + +import static org.assertj.core.api.Assertions.assertThat; + +public class TestFairScheduler +{ + @Test + public void testBasic() + throws ExecutionException, InterruptedException + { + try (FairScheduler scheduler = FairScheduler.newInstance(1)) { + Group group = scheduler.createGroup("G1"); + + AtomicBoolean ran = new AtomicBoolean(); + ListenableFuture done = scheduler.submit(group, 1, context -> ran.set(true)); + + done.get(); + assertThat(ran.get()) + .describedAs("Ran task") + .isTrue(); + } + } + + @Test + @Timeout(5) + public void testYield() + throws ExecutionException, InterruptedException + { + TestingTicker ticker = new TestingTicker(); + try (FairScheduler scheduler = FairScheduler.newInstance(1, ticker)) { + Group group = scheduler.createGroup("G"); + + CountDownLatch task1Started = new CountDownLatch(1); + AtomicBoolean task2Ran = new AtomicBoolean(); + + ListenableFuture task1 = scheduler.submit(group, 1, context -> { + task1Started.countDown(); + while (!task2Ran.get()) { + if (!context.maybeYield()) { + return; + } + } + }); + + task1Started.await(); + + ListenableFuture task2 = scheduler.submit(group, 2, context -> { + task2Ran.set(true); + }); + + while (!task2.isDone()) { + ticker.increment(FairScheduler.QUANTUM_NANOS * 2, TimeUnit.NANOSECONDS); + } + + task1.get(); + } + } + + @Test + public void testBlocking() + throws InterruptedException, ExecutionException + { + try (FairScheduler scheduler = FairScheduler.newInstance(1)) { + Group group = scheduler.createGroup("G"); + + CountDownLatch task1Started = new CountDownLatch(1); + CountDownLatch task2Submitted = new CountDownLatch(1); + CountDownLatch task2Started = new CountDownLatch(1); + AtomicBoolean task2Ran = new AtomicBoolean(); + + SettableFuture task1Blocked = SettableFuture.create(); + + ListenableFuture task1 = scheduler.submit(group, 1, context -> { + try { + task1Started.countDown(); + task2Submitted.await(); + + assertThat(task2Ran.get()) + .describedAs("Task 2 run") + .isFalse(); + + context.block(task1Blocked); + + assertThat(task2Ran.get()) + .describedAs("Task 2 run") + .isTrue(); + } + catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException(e); + } + }); + + task1Started.await(); + + ListenableFuture task2 = scheduler.submit(group, 2, context -> { + task2Started.countDown(); + task2Ran.set(true); + }); + + task2Submitted.countDown(); + task2Started.await(); + + // unblock task 1 + task1Blocked.set(null); + + task1.get(); + task2.get(); + } + } + + @Test + public void testCancelWhileYielding() + throws InterruptedException, ExecutionException + { + TestingTicker ticker = new TestingTicker(); + try (FairScheduler scheduler = FairScheduler.newInstance(1, ticker)) { + Group group = scheduler.createGroup("G"); + + CountDownLatch task1Started = new CountDownLatch(1); + CountDownLatch task1TimeAdvanced = new CountDownLatch(1); + + ListenableFuture task1 = scheduler.submit(group, 1, context -> { + try { + task1Started.countDown(); + task1TimeAdvanced.await(); + + assertThat(context.maybeYield()) + .describedAs("Cancelled while yielding") + .isFalse(); + } + catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException(e); + } + }); + + task1Started.await(); + scheduler.pause(); // prevent rescheduling after yield + + ticker.increment(FairScheduler.QUANTUM_NANOS * 2, TimeUnit.NANOSECONDS); + task1TimeAdvanced.countDown(); + + scheduler.removeGroup(group); + task1.get(); + } + } + + @Test + public void testCancelWhileBlocking() + throws InterruptedException, ExecutionException + { + TestingTicker ticker = new TestingTicker(); + try (FairScheduler scheduler = FairScheduler.newInstance(1, ticker)) { + Group group = scheduler.createGroup("G"); + + CountDownLatch task1Started = new CountDownLatch(1); + TestFuture task1Blocked = new TestFuture(); + + ListenableFuture task1 = scheduler.submit(group, 1, context -> { + task1Started.countDown(); + + assertThat(context.block(task1Blocked)) + .describedAs("Cancelled while blocking") + .isFalse(); + }); + + task1Started.await(); + + task1Blocked.awaitListenerAdded(); // When the listener is added, we know the task is blocked + + scheduler.removeGroup(group); + task1.get(); + } + } + + private static class TestFuture + extends AbstractFuture + { + private final CountDownLatch listenerAdded = new CountDownLatch(1); + + @Override + public void addListener(Runnable listener, Executor executor) + { + super.addListener(listener, executor); + listenerAdded.countDown(); + } + + @Override + public boolean set(Void value) + { + return super.set(value); + } + + public void awaitListenerAdded() + throws InterruptedException + { + listenerAdded.await(); + } + } +} diff --git a/core/trino-main/src/test/java/io/trino/execution/executor/scheduler/TestPriorityQueue.java b/core/trino-main/src/test/java/io/trino/execution/executor/scheduler/TestPriorityQueue.java new file mode 100644 index 000000000000..d4f799e1a5e3 --- /dev/null +++ b/core/trino-main/src/test/java/io/trino/execution/executor/scheduler/TestPriorityQueue.java @@ -0,0 +1,210 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution.executor.scheduler; + +import com.google.common.collect.ImmutableSet; +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +public class TestPriorityQueue +{ + @Test + public void testEmpty() + { + PriorityQueue queue = new PriorityQueue<>(); + + assertThat(queue.poll()).isNull(); + assertThat(queue.isEmpty()).isTrue(); + } + + @Test + public void testNotEmpty() + { + PriorityQueue queue = new PriorityQueue<>(); + + queue.add("hello", 1); + assertThat(queue.isEmpty()).isFalse(); + } + + @Test + public void testDuplicate() + { + PriorityQueue queue = new PriorityQueue<>(); + + queue.add("hello", 1); + assertThatThrownBy(() -> queue.add("hello", 2)) + .isInstanceOf(IllegalArgumentException.class); + } + + @Test + public void testOrder() + { + PriorityQueue queue = new PriorityQueue<>(); + + queue.add("jumps", 5); + queue.add("fox", 4); + queue.add("over", 6); + queue.add("brown", 3); + queue.add("dog", 8); + queue.add("the", 1); + queue.add("lazy", 7); + queue.add("quick", 2); + + assertThat(queue.poll()).isEqualTo("the"); + assertThat(queue.poll()).isEqualTo("quick"); + assertThat(queue.poll()).isEqualTo("brown"); + assertThat(queue.poll()).isEqualTo("fox"); + assertThat(queue.poll()).isEqualTo("jumps"); + assertThat(queue.poll()).isEqualTo("over"); + assertThat(queue.poll()).isEqualTo("lazy"); + assertThat(queue.poll()).isEqualTo("dog"); + assertThat(queue.poll()).isNull(); + } + + @Test + public void testInterleaved() + { + PriorityQueue queue = new PriorityQueue<>(); + + queue.add("jumps", 5); + queue.add("over", 6); + queue.add("fox", 4); + + assertThat(queue.poll()).isEqualTo("fox"); + assertThat(queue.poll()).isEqualTo("jumps"); + + queue.add("brown", 3); + queue.add("dog", 8); + queue.add("the", 1); + + assertThat(queue.poll()).isEqualTo("the"); + assertThat(queue.poll()).isEqualTo("brown"); + assertThat(queue.poll()).isEqualTo("over"); + + queue.add("lazy", 7); + queue.add("quick", 2); + + assertThat(queue.poll()).isEqualTo("quick"); + assertThat(queue.poll()).isEqualTo("lazy"); + assertThat(queue.poll()).isEqualTo("dog"); + assertThat(queue.poll()).isNull(); + } + + @Test + public void testRemove() + { + PriorityQueue queue = new PriorityQueue<>(); + + queue.add("fox", 4); + queue.add("brown", 3); + queue.add("the", 1); + queue.add("quick", 2); + + queue.remove("brown"); + + assertThat(queue.poll()).isEqualTo("the"); + assertThat(queue.poll()).isEqualTo("quick"); + assertThat(queue.poll()).isEqualTo("fox"); + assertThat(queue.poll()).isNull(); + } + + @Test + public void testRemoveMissing() + { + PriorityQueue queue = new PriorityQueue<>(); + + queue.add("the", 1); + queue.add("quick", 2); + queue.add("brown", 3); + + assertThatThrownBy(() -> queue.remove("fox")) + .isInstanceOf(IllegalArgumentException.class); + + queue.removeIfPresent("fox"); + } + + @Test + public void testContains() + { + PriorityQueue queue = new PriorityQueue<>(); + + queue.add("the", 1); + queue.add("quick", 2); + queue.add("brown", 3); + + assertThat(queue.contains("quick")).isTrue(); + assertThat(queue.contains("fox")).isFalse(); + } + + @Test + public void testRecycle() + { + PriorityQueue queue = new PriorityQueue<>(); + + queue.add("hello", 1); + assertThat(queue.poll()).isEqualTo("hello"); + + queue.add("hello", 2); + assertThat(queue.poll()).isEqualTo("hello"); + } + + @Test + public void testValues() + { + PriorityQueue queue = new PriorityQueue<>(); + + assertThat(queue.values()).isEmpty(); + + queue.add("hello", 1); + queue.add("world", 2); + + assertThat(queue.values()) + .isEqualTo(ImmutableSet.of("hello", "world")); + } + + @Test + public void testNextPriority() + { + PriorityQueue queue = new PriorityQueue<>(); + + assertThatThrownBy(queue::nextPriority) + .isInstanceOf(IllegalStateException.class); + + queue.add("hello", 10); + queue.add("world", 20); + + assertThat(queue.nextPriority()).isEqualTo(10); + + queue.poll(); + assertThat(queue.nextPriority()).isEqualTo(20); + + queue.poll(); + assertThatThrownBy(queue::nextPriority) + .isInstanceOf(IllegalStateException.class); + } + + @Test + public void testSamePriority() + { + PriorityQueue queue = new PriorityQueue<>(); + + queue.add("hello", 1); + queue.add("world", 1); + + assertThat(queue.poll()).isEqualTo("hello"); + assertThat(queue.poll()).isEqualTo("world"); + } +} diff --git a/core/trino-main/src/test/java/io/trino/execution/executor/scheduler/TestSchedulingQueue.java b/core/trino-main/src/test/java/io/trino/execution/executor/scheduler/TestSchedulingQueue.java new file mode 100644 index 000000000000..cc31c9db2d86 --- /dev/null +++ b/core/trino-main/src/test/java/io/trino/execution/executor/scheduler/TestSchedulingQueue.java @@ -0,0 +1,323 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.execution.executor.scheduler; + +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +public class TestSchedulingQueue +{ + @Test + public void testEmpty() + { + SchedulingQueue queue = new SchedulingQueue<>(); + + assertThat(queue.dequeue(1)).isNull(); + } + + @Test + public void testSingleGroup() + { + SchedulingQueue queue = new SchedulingQueue<>(); + + queue.startGroup("G1"); + + queue.enqueue("G1", "T1", 1); + queue.enqueue("G1", "T2", 3); + queue.enqueue("G1", "T3", 5); + queue.enqueue("G1", "T4", 7); + + assertThat(queue.dequeue(1)).isEqualTo("T1"); + assertThat(queue.dequeue(1)).isEqualTo("T2"); + assertThat(queue.dequeue(1)).isEqualTo("T3"); + assertThat(queue.dequeue(1)).isEqualTo("T4"); + + queue.enqueue("G1", "T1", 10); + queue.enqueue("G1", "T2", 10); + queue.enqueue("G1", "T3", 10); + queue.enqueue("G1", "T4", 10); + + assertThat(queue.dequeue(1)).isEqualTo("T1"); + assertThat(queue.dequeue(1)).isEqualTo("T2"); + assertThat(queue.dequeue(1)).isEqualTo("T3"); + assertThat(queue.dequeue(1)).isEqualTo("T4"); + + queue.enqueue("G1", "T1", 16); + queue.enqueue("G1", "T2", 12); + queue.enqueue("G1", "T3", 8); + queue.enqueue("G1", "T4", 4); + + assertThat(queue.dequeue(1)).isEqualTo("T4"); + assertThat(queue.dequeue(1)).isEqualTo("T3"); + assertThat(queue.dequeue(1)).isEqualTo("T2"); + assertThat(queue.dequeue(1)).isEqualTo("T1"); + + queue.finish("G1", "T1"); + queue.finish("G1", "T2"); + queue.finish("G1", "T3"); + queue.finish("G1", "T4"); + + assertThat(queue.state("G1")).isEqualTo(State.BLOCKED); + } + + @Test + public void testBasic() + { + SchedulingQueue queue = new SchedulingQueue<>(); + + queue.startGroup("G1"); + queue.startGroup("G2"); + + queue.enqueue("G1", "T1.0", 1); + queue.enqueue("G1", "T1.1", 2); + queue.enqueue("G2", "T2.0", 3); + queue.enqueue("G2", "T2.1", 4); + + assertThat(queue.dequeue(1)).isEqualTo("T1.0"); + assertThat(queue.dequeue(1)).isEqualTo("T1.1"); + assertThat(queue.dequeue(1)).isEqualTo("T2.0"); + assertThat(queue.dequeue(1)).isEqualTo("T2.1"); + + queue.enqueue("G1", "T1.0", 10); + queue.enqueue("G1", "T1.1", 20); + queue.enqueue("G2", "T2.0", 15); + queue.enqueue("G2", "T2.1", 5); + + assertThat(queue.dequeue(1)).isEqualTo("T2.1"); + assertThat(queue.dequeue(1)).isEqualTo("T2.0"); + assertThat(queue.dequeue(1)).isEqualTo("T1.0"); + assertThat(queue.dequeue(1)).isEqualTo("T1.1"); + + queue.enqueue("G1", "T1.0", 100); + queue.enqueue("G2", "T2.0", 90); + assertThat(queue.dequeue(1)).isEqualTo("T2.0"); + assertThat(queue.dequeue(1)).isEqualTo("T1.0"); + } + + @Test + public void testSomeEmptyGroups() + { + SchedulingQueue queue = new SchedulingQueue<>(); + + queue.startGroup("G1"); + queue.startGroup("G2"); + + queue.enqueue("G2", "T1", 0); + + assertThat(queue.dequeue(1)).isEqualTo("T1"); + } + + @Test + public void testDelayedCreation() + { + SchedulingQueue queue = new SchedulingQueue<>(); + + queue.startGroup("G1"); + queue.startGroup("G2"); + + queue.enqueue("G1", "T1.0", 100); + queue.enqueue("G2", "T2.0", 200); + + queue.startGroup("G3"); // new group gets a priority baseline equal to the minimum current priority + queue.enqueue("G3", "T3.0", 50); + + assertThat(queue.dequeue(1)).isEqualTo("T1.0"); + assertThat(queue.dequeue(1)).isEqualTo("T3.0"); + assertThat(queue.dequeue(1)).isEqualTo("T2.0"); + } + + @Test + public void testDelayedCreationWhileAllRunning() + { + SchedulingQueue queue = new SchedulingQueue<>(); + + queue.startGroup("G1"); + queue.startGroup("G2"); + + queue.enqueue("G1", "T1.0", 0); + + queue.enqueue("G2", "T2.0", 100); + queue.dequeue(50); + queue.dequeue(50); + + queue.startGroup("G3"); // new group gets a priority baseline equal to the minimum current priority + queue.enqueue("G3", "T3.0", 10); + + queue.enqueue("G1", "T1.0", 50); + queue.enqueue("G2", "T2.0", 50); + + assertThat(queue.dequeue(1)).isEqualTo("T1.0"); + assertThat(queue.dequeue(1)).isEqualTo("T3.0"); + assertThat(queue.dequeue(1)).isEqualTo("T2.0"); + } + + @Test + public void testGroupState() + { + SchedulingQueue queue = new SchedulingQueue<>(); + + // initial state with no tasks + queue.startGroup("G1"); + assertThat(queue.state("G1")).isEqualTo(State.BLOCKED); + + // after adding a task, it should be runnable + queue.enqueue("G1", "T1", 0); + assertThat(queue.state("G1")).isEqualTo(State.RUNNABLE); + queue.enqueue("G1", "T2", 0); + assertThat(queue.state("G1")).isEqualTo(State.RUNNABLE); + + // after dequeueing, still runnable if there's at least one runnable task + queue.dequeue(1); + assertThat(queue.state("G1")).isEqualTo(State.RUNNABLE); + + // after all tasks are dequeued, it should be running + queue.dequeue(1); + assertThat(queue.state("G1")).isEqualTo(State.RUNNING); + + // still running while at least one task is running and there are no runnable tasks + queue.block("G1", "T1", 1); + assertThat(queue.state("G1")).isEqualTo(State.RUNNING); + + // runnable after blocking when there are still runnable tasks + queue.enqueue("G1", "T1", 1); + queue.block("G1", "T2", 1); + assertThat(queue.state("G1")).isEqualTo(State.RUNNABLE); + + // blocked when all tasks are blocked + queue.dequeue(1); + queue.block("G1", "T1", 1); + assertThat(queue.state("G1")).isEqualTo(State.BLOCKED); + + // back to runnable after unblocking + queue.enqueue("G1", "T1", 1); + assertThat(queue.state("G1")).isEqualTo(State.RUNNABLE); + } + + @Test + public void testNonGreedyDeque() + { + SchedulingQueue queue = new SchedulingQueue<>(); + + queue.startGroup("G1"); + queue.startGroup("G2"); + + queue.enqueue("G1", "T1.0", 0); + queue.enqueue("G2", "T2.0", 1); + + queue.enqueue("G1", "T1.1", 2); + queue.enqueue("G1", "T1.2", 3); + + queue.enqueue("G2", "T2.1", 2); + queue.enqueue("G2", "T2.2", 3); + + assertThat(queue.dequeue(2)).isEqualTo("T1.0"); + assertThat(queue.dequeue(2)).isEqualTo("T2.0"); + assertThat(queue.dequeue(2)).isEqualTo("T1.1"); + assertThat(queue.dequeue(2)).isEqualTo("T2.1"); + assertThat(queue.dequeue(2)).isEqualTo("T1.2"); + assertThat(queue.dequeue(2)).isEqualTo("T2.2"); + assertThat(queue.dequeue(2)).isNull(); + } + + @Test + public void testFinishTask() + { + SchedulingQueue queue = new SchedulingQueue<>(); + + queue.startGroup("G1"); + queue.enqueue("G1", "T1", 0); + queue.enqueue("G1", "T2", 1); + queue.enqueue("G1", "T3", 2); + + assertThat(queue.peek()).isEqualTo("T1"); + queue.finish("G1", "T1"); + assertThat(queue.peek()).isEqualTo("T2"); + assertThat(queue.state("G1")).isEqualTo(State.RUNNABLE); + + // check that the group becomes not-runnable + queue.finish("G1", "T2"); + queue.finish("G1", "T3"); + assertThat(queue.peek()).isNull(); + assertThat(queue.state("G1")).isEqualTo(State.BLOCKED); + + // check that the group becomes runnable again + queue.enqueue("G1", "T4", 0); + assertThat(queue.peek()).isEqualTo("T4"); + assertThat(queue.state("G1")).isEqualTo(State.RUNNABLE); + } + + @Test + public void testFinishTaskWhileRunning() + { + SchedulingQueue queue = new SchedulingQueue<>(); + queue.startGroup("G1"); + + queue.enqueue("G1", "T1", 0); + queue.enqueue("G1", "T2", 1); + queue.enqueue("G1", "T3", 2); + assertThat(queue.dequeue(0)).isEqualTo("T1"); + assertThat(queue.dequeue(0)).isEqualTo("T2"); + assertThat(queue.peek()).isEqualTo("T3"); + assertThat(queue.state("G1")).isEqualTo(State.RUNNABLE); + + queue.finish("G1", "T3"); + assertThat(queue.state("G1")).isEqualTo(State.RUNNING); + + queue.finish("G1", "T1"); + assertThat(queue.state("G1")).isEqualTo(State.RUNNING); + + queue.finish("G1", "T2"); + assertThat(queue.state("G1")).isEqualTo(State.BLOCKED); + } + + @Test + public void testFinishTaskWhileBlocked() + { + SchedulingQueue queue = new SchedulingQueue<>(); + queue.startGroup("G1"); + + queue.enqueue("G1", "T1", 0); + queue.enqueue("G1", "T2", 1); + assertThat(queue.dequeue(0)).isEqualTo("T1"); + assertThat(queue.dequeue(0)).isEqualTo("T2"); + queue.block("G1", "T1", 0); + queue.block("G1", "T2", 0); + assertThat(queue.state("G1")).isEqualTo(State.BLOCKED); + + queue.finish("G1", "T1"); + assertThat(queue.state("G1")).isEqualTo(State.BLOCKED); + + queue.finish("G1", "T2"); + assertThat(queue.state("G1")).isEqualTo(State.BLOCKED); + } + + @Test + public void testFinishGroup() + { + SchedulingQueue queue = new SchedulingQueue<>(); + + queue.startGroup("G1"); + queue.enqueue("G1", "T1.1", 0); + assertThat(queue.peek()).isEqualTo("T1.1"); + + queue.startGroup("G2"); + queue.enqueue("G2", "T2.1", 1); + assertThat(queue.peek()).isEqualTo("T1.1"); + + queue.finishGroup("G1"); + assertThat(queue.containsGroup("G1")).isFalse(); + assertThat(queue.peek()).isEqualTo("T2.1"); + } +} diff --git a/pom.xml b/pom.xml index 42d224497844..1b97f9a074ac 100644 --- a/pom.xml +++ b/pom.xml @@ -2497,7 +2497,8 @@ -Xep:StreamResourceLeak:ERROR \ -Xep:UnnecessaryMethodReference:ERROR \ -Xep:UnnecessaryOptionalGet:ERROR \ - -Xep:UnusedVariable:ERROR \ + + -Xep:UseEnumSwitch:ERROR \ -XepExcludedPaths:.*/target/generated-(|test-)sources/.*