-
Notifications
You must be signed in to change notification settings - Fork 5.5k
fix(scheduler): Coordinator Task Throttling Bug #27146
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1098,4 +1098,215 @@ public String getName() | |
|
|
||
| return new ClusterResourceChecker(mockPolicy, config, createNodeManager()); | ||
| } | ||
|
|
||
| // Tests that when task limit is exceeded, new queries are queued instead of starting immediately | ||
| @Test(timeOut = 10_000) | ||
| public void testTaskLimitExceededQueuesQuery() | ||
| { | ||
| RootInternalResourceGroup root = new RootInternalResourceGroup( | ||
| "root", | ||
| (group, export) -> {}, | ||
| directExecutor(), | ||
| ignored -> Optional.empty(), | ||
| rg -> false, | ||
| createNodeManager(), | ||
| createClusterResourceChecker(), | ||
| QueryPacingContext.NOOP); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion (testing): Consider adding a test that covers the The production logic now treats the coordinator as overloaded if either Suggested implementation: private static final class TestQueryPacingContext
implements QueryPacingContext
{
private final java.util.concurrent.atomic.AtomicBoolean admit;
private final java.util.concurrent.atomic.AtomicInteger inUseSlots = new java.util.concurrent.atomic.AtomicInteger();
private TestQueryPacingContext(boolean initialAdmit)
{
this.admit = new java.util.concurrent.atomic.AtomicBoolean(initialAdmit);
}
@Override
public boolean tryAcquireAdmissionSlot()
{
if (!admit.get()) {
return false;
}
inUseSlots.incrementAndGet();
return true;
}
@Override
public void releaseAdmissionSlot()
{
inUseSlots.decrementAndGet();
}
public void setAdmit(boolean value)
{
admit.set(value);
}
public int getInUseSlots()
{
return inUseSlots.get();
}
}
@Test
public void testAdmissionSlotFailureQueuesAndStartsQueries()
{
TestQueryPacingContext pacingContext = new TestQueryPacingContext(false);
RootInternalResourceGroup root = new RootInternalResourceGroup(
"root",
(group, export) -> {},
directExecutor(),
ignored -> Optional.empty(),
rg -> false,
createNodeManager(),
createClusterResourceChecker(),
pacingContext);
root.setSoftMemoryLimit(new DataSize(1, MEGABYTE));
root.setMaxQueuedQueries(10);
root.setHardConcurrencyLimit(10);
// No task limit; overload purely due to admission slots
root.setTaskLimitExceeded(false);
// Immediate-start candidate should be queued because tryAcquireAdmissionSlot() fails
MockManagedQueryExecution first = new MockManagedQueryExecution(0);
first.startWaitingForPrerequisites();
root.run(first);
assertEquals(root.getQueuedQueries(), 1);
assertFalse(first.getState().isDone());
// Allow admissions and process queued queries
pacingContext.setAdmit(true);
root.processQueuedQueries();
assertEquals(root.getQueuedQueries(), 0);
assertTrue(first.isDone());
}
public void testTaskLimitExceededQueuesQuery()To integrate this cleanly with the existing file, you’ll likely need to:
|
||
| root.setSoftMemoryLimit(new DataSize(1, MEGABYTE)); | ||
| root.setMaxQueuedQueries(10); | ||
| root.setHardConcurrencyLimit(10); | ||
|
|
||
| // Set task limit exceeded | ||
| root.setTaskLimitExceeded(true); | ||
|
|
||
| // Submit a query - it should be queued because task limit is exceeded | ||
| MockManagedQueryExecution query1 = new MockManagedQueryExecution(0); | ||
| query1.startWaitingForPrerequisites(); | ||
| root.run(query1); | ||
|
|
||
| // Query should be queued, not running | ||
| assertEquals(query1.getState(), QUEUED); | ||
| assertEquals(root.getQueuedQueries(), 1); | ||
| assertEquals(root.getRunningQueries(), 0); | ||
| } | ||
|
|
||
| // Tests that queued queries start when task limit is no longer exceeded | ||
| @Test(timeOut = 10_000) | ||
| public void testQueryStartsWhenTaskLimitClears() | ||
| { | ||
| RootInternalResourceGroup root = new RootInternalResourceGroup( | ||
| "root", | ||
| (group, export) -> {}, | ||
| directExecutor(), | ||
| ignored -> Optional.empty(), | ||
| rg -> false, | ||
| createNodeManager(), | ||
| createClusterResourceChecker(), | ||
| QueryPacingContext.NOOP); | ||
| root.setSoftMemoryLimit(new DataSize(1, MEGABYTE)); | ||
| root.setMaxQueuedQueries(10); | ||
| root.setHardConcurrencyLimit(10); | ||
|
|
||
| // Set task limit exceeded | ||
| root.setTaskLimitExceeded(true); | ||
|
|
||
| // Submit queries - they should be queued | ||
| MockManagedQueryExecution query1 = new MockManagedQueryExecution(0); | ||
| query1.startWaitingForPrerequisites(); | ||
| root.run(query1); | ||
| MockManagedQueryExecution query2 = new MockManagedQueryExecution(0); | ||
| query2.startWaitingForPrerequisites(); | ||
| root.run(query2); | ||
|
|
||
| assertEquals(query1.getState(), QUEUED); | ||
| assertEquals(query2.getState(), QUEUED); | ||
| assertEquals(root.getQueuedQueries(), 2); | ||
| assertEquals(root.getRunningQueries(), 0); | ||
|
|
||
| // Clear task limit | ||
| root.setTaskLimitExceeded(false); | ||
|
|
||
| // Process queued queries - they should now start | ||
| root.processQueuedQueries(); | ||
|
|
||
| assertEquals(query1.getState(), RUNNING); | ||
| assertEquals(query2.getState(), RUNNING); | ||
| assertEquals(root.getQueuedQueries(), 0); | ||
| assertEquals(root.getRunningQueries(), 2); | ||
| } | ||
|
|
||
| // Tests that queries in a subgroup hierarchy are properly queued and started when task limit changes | ||
| @Test(timeOut = 10_000) | ||
| public void testTaskLimitExceededWithSubgroups() | ||
| { | ||
| RootInternalResourceGroup root = new RootInternalResourceGroup( | ||
| "root", | ||
| (group, export) -> {}, | ||
| directExecutor(), | ||
| ignored -> Optional.empty(), | ||
| rg -> false, | ||
| createNodeManager(), | ||
| createClusterResourceChecker(), | ||
| QueryPacingContext.NOOP); | ||
| root.setSoftMemoryLimit(new DataSize(1, MEGABYTE)); | ||
| root.setMaxQueuedQueries(10); | ||
| root.setHardConcurrencyLimit(10); | ||
|
|
||
| InternalResourceGroup groupA = root.getOrCreateSubGroup("A", true); | ||
| groupA.setSoftMemoryLimit(new DataSize(1, MEGABYTE)); | ||
| groupA.setMaxQueuedQueries(10); | ||
| groupA.setHardConcurrencyLimit(10); | ||
|
|
||
| InternalResourceGroup groupG = groupA.getOrCreateSubGroup("G", true); | ||
| groupG.setSoftMemoryLimit(new DataSize(1, MEGABYTE)); | ||
| groupG.setMaxQueuedQueries(10); | ||
| groupG.setHardConcurrencyLimit(10); | ||
|
|
||
| // Set task limit exceeded | ||
| root.setTaskLimitExceeded(true); | ||
|
|
||
| // Submit a query to leaf group G - it should be queued | ||
| MockManagedQueryExecution query1 = new MockManagedQueryExecution(0); | ||
| query1.startWaitingForPrerequisites(); | ||
| groupG.run(query1); | ||
|
|
||
| assertEquals(query1.getState(), QUEUED); | ||
| assertEquals(groupG.getQueuedQueries(), 1); | ||
| assertEquals(groupG.getRunningQueries(), 0); | ||
|
|
||
| // Clear task limit and process queued queries | ||
| root.setTaskLimitExceeded(false); | ||
| root.processQueuedQueries(); | ||
|
|
||
| // Query should now be running | ||
| assertEquals(query1.getState(), RUNNING); | ||
| assertEquals(groupG.getQueuedQueries(), 0); | ||
| assertEquals(groupG.getRunningQueries(), 1); | ||
| } | ||
|
|
||
| // Tests that when task limit is exceeded, queries already running continue, but new ones are queued | ||
| @Test(timeOut = 10_000) | ||
| public void testTaskLimitExceededDoesNotAffectRunningQueries() | ||
| { | ||
| RootInternalResourceGroup root = new RootInternalResourceGroup( | ||
| "root", | ||
| (group, export) -> {}, | ||
| directExecutor(), | ||
| ignored -> Optional.empty(), | ||
| rg -> false, | ||
| createNodeManager(), | ||
| createClusterResourceChecker(), | ||
| QueryPacingContext.NOOP); | ||
| root.setSoftMemoryLimit(new DataSize(1, MEGABYTE)); | ||
| root.setMaxQueuedQueries(10); | ||
| root.setHardConcurrencyLimit(10); | ||
|
|
||
| // Submit a query before task limit is exceeded - it should run | ||
| MockManagedQueryExecution query1 = new MockManagedQueryExecution(0); | ||
| query1.startWaitingForPrerequisites(); | ||
| root.run(query1); | ||
| assertEquals(query1.getState(), RUNNING); | ||
|
|
||
| // Now set task limit exceeded | ||
| root.setTaskLimitExceeded(true); | ||
|
|
||
| // Submit another query - it should be queued | ||
| MockManagedQueryExecution query2 = new MockManagedQueryExecution(0); | ||
| query2.startWaitingForPrerequisites(); | ||
| root.run(query2); | ||
| assertEquals(query2.getState(), QUEUED); | ||
|
|
||
| // The first query should still be running | ||
| assertEquals(query1.getState(), RUNNING); | ||
| assertEquals(root.getRunningQueries(), 1); | ||
| assertEquals(root.getQueuedQueries(), 1); | ||
| } | ||
|
|
||
| // Tests that task limit transitions work correctly with multiple cycles | ||
| @Test(timeOut = 10_000) | ||
| public void testTaskLimitExceededMultipleCycles() | ||
| { | ||
| RootInternalResourceGroup root = new RootInternalResourceGroup( | ||
| "root", | ||
| (group, export) -> {}, | ||
| directExecutor(), | ||
| ignored -> Optional.empty(), | ||
| rg -> false, | ||
| createNodeManager(), | ||
| createClusterResourceChecker(), | ||
| QueryPacingContext.NOOP); | ||
| root.setSoftMemoryLimit(new DataSize(1, MEGABYTE)); | ||
| root.setMaxQueuedQueries(10); | ||
| root.setHardConcurrencyLimit(10); | ||
|
|
||
| // Cycle 1: Task limit exceeded, query queued | ||
| root.setTaskLimitExceeded(true); | ||
| MockManagedQueryExecution query1 = new MockManagedQueryExecution(0); | ||
| query1.startWaitingForPrerequisites(); | ||
| root.run(query1); | ||
| assertEquals(query1.getState(), QUEUED); | ||
|
|
||
| // Clear task limit, query starts | ||
| root.setTaskLimitExceeded(false); | ||
| root.processQueuedQueries(); | ||
| assertEquals(query1.getState(), RUNNING); | ||
|
|
||
| // Cycle 2: Task limit exceeded again, new query queued | ||
| root.setTaskLimitExceeded(true); | ||
| MockManagedQueryExecution query2 = new MockManagedQueryExecution(0); | ||
| query2.startWaitingForPrerequisites(); | ||
| root.run(query2); | ||
| assertEquals(query2.getState(), QUEUED); | ||
| assertEquals(query1.getState(), RUNNING); // query1 still running | ||
|
|
||
| // Complete query1, processQueuedQueries should not start query2 (task limit still exceeded) | ||
| query1.complete(); | ||
| root.processQueuedQueries(); | ||
| assertEquals(query2.getState(), QUEUED); // Still queued because task limit exceeded | ||
|
|
||
| // Clear task limit, query2 starts | ||
| root.setTaskLimitExceeded(false); | ||
| root.processQueuedQueries(); | ||
| assertEquals(query2.getState(), RUNNING); | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.