diff --git a/docs/changelog/142330.yaml b/docs/changelog/142330.yaml new file mode 100644 index 0000000000000..44bdc0f2d16d5 --- /dev/null +++ b/docs/changelog/142330.yaml @@ -0,0 +1,6 @@ +area: ES|QL +issues: + - 139883 +pr: 142330 +summary: Log compute engine server failures as ERROR +type: bug diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/operator/Driver.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/operator/Driver.java index 6a70fdbb76d57..1736f15ce9a4b 100644 --- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/operator/Driver.java +++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/operator/Driver.java @@ -201,12 +201,12 @@ SubscribableListener run(TimeValue maxTime, int maxIterations, LongSupplie RestStatus status = ExceptionsHelper.status(e); // HTTP 4xx status codes (400-499) indicate client/user errors: malformed requests, authentication // failures, cancellations, etc. These are expected user actions, not system failures, so log at - // DEBUG level to reduce noise. HTTP 5xx status codes (500+) indicate server errors: internal - // failures, timeouts, etc. These are system failures that need attention, so log at WARN level. + // DEBUG level to reduce noise. HTTP 5xx status codes (500+) indicate internal server failures and + // potential bugs, so log at ERROR. if (status.getStatus() >= 400 && status.getStatus() < 500) { LOGGER.debug(Strings.format("User error running driver [%s]", shortDescription), e); } else { - LOGGER.warn(Strings.format("Error running driver [%s]", shortDescription), e); + LOGGER.error(Strings.format("Error running driver [%s]", shortDescription), e); } throw e; } finally { diff --git a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/operator/DriverTests.java b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/operator/DriverTests.java index 65f76e227df6c..eb32d34e24e6c 100644 --- a/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/operator/DriverTests.java +++ b/x-pack/plugin/esql/compute/src/test/java/org/elasticsearch/compute/operator/DriverTests.java @@ -7,10 +7,13 @@ package org.elasticsearch.compute.operator; +import org.apache.logging.log4j.Level; +import org.elasticsearch.ElasticsearchStatusException; import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.ActionRunnable; import org.elasticsearch.action.support.PlainActionFuture; import org.elasticsearch.common.breaker.CircuitBreaker; +import org.elasticsearch.common.breaker.CircuitBreakingException; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.ByteSizeValue; import org.elasticsearch.common.util.MockBigArrays; @@ -31,11 +34,16 @@ import org.elasticsearch.compute.test.TestDriverFactory; import org.elasticsearch.compute.test.TestResultPageSinkOperator; import org.elasticsearch.core.TimeValue; +import org.elasticsearch.rest.RestStatus; +import org.elasticsearch.tasks.TaskCancelledException; import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.test.MockLog; +import org.elasticsearch.test.junit.annotations.TestLogging; import org.elasticsearch.threadpool.FixedExecutorBuilder; import org.elasticsearch.threadpool.TestThreadPool; import org.elasticsearch.threadpool.ThreadPool; +import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -247,6 +255,59 @@ public void testUnchangedStatus() { } } + public void testServerErrorsAreLoggedAsError() { + assertFailureLogged( + new IllegalStateException("simulated compute bug"), + IllegalStateException.class, + Level.ERROR, + "*Error running driver [test-task]*" + ); + assertFailureLogged( + new ArrayIndexOutOfBoundsException("simulated broken invariant"), + ArrayIndexOutOfBoundsException.class, + Level.ERROR, + "*Error running driver [test-task]*" + ); + assertFailureLogged( + new RuntimeException(new IOException("io failure")), + RuntimeException.class, + Level.ERROR, + "*Error running driver [test-task]*" + ); + } + + @TestLogging(reason = "assert DEBUG user-error logging in Driver", value = "org.elasticsearch.compute.operator.Driver:DEBUG") + public void testClientErrorsAreLoggedAsDebug() { + assertFailureLogged( + new ElasticsearchStatusException("bad request", RestStatus.BAD_REQUEST), + ElasticsearchStatusException.class, + Level.DEBUG, + "*User error running driver [test-task]*" + ); + assertFailureLogged( + new IllegalArgumentException("bad argument"), + IllegalArgumentException.class, + Level.DEBUG, + "*User error running driver [test-task]*" + ); + assertFailureLogged( + new CircuitBreakingException("too many bytes", CircuitBreaker.Durability.PERMANENT), + CircuitBreakingException.class, + Level.DEBUG, + "*User error running driver [test-task]*" + ); + } + + @TestLogging(reason = "assert DEBUG cancellation logging in Driver", value = "org.elasticsearch.compute.operator.Driver:DEBUG") + public void testTaskCancellationIsLoggedAsDebug() { + assertFailureLogged( + new TaskCancelledException("cancelled"), + TaskCancelledException.class, + Level.DEBUG, + "*Cancelling running driver [test-task]*" + ); + } + private static Driver createDriver( long startEpoch, long startNanos, @@ -272,6 +333,51 @@ private static Driver createDriver( ); } + private static Driver createFailingDriver(DriverContext driverContext, RuntimeException failure) { + return TestDriverFactory.create( + driverContext, + new CannedSourceOperator(List.of(new Page(driverContext.blockFactory().newConstantIntBlockWith(1, 1))).iterator()), + List.of(new AbstractPageMappingOperator() { + @Override + protected Page process(Page page) { + throw failure; + } + + @Override + public String toString() { + return "failing_operator"; + } + }), + new TestResultPageSinkOperator(page -> fail("sink should not receive output")) + ); + } + + private void assertFailureLogged( + RuntimeException failure, + Class expectedExceptionClass, + Level expectedLogLevel, + String expectedMessagePattern + ) { + Driver driver = createFailingDriver(driverContext(), failure); + try { + MockLog.assertThatLogger( + () -> expectThrows( + expectedExceptionClass, + () -> driver.run(TimeValue.timeValueDays(1), Integer.MAX_VALUE, System::nanoTime) + ), + Driver.class, + new MockLog.SeenEventExpectation( + "expected log event for " + failure.getClass().getSimpleName(), + Driver.class.getCanonicalName(), + expectedLogLevel, + expectedMessagePattern + ) + ); + } finally { + driver.close(); + } + } + static class NowSupplier implements LongSupplier { private final long startNanos; private final long waitTime;