Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ Standard Helm options such as `replicaCount`, `image`, `imagePullSecrets`,
More detail about the chart are available in the [values
reference documentation](https://github.com/trinodb/charts/blob/main/charts/gateway/README.md)

### Health Checks
### Health checks on Trino clusters

The Trino Gateway periodically performs health checks and maintains
an in-memory TrinoStatus for each backend. If a backend fails a health check,
Expand Down
13 changes: 13 additions & 0 deletions docs/operation.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ documentation](https://trino.io/docs/current/admin/graceful-shutdown.html) for
more details.

## Query routing options

- The default router selects the backend randomly to route the queries.
- If you want to route the queries to the least loaded backend for a user
i.e. backend with the least number of queries running or queued from a particular user,
Expand Down Expand Up @@ -65,3 +66,15 @@ scrape_configs:
- targets:
- gateway1.example.com:8080
```

## Trino Gateway health endpoints

Trino Gateway provides two API endpoints to indicate the current status of the server:

* `/trino-gateway/livez` always returns status code 200, indicating the server is
Comment thread
oneonestar marked this conversation as resolved.
alive. However, it might not respond if the Trino Gateway is too busy, stuck, or
taking a long time for garbage collection.
* `/trino-gateway/readyz` returns status code 200, indicating the server has
completed initialization and is ready to serve requests. This means the initial
connection to the database and the first round of health check on Trino clusters
are completed. Otherwise, status code 503 is returned.
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import io.trino.gateway.ha.module.RouterBaseModule;
import io.trino.gateway.ha.module.StochasticRoutingManagerProvider;
import io.trino.gateway.ha.resource.EntityEditorResource;
import io.trino.gateway.ha.resource.GatewayHealthCheckResource;
import io.trino.gateway.ha.resource.GatewayResource;
import io.trino.gateway.ha.resource.GatewayViewResource;
import io.trino.gateway.ha.resource.GatewayWebAppResource;
Expand Down Expand Up @@ -179,6 +180,7 @@ private static void registerResources(Binder binder)
jaxrsBinder(binder).bind(PublicResource.class);
jaxrsBinder(binder).bind(TrinoResource.class);
jaxrsBinder(binder).bind(WebUIStaticResource.class);
jaxrsBinder(binder).bind(GatewayHealthCheckResource.class);
}

private static void registerAuthFilters(Binder binder)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ public class ActiveClusterMonitor
public static final int DEFAULT_THREAD_POOL_SIZE = 20;
private static final Logger log = Logger.get(ActiveClusterMonitor.class);

private volatile boolean isInitialized;
private final List<TrinoClusterStatsObserver> clusterStatsObservers;
private final GatewayBackendManager gatewayBackendManager;

Expand Down Expand Up @@ -83,6 +84,7 @@ public void start()
observer.observe(stats);
}
}
isInitialized = true;
}
catch (Exception e) {
log.error(e, "Error performing backend monitor tasks");
Expand All @@ -96,4 +98,9 @@ public void stop()
executorService.shutdownNow();
scheduledExecutor.shutdownNow();
}

public boolean isInitialized()
{
return isInitialized;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.gateway.ha.resource;

import com.google.inject.Inject;
import io.trino.gateway.ha.clustermonitor.ActiveClusterMonitor;
import jakarta.ws.rs.GET;
import jakarta.ws.rs.Path;
import jakarta.ws.rs.core.Response;

import static java.util.Objects.requireNonNull;

@Path("/trino-gateway")
Comment thread
oneonestar marked this conversation as resolved.
public class GatewayHealthCheckResource
{
private final ActiveClusterMonitor activeClusterMonitor;

@Inject
public GatewayHealthCheckResource(ActiveClusterMonitor activeClusterMonitor)
{
this.activeClusterMonitor = requireNonNull(activeClusterMonitor, "activeClusterMonitor is null");
Comment thread
mosabua marked this conversation as resolved.
}

@GET
@Path("/livez")
public Response liveness()
{
return Response.ok("ok").build();
Comment thread
mosabua marked this conversation as resolved.
}

@GET
@Path("/readyz")
public Response readiness()
{
if (!activeClusterMonitor.isInitialized()) {
return Response
.status(Response.Status.SERVICE_UNAVAILABLE)
.entity("Trino Gateway is still initializing")
.build();
}
return Response.ok("ok").build();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
import static com.google.common.collect.MoreCollectors.onlyElement;
import static com.google.common.net.HttpHeaders.CONTENT_TYPE;
import static com.google.common.net.MediaType.JSON_UTF_8;
import static com.google.common.util.concurrent.Uninterruptibles.sleepUninterruptibly;
import static org.assertj.core.api.Assertions.assertThat;
import static org.testcontainers.utility.MountableFile.forClasspathResource;

Expand Down Expand Up @@ -359,6 +360,30 @@ void testCookieSigning()
assertThat(callbackResponse.code()).isEqualTo(500);
}

@Test
void testHealthCheckEndpoints()
throws IOException
{
Request livenessCheck = new Request.Builder()
.url("http://localhost:" + routerPort + "/trino-gateway/livez")
.build();
Response livenessResponse = httpClient.newCall(livenessCheck).execute();
assertThat(livenessResponse.code()).isEqualTo(200);

Request readinessCheck = new Request.Builder()
.url("http://localhost:" + routerPort + "/trino-gateway/readyz")
.build();
for (int i = 0; i < 100; i++) {
try (Response readinessResponse = httpClient.newCall(readinessCheck).execute()) {
if (readinessResponse.code() == 200) {
return;
}
}
sleepUninterruptibly(100, TimeUnit.MILLISECONDS);
Comment thread
oneonestar marked this conversation as resolved.
}
throw new IllegalStateException("Trino Gateway health check failed");
}

@AfterAll
void cleanup()
{
Expand Down