Skip to content

Commit

Permalink
Increase concurrent request of opening point-in-time (#96782)
Browse files Browse the repository at this point in the history
Today, we mistakenly throttle the opening point-in-time API to 1 request
per node. As a result, when attempting to open a point-in-time across
large clusters, it can take a significant amount of time and eventually
fails due to relocated target shards or deleted target indices managed
by ILM. Ideally, we should batch the requests per node and eliminate
this throttle completely. However, this requires all clusters to be on
the latest version.

~This PR increases the number of concurrent requests from 1 to 20. This
default is higher than search, which is 5, because opening point-in-time
is a lightweight operation, doesn't perform any I/O, and is executed
directly on the network threads.~

This PR increases the number of concurrent requests from 1 to 5, which
is the default of search.

Any suggestion are welcome.
  • Loading branch information
dnhatn authored Jun 20, 2023
1 parent 6cf467f commit a8fbd24
Show file tree
Hide file tree
Showing 9 changed files with 316 additions and 4 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/96782.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 96782
summary: Increase concurrent request of opening point-in-time
area: Search
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,20 @@
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.action.admin.indices.stats.CommonStats;
import org.elasticsearch.action.support.IndicesOptions;
import org.elasticsearch.action.support.PlainActionFuture;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.CollectionUtils;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.index.IndexService;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.query.MatchAllQueryBuilder;
import org.elasticsearch.index.query.RangeQueryBuilder;
import org.elasticsearch.index.shard.IndexShard;
import org.elasticsearch.indices.IndicesService;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.search.SearchContextMissingException;
import org.elasticsearch.search.SearchHit;
Expand All @@ -33,10 +36,14 @@
import org.elasticsearch.search.sort.SortOrder;
import org.elasticsearch.tasks.TaskInfo;
import org.elasticsearch.test.ESIntegTestCase;
import org.elasticsearch.test.transport.MockTransportService;
import org.elasticsearch.transport.TransportService;

import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;

Expand All @@ -54,6 +61,11 @@

public class PointInTimeIT extends ESIntegTestCase {

@Override
protected Collection<Class<? extends Plugin>> nodePlugins() {
return CollectionUtils.appendToCopy(super.nodePlugins(), MockTransportService.TestPlugin.class);
}

@Override
protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) {
return Settings.builder()
Expand Down Expand Up @@ -430,6 +442,52 @@ public void testCloseInvalidPointInTime() {
assertThat(tasks, empty());
}

public void testOpenPITConcurrentShardRequests() throws Exception {
DiscoveryNode dataNode = randomFrom(clusterService().state().nodes().getDataNodes().values());
int numShards = randomIntBetween(5, 10);
int maxConcurrentRequests = randomIntBetween(2, 5);
assertAcked(
client().admin()
.indices()
.prepareCreate("test")
.setSettings(
Settings.builder()
.put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, numShards)
.put("index.routing.allocation.require._id", dataNode.getId())
.build()
)
);
var transportService = (MockTransportService) internalCluster().getInstance(TransportService.class, dataNode.getName());
try {
CountDownLatch sentLatch = new CountDownLatch(maxConcurrentRequests);
CountDownLatch readyLatch = new CountDownLatch(1);
transportService.addRequestHandlingBehavior(
TransportOpenPointInTimeAction.OPEN_SHARD_READER_CONTEXT_NAME,
(handler, request, channel, task) -> {
sentLatch.countDown();
Thread thread = new Thread(() -> {
try {
assertTrue(readyLatch.await(1, TimeUnit.MINUTES));
handler.messageReceived(request, channel, task);
} catch (Exception e) {
throw new AssertionError(e);
}
});
thread.start();
}
);
OpenPointInTimeRequest request = new OpenPointInTimeRequest("test").keepAlive(TimeValue.timeValueMinutes(1));
request.maxConcurrentShardRequests(maxConcurrentRequests);
PlainActionFuture<OpenPointInTimeResponse> future = new PlainActionFuture<>();
client().execute(OpenPointInTimeAction.INSTANCE, request, future);
assertTrue(sentLatch.await(1, TimeUnit.MINUTES));
readyLatch.countDown();
closePointInTime(future.actionGet().getPointInTimeId());
} finally {
transportService.clearAllRules();
}
}

@SuppressWarnings({ "rawtypes", "unchecked" })
private void assertPagination(PointInTimeBuilder pit, int expectedNumDocs, int size, SortBuilder<?>... sorts) throws Exception {
Set<String> seen = new HashSet<>();
Expand Down
3 changes: 2 additions & 1 deletion server/src/main/java/org/elasticsearch/TransportVersion.java
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,10 @@ private static TransportVersion registerTransportVersion(int id, String uniqueId
public static final TransportVersion V_8_500_014 = registerTransportVersion(8_500_014, "D115A2E1-1739-4A02-AB7B-64F6EA157EFB");
public static final TransportVersion V_8_500_015 = registerTransportVersion(8_500_015, "651216c9-d54f-4189-9fe1-48d82d276863");
public static final TransportVersion V_8_500_016 = registerTransportVersion(8_500_016, "492C94FB-AAEA-4C9E-8375-BDB67A398584");
public static final TransportVersion V_8_500_017 = registerTransportVersion(8_500_017, "0EDCB5BA-049C-443C-8AB1-5FA58FB996FB");

private static class CurrentHolder {
private static final TransportVersion CURRENT = findCurrent(V_8_500_016);
private static final TransportVersion CURRENT = findCurrent(V_8_500_017);

// finds the pluggable current version, or uses the given fallback
private static TransportVersion findCurrent(TransportVersion fallback) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

package org.elasticsearch.action.search;

import org.elasticsearch.TransportVersion;
import org.elasticsearch.action.ActionRequest;
import org.elasticsearch.action.ActionRequestValidationException;
import org.elasticsearch.action.IndicesRequest;
Expand All @@ -20,16 +21,18 @@
import org.elasticsearch.tasks.TaskId;

import java.io.IOException;
import java.util.Arrays;
import java.util.Map;
import java.util.Objects;

import static org.elasticsearch.action.ValidateActions.addValidationError;

public final class OpenPointInTimeRequest extends ActionRequest implements IndicesRequest.Replaceable {

private String[] indices;
private IndicesOptions indicesOptions = DEFAULT_INDICES_OPTIONS;
private TimeValue keepAlive;

private int maxConcurrentShardRequests = SearchRequest.DEFAULT_MAX_CONCURRENT_SHARD_REQUESTS;
@Nullable
private String routing;
@Nullable
Expand All @@ -48,6 +51,9 @@ public OpenPointInTimeRequest(StreamInput in) throws IOException {
this.keepAlive = in.readTimeValue();
this.routing = in.readOptionalString();
this.preference = in.readOptionalString();
if (in.getTransportVersion().onOrAfter(TransportVersion.V_8_500_017)) {
this.maxConcurrentShardRequests = in.readVInt();
}
}

@Override
Expand All @@ -58,6 +64,9 @@ public void writeTo(StreamOutput out) throws IOException {
out.writeTimeValue(keepAlive);
out.writeOptionalString(routing);
out.writeOptionalString(preference);
if (out.getTransportVersion().onOrAfter(TransportVersion.V_8_500_017)) {
out.writeVInt(maxConcurrentShardRequests);
}
}

@Override
Expand Down Expand Up @@ -123,6 +132,27 @@ public OpenPointInTimeRequest preference(String preference) {
return this;
}

/**
* Similar to {@link SearchRequest#getMaxConcurrentShardRequests()}, this returns the number of shard requests that should be
* executed concurrently on a single node . This value should be used as a protection mechanism to reduce the number of shard
* requests fired per open point-in-time request. The default is {@code 5}
*/
public int maxConcurrentShardRequests() {
return maxConcurrentShardRequests;
}

/**
* Similar to {@link SearchRequest#setMaxConcurrentShardRequests(int)}, this sets the number of shard requests that should be
* executed concurrently on a single node. This value should be used as a protection mechanism to reduce the number of shard
* requests fired per open point-in-time request.
*/
public void maxConcurrentShardRequests(int maxConcurrentShardRequests) {
if (maxConcurrentShardRequests < 1) {
throw new IllegalArgumentException("maxConcurrentShardRequests must be >= 1");
}
this.maxConcurrentShardRequests = maxConcurrentShardRequests;
}

@Override
public boolean allowsRemoteIndices() {
return true;
Expand All @@ -138,8 +168,46 @@ public String getDescription() {
return "open search context: indices [" + String.join(",", indices) + "] keep_alive [" + keepAlive + "]";
}

@Override
public String toString() {
return "OpenPointInTimeRequest{"
+ "indices="
+ Arrays.toString(indices)
+ ", keepAlive="
+ keepAlive
+ ", maxConcurrentShardRequests="
+ maxConcurrentShardRequests
+ ", routing='"
+ routing
+ '\''
+ ", preference='"
+ preference
+ '\''
+ '}';
}

@Override
public Task createTask(long id, String type, String action, TaskId parentTaskId, Map<String, String> headers) {
return new SearchTask(id, type, action, this::getDescription, parentTaskId, headers);
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
OpenPointInTimeRequest that = (OpenPointInTimeRequest) o;
return maxConcurrentShardRequests == that.maxConcurrentShardRequests
&& Arrays.equals(indices, that.indices)
&& indicesOptions.equals(that.indicesOptions)
&& keepAlive.equals(that.keepAlive)
&& Objects.equals(routing, that.routing)
&& Objects.equals(preference, that.preference);
}

@Override
public int hashCode() {
int result = Objects.hash(indicesOptions, keepAlive, maxConcurrentShardRequests, routing, preference);
result = 31 * result + Arrays.hashCode(indices);
return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,13 @@ public RestChannelConsumer prepareRequest(final RestRequest request, final NodeC
openRequest.routing(request.param("routing"));
openRequest.preference(request.param("preference"));
openRequest.keepAlive(TimeValue.parseTimeValue(request.param("keep_alive"), null, "keep_alive"));
if (request.hasParam("max_concurrent_shard_requests")) {
final int maxConcurrentShardRequests = request.paramAsInt(
"max_concurrent_shard_requests",
openRequest.maxConcurrentShardRequests()
);
openRequest.maxConcurrentShardRequests(maxConcurrentShardRequests);
}
return channel -> client.execute(OpenPointInTimeAction.INSTANCE, openRequest, new RestToXContentListener<>(channel));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ public class SearchRequest extends ActionRequest implements IndicesRequest.Repla
private int batchedReduceSize = DEFAULT_BATCHED_REDUCE_SIZE;

private int maxConcurrentShardRequests = 0;
public static final int DEFAULT_MAX_CONCURRENT_SHARD_REQUESTS = 5;

private Integer preFilterShardSize;

Expand Down Expand Up @@ -716,7 +717,7 @@ public int getBatchedReduceSize() {
* cluster can be throttled with this number to reduce the cluster load. The default is {@code 5}
*/
public int getMaxConcurrentShardRequests() {
return maxConcurrentShardRequests == 0 ? 5 : maxConcurrentShardRequests;
return maxConcurrentShardRequests == 0 ? DEFAULT_MAX_CONCURRENT_SHARD_REQUESTS : maxConcurrentShardRequests;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ protected void doExecute(Task task, OpenPointInTimeRequest request, ActionListen
.preference(request.preference())
.routing(request.routing())
.allowPartialSearchResults(false);
searchRequest.setMaxConcurrentShardRequests(request.maxConcurrentShardRequests());
searchRequest.setCcsMinimizeRoundtrips(false);
transportSearchAction.executeRequest((SearchTask) task, searchRequest, listener.map(r -> {
assert r.pointInTimeId() != null : r;
Expand Down Expand Up @@ -117,6 +118,8 @@ public SearchPhase newSearchPhase(
ThreadPool threadPool,
SearchResponse.Clusters clusters
) {
assert searchRequest.getMaxConcurrentShardRequests() == pitRequest.maxConcurrentShardRequests()
: searchRequest.getMaxConcurrentShardRequests() + " != " + pitRequest.maxConcurrentShardRequests();
return new AbstractSearchAsyncAction<>(
actionName,
logger,
Expand All @@ -132,7 +135,7 @@ public SearchPhase newSearchPhase(
clusterState,
task,
new ArraySearchPhaseResults<>(shardIterators.size()),
1,
searchRequest.getMaxConcurrentShardRequests(),
clusters
) {
@Override
Expand Down
Loading

0 comments on commit a8fbd24

Please sign in to comment.