Skip to content

Commit f6206bc

Browse files
authored
Fix SnapshotBasedRecoveryIT#testSnapshotBasedRecovery (#78434)
Move the shard to a replica in an older version when the primary is located in the upgraded node during the first rolling restart round. Closes #76595 Backport of #77134
1 parent 7d61f2c commit f6206bc

File tree

1 file changed

+117
-11
lines changed

1 file changed

+117
-11
lines changed

qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/SnapshotBasedRecoveryIT.java

Lines changed: 117 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,17 @@
88

99
package org.elasticsearch.upgrades;
1010

11+
import org.apache.http.client.methods.HttpGet;
1112
import org.apache.http.client.methods.HttpPost;
13+
import org.elasticsearch.Version;
1214
import org.elasticsearch.client.Request;
1315
import org.elasticsearch.client.Response;
1416
import org.elasticsearch.cluster.metadata.IndexMetadata;
17+
import org.elasticsearch.common.Strings;
1518
import org.elasticsearch.common.settings.Settings;
19+
import org.elasticsearch.common.xcontent.XContentBuilder;
1620
import org.elasticsearch.common.xcontent.support.XContentMapValues;
21+
import org.elasticsearch.core.Nullable;
1722
import org.elasticsearch.index.query.QueryBuilder;
1823
import org.elasticsearch.index.query.QueryBuilders;
1924
import org.elasticsearch.repositories.blobstore.BlobStoreRepository;
@@ -26,11 +31,13 @@
2631

2732
import static org.elasticsearch.cluster.routing.UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING;
2833
import static org.elasticsearch.cluster.routing.allocation.decider.MaxRetryAllocationDecider.SETTING_ALLOCATION_MAX_RETRY;
29-
import static org.elasticsearch.upgrades.AbstractRollingTestCase.ClusterType.MIXED;
34+
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
3035
import static org.hamcrest.Matchers.allOf;
3136
import static org.hamcrest.Matchers.equalTo;
3237
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
38+
import static org.hamcrest.Matchers.is;
3339
import static org.hamcrest.Matchers.lessThan;
40+
import static org.hamcrest.Matchers.notNullValue;
3441

3542
public class SnapshotBasedRecoveryIT extends AbstractRollingTestCase {
3643
public void testSnapshotBasedRecovery() throws Exception {
@@ -66,17 +73,41 @@ public void testSnapshotBasedRecovery() throws Exception {
6673
break;
6774
case MIXED:
6875
case UPGRADED:
69-
// the following `if` for first round mixed was added as a selective test mute. Sometimes the primary shard ends
70-
// on the upgraded node. This causes issues when removing and adding replicas, since then we cannot allocate to
71-
// any of the old nodes. That is an issue only for the first mixed round, hence this check.
72-
// Ideally we would find the reason the primary ends on the upgraded node and fix that (or figure out that it
73-
// is all good).
74-
// @AwaitsFix(bugUrl = https://github.com/elastic/elasticsearch/issues/76595)
75-
if (CLUSTER_TYPE != MIXED || FIRST_MIXED_ROUND == false) {
76-
// Drop replicas
77-
updateIndexSettings(indexName, Settings.builder().put(IndexMetadata.INDEX_NUMBER_OF_REPLICAS_SETTING.getKey(), 0));
76+
if (FIRST_MIXED_ROUND) {
77+
String upgradedNodeId = getUpgradedNodeId();
78+
79+
if (upgradedNodeId != null) {
80+
updateIndexSettings(
81+
indexName,
82+
Settings.builder()
83+
.put("index.routing.allocation.exclude._id", upgradedNodeId)
84+
);
85+
}
86+
87+
String primaryNodeId = getPrimaryNodeIdOfShard(indexName, 0);
88+
Version primaryNodeVersion = getNodeVersion(primaryNodeId);
89+
90+
// Sometimes the primary shard ends on the upgraded node (i.e. after a rebalance)
91+
// This causes issues when removing and adding replicas, since then we cannot allocate to any of the old nodes.
92+
// That is an issue only for the first mixed round.
93+
// In that case we exclude the upgraded node from the shard allocation and cancel the shard to force moving
94+
// the primary to a node in the old version, this allows adding replicas in the first mixed round.
95+
if (primaryNodeVersion.after(UPGRADE_FROM_VERSION)) {
96+
cancelShard(indexName, 0, primaryNodeId);
97+
98+
String currentPrimaryNodeId = getPrimaryNodeIdOfShard(indexName, 0);
99+
assertThat(getNodeVersion(currentPrimaryNodeId), is(equalTo(UPGRADE_FROM_VERSION)));
100+
}
101+
} else {
102+
updateIndexSettings(
103+
indexName,
104+
Settings.builder()
105+
.putNull("index.routing.allocation.exclude._id")
106+
);
78107
}
79-
ensureGreen(indexName);
108+
109+
// Drop replicas
110+
updateIndexSettings(indexName, Settings.builder().put(IndexMetadata.INDEX_NUMBER_OF_REPLICAS_SETTING.getKey(), 0));
80111

81112
updateIndexSettings(indexName, Settings.builder().put(IndexMetadata.INDEX_NUMBER_OF_REPLICAS_SETTING.getKey(), 1));
82113
ensureGreen(indexName);
@@ -88,6 +119,81 @@ public void testSnapshotBasedRecovery() throws Exception {
88119
}
89120
}
90121

122+
@Nullable
123+
private String getUpgradedNodeId() throws IOException {
124+
Request request = new Request(HttpGet.METHOD_NAME, "_nodes/_all");
125+
Response response = client().performRequest(request);
126+
Map<String, Object> responseMap = responseAsMap(response);
127+
Map<String, Map<String, Object>> nodes = extractValue(responseMap, "nodes");
128+
for (Map.Entry<String, Map<String, Object>> nodeInfoEntry : nodes.entrySet()) {
129+
Version nodeVersion = Version.fromString(extractValue(nodeInfoEntry.getValue(), "version"));
130+
if (nodeVersion.after(UPGRADE_FROM_VERSION)) {
131+
return nodeInfoEntry.getKey();
132+
}
133+
}
134+
return null;
135+
}
136+
137+
private Version getNodeVersion(String primaryNodeId) throws IOException {
138+
Request request = new Request(HttpGet.METHOD_NAME, "_nodes/" + primaryNodeId);
139+
Response response = client().performRequest(request);
140+
String nodeVersion = extractValue(responseAsMap(response), "nodes." + primaryNodeId + ".version");
141+
return Version.fromString(nodeVersion);
142+
}
143+
144+
private String getPrimaryNodeIdOfShard(String indexName, int shard) throws Exception {
145+
String primaryNodeId;
146+
try (XContentBuilder builder = jsonBuilder()) {
147+
builder.startObject();
148+
{
149+
builder.field("index", indexName);
150+
builder.field("shard", shard);
151+
builder.field("primary", true);
152+
}
153+
builder.endObject();
154+
155+
Request request = new Request(HttpGet.METHOD_NAME, "_cluster/allocation/explain");
156+
request.setJsonEntity(Strings.toString(builder));
157+
158+
Response response = client().performRequest(request);
159+
Map<String, Object> responseMap = responseAsMap(response);
160+
primaryNodeId = extractValue(responseMap, "current_node.id");
161+
}
162+
assertThat(primaryNodeId, is(notNullValue()));
163+
164+
return primaryNodeId;
165+
}
166+
167+
private void cancelShard(String indexName, int shard, String nodeName) throws IOException {
168+
try (XContentBuilder builder = jsonBuilder()) {
169+
builder.startObject();
170+
{
171+
builder.startArray("commands");
172+
{
173+
builder.startObject();
174+
{
175+
builder.startObject("cancel");
176+
{
177+
builder.field("index", indexName);
178+
builder.field("shard", shard);
179+
builder.field("node", nodeName);
180+
builder.field("allow_primary", true);
181+
}
182+
builder.endObject();
183+
}
184+
builder.endObject();
185+
}
186+
builder.endArray();
187+
}
188+
builder.endObject();
189+
190+
Request request = new Request(HttpPost.METHOD_NAME, "/_cluster/reroute");
191+
request.setJsonEntity(Strings.toString(builder));
192+
Response response = client().performRequest(request);
193+
assertOK(response);
194+
}
195+
}
196+
91197
private void assertMatchAllReturnsAllDocuments(String indexName, int numDocs) throws IOException {
92198
Map<String, Object> searchResults = search(indexName, QueryBuilders.matchAllQuery());
93199
List<Map<String, Object>> hits = extractValue(searchResults, "hits.hits");

0 commit comments

Comments
 (0)