Skip to content

Commit 879e26e

Browse files
committed
Describe STALE_STATE_CONFIG in ClusterFormationFH (#53878)
We mark cluster states persisted on master-ineligible nodes as potentially-stale using the voting configuration `{STALE_STATE_CONFIG}` which prevents these nodes from being elected as master if they are restarted as master-eligible. Today we do not handle this special voting configuration differently in the `ClusterFormationFailureHandler`, leading to a mysterious message `an election requires a node with id [STALE_STATE_CONFIG]` if the election does not succeed. This commit adds a special case description for this situation to explain better why this node cannot win an election. Closes #53734
1 parent 0cfe6d9 commit 879e26e

File tree

3 files changed

+25
-2
lines changed

3 files changed

+25
-2
lines changed

server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import org.elasticsearch.common.transport.TransportAddress;
3232
import org.elasticsearch.common.unit.TimeValue;
3333
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
34+
import org.elasticsearch.gateway.GatewayMetaState;
3435
import org.elasticsearch.threadpool.ThreadPool;
3536
import org.elasticsearch.threadpool.ThreadPool.Names;
3637

@@ -210,7 +211,12 @@ private String describeQuorum(VotingConfiguration votingConfiguration) {
210211
assert requiredNodes <= realNodeIds.size() : nodeIds;
211212

212213
if (nodeIds.size() == 1) {
213-
return "a node with id " + realNodeIds;
214+
if (nodeIds.contains(GatewayMetaState.STALE_STATE_CONFIG_NODE_ID)) {
215+
return "one or more nodes that have already participated as master-eligible nodes in the cluster but this node was " +
216+
"not master-eligible the last time it joined the cluster";
217+
} else {
218+
return "a node with id " + realNodeIds;
219+
}
214220
} else if (nodeIds.size() == 2) {
215221
return "two nodes with ids " + realNodeIds;
216222
} else {

server/src/main/java/org/elasticsearch/gateway/GatewayMetaState.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,13 @@
8181
*/
8282
public class GatewayMetaState implements Closeable {
8383

84+
/**
85+
* Fake node ID for a voting configuration written by a master-ineligible data node to indicate that its on-disk state is potentially
86+
* stale (since it is written asynchronously after application, rather than before acceptance). This node ID means that if the node is
87+
* restarted as a master-eligible node then it does not win any elections until it has received a fresh cluster state.
88+
*/
89+
public static final String STALE_STATE_CONFIG_NODE_ID = "STALE_STATE_CONFIG";
90+
8491
// Set by calling start()
8592
private final SetOnce<PersistedState> persistedState = new SetOnce<>();
8693

@@ -425,7 +432,7 @@ protected void doRun() {
425432
}
426433

427434
static final CoordinationMetaData.VotingConfiguration staleStateConfiguration =
428-
new CoordinationMetaData.VotingConfiguration(Collections.singleton("STALE_STATE_CONFIG"));
435+
new CoordinationMetaData.VotingConfiguration(Collections.singleton(STALE_STATE_CONFIG_NODE_ID));
429436

430437
static ClusterState resetVotingConfiguration(ClusterState clusterState) {
431438
CoordinationMetaData newCoordinationMetaData = CoordinationMetaData.builder(clusterState.coordinationMetaData())

server/src/test/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelperTests.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import org.elasticsearch.cluster.node.DiscoveryNodes;
3030
import org.elasticsearch.common.settings.Settings;
3131
import org.elasticsearch.common.transport.TransportAddress;
32+
import org.elasticsearch.gateway.GatewayMetaState;
3233
import org.elasticsearch.test.ESTestCase;
3334

3435
import java.util.Arrays;
@@ -412,5 +413,14 @@ public void testDescriptionAfterBootstrapping() {
412413
"have discovered [] which is not a quorum; " +
413414
"discovery will continue using [] from hosts providers and [" + otherMasterNode + ", " + localNode +
414415
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0")));
416+
417+
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, GatewayMetaState.STALE_STATE_CONFIG_NODE_ID), emptyList(),
418+
emptyList(), 0L, electionStrategy).getDescription(),
419+
is("master not discovered or elected yet, an election requires one or more nodes that have already participated as " +
420+
"master-eligible nodes in the cluster but this node was not master-eligible the last time it joined the cluster, " +
421+
"have discovered [] which is not a quorum; " +
422+
"discovery will continue using [] from hosts providers and [" + localNode +
423+
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
424+
415425
}
416426
}

0 commit comments

Comments
 (0)