Skip to content

BalancedShardsAllocator rebalancing might move shards but not improve the balance #88384

@idegtiarenko

Description

@idegtiarenko

Description

I have discovered a situation when rebalancing combined with cluster.routing.allocation.node_concurrent_incoming_recoveries=1 and cluster.routing.allocation.node_concurrent_outgoing_recoveries=1 after moving some shards might result in same exact balance as before.

For example with a following shard allocation:

index,   node-0, node-1, node-2
index-0  ****    ****    **
index-1  ****    ****    **
index-2  ***     ***     ****
index-3  ***     ***     ****
index-4  ****    ***     ***
index-5  ***     ****    ***
index-6  ***     ***     ****
total    ^24     ^24     ^22

The balancing will:

  • Move a shard of index-2 from node-1 to node-2 (as expected improving the balance)
  • Move a shard of index-4 from node-2 to node-1 resulting in the 4-4-2 shard per node distribution in another index.

This could be reproduced with a test below:

    public void testRebalance() {
        var discoveryNodesBuilder = DiscoveryNodes.builder();
        for (int node = 0; node < 3; node++) {
            discoveryNodesBuilder.add(createNode("node-" + node));
        }

        var metadataBuilder = Metadata.builder();
        var routingTableBuilder = RoutingTable.builder();

        addIndex(metadataBuilder, routingTableBuilder, "index-0", 10, Map.of("node-0", 4, "node-1", 4, "node-2", 2));
        addIndex(metadataBuilder, routingTableBuilder, "index-1", 10, Map.of("node-0", 4, "node-1", 4, "node-2", 2));
        addIndex(metadataBuilder, routingTableBuilder, "index-2", 10, Map.of("node-0", 3, "node-1", 3, "node-2", 4));
        addIndex(metadataBuilder, routingTableBuilder, "index-3", 10, Map.of("node-0", 3, "node-1", 3, "node-2", 4));
        addIndex(metadataBuilder, routingTableBuilder, "index-4", 10, Map.of("node-0", 4, "node-1", 3, "node-2", 3));
        addIndex(metadataBuilder, routingTableBuilder, "index-5", 10, Map.of("node-0", 3, "node-1", 4, "node-2", 3));
        addIndex(metadataBuilder, routingTableBuilder, "index-6", 10, Map.of("node-0", 3, "node-1", 3, "node-2", 4));

        var clusterState = ClusterState.builder(ClusterName.DEFAULT)
            .nodes(discoveryNodesBuilder)
            .metadata(metadataBuilder)
            .routingTable(routingTableBuilder)
            .build();

        var allocationService = createAllocationService(
            Settings.builder()
                .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING.getKey(), 1)
                .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_OUTGOING_RECOVERIES_SETTING.getKey(), 1)
                .build()
        );

        var reroutedState = allocationService.reroute(clusterState, "test");

        for (ShardRouting relocatingShard : RoutingNodesHelper.shardsWithState(reroutedState.getRoutingNodes(), RELOCATING)) {
            assertThat(
                "new allocation should not result in indexes with 2 shards per node",
                getTargetShardPerNodeCount(reroutedState.getRoutingTable().index(relocatingShard.index())).containsValue(2),
                equalTo(false)
            );
        }
    }

    private Map<String, Integer> getTargetShardPerNodeCount(IndexRoutingTable indexRoutingTable) {
        var counts = new HashMap<String, Integer>();
        for (int shardId = 0; shardId < indexRoutingTable.size(); shardId++) {
            var shard = indexRoutingTable.shard(shardId).primaryShard();
            counts.compute(
                shard.relocating() ? shard.relocatingNodeId() : shard.currentNodeId(),
                (nodeId, count) -> count != null ? count + 1 : 1
            );
        }
        return counts;
    }

    private DiscoveryNode createNode(String nodeId) {
        return new DiscoveryNode(
            nodeId,
            nodeId,
            UUIDs.randomBase64UUID(random()),
            buildNewFakeTransportAddress(),
            Map.of(),
            DiscoveryNodeRole.roles(),
            Version.CURRENT
        );
    }

    private void addIndex(
        Metadata.Builder metadataBuilder,
        RoutingTable.Builder routingTableBuilder,
        String name,
        int numberOfShards,
        Map<String, Integer> assignments
    ) {
        var inSyncIds = randomList(numberOfShards, numberOfShards, () -> UUIDs.randomBase64UUID(random()));
        var indexMetadataBuilder = IndexMetadata.builder(name)
            .settings(
                Settings.builder()
                    .put("index.number_of_shards", numberOfShards)
                    .put("index.number_of_replicas", 0)
                    .put("index.version.created", Version.CURRENT)
                    .build()
            );

        for (int shardId = 0; shardId < numberOfShards; shardId++) {
            indexMetadataBuilder.putInSyncAllocationIds(shardId, Set.of(inSyncIds.get(shardId)));
        }
        metadataBuilder.put(indexMetadataBuilder);

        var indexId = metadataBuilder.get(name).getIndex();
        var indexRoutingTableBuilder = IndexRoutingTable.builder(indexId);

        int shardId = 0;
        for (var assignment : assignments.entrySet()) {
            for (int i = 0; i < assignment.getValue(); i++) {
                indexRoutingTableBuilder.addShard(
                    TestShardRouting.newShardRouting(
                        new ShardId(indexId, shardId),
                        assignment.getKey(),
                        null,
                        true,
                        ShardRoutingState.STARTED,
                        AllocationId.newInitializing(inSyncIds.get(shardId))
                    )
                );
                shardId++;
            }
        }
        assertThat(shardId, equalTo(numberOfShards));
        routingTableBuilder.add(indexRoutingTableBuilder);
    }

Metadata

Metadata

Assignees

No one assigned

    Labels

    :Distributed Coordination/AllocationAll issues relating to the decision making around placing a shard (both master logic & on the nodes)>bugTeam:Distributed (Obsolete)Meta label for distributed team (obsolete). Replaced by Distributed Indexing/Coordination.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions