From 8339f96b334bd9bca6676d62c4f3821de7c74486 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Fri, 1 Mar 2024 15:37:51 -0800 Subject: [PATCH 01/43] WIP SCM changes for reconcile cli --- .../hadoop/hdds/scm/client/ScmClient.java | 8 ++++ .../hdds/scm/exceptions/SCMException.java | 3 +- .../StorageContainerLocationProtocol.java | 8 ++++ ...ocationProtocolClientSideTranslatorPB.java | 26 ++++++++++ .../src/main/proto/ScmAdminProtocol.proto | 16 +++++++ .../hadoop/hdds/scm/events/SCMEvents.java | 7 +++ .../scm/server/SCMClientProtocolServer.java | 31 ++++++++++++ .../apache/hadoop/ozone/audit/SCMAction.java | 3 +- .../scm/cli/ContainerOperationClient.java | 4 ++ .../cli/container/ReconcileSubcommand.java | 47 +++++++++++++++++++ 10 files changed, 151 insertions(+), 2 deletions(-) create mode 100644 hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java index fb5a2deee26d..2e56d141b363 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java @@ -454,4 +454,12 @@ DecommissionScmResponseProto decommissionScm( String scmId) throws IOException; String getMetrics(String query) throws IOException; + + /** + * Trigger a reconcile command to datanodes for the current container ID. + * + * @param containerID The ID of the container to reconcile. + * @throws IOException On error + */ + void reconcileContainer(long containerID) throws IOException; } diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/exceptions/SCMException.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/exceptions/SCMException.java index 1cebd3296e34..fad6feca0be7 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/exceptions/SCMException.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/exceptions/SCMException.java @@ -139,6 +139,7 @@ public enum ResultCodes { CA_ROTATION_IN_PROGRESS, CA_ROTATION_IN_POST_PROGRESS, CONTAINER_ALREADY_CLOSED, - CONTAINER_ALREADY_CLOSING + CONTAINER_ALREADY_CLOSING, + UNSUPPORTED_OPERATION } } diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java index 663f317a3b3b..1d9210a7f766 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java @@ -476,4 +476,12 @@ DecommissionScmResponseProto decommissionScm( String scmId) throws IOException; String getMetrics(String query) throws IOException; + + /** + * Trigger a reconcile command to datanodes for the current container ID. + * + * @param containerID The ID of the container to reconcile. + * @throws IOException On error + */ + void reconcileContainer(long containerID) throws IOException; } diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java index 109358c67bf6..e52851fa7958 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java @@ -103,6 +103,8 @@ import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StartContainerBalancerResponseProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StopContainerBalancerRequestProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ResetDeletedBlockRetryCountRequestProto; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ReconcileContainerRequestProto; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ReconcileContainerResponseProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.Type; import org.apache.hadoop.hdds.scm.DatanodeAdminError; import org.apache.hadoop.hdds.scm.ScmInfo; @@ -1154,4 +1156,28 @@ public String getMetrics(String query) throws IOException { String metricsJsonStr = response.getMetricsJson(); return metricsJsonStr; } + + @Override + public void reconcileContainer(long containerID) throws IOException { + ReconcileContainerRequestProto request = ReconcileContainerRequestProto.newBuilder() + .setContainerID(containerID) + .build(); + ReconcileContainerResponseProto response = submitRequest(Type.ReconcileContainer, + builder -> builder.setReconcileContainerRequest(request)).getReconcileContainerResponse(); + if (response.hasStatus()) { + switch (response.getStatus()) { + case OK: + break; + case CONTAINER_STILL_OPEN: + throw new IOException("Cannot reconcile an open container"); + break; + case UNSUPPORTED_CONTAINER_TYPE: + throw new IOException("Reconciliation is currently only supported on Ratis containers"); + break; + default: + throw new IOException("Reconciliation encountered an unknown error"); + break; + } + } + } } diff --git a/hadoop-hdds/interface-admin/src/main/proto/ScmAdminProtocol.proto b/hadoop-hdds/interface-admin/src/main/proto/ScmAdminProtocol.proto index e8b8d623942a..031596a8321d 100644 --- a/hadoop-hdds/interface-admin/src/main/proto/ScmAdminProtocol.proto +++ b/hadoop-hdds/interface-admin/src/main/proto/ScmAdminProtocol.proto @@ -84,6 +84,7 @@ message ScmContainerLocationRequest { optional SingleNodeQueryRequestProto singleNodeQueryRequest = 45; optional GetContainersOnDecomNodeRequestProto getContainersOnDecomNodeRequest = 46; optional GetMetricsRequestProto getMetricsRequest = 47; + optional ReconcileContainerRequestProto reconcileContainerRequest = 48; } message ScmContainerLocationResponse { @@ -139,6 +140,7 @@ message ScmContainerLocationResponse { optional SingleNodeQueryResponseProto singleNodeQueryResponse = 45; optional GetContainersOnDecomNodeResponseProto getContainersOnDecomNodeResponse = 46; optional GetMetricsResponseProto getMetricsResponse = 47; + optional ReconcileContainerResponseProto reconcileContainerResponse = 48; enum Status { OK = 1; @@ -193,6 +195,7 @@ enum Type { SingleNodeQuery = 41; GetContainersOnDecomNode = 42; GetMetrics = 43; + ReconcileContainer = 44; } /** @@ -629,6 +632,19 @@ message GetMetricsResponseProto { optional string metricsJson = 1; } +message ReconcileContainerRequestProto { + required int64 containerID = 1; +} + +message ReconcileContainerResponseProto { + enum Status { + OK = 1; + CONTAINER_STILL_OPEN = 2; + UNSUPPORTED_CONTAINER_TYPE = 3; + } + optional Status status = 1; +} + /** * Protocol used from an HDFS node to StorageContainerManager. See the request * and response messages for details of the RPC calls. diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/events/SCMEvents.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/events/SCMEvents.java index 0cc205b2ffce..3b2a84f4f335 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/events/SCMEvents.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/events/SCMEvents.java @@ -217,6 +217,13 @@ public final class SCMEvents { new TypedEvent<>(CRLStatusReportFromDatanode.class, "Crl_Status_Report"); + /** + * This event will be triggered whenever a datanode needs to reconcile its replica of a container with other + * replicas in the cluster. + */ + public static final TypedEvent + RECONCILE_CONTAINER = new TypedEvent<>(ContainerID.class, "Reconcile_Container"); + /** * Private Ctor. Never Constructed. */ diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java index 2df2a4847e36..57d1310407c2 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java @@ -1380,4 +1380,35 @@ public String getMetrics(String query) throws IOException { FetchMetrics fetchMetrics = new FetchMetrics(); return fetchMetrics.getMetrics(query); } + + @Override + public void reconcileContainer(long containerID) throws IOException { + getScm().checkAdminAccess(getRemoteUser(), false); + final UserGroupInformation remoteUser = getRemoteUser(); + final Map auditMap = Maps.newHashMap(); + auditMap.put("containerID", String.valueOf(containerID)); + auditMap.put("remoteUser", remoteUser.getUserName()); + try { + // Reconcile is not allowed on open containers. + ContainerInfo container = scm.getContainerManager().getContainer(ContainerID.valueOf(containerID)); + final HddsProtos.LifeCycleState state = container.getState(); + if (state.equals(HddsProtos.LifeCycleState.OPEN)) { + throw new SCMException("Cannot reconcile a " + state + " container.", ResultCodes.UNEXPECTED_CONTAINER_STATE); + } + // Reconcile on EC containers is not yet implemented. + final HddsProtos.ReplicationType repType = container.getReplicationType(); + if (repType == HddsProtos.ReplicationType.EC) { + throw new SCMException("Reconciliation for erasure coded containers is not yet supported.", + ResultCodes.UNSUPPORTED_OPERATION); + } + scm.getEventQueue().fireEvent(SCMEvents.RECONCILE_CONTAINER, ContainerID.valueOf(containerID)); + AUDIT.logWriteSuccess(buildAuditMessageForSuccess( + SCMAction.RECONCILE_CONTAINER, auditMap)); + } catch (Exception ex) { + AUDIT.logWriteFailure(buildAuditMessageForFailure( + SCMAction.RECONCILE_CONTAINER, auditMap, ex)); + throw ex; + } + + } } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/ozone/audit/SCMAction.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/ozone/audit/SCMAction.java index 4e1fe234ff01..2c9df2afb404 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/ozone/audit/SCMAction.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/ozone/audit/SCMAction.java @@ -52,7 +52,8 @@ public enum SCMAction implements AuditAction { GET_REPLICATION_MANAGER_REPORT, RESET_DELETED_BLOCK_RETRY_COUNT, TRANSFER_LEADERSHIP, - GET_FAILED_DELETED_BLOCKS_TRANSACTION; + GET_FAILED_DELETED_BLOCKS_TRANSACTION, + RECONCILE_CONTAINER; @Override public String getAction() { diff --git a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java index 499d58b1ff2a..f5facb003316 100644 --- a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java +++ b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java @@ -568,4 +568,8 @@ public String getMetrics(String query) throws IOException { return storageContainerLocationClient.getMetrics(query); } + @Override + public void reconcileContainer(long id) throws IOException { + storageContainerLocationClient.reconcileContainer(id); + } } diff --git a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java new file mode 100644 index 000000000000..2783d88753c3 --- /dev/null +++ b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdds.scm.cli.container; + +import java.io.IOException; + +import org.apache.hadoop.hdds.cli.HddsVersionProvider; +import org.apache.hadoop.hdds.scm.cli.ScmSubcommand; +import org.apache.hadoop.hdds.scm.client.ScmClient; + +import picocli.CommandLine; +import picocli.CommandLine.Command; + +/** + * This is the handler that process container list command. + */ +@Command( + name = "reconcile", + description = "Reconcile container replicas", + mixinStandardHelpOptions = true, + versionProvider = HddsVersionProvider.class) +public class ReconcileSubcommand extends ScmSubcommand { + + @CommandLine.Parameters(description = "ID of the container to reconcile") + private long containerId; + + @Override + public void execute(ScmClient scmClient) throws IOException { + // TODO output a status message? + scmClient.reconcileContainer(containerId); + } +} From 573ec30710625dc234e33e340a49268a4672c791 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Wed, 6 Mar 2024 06:08:07 -0800 Subject: [PATCH 02/43] Changes for SCM WIP pt2 Still working on best way to get errors back to the client --- .../hadoop/hdds/scm/client/ScmClient.java | 3 +- .../StorageContainerLocationProtocol.java | 3 +- .../commands/ReconcileContainerCommand.java | 71 +++++++++++++++++++ ...ocationProtocolClientSideTranslatorPB.java | 19 +---- .../ScmServerDatanodeHeartbeatProtocol.proto | 11 +++ .../hdds/scm/container/ContainerManager.java | 3 + .../scm/container/ContainerManagerImpl.java | 19 +++++ .../ReconcileContainerCommandHandler.java | 62 ++++++++++++++++ .../scm/server/SCMClientProtocolServer.java | 24 +++---- .../scm/cli/ContainerOperationClient.java | 5 +- .../cli/container/ReconcileSubcommand.java | 19 ++++- 11 files changed, 200 insertions(+), 39 deletions(-) create mode 100644 hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/ReconcileContainerCommand.java create mode 100644 hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerCommandHandler.java diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java index 2e56d141b363..6f28d9fa19e0 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java @@ -22,6 +22,7 @@ import org.apache.hadoop.hdds.client.ReplicationConfig; import org.apache.hadoop.hdds.protocol.DatanodeDetails; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionInfo; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ReconcileContainerResponseProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StartContainerBalancerResponseProto; import org.apache.hadoop.hdds.scm.DatanodeAdminError; @@ -461,5 +462,5 @@ DecommissionScmResponseProto decommissionScm( * @param containerID The ID of the container to reconcile. * @throws IOException On error */ - void reconcileContainer(long containerID) throws IOException; + ReconcileContainerResponseProto reconcileContainer(long containerID) throws IOException; } diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java index 1d9210a7f766..26fc382a1a07 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java @@ -23,6 +23,7 @@ import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionInfo; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ReconcileContainerResponseProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StartContainerBalancerResponseProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.Type; import org.apache.hadoop.hdds.scm.DatanodeAdminError; @@ -483,5 +484,5 @@ DecommissionScmResponseProto decommissionScm( * @param containerID The ID of the container to reconcile. * @throws IOException On error */ - void reconcileContainer(long containerID) throws IOException; + ReconcileContainerResponseProto reconcileContainer(long containerID) throws IOException; } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/ReconcileContainerCommand.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/ReconcileContainerCommand.java new file mode 100644 index 000000000000..2dac0b920622 --- /dev/null +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/ReconcileContainerCommand.java @@ -0,0 +1,71 @@ +package org.apache.hadoop.ozone.protocol.commands; + +import com.google.common.base.Preconditions; +import org.apache.hadoop.hdds.protocol.DatanodeDetails; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.SCMCommandProto; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ReconcileContainerCommandProto; +import org.apache.hadoop.hdds.scm.container.ContainerID; + +import java.util.List; +import java.util.stream.Collectors; + +import static java.util.Collections.emptyList; + +/** + * Asks datanodes to reconcile the specified container with other container replicas. + */ +public class ReconcileContainerCommand extends SCMCommand { + + private final List sourceDatanodes; + + public ReconcileContainerCommand(long containerID, List sourceDatanodes) { + // Container ID serves as command ID, since only one reconciliation should be in progress at a time. + super(containerID); + this.sourceDatanodes = sourceDatanodes; + } + + + @Override + public SCMCommandProto.Type getType() { + return SCMCommandProto.Type.reconcileContainerCommand; + } + + @Override + public ReconcileContainerCommandProto getProto() { + ReconcileContainerCommandProto.Builder builder = ReconcileContainerCommandProto.newBuilder() + .setContainerID(getId()); + for (DatanodeDetails dd : sourceDatanodes) { + builder.addSources(dd.getProtoBufMessage()); + } + return builder.build(); + } + + public List getSourceDatanodes() { + return sourceDatanodes; + } + + public long getContainerID() { + return getId(); + } + + public static ReconcileContainerCommand getFromProtobuf(ReconcileContainerCommandProto protoMessage) { + Preconditions.checkNotNull(protoMessage); + + List sources = protoMessage.getSourcesList(); + List sourceNodes = !sources.isEmpty() + ? sources.stream() + .map(DatanodeDetails::getFromProtoBuf) + .collect(Collectors.toList()) + : emptyList(); + + return new ReconcileContainerCommand(protoMessage.getContainerID(), sourceNodes); + } + + @Override + public String toString() { + return getType() + + ": containerId=" + getContainerID() + + ", sourceNodes=" + sourceDatanodes; + } +} diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java index e52851fa7958..5b0fcb7bf7c6 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java @@ -1158,26 +1158,11 @@ public String getMetrics(String query) throws IOException { } @Override - public void reconcileContainer(long containerID) throws IOException { + public ReconcileContainerResponseProto reconcileContainer(long containerID) throws IOException { ReconcileContainerRequestProto request = ReconcileContainerRequestProto.newBuilder() .setContainerID(containerID) .build(); - ReconcileContainerResponseProto response = submitRequest(Type.ReconcileContainer, + return submitRequest(Type.ReconcileContainer, builder -> builder.setReconcileContainerRequest(request)).getReconcileContainerResponse(); - if (response.hasStatus()) { - switch (response.getStatus()) { - case OK: - break; - case CONTAINER_STILL_OPEN: - throw new IOException("Cannot reconcile an open container"); - break; - case UNSUPPORTED_CONTAINER_TYPE: - throw new IOException("Reconciliation is currently only supported on Ratis containers"); - break; - default: - throw new IOException("Reconciliation encountered an unknown error"); - break; - } - } } } diff --git a/hadoop-hdds/interface-server/src/main/proto/ScmServerDatanodeHeartbeatProtocol.proto b/hadoop-hdds/interface-server/src/main/proto/ScmServerDatanodeHeartbeatProtocol.proto index 2994073c0240..8363eba27742 100644 --- a/hadoop-hdds/interface-server/src/main/proto/ScmServerDatanodeHeartbeatProtocol.proto +++ b/hadoop-hdds/interface-server/src/main/proto/ScmServerDatanodeHeartbeatProtocol.proto @@ -328,6 +328,7 @@ message SCMCommandProto { finalizeNewLayoutVersionCommand = 9; refreshVolumeUsageInfo = 10; reconstructECContainersCommand = 11; + reconcileContainerCommand = 12; } // TODO: once we start using protoc 3.x, refactor this message using "oneof" required Type commandType = 1; @@ -343,6 +344,7 @@ message SCMCommandProto { finalizeNewLayoutVersionCommandProto = 10; optional RefreshVolumeUsageCommandProto refreshVolumeUsageCommandProto = 11; optional ReconstructECContainersCommandProto reconstructECContainersCommandProto = 12; + optional ReconcileContainerCommandProto reconcileContainerCommandProto = 13; // If running upon Ratis, holds term of underlying RaftServer iff current @@ -499,6 +501,15 @@ message FinalizeNewLayoutVersionCommandProto { required int64 cmdId = 3; } +/** +This command asks the datanode to replicate a container from specific sources. +*/ +message ReconcileContainerCommandProto { + required int64 containerID = 1; + repeated DatanodeDetailsProto sources = 2; + required int64 cmdId = 3; +} + /** * Protocol used from a datanode to StorageContainerManager. * diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManager.java index 2a60e268ff4a..8406b97bd393 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManager.java @@ -26,6 +26,7 @@ import org.apache.hadoop.hdds.client.ReplicationConfig; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleEvent; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ReconcileContainerResponseProto; import org.apache.hadoop.hdds.scm.pipeline.Pipeline; import org.apache.hadoop.hdds.utils.db.Table; import org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException; @@ -204,6 +205,8 @@ ContainerInfo getMatchingContainer(long size, String owner, void deleteContainer(ContainerID containerID) throws IOException; + ReconcileContainerResponseProto.Status canReconcileContainer(ContainerID containerID) throws ContainerNotFoundException; + /** * Returns containerStateManger. * @return containerStateManger diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerImpl.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerImpl.java index 8e1e881c44ea..4cbd93029399 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerImpl.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerImpl.java @@ -40,6 +40,7 @@ import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ContainerInfoProto; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleEvent; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ReconcileContainerResponseProto; import org.apache.hadoop.hdds.scm.ScmConfigKeys; import org.apache.hadoop.hdds.scm.container.metrics.SCMContainerManagerMetrics; import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaPendingOps; @@ -421,6 +422,24 @@ public void deleteContainer(final ContainerID cid) } } + @Override + public ReconcileContainerResponseProto.Status canReconcileContainer(ContainerID containerID) + throws ContainerNotFoundException { + // Reconcile is not allowed on open containers. + ContainerInfo container = getContainer(containerID); + final HddsProtos.LifeCycleState state = container.getState(); + if (state.equals(HddsProtos.LifeCycleState.OPEN)) { + return ReconcileContainerResponseProto.Status.CONTAINER_STILL_OPEN; + } + // Reconcile on EC containers is not yet implemented. + final HddsProtos.ReplicationType repType = container.getReplicationType(); + if (repType == HddsProtos.ReplicationType.EC) { + return ReconcileContainerResponseProto.Status.UNSUPPORTED_CONTAINER_TYPE; + } + + return ReconcileContainerResponseProto.Status.OK; + } + @Override public boolean containerExist(final ContainerID id) { return containerStateManager.contains(id); diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerCommandHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerCommandHandler.java new file mode 100644 index 000000000000..98f36d72f632 --- /dev/null +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerCommandHandler.java @@ -0,0 +1,62 @@ +package org.apache.hadoop.hdds.scm.container; + +import org.apache.hadoop.hdds.protocol.DatanodeDetails; +import org.apache.hadoop.hdds.scm.ha.SCMContext; +import org.apache.hadoop.hdds.server.events.EventHandler; +import org.apache.hadoop.hdds.server.events.EventPublisher; +import org.apache.hadoop.ozone.protocol.commands.CommandForDatanode; +import org.apache.hadoop.ozone.protocol.commands.ReconcileContainerCommand; +import org.apache.hadoop.ozone.protocol.commands.SCMCommand; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.stream.Collectors; + +import static org.apache.hadoop.hdds.scm.events.SCMEvents.DATANODE_COMMAND; + +/** + * SCM may trigger a reconcile container request when it sees multiple non-open containers whose hashes do not match. + * The reconcile command can also be triggered manually from the command line. + * This command will instruct datanodes to read blocks from their peers that also have replicas of the specified + * container to reach an agreement on its contents. + */ +public class ReconcileContainerCommandHandler implements EventHandler { + + public static final Logger LOG = + LoggerFactory.getLogger(CloseContainerEventHandler.class); + + private final ContainerManager containerManager; + private final SCMContext scmContext; + + public ReconcileContainerCommandHandler(ContainerManager containerManager, SCMContext scmContext) { + this.containerManager = containerManager; + this.scmContext = scmContext; + } + + @Override + public void onMessage(ContainerID containerID, EventPublisher publisher) { + if (!scmContext.isLeader()) { + LOG.info("Skip command to reconcile container {} since the current SCM is not the leader.", + containerID); + return; + } + + try { + List nodesWithReplica = containerManager.getContainerReplicas(containerID) + .stream() + .map(ContainerReplica::getDatanodeDetails) + .collect(Collectors.toList()); + + // TODO fail if container recon not allowed + + // Datanodes will not reconcile with themselves even if they are listed as a source. + // Therefore, send the same source list to every datanode. + SCMCommand reconcileCommand = new ReconcileContainerCommand(containerID.getId(), nodesWithReplica); + nodesWithReplica.forEach(node -> + publisher.fireEvent(DATANODE_COMMAND, new CommandForDatanode<>(node.getUuid(), reconcileCommand))); + } catch (ContainerNotFoundException ex) { + LOG.error("Cannot send reconcile command for unknown container {}", containerID); + } + } +} diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java index 57d1310407c2..efd644046ee4 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java @@ -37,6 +37,7 @@ import org.apache.hadoop.hdds.protocol.proto.ReconfigureProtocolProtos.ReconfigureProtocolService; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionInfo; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ReconcileContainerResponseProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto.Builder; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StartContainerBalancerResponseProto; @@ -1382,26 +1383,18 @@ public String getMetrics(String query) throws IOException { } @Override - public void reconcileContainer(long containerID) throws IOException { + public ReconcileContainerResponseProto reconcileContainer(long longContainerID) throws IOException { + ContainerID containerID = ContainerID.valueOf(longContainerID); getScm().checkAdminAccess(getRemoteUser(), false); final UserGroupInformation remoteUser = getRemoteUser(); final Map auditMap = Maps.newHashMap(); - auditMap.put("containerID", String.valueOf(containerID)); + auditMap.put("containerID", containerID.toString()); auditMap.put("remoteUser", remoteUser.getUserName()); try { - // Reconcile is not allowed on open containers. - ContainerInfo container = scm.getContainerManager().getContainer(ContainerID.valueOf(containerID)); - final HddsProtos.LifeCycleState state = container.getState(); - if (state.equals(HddsProtos.LifeCycleState.OPEN)) { - throw new SCMException("Cannot reconcile a " + state + " container.", ResultCodes.UNEXPECTED_CONTAINER_STATE); - } - // Reconcile on EC containers is not yet implemented. - final HddsProtos.ReplicationType repType = container.getReplicationType(); - if (repType == HddsProtos.ReplicationType.EC) { - throw new SCMException("Reconciliation for erasure coded containers is not yet supported.", - ResultCodes.UNSUPPORTED_OPERATION); - } - scm.getEventQueue().fireEvent(SCMEvents.RECONCILE_CONTAINER, ContainerID.valueOf(containerID)); + // TODO container manager should return status. + ReconcileContainerResponseProto.Status status = scm.getContainerManager().canReconcileContainer(containerID); + + scm.getEventQueue().fireEvent(SCMEvents.RECONCILE_CONTAINER, containerID); AUDIT.logWriteSuccess(buildAuditMessageForSuccess( SCMAction.RECONCILE_CONTAINER, auditMap)); } catch (Exception ex) { @@ -1409,6 +1402,5 @@ public void reconcileContainer(long containerID) throws IOException { SCMAction.RECONCILE_CONTAINER, auditMap, ex)); throw ex; } - } } diff --git a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java index f5facb003316..ff8eb79153d9 100644 --- a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java +++ b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java @@ -29,6 +29,7 @@ import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ReadContainerResponseProto; import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionInfo; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ReconcileContainerResponseProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StartContainerBalancerResponseProto; import org.apache.hadoop.hdds.scm.DatanodeAdminError; @@ -569,7 +570,7 @@ public String getMetrics(String query) throws IOException { } @Override - public void reconcileContainer(long id) throws IOException { - storageContainerLocationClient.reconcileContainer(id); + public ReconcileContainerResponseProto reconcileContainer(long id) throws IOException { + return storageContainerLocationClient.reconcileContainer(id); } } diff --git a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java index 2783d88753c3..dfc77329867a 100644 --- a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java +++ b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java @@ -20,6 +20,7 @@ import java.io.IOException; import org.apache.hadoop.hdds.cli.HddsVersionProvider; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ReconcileContainerResponseProto; import org.apache.hadoop.hdds.scm.cli.ScmSubcommand; import org.apache.hadoop.hdds.scm.client.ScmClient; @@ -41,7 +42,21 @@ public class ReconcileSubcommand extends ScmSubcommand { @Override public void execute(ScmClient scmClient) throws IOException { - // TODO output a status message? - scmClient.reconcileContainer(containerId); + ReconcileContainerResponseProto response = scmClient.reconcileContainer(containerId); + if (response.hasStatus()) { + switch (response.getStatus()) { + case OK: + System.out.println("Reconciliation has been triggered for container " + containerId); + System.out.println("Use \"ozone admin container info " + containerId + "\" to check the hashes of each " + + "container replica"); + break; + case CONTAINER_STILL_OPEN: + System.err.println("Cannot reconcile an open container"); + case UNSUPPORTED_CONTAINER_TYPE: + System.err.println("Reconciliation is currently only supported on Ratis containers"); + default: + System.err.println("Reconciliation encountered an unknown error"); + } + } } } From b259dae99111af6cae81202ad9d6edc5a5a23e48 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Tue, 2 Apr 2024 17:52:29 -0700 Subject: [PATCH 03/43] Use SCMException only for error handling --- .../hadoop/hdds/scm/client/ScmClient.java | 2 +- .../StorageContainerLocationProtocol.java | 2 +- ...ocationProtocolClientSideTranslatorPB.java | 6 ++-- .../src/main/proto/ScmAdminProtocol.proto | 6 ---- .../hdds/scm/container/ContainerManager.java | 3 +- .../scm/container/ContainerManagerImpl.java | 20 +++--------- .../scm/server/SCMClientProtocolServer.java | 32 +++++++++++++------ .../scm/cli/ContainerOperationClient.java | 4 +-- .../cli/container/ReconcileSubcommand.java | 21 +++--------- 9 files changed, 40 insertions(+), 56 deletions(-) diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java index 6f28d9fa19e0..0dce695153c0 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java @@ -462,5 +462,5 @@ DecommissionScmResponseProto decommissionScm( * @param containerID The ID of the container to reconcile. * @throws IOException On error */ - ReconcileContainerResponseProto reconcileContainer(long containerID) throws IOException; + void reconcileContainer(long containerID) throws IOException; } diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java index 26fc382a1a07..4856dc32054d 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java @@ -484,5 +484,5 @@ DecommissionScmResponseProto decommissionScm( * @param containerID The ID of the container to reconcile. * @throws IOException On error */ - ReconcileContainerResponseProto reconcileContainer(long containerID) throws IOException; + void reconcileContainer(long containerID) throws IOException; } diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java index 5b0fcb7bf7c6..3584a3ef6a8e 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java @@ -1158,11 +1158,11 @@ public String getMetrics(String query) throws IOException { } @Override - public ReconcileContainerResponseProto reconcileContainer(long containerID) throws IOException { + public void reconcileContainer(long containerID) throws IOException { ReconcileContainerRequestProto request = ReconcileContainerRequestProto.newBuilder() .setContainerID(containerID) .build(); - return submitRequest(Type.ReconcileContainer, - builder -> builder.setReconcileContainerRequest(request)).getReconcileContainerResponse(); + // TODO check error handling. + submitRequest(Type.ReconcileContainer, builder -> builder.setReconcileContainerRequest(request)); } } diff --git a/hadoop-hdds/interface-admin/src/main/proto/ScmAdminProtocol.proto b/hadoop-hdds/interface-admin/src/main/proto/ScmAdminProtocol.proto index 031596a8321d..0560492e94fd 100644 --- a/hadoop-hdds/interface-admin/src/main/proto/ScmAdminProtocol.proto +++ b/hadoop-hdds/interface-admin/src/main/proto/ScmAdminProtocol.proto @@ -637,12 +637,6 @@ message ReconcileContainerRequestProto { } message ReconcileContainerResponseProto { - enum Status { - OK = 1; - CONTAINER_STILL_OPEN = 2; - UNSUPPORTED_CONTAINER_TYPE = 3; - } - optional Status status = 1; } /** diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManager.java index 8406b97bd393..aba95560cb76 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManager.java @@ -27,6 +27,7 @@ import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleEvent; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ReconcileContainerResponseProto; +import org.apache.hadoop.hdds.scm.exceptions.SCMException; import org.apache.hadoop.hdds.scm.pipeline.Pipeline; import org.apache.hadoop.hdds.utils.db.Table; import org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException; @@ -205,7 +206,7 @@ ContainerInfo getMatchingContainer(long size, String owner, void deleteContainer(ContainerID containerID) throws IOException; - ReconcileContainerResponseProto.Status canReconcileContainer(ContainerID containerID) throws ContainerNotFoundException; + void reconcileContainer(ContainerID containerID); /** * Returns containerStateManger. diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerImpl.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerImpl.java index 4cbd93029399..bfcf83b4af5d 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerImpl.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerImpl.java @@ -40,10 +40,11 @@ import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ContainerInfoProto; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleEvent; -import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ReconcileContainerResponseProto; import org.apache.hadoop.hdds.scm.ScmConfigKeys; import org.apache.hadoop.hdds.scm.container.metrics.SCMContainerManagerMetrics; import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaPendingOps; +import org.apache.hadoop.hdds.scm.exceptions.SCMException.ResultCodes; +import org.apache.hadoop.hdds.scm.exceptions.SCMException; import org.apache.hadoop.hdds.scm.ha.SCMHAManager; import org.apache.hadoop.hdds.scm.ha.SequenceIdGenerator; import org.apache.hadoop.hdds.scm.pipeline.Pipeline; @@ -423,21 +424,8 @@ public void deleteContainer(final ContainerID cid) } @Override - public ReconcileContainerResponseProto.Status canReconcileContainer(ContainerID containerID) - throws ContainerNotFoundException { - // Reconcile is not allowed on open containers. - ContainerInfo container = getContainer(containerID); - final HddsProtos.LifeCycleState state = container.getState(); - if (state.equals(HddsProtos.LifeCycleState.OPEN)) { - return ReconcileContainerResponseProto.Status.CONTAINER_STILL_OPEN; - } - // Reconcile on EC containers is not yet implemented. - final HddsProtos.ReplicationType repType = container.getReplicationType(); - if (repType == HddsProtos.ReplicationType.EC) { - return ReconcileContainerResponseProto.Status.UNSUPPORTED_CONTAINER_TYPE; - } - - return ReconcileContainerResponseProto.Status.OK; + public void reconcileContainer(ContainerID containerID) { + // TODO } @Override diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java index efd644046ee4..e9bd14282848 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java @@ -1383,24 +1383,36 @@ public String getMetrics(String query) throws IOException { } @Override - public ReconcileContainerResponseProto reconcileContainer(long longContainerID) throws IOException { + public void reconcileContainer(long longContainerID) throws IOException { ContainerID containerID = ContainerID.valueOf(longContainerID); getScm().checkAdminAccess(getRemoteUser(), false); final UserGroupInformation remoteUser = getRemoteUser(); final Map auditMap = Maps.newHashMap(); auditMap.put("containerID", containerID.toString()); auditMap.put("remoteUser", remoteUser.getUserName()); - try { - // TODO container manager should return status. - ReconcileContainerResponseProto.Status status = scm.getContainerManager().canReconcileContainer(containerID); + ContainerInfo container = getContainer(longContainerID); + + SCMException exception = null; + // Reconcile is not allowed on open containers. + final HddsProtos.LifeCycleState state = container.getState(); + if (state.equals(HddsProtos.LifeCycleState.OPEN)) { + exception = new SCMException("Cannot reconcile container in state " + state, + ResultCodes.UNEXPECTED_CONTAINER_STATE); + } + // Reconcile on EC containers is not yet implemented. + final HddsProtos.ReplicationType repType = container.getReplicationType(); + if (repType == HddsProtos.ReplicationType.EC) { + exception = new SCMException("Reconciliation is currently only supported for Ratis containers", + ResultCodes.UNSUPPORTED_OPERATION); + } + + if (exception == null) { scm.getEventQueue().fireEvent(SCMEvents.RECONCILE_CONTAINER, containerID); - AUDIT.logWriteSuccess(buildAuditMessageForSuccess( - SCMAction.RECONCILE_CONTAINER, auditMap)); - } catch (Exception ex) { - AUDIT.logWriteFailure(buildAuditMessageForFailure( - SCMAction.RECONCILE_CONTAINER, auditMap, ex)); - throw ex; + AUDIT.logWriteSuccess(buildAuditMessageForSuccess(SCMAction.RECONCILE_CONTAINER, auditMap)); + } else { + AUDIT.logWriteFailure(buildAuditMessageForFailure(SCMAction.RECONCILE_CONTAINER, auditMap, exception)); + throw exception; } } } diff --git a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java index ff8eb79153d9..39a5a3b8cece 100644 --- a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java +++ b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java @@ -570,7 +570,7 @@ public String getMetrics(String query) throws IOException { } @Override - public ReconcileContainerResponseProto reconcileContainer(long id) throws IOException { - return storageContainerLocationClient.reconcileContainer(id); + public void reconcileContainer(long id) throws IOException { + storageContainerLocationClient.reconcileContainer(id); } } diff --git a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java index dfc77329867a..6d9e86fccc53 100644 --- a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java +++ b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java @@ -42,21 +42,10 @@ public class ReconcileSubcommand extends ScmSubcommand { @Override public void execute(ScmClient scmClient) throws IOException { - ReconcileContainerResponseProto response = scmClient.reconcileContainer(containerId); - if (response.hasStatus()) { - switch (response.getStatus()) { - case OK: - System.out.println("Reconciliation has been triggered for container " + containerId); - System.out.println("Use \"ozone admin container info " + containerId + "\" to check the hashes of each " + - "container replica"); - break; - case CONTAINER_STILL_OPEN: - System.err.println("Cannot reconcile an open container"); - case UNSUPPORTED_CONTAINER_TYPE: - System.err.println("Reconciliation is currently only supported on Ratis containers"); - default: - System.err.println("Reconciliation encountered an unknown error"); - } - } + scmClient.reconcileContainer(containerId); + System.out.println("Reconciliation has been triggered for container " + containerId); + // TODO a better option to check status may be added later. + System.out.println("Use \"ozone admin container info " + containerId + "\" to check the hashes of each container" + + "replica"); } } From 4a89c8ea5c8d52f052b5a7cc7652779640272b40 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Wed, 3 Apr 2024 12:38:04 -0700 Subject: [PATCH 04/43] Add SCM event handler for reconcile events --- .../ReconcileContainerCommandHandler.java | 62 --------------- .../ReconcileContainerEventHandler.java | 76 +++++++++++++++++++ .../scm/server/SCMClientProtocolServer.java | 1 + .../scm/server/StorageContainerManager.java | 5 ++ 4 files changed, 82 insertions(+), 62 deletions(-) delete mode 100644 hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerCommandHandler.java create mode 100644 hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerEventHandler.java diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerCommandHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerCommandHandler.java deleted file mode 100644 index 98f36d72f632..000000000000 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerCommandHandler.java +++ /dev/null @@ -1,62 +0,0 @@ -package org.apache.hadoop.hdds.scm.container; - -import org.apache.hadoop.hdds.protocol.DatanodeDetails; -import org.apache.hadoop.hdds.scm.ha.SCMContext; -import org.apache.hadoop.hdds.server.events.EventHandler; -import org.apache.hadoop.hdds.server.events.EventPublisher; -import org.apache.hadoop.ozone.protocol.commands.CommandForDatanode; -import org.apache.hadoop.ozone.protocol.commands.ReconcileContainerCommand; -import org.apache.hadoop.ozone.protocol.commands.SCMCommand; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.List; -import java.util.stream.Collectors; - -import static org.apache.hadoop.hdds.scm.events.SCMEvents.DATANODE_COMMAND; - -/** - * SCM may trigger a reconcile container request when it sees multiple non-open containers whose hashes do not match. - * The reconcile command can also be triggered manually from the command line. - * This command will instruct datanodes to read blocks from their peers that also have replicas of the specified - * container to reach an agreement on its contents. - */ -public class ReconcileContainerCommandHandler implements EventHandler { - - public static final Logger LOG = - LoggerFactory.getLogger(CloseContainerEventHandler.class); - - private final ContainerManager containerManager; - private final SCMContext scmContext; - - public ReconcileContainerCommandHandler(ContainerManager containerManager, SCMContext scmContext) { - this.containerManager = containerManager; - this.scmContext = scmContext; - } - - @Override - public void onMessage(ContainerID containerID, EventPublisher publisher) { - if (!scmContext.isLeader()) { - LOG.info("Skip command to reconcile container {} since the current SCM is not the leader.", - containerID); - return; - } - - try { - List nodesWithReplica = containerManager.getContainerReplicas(containerID) - .stream() - .map(ContainerReplica::getDatanodeDetails) - .collect(Collectors.toList()); - - // TODO fail if container recon not allowed - - // Datanodes will not reconcile with themselves even if they are listed as a source. - // Therefore, send the same source list to every datanode. - SCMCommand reconcileCommand = new ReconcileContainerCommand(containerID.getId(), nodesWithReplica); - nodesWithReplica.forEach(node -> - publisher.fireEvent(DATANODE_COMMAND, new CommandForDatanode<>(node.getUuid(), reconcileCommand))); - } catch (ContainerNotFoundException ex) { - LOG.error("Cannot send reconcile command for unknown container {}", containerID); - } - } -} diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerEventHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerEventHandler.java new file mode 100644 index 000000000000..545d6079f472 --- /dev/null +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerEventHandler.java @@ -0,0 +1,76 @@ +package org.apache.hadoop.hdds.scm.container; + +import org.apache.hadoop.hdds.protocol.DatanodeDetails; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos; +import org.apache.hadoop.hdds.scm.ha.SCMContext; +import org.apache.hadoop.hdds.server.events.EventHandler; +import org.apache.hadoop.hdds.server.events.EventPublisher; +import org.apache.hadoop.ozone.protocol.commands.CommandForDatanode; +import org.apache.hadoop.ozone.protocol.commands.ReconcileContainerCommand; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.apache.hadoop.hdds.scm.events.SCMEvents.DATANODE_COMMAND; + +/** + * When a reconcile container event is fired, this class will check if the container is eligible for reconciliation, + * and if so, send the reconcile request to all datanodes with a replica of that container. + */ +public class ReconcileContainerEventHandler implements EventHandler { + public static final Logger LOG = + LoggerFactory.getLogger(ReconcileContainerEventHandler.class); + + private ContainerManager containerManager; + private SCMContext scmContext; + + public ReconcileContainerEventHandler( + final ContainerManager containerManager, + final SCMContext scmContext) { + this.containerManager = containerManager; + this.scmContext = scmContext; + } + + @Override + public void onMessage(ContainerID containerID, EventPublisher publisher) { + if (!scmContext.isLeader()) { + LOG.info("Skip reconciling container {} since current SCM is not leader.", containerID); + return; + } + + try { + ContainerInfo container = containerManager.getContainer(containerID); + final HddsProtos.LifeCycleState state = container.getState(); + if (state.equals(HddsProtos.LifeCycleState.OPEN)) { + LOG.error("Cannot reconcile container in state {}.", state); + return; + } + + // This restriction can be removed when reconciliation for EC containers is added. + final HddsProtos.ReplicationType repType = container.getReplicationType(); + if (repType == HddsProtos.ReplicationType.EC) { + LOG.error("Cannot reconcile container {} with replication type {}. Reconciliation is currently only supported" + + " for Ratis containers.", containerID, repType); + } + + // create SCMCommand + Set replicas = containerManager.getContainerReplicas(containerID) + .stream() + .map(ContainerReplica::getDatanodeDetails) + .collect(Collectors.toSet()); + + for (DatanodeDetails replica: replicas) { + List otherReplicas = replicas.stream() + .filter(other -> !other.equals(replica)) + .collect(Collectors.toList()); + ReconcileContainerCommand command = new ReconcileContainerCommand(containerID.getId(), otherReplicas); + publisher.fireEvent(DATANODE_COMMAND, new CommandForDatanode<>(replica.getUuid(), command)); + } + } catch (ContainerNotFoundException ex) { + LOG.error("Failed to start reconciliation for container {}. Container not found.", containerID); + } + } +} diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java index e9bd14282848..137cf7b527a5 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java @@ -1391,6 +1391,7 @@ public void reconcileContainer(long longContainerID) throws IOException { auditMap.put("containerID", containerID.toString()); auditMap.put("remoteUser", remoteUser.getUserName()); + // TODO need to audit log failure if this happens ContainerInfo container = getContainer(longContainerID); SCMException exception = null; diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java index 11fdc0d16d79..35c550f42f0e 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java @@ -49,6 +49,7 @@ import org.apache.hadoop.hdds.scm.container.ContainerManager; import org.apache.hadoop.hdds.scm.container.ContainerManagerImpl; import org.apache.hadoop.hdds.scm.PlacementPolicyValidateProxy; +import org.apache.hadoop.hdds.scm.container.ReconcileContainerEventHandler; import org.apache.hadoop.hdds.scm.container.balancer.MoveManager; import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaPendingOps; import org.apache.hadoop.hdds.scm.container.replication.DatanodeCommandCountUpdatedHandler; @@ -506,6 +507,9 @@ private void initializeEventHandlers() { CRLStatusReportHandler crlStatusReportHandler = new CRLStatusReportHandler(certificateStore, configuration); + ReconcileContainerEventHandler reconcileContainerEventHandler = + new ReconcileContainerEventHandler(containerManager, scmContext); + eventQueue.addHandler(SCMEvents.DATANODE_COMMAND, scmNodeManager); eventQueue.addHandler(SCMEvents.RETRIABLE_DATANODE_COMMAND, scmNodeManager); eventQueue.addHandler(SCMEvents.NODE_REPORT, nodeReportHandler); @@ -578,6 +582,7 @@ private void initializeEventHandlers() { eventQueue.addHandler(SCMEvents.PIPELINE_ACTIONS, pipelineActionHandler); eventQueue.addHandler(SCMEvents.PIPELINE_REPORT, pipelineReportHandler); eventQueue.addHandler(SCMEvents.CRL_STATUS_REPORT, crlStatusReportHandler); + eventQueue.addHandler(SCMEvents.RECONCILE_CONTAINER, reconcileContainerEventHandler); scmNodeManager.registerSendCommandNotify( SCMCommandProto.Type.deleteBlocksCommand, From f9d1bfd30fdfed3a1d1b9ecbcbe5b9d53f80f1bb Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Wed, 3 Apr 2024 16:59:01 -0700 Subject: [PATCH 05/43] Add datanode reconcile stub. Also renamed existing checksum field and methods in ContainerData. --- .../common/helpers/ContainerUtils.java | 6 +- .../container/common/impl/ContainerData.java | 41 +++++---- .../common/impl/ContainerDataYaml.java | 4 +- .../container/common/interfaces/Handler.java | 9 ++ .../statemachine/DatanodeStateMachine.java | 2 + .../ReconcileContainerCommandHandler.java | 85 +++++++++++++++++++ .../endpoint/HeartbeatEndpointTask.java | 6 ++ .../container/keyvalue/KeyValueContainer.java | 3 +- .../container/keyvalue/KeyValueHandler.java | 10 +++ .../ozoneimpl/ContainerController.java | 11 +++ .../TestSchemaOneBackwardsCompatibility.java | 2 +- .../replication/TestContainerImporter.java | 3 +- .../ScmServerDatanodeHeartbeatProtocol.proto | 4 +- 13 files changed, 160 insertions(+), 26 deletions(-) create mode 100644 hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java index b89ecff48c90..759b5edae3bb 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java @@ -201,14 +201,14 @@ public static void verifyChecksum(ContainerData containerData, HddsConfigKeys. HDDS_CONTAINER_CHECKSUM_VERIFICATION_ENABLED_DEFAULT); if (enabled) { - String storedChecksum = containerData.getChecksum(); + String storedChecksum = containerData.getContainerFileChecksum(); Yaml yaml = ContainerDataYaml.getYamlForContainerType( containerData.getContainerType(), containerData instanceof KeyValueContainerData && ((KeyValueContainerData)containerData).getReplicaIndex() > 0); - containerData.computeAndSetChecksum(yaml); - String computedChecksum = containerData.getChecksum(); + containerData.computeAndSetContainerFileChecksum(yaml); + String computedChecksum = containerData.getContainerFileChecksum(); if (storedChecksum == null || !storedChecksum.equals(computedChecksum)) { throw new StorageContainerException("Container checksum error for " + diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerData.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerData.java index 3c202ba60a8a..48c6c2e33d15 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerData.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerData.java @@ -99,7 +99,11 @@ public abstract class ContainerData { private HddsVolume volume; - private String checksum; + private String containerFileChecksum; + + // TODO This should have type Checksum once we decide on the checksum implementation to use. + // Currently this is just a placeholder to save data for testing. + private String dataChecksum; private boolean isEmpty; @@ -112,7 +116,7 @@ public abstract class ContainerData { private transient Optional lastDataScanTime = Optional.empty(); public static final Charset CHARSET_ENCODING = StandardCharsets.UTF_8; - private static final String DUMMY_CHECKSUM = new String(new byte[64], + private static final String ZERO_CHECKSUM = new String(new byte[64], CHARSET_ENCODING); // Common Fields need to be stored in .container file. @@ -159,7 +163,8 @@ protected ContainerData(ContainerType type, long containerId, this.originPipelineId = originPipelineId; this.originNodeId = originNodeId; this.isEmpty = false; - setChecksumTo0ByteArray(); + this.containerFileChecksum = ZERO_CHECKSUM; + this.dataChecksum = ""; } protected ContainerData(ContainerData source) { @@ -571,16 +576,12 @@ public void setBlockCount(long count) { this.blockCount.set(count); } - public void setChecksumTo0ByteArray() { - this.checksum = DUMMY_CHECKSUM; - } - - public void setChecksum(String checkSum) { - this.checksum = checkSum; + public void setContainerFileChecksum(String checkSum) { + this.containerFileChecksum = checkSum; } - public String getChecksum() { - return this.checksum; + public String getContainerFileChecksum() { + return this.containerFileChecksum; } /** @@ -629,22 +630,30 @@ public String getOriginNodeId() { * on ContainerType) and set the checksum. * * Checksum of ContainerData is calculated by setting the - * {@link ContainerData#checksum} field to a 64-byte array with all 0's - - * {@link ContainerData#DUMMY_CHECKSUM}. After the checksum is calculated, + * {@link ContainerData#containerFileChecksum} field to a 64-byte array with all 0's - + * {@link ContainerData#ZERO_CHECKSUM}. After the checksum is calculated, * the checksum field is updated with this value. * * @param yaml Yaml for ContainerType to get the ContainerData as Yaml String * @throws IOException */ - public void computeAndSetChecksum(Yaml yaml) throws IOException { + public void computeAndSetContainerFileChecksum(Yaml yaml) throws IOException { // Set checksum to dummy value - 0 byte array, to calculate the checksum // of rest of the data. - setChecksumTo0ByteArray(); + this.containerFileChecksum = ZERO_CHECKSUM; // Dump yaml data into a string to compute its checksum String containerDataYamlStr = yaml.dump(this); - this.checksum = ContainerUtils.getChecksum(containerDataYamlStr); + this.containerFileChecksum = ContainerUtils.getChecksum(containerDataYamlStr); + } + + public void setDataChecksum(String checksum) { + dataChecksum = checksum; + } + + public String getDataChecksum() { + return dataChecksum; } /** diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerDataYaml.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerDataYaml.java index a4750b5fae01..140a462676b0 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerDataYaml.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerDataYaml.java @@ -98,7 +98,7 @@ public static void createContainerFile(ContainerType containerType, // Create Yaml for given container type Yaml yaml = getYamlForContainerType(containerType, withReplicaIndex); // Compute Checksum and update ContainerData - containerData.computeAndSetChecksum(yaml); + containerData.computeAndSetContainerFileChecksum(yaml); // Write the ContainerData with checksum to Yaml file. out = new FileOutputStream( @@ -312,7 +312,7 @@ public Object construct(Node node) { kvData.setChunksPath((String) nodes.get(OzoneConsts.CHUNKS_PATH)); Map meta = (Map) nodes.get(OzoneConsts.METADATA); kvData.setMetadata(meta); - kvData.setChecksum((String) nodes.get(OzoneConsts.CHECKSUM)); + kvData.setContainerFileChecksum((String) nodes.get(OzoneConsts.CHECKSUM)); Long timestamp = (Long) nodes.get(OzoneConsts.DATA_SCAN_TIMESTAMP); kvData.setDataScanTimestamp(timestamp); String state = (String) nodes.get(OzoneConsts.STATE); diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/Handler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/Handler.java index 2ffb9d30d1f4..179274f2c024 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/Handler.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/Handler.java @@ -21,8 +21,10 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.util.List; import org.apache.hadoop.hdds.conf.ConfigurationSource; +import org.apache.hadoop.hdds.protocol.DatanodeDetails; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandRequestProto; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandResponseProto; @@ -192,6 +194,13 @@ public abstract void closeContainer(Container container) public abstract void deleteContainer(Container container, boolean force) throws IOException; + /** + * Triggers reconciliation of this container replica's data with its peers. + * @param container container to be reconciled. + * @param peers The other datanodes with a copy of this container whose data should be checked. + */ + public abstract void reconcileContainer(Container container, List peers) throws IOException; + /** * Deletes the given files associated with a block of the container. * diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java index 967714405491..9292dba5fdd7 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java @@ -56,6 +56,7 @@ import org.apache.hadoop.ozone.container.common.statemachine.commandhandler.DeleteBlocksCommandHandler; import org.apache.hadoop.ozone.container.common.statemachine.commandhandler.DeleteContainerCommandHandler; import org.apache.hadoop.ozone.container.common.statemachine.commandhandler.FinalizeNewLayoutVersionCommandHandler; +import org.apache.hadoop.ozone.container.common.statemachine.commandhandler.ReconcileContainerCommandHandler; import org.apache.hadoop.ozone.container.common.statemachine.commandhandler.ReconstructECContainersCommandHandler; import org.apache.hadoop.ozone.container.common.statemachine.commandhandler.RefreshVolumeUsageCommandHandler; import org.apache.hadoop.ozone.container.common.statemachine.commandhandler.ReplicateContainerCommandHandler; @@ -258,6 +259,7 @@ public DatanodeStateMachine(DatanodeDetails datanodeDetails, supervisor::nodeStateUpdated)) .addHandler(new FinalizeNewLayoutVersionCommandHandler()) .addHandler(new RefreshVolumeUsageCommandHandler()) + .addHandler(new ReconcileContainerCommandHandler(threadNamePrefix)) .setConnectionManager(connectionManager) .setContainer(container) .setContext(context) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java new file mode 100644 index 000000000000..e95eefe029c1 --- /dev/null +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java @@ -0,0 +1,85 @@ +package org.apache.hadoop.ozone.container.common.statemachine.commandhandler; + +import com.google.common.util.concurrent.ThreadFactoryBuilder; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.SCMCommandProto; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ReconcileContainerCommandProto; +import org.apache.hadoop.ozone.container.common.statemachine.SCMConnectionManager; +import org.apache.hadoop.ozone.container.common.statemachine.StateContext; +import org.apache.hadoop.ozone.container.ozoneimpl.OzoneContainer; +import org.apache.hadoop.ozone.protocol.commands.ReconcileContainerCommand; +import org.apache.hadoop.ozone.protocol.commands.SCMCommand; +import org.apache.hadoop.util.Time; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; + +public class ReconcileContainerCommandHandler implements CommandHandler { + private static final Logger LOG = + LoggerFactory.getLogger(ReconcileContainerCommandHandler.class); + + private final AtomicLong invocationCount; + private final AtomicInteger queuedCount; + private final ExecutorService executor; + private long totalTime; + + public ReconcileContainerCommandHandler(String threadNamePrefix) { + invocationCount = new AtomicLong(0); + queuedCount = new AtomicInteger(0); + // TODO Allow configurable thread pool size with a default value when the implementation is ready. + executor = Executors.newSingleThreadExecutor(new ThreadFactoryBuilder() + .setNameFormat(threadNamePrefix + "ReconcileContainerThread-%d") + .build()); + totalTime = 0; + } + + @Override + public void handle(SCMCommand command, OzoneContainer container, StateContext context, SCMConnectionManager connectionManager) { + queuedCount.incrementAndGet(); + CompletableFuture.runAsync(() -> { + queuedCount.incrementAndGet(); + long startTime = Time.monotonicNow(); + ReconcileContainerCommand reconcileCommand = (ReconcileContainerCommand) command; + try { + container.getController().reconcileContainer(reconcileCommand.getContainerID(), + reconcileCommand.getSourceDatanodes()); + } catch (IOException ex) { + LOG.error("Failed to reconcile container {}.", reconcileCommand.getContainerID(), ex); + } finally { + long endTime = Time.monotonicNow(); + totalTime += endTime - startTime; + } + }, executor).whenComplete((v, e) -> queuedCount.decrementAndGet()); + } + + @Override + public SCMCommandProto.Type getCommandType() { + return SCMCommandProto.Type.reconcileContainerCommand; + } + + @Override + public int getInvocationCount() { + return 0; + } + + @Override + public long getAverageRunTime() { + return 0; + } + + @Override + public long getTotalRunTime() { + return 0; + } + + @Override + public int getQueuedCount() { + return 0; + } +} diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/HeartbeatEndpointTask.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/HeartbeatEndpointTask.java index 44f0eae49ead..b6ab4748fe30 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/HeartbeatEndpointTask.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/states/endpoint/HeartbeatEndpointTask.java @@ -55,6 +55,7 @@ import org.apache.hadoop.ozone.protocol.commands.DeleteBlocksCommand; import org.apache.hadoop.ozone.protocol.commands.DeleteContainerCommand; import org.apache.hadoop.ozone.protocol.commands.FinalizeNewLayoutVersionCommand; +import org.apache.hadoop.ozone.protocol.commands.ReconcileContainerCommand; import org.apache.hadoop.ozone.protocol.commands.ReconstructECContainersCommand; import org.apache.hadoop.ozone.protocol.commands.RefreshVolumeUsageCommand; import org.apache.hadoop.ozone.protocol.commands.ReplicateContainerCommand; @@ -416,6 +417,11 @@ private void processResponse(SCMHeartbeatResponseProto response, commandResponseProto.getRefreshVolumeUsageCommandProto()); processCommonCommand(commandResponseProto, refreshVolumeUsageCommand); break; + case reconcileContainerCommand: + ReconcileContainerCommand reconcileContainerCommand = + ReconcileContainerCommand.getFromProtobuf(commandResponseProto.getReconcileContainerCommandProto()); + processCommonCommand(commandResponseProto, reconcileContainerCommand); + break; default: throw new IllegalArgumentException("Unknown response : " + commandResponseProto.getCommandType().name()); diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainer.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainer.java index 838818266757..de43a29d9f6d 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainer.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainer.java @@ -885,7 +885,8 @@ public ContainerReplicaProto getContainerReport() .setDeleteTransactionId(containerData.getDeleteTransactionId()) .setBlockCommitSequenceId(containerData.getBlockCommitSequenceId()) .setOriginNodeId(containerData.getOriginNodeId()) - .setIsEmpty(containerData.isEmpty()); + .setIsEmpty(containerData.isEmpty()) + .setDataChecksum(containerData.getDataChecksum()); return ciBuilder.build(); } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java index e35c6345683f..ffa7e2e5e10f 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java @@ -41,6 +41,7 @@ import org.apache.hadoop.hdds.client.BlockID; import org.apache.hadoop.hdds.conf.ConfigurationSource; import org.apache.hadoop.hdds.conf.StorageUnit; +import org.apache.hadoop.hdds.protocol.DatanodeDetails; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandRequestProto; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandResponseProto; @@ -1150,6 +1151,15 @@ public void deleteContainer(Container container, boolean force) deleteInternal(container, force); } + @Override + public void reconcileContainer(Container container, List peers) throws IOException { + // TODO Just a deterministic placeholder hash for testing until actual implementation is finished. + ContainerData data = container.getContainerData(); + String dataChecksum = ContainerUtils.getChecksum(Long.toString(data.getContainerID())); + data.setDataChecksum(dataChecksum); + sendICR(container); + } + /** * Called by BlockDeletingService to delete all the chunks in a block * before proceeding to delete the block info from DB. diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerController.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerController.java index feb580538747..feb86f351975 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerController.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerController.java @@ -17,6 +17,7 @@ */ package org.apache.hadoop.ozone.container.ozoneimpl; +import org.apache.hadoop.hdds.protocol.DatanodeDetails; import org.apache.hadoop.hdds.protocol.datanode.proto .ContainerProtos.ContainerType; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos @@ -38,6 +39,7 @@ import java.io.OutputStream; import java.time.Instant; import java.util.Iterator; +import java.util.List; import java.util.Map; import java.util.Set; @@ -188,6 +190,15 @@ public void deleteContainer(final long containerId, boolean force) } } + public void reconcileContainer(long containerID, List peers) throws IOException { + Container container = containerSet.getContainer(containerID); + if (container == null) { + LOG.warn("Container {} to reconcile not found on this datanode.", containerID); + } else { + getHandler(container).reconcileContainer(container, peers); + } + } + /** * Given a container, returns its handler instance. * diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestSchemaOneBackwardsCompatibility.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestSchemaOneBackwardsCompatibility.java index 2235b23ce882..ad5ca482189b 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestSchemaOneBackwardsCompatibility.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestSchemaOneBackwardsCompatibility.java @@ -614,7 +614,7 @@ private KeyValueContainerData newKvData() throws IOException { Yaml yaml = ContainerDataYaml.getYamlForContainerType( kvData.getContainerType(), kvData.getReplicaIndex() > 0); - kvData.computeAndSetChecksum(yaml); + kvData.computeAndSetContainerFileChecksum(yaml); KeyValueContainerUtil.parseKVContainerData(kvData, conf); diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestContainerImporter.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestContainerImporter.java index 1b989e6bc7ff..a2a397ebc420 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestContainerImporter.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestContainerImporter.java @@ -152,8 +152,7 @@ public void testInconsistentChecksumContainerShouldThrowError() throws Exception KeyValueContainerData containerData = spy(new KeyValueContainerData(containerId, ContainerLayoutVersion.FILE_PER_BLOCK, 100, "test", "test")); // mock to return different checksum - when(containerData.getChecksum()).thenReturn("checksum1", "checksum2"); - doNothing().when(containerData).setChecksumTo0ByteArray(); + when(containerData.getContainerFileChecksum()).thenReturn("checksum1", "checksum2"); // create containerImporter object ContainerController controllerMock = mock(ContainerController.class); ContainerSet containerSet = new ContainerSet(0); diff --git a/hadoop-hdds/interface-server/src/main/proto/ScmServerDatanodeHeartbeatProtocol.proto b/hadoop-hdds/interface-server/src/main/proto/ScmServerDatanodeHeartbeatProtocol.proto index 8363eba27742..497441fe748c 100644 --- a/hadoop-hdds/interface-server/src/main/proto/ScmServerDatanodeHeartbeatProtocol.proto +++ b/hadoop-hdds/interface-server/src/main/proto/ScmServerDatanodeHeartbeatProtocol.proto @@ -230,12 +230,14 @@ message ContainerReplicaProto { optional int64 writeCount = 7; optional int64 readBytes = 8; optional int64 writeBytes = 9; - optional string finalhash = 10; + optional string finalhash = 10 [ deprecated = true ]; optional int64 deleteTransactionId = 11; optional uint64 blockCommitSequenceId = 12; optional string originNodeId = 13; optional int32 replicaIndex = 14; optional bool isEmpty = 15 [default = false]; + // TODO Should we create a Checksum type here like DatanodeClientProtocol has? + optional string dataChecksum = 16; } message CommandStatusReportsProto { From ddf3ce8bde2ffe3deb2ad22c985178904664fc99 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Thu, 4 Apr 2024 16:24:10 -0700 Subject: [PATCH 06/43] Updates after reviewing diff --- .../ReconcileContainerCommandHandler.java | 18 ++++++--- .../commands/ReconcileContainerCommand.java | 25 ++++++------ .../ScmServerDatanodeHeartbeatProtocol.proto | 8 ++-- .../hdds/scm/container/ContainerManager.java | 2 - .../scm/container/ContainerManagerImpl.java | 5 --- .../ReconcileContainerEventHandler.java | 4 +- .../scm/server/SCMClientProtocolServer.java | 39 +++++++++---------- .../scm/cli/container/ContainerCommands.java | 4 +- .../cli/container/ReconcileSubcommand.java | 1 - 9 files changed, 53 insertions(+), 53 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java index e95eefe029c1..22766a26d2c3 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java @@ -2,7 +2,6 @@ import com.google.common.util.concurrent.ThreadFactoryBuilder; import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.SCMCommandProto; -import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ReconcileContainerCommandProto; import org.apache.hadoop.ozone.container.common.statemachine.SCMConnectionManager; import org.apache.hadoop.ozone.container.common.statemachine.StateContext; import org.apache.hadoop.ozone.container.ozoneimpl.OzoneContainer; @@ -16,10 +15,12 @@ import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; -import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; +/** + * Handles commands from SCM to reconcile a container replica on this datanode with the replicas on its peers. + */ public class ReconcileContainerCommandHandler implements CommandHandler { private static final Logger LOG = LoggerFactory.getLogger(ReconcileContainerCommandHandler.class); @@ -46,9 +47,11 @@ public void handle(SCMCommand command, OzoneContainer container, StateContext co queuedCount.incrementAndGet(); long startTime = Time.monotonicNow(); ReconcileContainerCommand reconcileCommand = (ReconcileContainerCommand) command; + LOG.info("Processing reconcile container command for container {} with peers {}", + reconcileCommand.getContainerID(), reconcileCommand.getPeerDatanodes()); try { container.getController().reconcileContainer(reconcileCommand.getContainerID(), - reconcileCommand.getSourceDatanodes()); + reconcileCommand.getPeerDatanodes()); } catch (IOException ex) { LOG.error("Failed to reconcile container {}.", reconcileCommand.getContainerID(), ex); } finally { @@ -65,21 +68,24 @@ public SCMCommandProto.Type getCommandType() { @Override public int getInvocationCount() { - return 0; + return (int)invocationCount.get(); } @Override public long getAverageRunTime() { + if (invocationCount.get() > 0) { + return totalTime / invocationCount.get(); + } return 0; } @Override public long getTotalRunTime() { - return 0; + return totalTime; } @Override public int getQueuedCount() { - return 0; + return queuedCount.get(); } } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/ReconcileContainerCommand.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/ReconcileContainerCommand.java index 2dac0b920622..3e4d47fff5af 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/ReconcileContainerCommand.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/ReconcileContainerCommand.java @@ -5,7 +5,6 @@ import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.SCMCommandProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ReconcileContainerCommandProto; -import org.apache.hadoop.hdds.scm.container.ContainerID; import java.util.List; import java.util.stream.Collectors; @@ -17,12 +16,12 @@ */ public class ReconcileContainerCommand extends SCMCommand { - private final List sourceDatanodes; + private final List peerDatanodes; - public ReconcileContainerCommand(long containerID, List sourceDatanodes) { + public ReconcileContainerCommand(long containerID, List peerDatanodes) { // Container ID serves as command ID, since only one reconciliation should be in progress at a time. super(containerID); - this.sourceDatanodes = sourceDatanodes; + this.peerDatanodes = peerDatanodes; } @@ -35,14 +34,14 @@ public SCMCommandProto.Type getType() { public ReconcileContainerCommandProto getProto() { ReconcileContainerCommandProto.Builder builder = ReconcileContainerCommandProto.newBuilder() .setContainerID(getId()); - for (DatanodeDetails dd : sourceDatanodes) { - builder.addSources(dd.getProtoBufMessage()); + for (DatanodeDetails dd : peerDatanodes) { + builder.addPeers(dd.getProtoBufMessage()); } return builder.build(); } - public List getSourceDatanodes() { - return sourceDatanodes; + public List getPeerDatanodes() { + return peerDatanodes; } public long getContainerID() { @@ -52,20 +51,20 @@ public long getContainerID() { public static ReconcileContainerCommand getFromProtobuf(ReconcileContainerCommandProto protoMessage) { Preconditions.checkNotNull(protoMessage); - List sources = protoMessage.getSourcesList(); - List sourceNodes = !sources.isEmpty() - ? sources.stream() + List peers = protoMessage.getPeersList(); + List peerNodes = !peers.isEmpty() + ? peers.stream() .map(DatanodeDetails::getFromProtoBuf) .collect(Collectors.toList()) : emptyList(); - return new ReconcileContainerCommand(protoMessage.getContainerID(), sourceNodes); + return new ReconcileContainerCommand(protoMessage.getContainerID(), peerNodes); } @Override public String toString() { return getType() + ": containerId=" + getContainerID() + - ", sourceNodes=" + sourceDatanodes; + ", peerNodes=" + peerDatanodes; } } diff --git a/hadoop-hdds/interface-server/src/main/proto/ScmServerDatanodeHeartbeatProtocol.proto b/hadoop-hdds/interface-server/src/main/proto/ScmServerDatanodeHeartbeatProtocol.proto index 497441fe748c..187ee32e8390 100644 --- a/hadoop-hdds/interface-server/src/main/proto/ScmServerDatanodeHeartbeatProtocol.proto +++ b/hadoop-hdds/interface-server/src/main/proto/ScmServerDatanodeHeartbeatProtocol.proto @@ -236,7 +236,7 @@ message ContainerReplicaProto { optional string originNodeId = 13; optional int32 replicaIndex = 14; optional bool isEmpty = 15 [default = false]; - // TODO Should we create a Checksum type here like DatanodeClientProtocol has? + // TODO Leaving this as a string for now. We can define a checksum type like the client protocol has later. optional string dataChecksum = 16; } @@ -504,12 +504,12 @@ message FinalizeNewLayoutVersionCommandProto { } /** -This command asks the datanode to replicate a container from specific sources. +This command asks the datanode to reconcile its copy of a container with its peer datanodes that also have a copy of +the container. */ message ReconcileContainerCommandProto { required int64 containerID = 1; - repeated DatanodeDetailsProto sources = 2; - required int64 cmdId = 3; + repeated DatanodeDetailsProto peers = 2; } /** diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManager.java index aba95560cb76..2f4f1035d6ea 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManager.java @@ -206,8 +206,6 @@ ContainerInfo getMatchingContainer(long size, String owner, void deleteContainer(ContainerID containerID) throws IOException; - void reconcileContainer(ContainerID containerID); - /** * Returns containerStateManger. * @return containerStateManger diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerImpl.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerImpl.java index bfcf83b4af5d..f4d9b93e081b 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerImpl.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerImpl.java @@ -423,11 +423,6 @@ public void deleteContainer(final ContainerID cid) } } - @Override - public void reconcileContainer(ContainerID containerID) { - // TODO - } - @Override public boolean containerExist(final ContainerID id) { return containerStateManager.contains(id); diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerEventHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerEventHandler.java index 545d6079f472..c7b0ece41eeb 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerEventHandler.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerEventHandler.java @@ -54,14 +54,16 @@ public void onMessage(ContainerID containerID, EventPublisher publisher) { if (repType == HddsProtos.ReplicationType.EC) { LOG.error("Cannot reconcile container {} with replication type {}. Reconciliation is currently only supported" + " for Ratis containers.", containerID, repType); + return; } - // create SCMCommand Set replicas = containerManager.getContainerReplicas(containerID) .stream() .map(ContainerReplica::getDatanodeDetails) .collect(Collectors.toSet()); + LOG.info("Reconcile container event triggered for container {} with peers {}", containerID, replicas); + for (DatanodeDetails replica: replicas) { List otherReplicas = replicas.stream() .filter(other -> !other.equals(replica)) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java index 137cf7b527a5..33cffd8e76ba 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java @@ -1391,29 +1391,28 @@ public void reconcileContainer(long longContainerID) throws IOException { auditMap.put("containerID", containerID.toString()); auditMap.put("remoteUser", remoteUser.getUserName()); - // TODO need to audit log failure if this happens - ContainerInfo container = getContainer(longContainerID); - - SCMException exception = null; - // Reconcile is not allowed on open containers. - final HddsProtos.LifeCycleState state = container.getState(); - if (state.equals(HddsProtos.LifeCycleState.OPEN)) { - exception = new SCMException("Cannot reconcile container in state " + state, - ResultCodes.UNEXPECTED_CONTAINER_STATE); - } - // Reconcile on EC containers is not yet implemented. - final HddsProtos.ReplicationType repType = container.getReplicationType(); - if (repType == HddsProtos.ReplicationType.EC) { - exception = new SCMException("Reconciliation is currently only supported for Ratis containers", - ResultCodes.UNSUPPORTED_OPERATION); - } + try { + // May throw ContainerNotFoundException, which will be caught, audited, and returned to the user. + ContainerInfo container = getContainer(longContainerID); + + // Reconcile is not allowed on open containers. + final HddsProtos.LifeCycleState state = container.getState(); + if (state.equals(HddsProtos.LifeCycleState.OPEN)) { + throw new SCMException("Cannot reconcile container in state " + state, + ResultCodes.UNEXPECTED_CONTAINER_STATE); + } + // Reconcile on EC containers is not yet implemented. + final HddsProtos.ReplicationType repType = container.getReplicationType(); + if (repType == HddsProtos.ReplicationType.EC) { + throw new SCMException("Reconciliation is currently only supported for Ratis containers", + ResultCodes.UNSUPPORTED_OPERATION); + } - if (exception == null) { scm.getEventQueue().fireEvent(SCMEvents.RECONCILE_CONTAINER, containerID); AUDIT.logWriteSuccess(buildAuditMessageForSuccess(SCMAction.RECONCILE_CONTAINER, auditMap)); - } else { - AUDIT.logWriteFailure(buildAuditMessageForFailure(SCMAction.RECONCILE_CONTAINER, auditMap, exception)); - throw exception; + } catch (SCMException ex) { + AUDIT.logWriteFailure(buildAuditMessageForFailure(SCMAction.RECONCILE_CONTAINER, auditMap, ex)); + throw ex; } } } diff --git a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ContainerCommands.java b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ContainerCommands.java index 54c69273f0bc..ae273f1d1710 100644 --- a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ContainerCommands.java +++ b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ContainerCommands.java @@ -24,6 +24,7 @@ import org.apache.hadoop.hdds.cli.OzoneAdmin; import org.apache.hadoop.hdds.cli.SubcommandWithParent; +import org.apache.hadoop.ozone.protocol.commands.ReconcileContainerCommand; import org.kohsuke.MetaInfServices; import picocli.CommandLine.Command; import picocli.CommandLine.Model.CommandSpec; @@ -43,7 +44,8 @@ CreateSubcommand.class, CloseSubcommand.class, ReportSubcommand.class, - UpgradeSubcommand.class + UpgradeSubcommand.class, + ReconcileContainerCommand.class }) @MetaInfServices(SubcommandWithParent.class) public class ContainerCommands implements Callable, SubcommandWithParent { diff --git a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java index 6d9e86fccc53..a4e556e373d5 100644 --- a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java +++ b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java @@ -20,7 +20,6 @@ import java.io.IOException; import org.apache.hadoop.hdds.cli.HddsVersionProvider; -import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ReconcileContainerResponseProto; import org.apache.hadoop.hdds.scm.cli.ScmSubcommand; import org.apache.hadoop.hdds.scm.client.ScmClient; From 47f6c0658df6e7fd22a8faa6f7872e7ac7f320a8 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Thu, 4 Apr 2024 17:03:39 -0700 Subject: [PATCH 07/43] Fix checkstyle --- .../main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java | 1 - .../hdds/scm/protocol/StorageContainerLocationProtocol.java | 1 - .../commandhandler/ReconcileContainerCommandHandler.java | 3 ++- ...StorageContainerLocationProtocolClientSideTranslatorPB.java | 1 - .../org/apache/hadoop/hdds/scm/container/ContainerManager.java | 2 -- .../apache/hadoop/hdds/scm/container/ContainerManagerImpl.java | 2 -- .../apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java | 1 - .../apache/hadoop/hdds/scm/cli/ContainerOperationClient.java | 1 - 8 files changed, 2 insertions(+), 10 deletions(-) diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java index 0dce695153c0..2e56d141b363 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java @@ -22,7 +22,6 @@ import org.apache.hadoop.hdds.client.ReplicationConfig; import org.apache.hadoop.hdds.protocol.DatanodeDetails; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionInfo; -import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ReconcileContainerResponseProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StartContainerBalancerResponseProto; import org.apache.hadoop.hdds.scm.DatanodeAdminError; diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java index 4856dc32054d..1d9210a7f766 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocol.java @@ -23,7 +23,6 @@ import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionInfo; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto; -import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ReconcileContainerResponseProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StartContainerBalancerResponseProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.Type; import org.apache.hadoop.hdds.scm.DatanodeAdminError; diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java index 22766a26d2c3..7c8112b1c09e 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java @@ -41,7 +41,8 @@ public ReconcileContainerCommandHandler(String threadNamePrefix) { } @Override - public void handle(SCMCommand command, OzoneContainer container, StateContext context, SCMConnectionManager connectionManager) { + public void handle(SCMCommand command, OzoneContainer container, StateContext context, + SCMConnectionManager connectionManager) { queuedCount.incrementAndGet(); CompletableFuture.runAsync(() -> { queuedCount.incrementAndGet(); diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java index 3584a3ef6a8e..b383aa9008cd 100644 --- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java +++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java @@ -104,7 +104,6 @@ import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StopContainerBalancerRequestProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ResetDeletedBlockRetryCountRequestProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ReconcileContainerRequestProto; -import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ReconcileContainerResponseProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.Type; import org.apache.hadoop.hdds.scm.DatanodeAdminError; import org.apache.hadoop.hdds.scm.ScmInfo; diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManager.java index 2f4f1035d6ea..2a60e268ff4a 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManager.java @@ -26,8 +26,6 @@ import org.apache.hadoop.hdds.client.ReplicationConfig; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleEvent; -import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ReconcileContainerResponseProto; -import org.apache.hadoop.hdds.scm.exceptions.SCMException; import org.apache.hadoop.hdds.scm.pipeline.Pipeline; import org.apache.hadoop.hdds.utils.db.Table; import org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException; diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerImpl.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerImpl.java index f4d9b93e081b..8e1e881c44ea 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerImpl.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerManagerImpl.java @@ -43,8 +43,6 @@ import org.apache.hadoop.hdds.scm.ScmConfigKeys; import org.apache.hadoop.hdds.scm.container.metrics.SCMContainerManagerMetrics; import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaPendingOps; -import org.apache.hadoop.hdds.scm.exceptions.SCMException.ResultCodes; -import org.apache.hadoop.hdds.scm.exceptions.SCMException; import org.apache.hadoop.hdds.scm.ha.SCMHAManager; import org.apache.hadoop.hdds.scm.ha.SequenceIdGenerator; import org.apache.hadoop.hdds.scm.pipeline.Pipeline; diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java index 33cffd8e76ba..bbba1746c429 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java @@ -37,7 +37,6 @@ import org.apache.hadoop.hdds.protocol.proto.ReconfigureProtocolProtos.ReconfigureProtocolService; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionInfo; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos; -import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ReconcileContainerResponseProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto.Builder; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StartContainerBalancerResponseProto; diff --git a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java index 39a5a3b8cece..f5facb003316 100644 --- a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java +++ b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/ContainerOperationClient.java @@ -29,7 +29,6 @@ import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ReadContainerResponseProto; import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionInfo; -import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ReconcileContainerResponseProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StartContainerBalancerResponseProto; import org.apache.hadoop.hdds.scm.DatanodeAdminError; From 9154e3cee64523cef69f66febecd54dbc1f169ec Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Tue, 9 Apr 2024 17:07:42 -0700 Subject: [PATCH 08/43] Basic reconcile scm <-> DN works --- .../hdds/scm/container/ContainerReplicaInfo.java | 13 ++++++++++++- .../interface-client/src/main/proto/hdds.proto | 1 + .../container/AbstractContainerReportHandler.java | 1 + .../hdds/scm/container/ContainerReplica.java | 14 ++++++++++++++ .../container/ReconcileContainerEventHandler.java | 6 +++++- ...inerLocationProtocolServerSideTranslatorPB.java | 13 +++++++++++++ .../hdds/scm/server/SCMClientProtocolServer.java | 4 +++- .../hdds/scm/server/SCMDatanodeProtocolServer.java | 8 ++++++++ .../hdds/scm/cli/container/ContainerCommands.java | 3 +-- .../scm/cli/container/ReconcileSubcommand.java | 2 +- 10 files changed, 59 insertions(+), 6 deletions(-) diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplicaInfo.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplicaInfo.java index 5a81f6bb47a1..1cde385b67a5 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplicaInfo.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplicaInfo.java @@ -35,6 +35,7 @@ public final class ContainerReplicaInfo { private long keyCount; private long bytesUsed; private int replicaIndex = -1; + private String dataChecksum; public static ContainerReplicaInfo fromProto( HddsProtos.SCMContainerReplicaProto proto) { @@ -48,7 +49,8 @@ public static ContainerReplicaInfo fromProto( .setKeyCount(proto.getKeyCount()) .setBytesUsed(proto.getBytesUsed()) .setReplicaIndex( - proto.hasReplicaIndex() ? (int)proto.getReplicaIndex() : -1); + proto.hasReplicaIndex() ? (int)proto.getReplicaIndex() : -1) + .setDataChecksum(proto.getDataChecksum()); return builder.build(); } @@ -87,6 +89,10 @@ public int getReplicaIndex() { return replicaIndex; } + public String getDataChecksum() { + return dataChecksum; + } + /** * Builder for ContainerReplicaInfo class. */ @@ -134,6 +140,11 @@ public Builder setReplicaIndex(int replicaIndex) { return this; } + public Builder setDataChecksum(String dataChecksum) { + subject.dataChecksum = dataChecksum; + return this; + } + public ContainerReplicaInfo build() { return subject; } diff --git a/hadoop-hdds/interface-client/src/main/proto/hdds.proto b/hadoop-hdds/interface-client/src/main/proto/hdds.proto index 3f346300b3ed..7c80adb7ee1d 100644 --- a/hadoop-hdds/interface-client/src/main/proto/hdds.proto +++ b/hadoop-hdds/interface-client/src/main/proto/hdds.proto @@ -431,6 +431,7 @@ message SCMContainerReplicaProto { required int64 keyCount = 6; required int64 bytesUsed = 7; optional int64 replicaIndex = 8; + optional string dataChecksum = 9; } message KeyContainerIDList { diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java index 7e163ac306f8..db00d6842d28 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java @@ -381,6 +381,7 @@ private void updateContainerReplica(final DatanodeDetails datanodeDetails, .setReplicaIndex(replicaProto.getReplicaIndex()) .setBytesUsed(replicaProto.getUsed()) .setEmpty(replicaProto.getIsEmpty()) + .setDataChecksum(replicaProto.getDataChecksum()) .build(); if (replica.getState().equals(State.DELETED)) { diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplica.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplica.java index 78ebfd311dd1..05afbb30e28a 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplica.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplica.java @@ -27,6 +27,7 @@ import org.apache.commons.lang3.builder.CompareToBuilder; import org.apache.commons.lang3.builder.EqualsBuilder; import org.apache.commons.lang3.builder.HashCodeBuilder; +import org.apache.hadoop.ozone.container.common.interfaces.Container; /** * In-memory state of a container replica. @@ -43,6 +44,8 @@ public final class ContainerReplica implements Comparable { private final long keyCount; private final long bytesUsed; private final boolean isEmpty; + // TODO Use a dedicated checksum class for this if required later. + private final String dataChecksum; private ContainerReplica(ContainerReplicaBuilder b) { containerID = b.containerID; @@ -54,6 +57,7 @@ private ContainerReplica(ContainerReplicaBuilder b) { replicaIndex = b.replicaIndex; isEmpty = b.isEmpty; sequenceId = b.sequenceId; + dataChecksum = b.dataChecksum; } /** @@ -114,6 +118,10 @@ public boolean isEmpty() { return isEmpty; } + public String getDataChecksum() { + return dataChecksum; + } + @Override public int hashCode() { return new HashCodeBuilder(61, 71) @@ -201,6 +209,7 @@ public static class ContainerReplicaBuilder { private long keyCount; private int replicaIndex; private boolean isEmpty; + private String dataChecksum; /** * Set Container Id. @@ -275,6 +284,11 @@ public ContainerReplicaBuilder setEmpty(boolean empty) { return this; } + public ContainerReplicaBuilder setDataChecksum(String dataChecksum) { + this.dataChecksum = dataChecksum; + return this; + } + /** * Constructs new ContainerReplicaBuilder. * diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerEventHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerEventHandler.java index c7b0ece41eeb..72af28a2ab37 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerEventHandler.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerEventHandler.java @@ -7,6 +7,7 @@ import org.apache.hadoop.hdds.server.events.EventPublisher; import org.apache.hadoop.ozone.protocol.commands.CommandForDatanode; import org.apache.hadoop.ozone.protocol.commands.ReconcileContainerCommand; +import org.apache.ratis.protocol.exceptions.NotLeaderException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -64,15 +65,18 @@ public void onMessage(ContainerID containerID, EventPublisher publisher) { LOG.info("Reconcile container event triggered for container {} with peers {}", containerID, replicas); - for (DatanodeDetails replica: replicas) { + for (DatanodeDetails replica : replicas) { List otherReplicas = replicas.stream() .filter(other -> !other.equals(replica)) .collect(Collectors.toList()); ReconcileContainerCommand command = new ReconcileContainerCommand(containerID.getId(), otherReplicas); + command.setTerm(scmContext.getTermOfLeader()); publisher.fireEvent(DATANODE_COMMAND, new CommandForDatanode<>(replica.getUuid(), command)); } } catch (ContainerNotFoundException ex) { LOG.error("Failed to start reconciliation for container {}. Container not found.", containerID); + } catch (NotLeaderException nle) { + LOG.info("Skip reconciling container {} since current SCM is not leader.", containerID); } } } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java index a44536bf4463..dd68d6a713dc 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/protocol/StorageContainerLocationProtocolServerSideTranslatorPB.java @@ -30,6 +30,8 @@ import org.apache.hadoop.hdds.protocol.proto.HddsProtos.TransferLeadershipResponseProto; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.UpgradeFinalizationStatus; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ReconcileContainerRequestProto; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ReconcileContainerResponseProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ActivatePipelineRequestProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ActivatePipelineResponseProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ClosePipelineRequestProto; @@ -722,6 +724,12 @@ public ScmContainerLocationResponse processRequest( .setStatus(Status.OK) .setGetMetricsResponse(getMetrics(request.getGetMetricsRequest())) .build(); + case ReconcileContainer: + return ScmContainerLocationResponse.newBuilder() + .setCmdType(request.getCmdType()) + .setStatus(Status.OK) + .setReconcileContainerResponse(reconcileContainer(request.getReconcileContainerRequest())) + .build(); default: throw new IllegalArgumentException( "Unknown command type: " + request.getCmdType()); @@ -1299,4 +1307,9 @@ public DecommissionScmResponseProto decommissionScm( public GetMetricsResponseProto getMetrics(GetMetricsRequestProto request) throws IOException { return GetMetricsResponseProto.newBuilder().setMetricsJson(impl.getMetrics(request.getQuery())).build(); } + + public ReconcileContainerResponseProto reconcileContainer(ReconcileContainerRequestProto request) throws IOException { + impl.reconcileContainer(request.getContainerID()); + return ReconcileContainerResponseProto.getDefaultInstance(); + } } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java index bbba1746c429..9b1784e9bf12 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java @@ -336,7 +336,9 @@ public List getContainerReplicas( .setPlaceOfBirth(r.getOriginDatanodeId().toString()) .setKeyCount(r.getKeyCount()) .setSequenceID(r.getSequenceId()) - .setReplicaIndex(r.getReplicaIndex()).build() + .setReplicaIndex(r.getReplicaIndex()) + .setDataChecksum(r.getDataChecksum()) + .build() ); } return results; diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeProtocolServer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeProtocolServer.java index 3d864d4ea212..98a7aa22f3e6 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeProtocolServer.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeProtocolServer.java @@ -73,6 +73,7 @@ import org.apache.hadoop.ozone.protocol.commands.DeleteBlocksCommand; import org.apache.hadoop.ozone.protocol.commands.DeleteContainerCommand; import org.apache.hadoop.ozone.protocol.commands.FinalizeNewLayoutVersionCommand; +import org.apache.hadoop.ozone.protocol.commands.ReconcileContainerCommand; import org.apache.hadoop.ozone.protocol.commands.RefreshVolumeUsageCommand; import org.apache.hadoop.ozone.protocol.commands.RegisteredCommand; import org.apache.hadoop.ozone.protocol.commands.ReplicateContainerCommand; @@ -94,6 +95,7 @@ import static org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.SCMCommandProto.Type.deleteBlocksCommand; import static org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.SCMCommandProto.Type.deleteContainerCommand; import static org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.SCMCommandProto.Type.finalizeNewLayoutVersionCommand; +import static org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.SCMCommandProto.Type.reconcileContainerCommand; import static org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.SCMCommandProto.Type.reconstructECContainersCommand; import static org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.SCMCommandProto.Type.refreshVolumeUsageInfo; import static org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.SCMCommandProto.Type.replicateContainerCommand; @@ -407,6 +409,12 @@ public static SCMCommandProto getCommandResponse(SCMCommand cmd, .setRefreshVolumeUsageCommandProto( ((RefreshVolumeUsageCommand)cmd).getProto()) .build(); + case reconcileContainerCommand: + return builder + .setCommandType(reconcileContainerCommand) + .setReconcileContainerCommandProto( + ((ReconcileContainerCommand)cmd).getProto()) + .build(); default: throw new IllegalArgumentException("Scm command " + diff --git a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ContainerCommands.java b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ContainerCommands.java index ae273f1d1710..9f93c56f2db2 100644 --- a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ContainerCommands.java +++ b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ContainerCommands.java @@ -24,7 +24,6 @@ import org.apache.hadoop.hdds.cli.OzoneAdmin; import org.apache.hadoop.hdds.cli.SubcommandWithParent; -import org.apache.hadoop.ozone.protocol.commands.ReconcileContainerCommand; import org.kohsuke.MetaInfServices; import picocli.CommandLine.Command; import picocli.CommandLine.Model.CommandSpec; @@ -45,7 +44,7 @@ CloseSubcommand.class, ReportSubcommand.class, UpgradeSubcommand.class, - ReconcileContainerCommand.class + ReconcileSubcommand.class }) @MetaInfServices(SubcommandWithParent.class) public class ContainerCommands implements Callable, SubcommandWithParent { diff --git a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java index a4e556e373d5..343077b39013 100644 --- a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java +++ b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java @@ -44,7 +44,7 @@ public void execute(ScmClient scmClient) throws IOException { scmClient.reconcileContainer(containerId); System.out.println("Reconciliation has been triggered for container " + containerId); // TODO a better option to check status may be added later. - System.out.println("Use \"ozone admin container info " + containerId + "\" to check the hashes of each container" + + System.out.println("Use \"ozone admin container info " + containerId + "\" to check the hashes of each container " + "replica"); } } From d03622957441d094ca43757e5c7753151d8ec79f Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Tue, 9 Apr 2024 18:24:31 -0700 Subject: [PATCH 09/43] Improve error handling --- .../ReconcileContainerEventHandler.java | 17 +++++++++++++---- .../scm/server/SCMClientProtocolServer.java | 15 +++++++++++---- .../scm/cli/container/ReconcileSubcommand.java | 4 ++-- 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerEventHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerEventHandler.java index 72af28a2ab37..79a32d5d8d62 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerEventHandler.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerEventHandler.java @@ -1,7 +1,9 @@ package org.apache.hadoop.hdds.scm.container; +import org.apache.hadoop.hdds.client.ReplicationConfig; import org.apache.hadoop.hdds.protocol.DatanodeDetails; import org.apache.hadoop.hdds.protocol.proto.HddsProtos; +import org.apache.hadoop.hdds.scm.exceptions.SCMException; import org.apache.hadoop.hdds.scm.ha.SCMContext; import org.apache.hadoop.hdds.server.events.EventHandler; import org.apache.hadoop.hdds.server.events.EventPublisher; @@ -50,12 +52,19 @@ public void onMessage(ContainerID containerID, EventPublisher publisher) { return; } - // This restriction can be removed when reconciliation for EC containers is added. - final HddsProtos.ReplicationType repType = container.getReplicationType(); - if (repType == HddsProtos.ReplicationType.EC) { + // Reconcile on EC containers is not yet implemented. + ReplicationConfig repConfig = container.getReplicationConfig(); + HddsProtos.ReplicationType repType = repConfig.getReplicationType(); + if (repConfig.getReplicationType() != HddsProtos.ReplicationType.RATIS) { LOG.error("Cannot reconcile container {} with replication type {}. Reconciliation is currently only supported" + " for Ratis containers.", containerID, repType); - return; + } + + // Reconciliation requires multiple replicas to reconcile. + int requiredNodes = repConfig.getRequiredNodes(); + if (requiredNodes <= 1) { + LOG.error("Cannot reconcile container {} with {} required nodes. Reconciliation is only supported for " + + "containers with more than 1 required node.", containerID, requiredNodes); } Set replicas = containerManager.getContainerReplicas(containerID) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java index 9b1784e9bf12..0b2d4f569378 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java @@ -1394,21 +1394,28 @@ public void reconcileContainer(long longContainerID) throws IOException { try { // May throw ContainerNotFoundException, which will be caught, audited, and returned to the user. - ContainerInfo container = getContainer(longContainerID); + ContainerInfo container = getScm().getContainerManager().getContainer(containerID); // Reconcile is not allowed on open containers. - final HddsProtos.LifeCycleState state = container.getState(); + HddsProtos.LifeCycleState state = container.getState(); if (state.equals(HddsProtos.LifeCycleState.OPEN)) { throw new SCMException("Cannot reconcile container in state " + state, ResultCodes.UNEXPECTED_CONTAINER_STATE); } + // Reconcile on EC containers is not yet implemented. - final HddsProtos.ReplicationType repType = container.getReplicationType(); - if (repType == HddsProtos.ReplicationType.EC) { + ReplicationConfig repConfig = container.getReplicationConfig(); + if (repConfig.getReplicationType() != HddsProtos.ReplicationType.RATIS) { throw new SCMException("Reconciliation is currently only supported for Ratis containers", ResultCodes.UNSUPPORTED_OPERATION); } + // Reconciliation requires multiple replicas to reconcile. + if (repConfig.getRequiredNodes() <= 1) { + throw new SCMException("Reconciliation is only supported for containers with more than one required node.", + ResultCodes.UNSUPPORTED_OPERATION); + } + scm.getEventQueue().fireEvent(SCMEvents.RECONCILE_CONTAINER, containerID); AUDIT.logWriteSuccess(buildAuditMessageForSuccess(SCMAction.RECONCILE_CONTAINER, auditMap)); } catch (SCMException ex) { diff --git a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java index 343077b39013..e6893013f956 100644 --- a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java +++ b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java @@ -44,7 +44,7 @@ public void execute(ScmClient scmClient) throws IOException { scmClient.reconcileContainer(containerId); System.out.println("Reconciliation has been triggered for container " + containerId); // TODO a better option to check status may be added later. - System.out.println("Use \"ozone admin container info " + containerId + "\" to check the hashes of each container " + - "replica"); + System.out.println("Use \"ozone admin container info --json " + containerId + "\" to check the hashes of each " + + "container replica"); } } From 9aecbcf2d5e95797e2954e2644bc62d0353f4723 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Fri, 12 Apr 2024 15:08:49 -0700 Subject: [PATCH 10/43] Add DN side unit tests --- .../ReconcileContainerCommandHandler.java | 2 +- .../common/TestKeyValueContainerData.java | 3 + .../common/statemachine/TestStateContext.java | 4 + .../TestReconcileContainerCommandHandler.java | 173 ++++++++++++++++++ .../endpoint/TestHeartbeatEndpointTask.java | 38 ++++ .../keyvalue/TestKeyValueHandler.java | 40 ++++ 6 files changed, 259 insertions(+), 1 deletion(-) create mode 100644 hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java index 7c8112b1c09e..8bf18576104d 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java @@ -45,7 +45,7 @@ public void handle(SCMCommand command, OzoneContainer container, StateContext co SCMConnectionManager connectionManager) { queuedCount.incrementAndGet(); CompletableFuture.runAsync(() -> { - queuedCount.incrementAndGet(); + invocationCount.incrementAndGet(); long startTime = Time.monotonicNow(); ReconcileContainerCommand reconcileCommand = (ReconcileContainerCommand) command; LOG.info("Processing reconcile container command for container {} with peers {}", diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestKeyValueContainerData.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestKeyValueContainerData.java index b3b0f5b43771..c06ec651c672 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestKeyValueContainerData.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestKeyValueContainerData.java @@ -84,6 +84,7 @@ public void testKeyValueData(ContainerTestVersionInfo versionInfo) { assertEquals(val.get(), kvData.getBlockCount()); assertEquals(val.get(), kvData.getNumPendingDeletionBlocks()); assertEquals(MAXSIZE, kvData.getMaxSize()); + assertEquals("", kvData.getDataChecksum()); kvData.setState(state); kvData.setContainerDBType(containerDBType); @@ -98,6 +99,7 @@ public void testKeyValueData(ContainerTestVersionInfo versionInfo) { kvData.incrPendingDeletionBlocks(1); kvData.setSchemaVersion( VersionedDatanodeFeatures.SchemaV3.chooseSchemaVersion(conf)); + kvData.setDataChecksum("1234"); assertEquals(state, kvData.getState()); assertEquals(containerDBType, kvData.getContainerDBType()); @@ -114,6 +116,7 @@ public void testKeyValueData(ContainerTestVersionInfo versionInfo) { assertEquals(datanodeId.toString(), kvData.getOriginNodeId()); assertEquals(VersionedDatanodeFeatures.SchemaV3.chooseSchemaVersion(conf), kvData.getSchemaVersion()); + assertEquals("1234", kvData.getDataChecksum()); KeyValueContainerData newKvData = new KeyValueContainerData(kvData); assertEquals(kvData.getReplicaIndex(), newKvData.getReplicaIndex()); diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/TestStateContext.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/TestStateContext.java index 7f2cdcc6e532..d337b3a5f25a 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/TestStateContext.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/TestStateContext.java @@ -65,6 +65,7 @@ import org.apache.hadoop.ozone.container.ozoneimpl.OzoneContainer; import org.apache.hadoop.ozone.protocol.commands.CloseContainerCommand; import org.apache.hadoop.ozone.protocol.commands.ClosePipelineCommand; +import org.apache.hadoop.ozone.protocol.commands.ReconcileContainerCommand; import org.apache.hadoop.ozone.protocol.commands.ReplicateContainerCommand; import org.apache.hadoop.ozone.protocol.commands.SCMCommand; import org.apache.ozone.test.LambdaTestUtils; @@ -707,6 +708,7 @@ public void testCommandQueueSummary() throws IOException { ctx.addCommand(ReplicateContainerCommand.forTest(3)); ctx.addCommand(new ClosePipelineCommand(PipelineID.randomId())); ctx.addCommand(new CloseContainerCommand(1, PipelineID.randomId())); + ctx.addCommand(new ReconcileContainerCommand(4, Collections.emptyList())); Map summary = ctx.getCommandQueueSummary(); assertEquals(3, @@ -715,6 +717,8 @@ public void testCommandQueueSummary() throws IOException { summary.get(SCMCommandProto.Type.closePipelineCommand).intValue()); assertEquals(1, summary.get(SCMCommandProto.Type.closeContainerCommand).intValue()); + assertEquals(1, + summary.get(SCMCommandProto.Type.reconcileContainerCommand).intValue()); } @Test diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java new file mode 100644 index 000000000000..92f99ec3c754 --- /dev/null +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java @@ -0,0 +1,173 @@ +package org.apache.hadoop.ozone.container.common.statemachine.commandhandler; + +import org.apache.hadoop.hdds.conf.OzoneConfiguration; +import org.apache.hadoop.hdds.protocol.DatanodeDetails; +import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto; +import org.apache.hadoop.hdds.scm.container.ContainerID; +import org.apache.hadoop.hdds.scm.pipeline.PipelineID; +import org.apache.hadoop.ozone.container.common.ContainerTestUtils; +import org.apache.hadoop.ozone.container.common.helpers.ContainerMetrics; +import org.apache.hadoop.ozone.container.common.helpers.ContainerUtils; +import org.apache.hadoop.ozone.container.common.impl.ContainerLayoutVersion; +import org.apache.hadoop.ozone.container.common.impl.ContainerSet; +import org.apache.hadoop.ozone.container.common.interfaces.Container; +import org.apache.hadoop.ozone.container.common.interfaces.Handler; +import org.apache.hadoop.ozone.container.common.report.IncrementalReportSender; +import org.apache.hadoop.ozone.container.common.statemachine.StateContext; +import org.apache.hadoop.ozone.container.common.volume.VolumeSet; +import org.apache.hadoop.ozone.container.keyvalue.ContainerLayoutTestInfo; +import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainer; +import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData; +import org.apache.hadoop.ozone.container.keyvalue.KeyValueHandler; +import org.apache.hadoop.ozone.container.ozoneimpl.ContainerController; +import org.apache.hadoop.ozone.container.ozoneimpl.OzoneContainer; +import org.apache.hadoop.ozone.protocol.commands.ReconcileContainerCommand; +import org.apache.ozone.test.GenericTestUtils; +import org.junit.jupiter.api.Assertions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; + +import static java.util.Collections.singletonMap; +import static org.apache.hadoop.hdds.protocol.MockDatanodeDetails.randomDatanodeDetails; +import static org.apache.hadoop.ozone.OzoneConsts.GB; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class TestReconcileContainerCommandHandler { + public static final Logger LOG = LoggerFactory.getLogger(TestReconcileContainerCommandHandler.class); + + private static final long CONTAINER_ID = 123L; + + private OzoneContainer ozoneContainer; + private StateContext context; + private Container container; + private Handler containerHandler; + private ContainerController controller; + private ContainerSet containerSet; + private ReconcileContainerCommandHandler subject; + // Used to block ICR sending so that queue metrics can be checked before the reconcile task completes. + private CountDownLatch icrLatch; + + private ContainerLayoutVersion layoutVersion; + + // As data hashes are calculated during the test, they are written back here. + private final Map containerReportsSent = new HashMap<>(); + + public void initLayoutVersion(ContainerLayoutVersion layout) + throws Exception { + this.layoutVersion = layout; + init(); + } + + private void init() throws Exception { + OzoneConfiguration conf = new OzoneConfiguration(); + DatanodeDetails dnDetails = randomDatanodeDetails(); + subject = new ReconcileContainerCommandHandler(""); + context = ContainerTestUtils.getMockContext(dnDetails, conf); + + KeyValueContainerData data = new KeyValueContainerData(CONTAINER_ID, layoutVersion, GB, + PipelineID.randomId().toString(), randomDatanodeDetails().getUuidString()); + container = new KeyValueContainer(data, conf); + containerSet = new ContainerSet(1000); + containerSet.addContainer(container); + + icrLatch = new CountDownLatch(1); + IncrementalReportSender icrSender = c -> { + try { + containerReportsSent.put(ContainerID.valueOf(c.getContainerData().getContainerID()), c.getContainerReport()); + + // Block the caller until the latch is counted down. + // Caller can check queue metrics in the meantime. + LOG.info("ICR sender waiting for latch"); + Assertions.assertTrue(icrLatch.await(30, TimeUnit.SECONDS)); + LOG.info("ICR sender proceeding after latch"); + // Reset the latch for the next iteration. + // This assumes requests are executed by a single thread reading the latch. + icrLatch = new CountDownLatch(1); + } catch (Exception ex) { + LOG.error("ICR sender failed", ex); + } + }; + + containerHandler = new KeyValueHandler(new OzoneConfiguration(), dnDetails.getUuidString(), containerSet, + mock(VolumeSet.class), mock(ContainerMetrics.class), icrSender); + controller = new ContainerController(containerSet, + singletonMap(ContainerProtos.ContainerType.KeyValueContainer, containerHandler)); + ozoneContainer = mock(OzoneContainer.class); + when(ozoneContainer.getController()).thenReturn(controller); + when(ozoneContainer.getContainerSet()).thenReturn(containerSet); + } + + // TODO test is flaky on the second container layout run only. + @ContainerLayoutTestInfo.ContainerTest + public void testReconcileContainerCommandHandled(ContainerLayoutVersion layout) throws Exception { + initLayoutVersion(layout); + + ReconcileContainerCommand cmd = new ReconcileContainerCommand(CONTAINER_ID, Collections.emptyList()); + // Queue two commands for processing. + // Handler is blocked until we count down the ICR latch. + subject.handle(cmd, ozoneContainer, context, null); + subject.handle(cmd, ozoneContainer, context, null); + + // The first command was invoked when submitted, and is now blocked in the ICR sender. + // Since neither command has finished, they both count towards queue count. + Assertions.assertEquals(1, subject.getInvocationCount()); + Assertions.assertEquals(2, subject.getQueuedCount()); + Assertions.assertEquals(0, subject.getTotalRunTime()); + Assertions.assertEquals(0, subject.getAverageRunTime()); + + // Wait this long before unblocking the ICR sender. This is the lower bound on simulated execution time. + long minExecTimeMillis = 500; + Thread.sleep(minExecTimeMillis); + icrLatch.countDown(); + + // Decrementing queue count indicates the task completed. + waitForQueueCount(1); + // The other command is invoked but blocked in the ICR sender. + Assertions.assertEquals(2, subject.getInvocationCount()); + long firstTotalRunTime = subject.getTotalRunTime(); + long firstAvgRunTime = subject.getAverageRunTime(); + Assertions.assertTrue(firstTotalRunTime >= minExecTimeMillis, + "Total run time " + firstTotalRunTime + "ms was not larger than min exec time " + minExecTimeMillis + "ms"); + + // Wait a little longer before firing the second command. + Thread.sleep(minExecTimeMillis + 100); + icrLatch.countDown(); + // Decrementing queue count indicates the task completed. + waitForQueueCount(0); + Assertions.assertEquals(2, subject.getInvocationCount()); + long secondTotalRunTime = subject.getTotalRunTime(); + long secondAvgRunTime = subject.getAverageRunTime(); + Assertions.assertTrue(secondTotalRunTime >= firstTotalRunTime + minExecTimeMillis); + Assertions.assertTrue(secondAvgRunTime >= minExecTimeMillis); + // We slept the thread a little longer on the second invocation, which should have increased the average run time + // from the first run. + Assertions.assertTrue(secondAvgRunTime >= firstAvgRunTime); + + verifyContainerReportsSent(); + } + + private void waitForQueueCount(int expectedQueueCount) throws Exception { + GenericTestUtils.waitFor(() -> { + int qCount = subject.getQueuedCount(); + LOG.info("Waiting for queued command count to reach " + expectedQueueCount + ". Currently at " + qCount); + return qCount == expectedQueueCount; + }, + 500, 3000); + } + + private void verifyContainerReportsSent() throws Exception { + for (ContainerID id: containerReportsSent.keySet()) { + String sentDataChecksum = containerReportsSent.get(id).getDataChecksum(); + String expectedDataChecksum = ContainerUtils.getChecksum(Long.toString(id.getId())); + Assertions.assertEquals(expectedDataChecksum, sentDataChecksum, "Checksum mismatch in report of container " + id); + } + } +} diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/states/endpoint/TestHeartbeatEndpointTask.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/states/endpoint/TestHeartbeatEndpointTask.java index 09fa8a991770..ea05ab2f79a4 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/states/endpoint/TestHeartbeatEndpointTask.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/states/endpoint/TestHeartbeatEndpointTask.java @@ -19,6 +19,7 @@ package org.apache.hadoop.ozone.container.common.states.endpoint; import static java.util.Collections.emptyList; +import static org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.SCMCommandProto.Type.reconcileContainerCommand; import static org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.SCMCommandProto.Type.reconstructECContainersCommand; import static org.apache.hadoop.hdds.upgrade.HDDSLayoutVersionManager.maxLayoutVersion; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -56,6 +57,7 @@ import org.apache.hadoop.ozone.container.common.statemachine.DatanodeStateMachine.DatanodeStates; import org.apache.hadoop.ozone.container.common.statemachine.EndpointStateMachine; import org.apache.hadoop.ozone.container.common.statemachine.StateContext; +import org.apache.hadoop.ozone.protocol.commands.ReconcileContainerCommand; import org.apache.hadoop.ozone.protocol.commands.ReconstructECContainersCommand; import org.apache.hadoop.ozone.protocolPB.StorageContainerDatanodeProtocolClientSideTranslatorPB; @@ -109,6 +111,42 @@ public void handlesReconstructContainerCommand() throws Exception { .get(reconstructECContainersCommand).intValue()); } + @Test + public void testHandlesReconcileContainerCommand() throws Exception { + StorageContainerDatanodeProtocolClientSideTranslatorPB scm = + mock(StorageContainerDatanodeProtocolClientSideTranslatorPB.class); + + List peerDNs = new ArrayList<>(); + peerDNs.add(MockDatanodeDetails.randomDatanodeDetails()); + peerDNs.add(MockDatanodeDetails.randomDatanodeDetails()); + ReconcileContainerCommand cmd = new ReconcileContainerCommand(1, peerDNs); + + when(scm.sendHeartbeat(any())) + .thenAnswer(invocation -> + SCMHeartbeatResponseProto.newBuilder() + .setDatanodeUUID( + ((SCMHeartbeatRequestProto)invocation.getArgument(0)) + .getDatanodeDetails().getUuid()) + .addCommands(SCMCommandProto.newBuilder() + .setCommandType(reconcileContainerCommand) + .setReconcileContainerCommandProto(cmd.getProto()) + .build()) + .build()); + + OzoneConfiguration conf = new OzoneConfiguration(); + DatanodeStateMachine datanodeStateMachine = mock(DatanodeStateMachine.class); + StateContext context = new StateContext(conf, DatanodeStates.RUNNING, + datanodeStateMachine, ""); + + // WHEN + HeartbeatEndpointTask task = getHeartbeatEndpointTask(conf, context, scm); + task.call(); + + // THEN + assertEquals(1, context.getCommandQueueSummary() + .get(reconcileContainerCommand).intValue()); + } + @Test public void testheartbeatWithoutReports() throws Exception { final long termInSCM = 42; diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandler.java index b9c8feae16ce..c1b4269c1680 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandler.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandler.java @@ -37,10 +37,13 @@ import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandRequestProto; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerType; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto; import org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException; +import org.apache.hadoop.hdds.scm.pipeline.PipelineID; import org.apache.hadoop.hdds.security.token.TokenVerifier; import org.apache.hadoop.ozone.container.common.ContainerTestUtils; import org.apache.hadoop.ozone.container.common.helpers.ContainerMetrics; +import org.apache.hadoop.ozone.container.common.helpers.ContainerUtils; import org.apache.hadoop.ozone.container.common.impl.ContainerLayoutVersion; import org.apache.hadoop.ozone.container.common.impl.ContainerSet; import org.apache.hadoop.ozone.container.common.impl.HddsDispatcher; @@ -56,7 +59,9 @@ import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_DATANODE_VOLUME_CHOOSING_POLICY; import static org.apache.hadoop.hdds.HddsConfigKeys.OZONE_METADATA_DIRS; +import static org.apache.hadoop.hdds.protocol.MockDatanodeDetails.randomDatanodeDetails; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.HDDS_DATANODE_DIR_KEY; +import static org.apache.hadoop.ozone.OzoneConsts.GB; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; @@ -64,6 +69,8 @@ import static org.junit.jupiter.api.Assertions.assertThrows; import static org.mockito.Mockito.any; +import org.checkerframework.checker.units.qual.A; +import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Timeout; @@ -433,6 +440,39 @@ public void testDeleteContainer() throws IOException { } } + @ContainerLayoutTestInfo.ContainerTest + public void testReconcileContainer(ContainerLayoutVersion layoutVersion) throws Exception { + OzoneConfiguration conf = new OzoneConfiguration(); + + KeyValueContainerData data = new KeyValueContainerData(123L, layoutVersion, GB, + PipelineID.randomId().toString(), randomDatanodeDetails().getUuidString()); + + Container container = new KeyValueContainer(data, conf); + ContainerSet containerSet = new ContainerSet(1000); + containerSet.addContainer(container); + + // Allows checking the invocation count of the lambda. + AtomicInteger icrCount = new AtomicInteger(0); + KeyValueHandler keyValueHandler = new KeyValueHandler(conf, randomDatanodeDetails().getUuidString(), containerSet, + mock(MutableVolumeSet.class), mock(ContainerMetrics.class), c -> { + // Check that the ICR contains expected info about the container. + ContainerReplicaProto report = c.getContainerReport(); + long reportedID = report.getContainerID(); + Assertions.assertEquals(container.getContainerData().getContainerID(), reportedID); + + String reportDataChecksum = report.getDataChecksum(); + String expectedDataChecksum = ContainerUtils.getChecksum(Long.toString(reportedID)); + Assertions.assertEquals(expectedDataChecksum, reportDataChecksum, + "Checksum mismatch in report of container " + reportedID); + icrCount.incrementAndGet(); + }); + + Assertions.assertEquals(0, icrCount.get()); + // This should trigger container report validation in the ICR handler above. + keyValueHandler.reconcileContainer(container, Collections.emptyList()); + Assertions.assertEquals(1, icrCount.get()); + } + private static ContainerCommandRequestProto createContainerRequest( String datanodeId, long containerID) { return ContainerCommandRequestProto.newBuilder() From 277580725e619a5cf05978f89c28d4925b8e9968 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Tue, 16 Apr 2024 15:16:37 -0700 Subject: [PATCH 11/43] Test with two containers --- .../container/TestContainerReportHandler.java | 142 +++++++++++++++++- 1 file changed, 141 insertions(+), 1 deletion(-) diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java index 695c88d11a3c..1e7e9e42c0f3 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java @@ -56,16 +56,19 @@ import java.time.Clock; import java.time.ZoneId; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Set; import java.util.concurrent.TimeoutException; import java.util.stream.Collectors; import java.util.stream.Stream; import static org.apache.hadoop.hdds.protocol.MockDatanodeDetails.randomDatanodeDetails; +import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; import static org.mockito.Mockito.doAnswer; @@ -979,6 +982,137 @@ public void testStaleReplicaOfDeletedContainer() throws NodeNotFoundException, containerOne.containerID()).size()); } + @Test + public void testContainerDataChecksumUpdated() throws Exception { + final ContainerReportHandler reportHandler = new ContainerReportHandler(nodeManager, containerManager); + + // Create 3 datanodes for testing. + final Iterator nodeIterator = nodeManager.getNodes( + NodeStatus.inServiceHealthy()).iterator(); + final DatanodeDetails datanodeOne = nodeIterator.next(); + final DatanodeDetails datanodeTwo = nodeIterator.next(); + final DatanodeDetails datanodeThree = nodeIterator.next(); + + // Create two containers, and put one replica of each container on each datanode. + final ContainerInfo containerOne = getContainer(LifeCycleState.CLOSED); + final ContainerInfo containerTwo = getContainer(LifeCycleState.CLOSED); + final Set containerIDSet = Stream.of( + containerOne.containerID(), containerTwo.containerID()) + .collect(Collectors.toSet()); + + nodeManager.setContainers(datanodeOne, containerIDSet); + nodeManager.setContainers(datanodeTwo, containerIDSet); + nodeManager.setContainers(datanodeThree, containerIDSet); + + containerStateManager.addContainer(containerOne.getProtobuf()); + containerStateManager.addContainer(containerTwo.getProtobuf()); + + getReplicas(containerOne.containerID(), + ContainerReplicaProto.State.CLOSED, + datanodeOne, datanodeTwo, datanodeThree) + .forEach(r -> containerStateManager.updateContainerReplica( + containerOne.containerID(), r)); + + getReplicas(containerTwo.containerID(), + ContainerReplicaProto.State.CLOSED, + datanodeOne, datanodeTwo, datanodeThree) + .forEach(r -> containerStateManager.updateContainerReplica( + containerTwo.containerID(), r)); + + // Container manager should now be aware of 3 replicas of each container. + assertEquals(3, containerManager.getContainerReplicas( + containerOne.containerID()).size()); + assertEquals(3, containerManager.getContainerReplicas( + containerTwo.containerID()).size()); + + // Create a report from datanode one with a replica of container one and two. + final ContainerReportsProto dn1ReportProto = getContainerReportsProto( + containerOne.containerID(), ContainerReplicaProto.State.CLOSED, + datanodeOne.getUuidString()).toBuilder() + .addReports(getReplica(containerTwo)).build(); + final ContainerReportFromDatanode dn1Report = + new ContainerReportFromDatanode(datanodeOne, dn1ReportProto); + // Create a report from datanode two with a replica of container one and two. + final ContainerReportsProto dn2ReportProto = getContainerReportsProto( + containerOne.containerID(), ContainerReplicaProto.State.CLOSED, + datanodeOne.getUuidString()).toBuilder() + .addReports(getReplica(containerTwo)).build(); + final ContainerReportFromDatanode dn2Report = + new ContainerReportFromDatanode(datanodeOne, dn1ReportProto); + + reportHandler.onMessage(dn1Report, publisher); + reportHandler.onMessage(dn2Report, publisher); + + // A container hash has not been added to the replicas yet. + boolean contOneDataChecksumsEmpty = containerManager.getContainerReplicas(containerOne.containerID()).stream() + .allMatch(r -> r.getDataChecksum().isEmpty()); + assertTrue(contOneDataChecksumsEmpty, "Replicas of container one should not yet have any data checksums."); + boolean contTwoDataChecksumsEmpty = containerManager.getContainerReplicas(containerTwo.containerID()).stream() + .allMatch(r -> r.getDataChecksum().isEmpty()); + assertTrue(contTwoDataChecksumsEmpty, "Replicas of container two should not yet have any data checksums."); + + // TODO Add a container data checksum to the reports to simulate the datanodes' container scanner filling in this + // value. + final ContainerReportsProto dnOneReport = getContainerReportsProto( + containerTwo.containerID(), ContainerReplicaProto.State.CLOSED, + datanodeOne.getUuidString()); + final ContainerReportFromDatanode containerReportFromDatanode = + new ContainerReportFromDatanode(datanodeOne, dnOneReport); + reportHandler.onMessage(containerReportFromDatanode, publisher); + + // Check that all reports contain the expected data checksums, and that the values are unique across replicas + // since that is how they were initialized. + int numReportsChecked = 0; + for (ContainerID contID: containerIDSet) { + for (ContainerReplica replica : containerManager.getContainerReplicas(contID)) { + String expectedHash = createDifferentDataChecksumsForReplicas(contID, + replica.getDatanodeDetails().getUuidString()); + assertEquals(expectedHash, replica.getDataChecksum(), "Incorrect replica data hash in container report."); + numReportsChecked++; + } + } + // Should have checked reports from 3 datanodes with 2 replicas each. + assertEquals(6, numReportsChecked); + + // One the next round of container reports, datanodes will report that all hashes match. This simulates + // reconciliation running and finishing. + // TODO Add a container data checksum to the reports to simulate the datanodes' container scanner filling in this + // value. + final ContainerReportsProto dnOneReport = getContainerReportsProto( + containerTwo.containerID(), ContainerReplicaProto.State.CLOSED, + datanodeOne.getUuidString()); + final ContainerReportFromDatanode containerReportFromDatanode = + new ContainerReportFromDatanode(datanodeOne, dnOneReport); + reportHandler.onMessage(containerReportFromDatanode, publisher); + + // Check that all reports contain the expected data checksums, and that the values are unique across replicas + // since that is how they were initialized. + numReportsChecked = 0; + for (ContainerID contID: containerIDSet) { + for (ContainerReplica replica : containerManager.getContainerReplicas(contID)) { + String expectedHash = createMatchingDataChecksumsForReplicas(contID); + assertEquals(expectedHash, replica.getDataChecksum(), "Incorrect replica data hash in container report."); + numReportsChecked++; + } + } + // Should have checked reports from 3 datanodes with 2 replicas each. + assertEquals(6, numReportsChecked); + } + + /** + * Generates a placeholder data checksum for testing that is specific to a container replica. + */ + private static String createDifferentDataChecksumsForReplicas(ContainerID containerID, String datanodeID) { + return Integer.toString((datanodeID + containerID).hashCode()); + } + + /** + * Generates a placeholder data checksum for testing that is specific to a container replica. + */ + private static String createMatchingDataChecksumsForReplicas(ContainerID containerID) { + return Integer.toString(Objects.hashCode(containerID)); + } + private ContainerReportFromDatanode getContainerReportFromDatanode( ContainerID containerId, ContainerReplicaProto.State state, DatanodeDetails dn, long bytesUsed, long keyCount) { @@ -1021,7 +1155,6 @@ protected static ContainerReportsProto getContainerReportsProto( .setContainerID(containerId.getId()) .setState(state) .setOriginNodeId(originNodeId) - .setFinalhash("e16cc9d6024365750ed8dbd194ea46d2") .setSize(5368709120L) .setUsed(usedBytes) .setKeyCount(keyCount) @@ -1036,4 +1169,11 @@ protected static ContainerReportsProto getContainerReportsProto( return crBuilder.addReports(replicaProto).build(); } + private ContainerReplicaProto getReplica(ContainerInfo cont) { + return ContainerReplicaProto.newBuilder() + .setContainerID(cont.containerID().getId()) + .setState(ContainerReplicaProto.State.CLOSED) + .build(); + } + } From 600174fd05828ba71576462fc77e5c5665cda462 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Tue, 16 Apr 2024 16:46:56 -0700 Subject: [PATCH 12/43] Add container report handler tests --- .../hdds/scm/container/ContainerReplica.java | 2 +- .../container/TestContainerReportHandler.java | 218 +++++++++--------- 2 files changed, 115 insertions(+), 105 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplica.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplica.java index 05afbb30e28a..f1d13497d867 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplica.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplica.java @@ -57,7 +57,7 @@ private ContainerReplica(ContainerReplicaBuilder b) { replicaIndex = b.replicaIndex; isEmpty = b.isEmpty; sequenceId = b.sequenceId; - dataChecksum = b.dataChecksum; + dataChecksum = Optional.ofNullable(b.dataChecksum).orElse(""); } /** diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java index 1e7e9e42c0f3..b2ba447bacc1 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java @@ -983,133 +983,151 @@ public void testStaleReplicaOfDeletedContainer() throws NodeNotFoundException, } @Test - public void testContainerDataChecksumUpdated() throws Exception { + public void testWithNoContainerDataChecksum() throws Exception { final ContainerReportHandler reportHandler = new ContainerReportHandler(nodeManager, containerManager); // Create 3 datanodes for testing. final Iterator nodeIterator = nodeManager.getNodes( NodeStatus.inServiceHealthy()).iterator(); - final DatanodeDetails datanodeOne = nodeIterator.next(); - final DatanodeDetails datanodeTwo = nodeIterator.next(); - final DatanodeDetails datanodeThree = nodeIterator.next(); - - // Create two containers, and put one replica of each container on each datanode. - final ContainerInfo containerOne = getContainer(LifeCycleState.CLOSED); - final ContainerInfo containerTwo = getContainer(LifeCycleState.CLOSED); - final Set containerIDSet = Stream.of( - containerOne.containerID(), containerTwo.containerID()) - .collect(Collectors.toSet()); + final DatanodeDetails dn1 = nodeIterator.next(); + final DatanodeDetails dn2 = nodeIterator.next(); + final DatanodeDetails dn3 = nodeIterator.next(); - nodeManager.setContainers(datanodeOne, containerIDSet); - nodeManager.setContainers(datanodeTwo, containerIDSet); - nodeManager.setContainers(datanodeThree, containerIDSet); + // Create a container and put one replica on each datanode. + final ContainerInfo container = getContainer(LifeCycleState.CLOSED); + ContainerID contID = container.containerID(); + final Set containerIDSet = Stream.of(contID).collect(Collectors.toSet()); - containerStateManager.addContainer(containerOne.getProtobuf()); - containerStateManager.addContainer(containerTwo.getProtobuf()); + nodeManager.setContainers(dn1, containerIDSet); + nodeManager.setContainers(dn2, containerIDSet); + nodeManager.setContainers(dn3, containerIDSet); - getReplicas(containerOne.containerID(), - ContainerReplicaProto.State.CLOSED, - datanodeOne, datanodeTwo, datanodeThree) - .forEach(r -> containerStateManager.updateContainerReplica( - containerOne.containerID(), r)); + containerStateManager.addContainer(container.getProtobuf()); - getReplicas(containerTwo.containerID(), - ContainerReplicaProto.State.CLOSED, - datanodeOne, datanodeTwo, datanodeThree) - .forEach(r -> containerStateManager.updateContainerReplica( - containerTwo.containerID(), r)); + getReplicas(contID, ContainerReplicaProto.State.CLOSED, dn1, dn2, dn3) + .forEach(r -> containerStateManager.updateContainerReplica(contID, r)); // Container manager should now be aware of 3 replicas of each container. - assertEquals(3, containerManager.getContainerReplicas( - containerOne.containerID()).size()); - assertEquals(3, containerManager.getContainerReplicas( - containerTwo.containerID()).size()); - - // Create a report from datanode one with a replica of container one and two. - final ContainerReportsProto dn1ReportProto = getContainerReportsProto( - containerOne.containerID(), ContainerReplicaProto.State.CLOSED, - datanodeOne.getUuidString()).toBuilder() - .addReports(getReplica(containerTwo)).build(); - final ContainerReportFromDatanode dn1Report = - new ContainerReportFromDatanode(datanodeOne, dn1ReportProto); - // Create a report from datanode two with a replica of container one and two. - final ContainerReportsProto dn2ReportProto = getContainerReportsProto( - containerOne.containerID(), ContainerReplicaProto.State.CLOSED, - datanodeOne.getUuidString()).toBuilder() - .addReports(getReplica(containerTwo)).build(); - final ContainerReportFromDatanode dn2Report = - new ContainerReportFromDatanode(datanodeOne, dn1ReportProto); + assertEquals(3, containerManager.getContainerReplicas(contID).size()); - reportHandler.onMessage(dn1Report, publisher); - reportHandler.onMessage(dn2Report, publisher); - - // A container hash has not been added to the replicas yet. - boolean contOneDataChecksumsEmpty = containerManager.getContainerReplicas(containerOne.containerID()).stream() + // All replicas should start with an empty data checksum in SCM. + boolean contOneDataChecksumsEmpty = containerManager.getContainerReplicas(contID).stream() .allMatch(r -> r.getDataChecksum().isEmpty()); assertTrue(contOneDataChecksumsEmpty, "Replicas of container one should not yet have any data checksums."); - boolean contTwoDataChecksumsEmpty = containerManager.getContainerReplicas(containerTwo.containerID()).stream() + + // Send a report to SCM from one datanode that still does not have a data checksum. + for (DatanodeDetails dn: Arrays.asList(dn1, dn2, dn3)) { + final ContainerReportsProto dnReportProto = getContainerReportsProto( + contID, ContainerReplicaProto.State.CLOSED, dn.getUuidString()); + final ContainerReportFromDatanode dnReport = new ContainerReportFromDatanode(dn, dnReportProto); + reportHandler.onMessage(dnReport, publisher); + } + + // Regardless of which datanode sent the report, none of them have checksums, so all replica's data checksums + // should remain empty. + boolean containerDataChecksumEmpty = containerManager.getContainerReplicas(contID).stream() .allMatch(r -> r.getDataChecksum().isEmpty()); - assertTrue(contTwoDataChecksumsEmpty, "Replicas of container two should not yet have any data checksums."); + assertTrue(containerDataChecksumEmpty, "Replicas of the container should not have any data checksums."); + } - // TODO Add a container data checksum to the reports to simulate the datanodes' container scanner filling in this - // value. - final ContainerReportsProto dnOneReport = getContainerReportsProto( - containerTwo.containerID(), ContainerReplicaProto.State.CLOSED, - datanodeOne.getUuidString()); - final ContainerReportFromDatanode containerReportFromDatanode = - new ContainerReportFromDatanode(datanodeOne, dnOneReport); - reportHandler.onMessage(containerReportFromDatanode, publisher); + @Test + public void testWithContainerDataChecksum() throws Exception { + final ContainerReportHandler reportHandler = new ContainerReportHandler(nodeManager, containerManager); + + // Create 3 datanodes for testing. + final Iterator nodeIterator = nodeManager.getNodes( + NodeStatus.inServiceHealthy()).iterator(); + final DatanodeDetails dn1 = nodeIterator.next(); + final DatanodeDetails dn2 = nodeIterator.next(); + final DatanodeDetails dn3 = nodeIterator.next(); + + // Create a container and put one replica on each datanode. + final ContainerInfo container = getContainer(LifeCycleState.CLOSED); + ContainerID contID = container.containerID(); + final Set containerIDSet = Stream.of(container.containerID()).collect(Collectors.toSet()); + + nodeManager.setContainers(dn1, containerIDSet); + nodeManager.setContainers(dn2, containerIDSet); + nodeManager.setContainers(dn3, containerIDSet); + + containerStateManager.addContainer(container.getProtobuf()); - // Check that all reports contain the expected data checksums, and that the values are unique across replicas - // since that is how they were initialized. - int numReportsChecked = 0; - for (ContainerID contID: containerIDSet) { - for (ContainerReplica replica : containerManager.getContainerReplicas(contID)) { - String expectedHash = createDifferentDataChecksumsForReplicas(contID, - replica.getDatanodeDetails().getUuidString()); - assertEquals(expectedHash, replica.getDataChecksum(), "Incorrect replica data hash in container report."); - numReportsChecked++; - } + getReplicas(contID, ContainerReplicaProto.State.CLOSED, dn1, dn2, dn3) + .forEach(r -> containerStateManager.updateContainerReplica(contID, r)); + + // Container manager should now be aware of 3 replicas of each container. + assertEquals(3, containerManager.getContainerReplicas(contID).size()); + + // All replicas should start with an empty data checksum in SCM. + boolean dataChecksumsEmpty = containerManager.getContainerReplicas(contID).stream() + .allMatch(r -> r.getDataChecksum().isEmpty()); + assertTrue(dataChecksumsEmpty, "Replicas of container one should not yet have any data checksums."); + + // For each datanode, send a container report with a mismatched checksum. + for (DatanodeDetails dn: Arrays.asList(dn1, dn2, dn3)) { + ContainerReportsProto dnReportProto = getContainerReportsProto( + contID, ContainerReplicaProto.State.CLOSED, dn.getUuidString()); + ContainerReplicaProto replicaWithChecksum = dnReportProto.getReports(0).toBuilder() + .setDataChecksum(createUniqueDataChecksumForReplica(contID, dn.getUuidString())) + .build(); + ContainerReportsProto reportWithChecksum = dnReportProto.toBuilder() + .clearReports() + .addReports(replicaWithChecksum) + .build(); + final ContainerReportFromDatanode dnReport = new ContainerReportFromDatanode(dn, reportWithChecksum); + reportHandler.onMessage(dnReport, publisher); } - // Should have checked reports from 3 datanodes with 2 replicas each. - assertEquals(6, numReportsChecked); - - // One the next round of container reports, datanodes will report that all hashes match. This simulates - // reconciliation running and finishing. - // TODO Add a container data checksum to the reports to simulate the datanodes' container scanner filling in this - // value. - final ContainerReportsProto dnOneReport = getContainerReportsProto( - containerTwo.containerID(), ContainerReplicaProto.State.CLOSED, - datanodeOne.getUuidString()); - final ContainerReportFromDatanode containerReportFromDatanode = - new ContainerReportFromDatanode(datanodeOne, dnOneReport); - reportHandler.onMessage(containerReportFromDatanode, publisher); - // Check that all reports contain the expected data checksums, and that the values are unique across replicas - // since that is how they were initialized. - numReportsChecked = 0; - for (ContainerID contID: containerIDSet) { - for (ContainerReplica replica : containerManager.getContainerReplicas(contID)) { - String expectedHash = createMatchingDataChecksumsForReplicas(contID); - assertEquals(expectedHash, replica.getDataChecksum(), "Incorrect replica data hash in container report."); - numReportsChecked++; - } + // All the replicas should have different checksums. + // Since the containers don't have any data in this test, different checksums are based on container ID and + // datanode ID. + int numReplicasChecked = 0; + for (ContainerReplica replica: containerManager.getContainerReplicas(contID)) { + String expectedChecksum = createUniqueDataChecksumForReplica( + contID, replica.getDatanodeDetails().getUuidString()); + assertEquals(expectedChecksum, replica.getDataChecksum()); + numReplicasChecked++; } - // Should have checked reports from 3 datanodes with 2 replicas each. - assertEquals(6, numReportsChecked); + assertEquals(3, numReplicasChecked); + + // For each datanode, send a container report with a matching checksum. + // This simulates reconciliation running. + for (DatanodeDetails dn: Arrays.asList(dn1, dn2, dn3)) { + ContainerReportsProto dnReportProto = getContainerReportsProto( + contID, ContainerReplicaProto.State.CLOSED, dn.getUuidString()); + ContainerReplicaProto replicaWithChecksum = dnReportProto.getReports(0).toBuilder() + .setDataChecksum(createMatchingDataChecksumForReplica(contID)) + .build(); + ContainerReportsProto reportWithChecksum = dnReportProto.toBuilder() + .clearReports() + .addReports(replicaWithChecksum) + .build(); + final ContainerReportFromDatanode dnReport = new ContainerReportFromDatanode(dn, reportWithChecksum); + reportHandler.onMessage(dnReport, publisher); + } + + // All the replicas should now have matching checksums. + // Since the containers don't have any data in this test, the matching checksums are based on container ID only. + numReplicasChecked = 0; + for (ContainerReplica replica: containerManager.getContainerReplicas(contID)) { + String expectedChecksum = createMatchingDataChecksumForReplica(contID); + assertEquals(expectedChecksum, replica.getDataChecksum()); + numReplicasChecked++; + } + assertEquals(3, numReplicasChecked); } /** * Generates a placeholder data checksum for testing that is specific to a container replica. */ - private static String createDifferentDataChecksumsForReplicas(ContainerID containerID, String datanodeID) { + private static String createUniqueDataChecksumForReplica(ContainerID containerID, String datanodeID) { return Integer.toString((datanodeID + containerID).hashCode()); } /** * Generates a placeholder data checksum for testing that is specific to a container replica. */ - private static String createMatchingDataChecksumsForReplicas(ContainerID containerID) { + private static String createMatchingDataChecksumForReplica(ContainerID containerID) { return Integer.toString(Objects.hashCode(containerID)); } @@ -1168,12 +1186,4 @@ protected static ContainerReportsProto getContainerReportsProto( .build(); return crBuilder.addReports(replicaProto).build(); } - - private ContainerReplicaProto getReplica(ContainerInfo cont) { - return ContainerReplicaProto.newBuilder() - .setContainerID(cont.containerID().getId()) - .setState(ContainerReplicaProto.State.CLOSED) - .build(); - } - } From 6ec2acb160a12fa740e5a9504ae33547e2f1313d Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Tue, 16 Apr 2024 17:21:43 -0700 Subject: [PATCH 13/43] Add ICR tests, improve FCR tests --- .../container/TestContainerReportHandler.java | 59 ++++--- ...TestIncrementalContainerReportHandler.java | 146 +++++++++++++++++- 2 files changed, 174 insertions(+), 31 deletions(-) diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java index b2ba447bacc1..f1aea0aa6cd0 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java @@ -986,29 +986,27 @@ public void testStaleReplicaOfDeletedContainer() throws NodeNotFoundException, public void testWithNoContainerDataChecksum() throws Exception { final ContainerReportHandler reportHandler = new ContainerReportHandler(nodeManager, containerManager); - // Create 3 datanodes for testing. - final Iterator nodeIterator = nodeManager.getNodes( - NodeStatus.inServiceHealthy()).iterator(); - final DatanodeDetails dn1 = nodeIterator.next(); - final DatanodeDetails dn2 = nodeIterator.next(); - final DatanodeDetails dn3 = nodeIterator.next(); + final int numNodes = 3; + List datanodes = nodeManager.getNodes(NodeStatus.inServiceHealthy()).stream() + .limit(numNodes) + .collect(Collectors.toList()); // Create a container and put one replica on each datanode. final ContainerInfo container = getContainer(LifeCycleState.CLOSED); ContainerID contID = container.containerID(); final Set containerIDSet = Stream.of(contID).collect(Collectors.toSet()); - nodeManager.setContainers(dn1, containerIDSet); - nodeManager.setContainers(dn2, containerIDSet); - nodeManager.setContainers(dn3, containerIDSet); + for (DatanodeDetails dn: datanodes) { + nodeManager.setContainers(dn, containerIDSet); + } containerStateManager.addContainer(container.getProtobuf()); - getReplicas(contID, ContainerReplicaProto.State.CLOSED, dn1, dn2, dn3) + getReplicas(contID, ContainerReplicaProto.State.CLOSED, 0, datanodes) .forEach(r -> containerStateManager.updateContainerReplica(contID, r)); // Container manager should now be aware of 3 replicas of each container. - assertEquals(3, containerManager.getContainerReplicas(contID).size()); + assertEquals(numNodes, containerManager.getContainerReplicas(contID).size()); // All replicas should start with an empty data checksum in SCM. boolean contOneDataChecksumsEmpty = containerManager.getContainerReplicas(contID).stream() @@ -1016,12 +1014,15 @@ public void testWithNoContainerDataChecksum() throws Exception { assertTrue(contOneDataChecksumsEmpty, "Replicas of container one should not yet have any data checksums."); // Send a report to SCM from one datanode that still does not have a data checksum. - for (DatanodeDetails dn: Arrays.asList(dn1, dn2, dn3)) { + int numReportsSent = 0; + for (DatanodeDetails dn: datanodes) { final ContainerReportsProto dnReportProto = getContainerReportsProto( contID, ContainerReplicaProto.State.CLOSED, dn.getUuidString()); final ContainerReportFromDatanode dnReport = new ContainerReportFromDatanode(dn, dnReportProto); reportHandler.onMessage(dnReport, publisher); + numReportsSent++; } + assertEquals(numNodes, numReportsSent); // Regardless of which datanode sent the report, none of them have checksums, so all replica's data checksums // should remain empty. @@ -1034,29 +1035,27 @@ public void testWithNoContainerDataChecksum() throws Exception { public void testWithContainerDataChecksum() throws Exception { final ContainerReportHandler reportHandler = new ContainerReportHandler(nodeManager, containerManager); - // Create 3 datanodes for testing. - final Iterator nodeIterator = nodeManager.getNodes( - NodeStatus.inServiceHealthy()).iterator(); - final DatanodeDetails dn1 = nodeIterator.next(); - final DatanodeDetails dn2 = nodeIterator.next(); - final DatanodeDetails dn3 = nodeIterator.next(); + final int numNodes = 3; + List datanodes = nodeManager.getNodes(NodeStatus.inServiceHealthy()).stream() + .limit(numNodes) + .collect(Collectors.toList()); // Create a container and put one replica on each datanode. final ContainerInfo container = getContainer(LifeCycleState.CLOSED); ContainerID contID = container.containerID(); - final Set containerIDSet = Stream.of(container.containerID()).collect(Collectors.toSet()); + final Set containerIDSet = Stream.of(contID).collect(Collectors.toSet()); - nodeManager.setContainers(dn1, containerIDSet); - nodeManager.setContainers(dn2, containerIDSet); - nodeManager.setContainers(dn3, containerIDSet); + for (DatanodeDetails dn: datanodes) { + nodeManager.setContainers(dn, containerIDSet); + } containerStateManager.addContainer(container.getProtobuf()); - getReplicas(contID, ContainerReplicaProto.State.CLOSED, dn1, dn2, dn3) + getReplicas(contID, ContainerReplicaProto.State.CLOSED, 0, datanodes) .forEach(r -> containerStateManager.updateContainerReplica(contID, r)); // Container manager should now be aware of 3 replicas of each container. - assertEquals(3, containerManager.getContainerReplicas(contID).size()); + assertEquals(numNodes, containerManager.getContainerReplicas(contID).size()); // All replicas should start with an empty data checksum in SCM. boolean dataChecksumsEmpty = containerManager.getContainerReplicas(contID).stream() @@ -1064,7 +1063,7 @@ public void testWithContainerDataChecksum() throws Exception { assertTrue(dataChecksumsEmpty, "Replicas of container one should not yet have any data checksums."); // For each datanode, send a container report with a mismatched checksum. - for (DatanodeDetails dn: Arrays.asList(dn1, dn2, dn3)) { + for (DatanodeDetails dn: datanodes) { ContainerReportsProto dnReportProto = getContainerReportsProto( contID, ContainerReplicaProto.State.CLOSED, dn.getUuidString()); ContainerReplicaProto replicaWithChecksum = dnReportProto.getReports(0).toBuilder() @@ -1088,11 +1087,11 @@ public void testWithContainerDataChecksum() throws Exception { assertEquals(expectedChecksum, replica.getDataChecksum()); numReplicasChecked++; } - assertEquals(3, numReplicasChecked); + assertEquals(numNodes, numReplicasChecked); // For each datanode, send a container report with a matching checksum. // This simulates reconciliation running. - for (DatanodeDetails dn: Arrays.asList(dn1, dn2, dn3)) { + for (DatanodeDetails dn: datanodes) { ContainerReportsProto dnReportProto = getContainerReportsProto( contID, ContainerReplicaProto.State.CLOSED, dn.getUuidString()); ContainerReplicaProto replicaWithChecksum = dnReportProto.getReports(0).toBuilder() @@ -1114,20 +1113,20 @@ public void testWithContainerDataChecksum() throws Exception { assertEquals(expectedChecksum, replica.getDataChecksum()); numReplicasChecked++; } - assertEquals(3, numReplicasChecked); + assertEquals(numNodes, numReplicasChecked); } /** * Generates a placeholder data checksum for testing that is specific to a container replica. */ - private static String createUniqueDataChecksumForReplica(ContainerID containerID, String datanodeID) { + protected static String createUniqueDataChecksumForReplica(ContainerID containerID, String datanodeID) { return Integer.toString((datanodeID + containerID).hashCode()); } /** * Generates a placeholder data checksum for testing that is specific to a container replica. */ - private static String createMatchingDataChecksumForReplica(ContainerID containerID) { + protected static String createMatchingDataChecksumForReplica(ContainerID containerID) { return Integer.toString(Objects.hashCode(containerID)); } diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestIncrementalContainerReportHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestIncrementalContainerReportHandler.java index dbcccce598c9..10eb1762d54b 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestIncrementalContainerReportHandler.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestIncrementalContainerReportHandler.java @@ -38,6 +38,7 @@ import org.apache.hadoop.hdds.scm.net.NetworkTopology; import org.apache.hadoop.hdds.scm.net.NetworkTopologyImpl; import org.apache.hadoop.hdds.scm.node.NodeManager; +import org.apache.hadoop.hdds.scm.node.NodeStatus; import org.apache.hadoop.hdds.scm.node.SCMNodeManager; import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException; import org.apache.hadoop.hdds.scm.pipeline.MockPipelineManager; @@ -66,6 +67,8 @@ import java.time.Clock; import java.time.ZoneId; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; @@ -76,6 +79,7 @@ import java.util.concurrent.TimeoutException; import java.util.stream.Collectors; import java.util.stream.IntStream; +import java.util.stream.Stream; import static org.apache.hadoop.hdds.protocol.MockDatanodeDetails.randomDatanodeDetails; import static org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.CLOSED; @@ -83,6 +87,9 @@ import static org.apache.hadoop.hdds.scm.HddsTestUtils.getContainer; import static org.apache.hadoop.hdds.scm.HddsTestUtils.getECContainer; import static org.apache.hadoop.hdds.scm.HddsTestUtils.getReplicas; +import static org.apache.hadoop.hdds.scm.container.TestContainerReportHandler.createMatchingDataChecksumForReplica; +import static org.apache.hadoop.hdds.scm.container.TestContainerReportHandler.createUniqueDataChecksumForReplica; +import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; import static org.mockito.Mockito.doAnswer; @@ -576,6 +583,144 @@ public void testICRFCRRace() throws IOException, NodeNotFoundException, } } + @Test + public void testWithNoContainerDataChecksum() throws Exception { + final IncrementalContainerReportHandler reportHandler = new IncrementalContainerReportHandler(nodeManager, + containerManager, scmContext); + + final int numNodes = 3; + + // Create a container which will have one replica on each datanode. + final ContainerInfo container = getContainer(LifeCycleState.CLOSED); + ContainerID contID = container.containerID(); + final Set containerIDSet = Stream.of(contID).collect(Collectors.toSet()); + + List datanodes = new ArrayList<>(); + for (int i = 0; i < numNodes; i++) { + DatanodeDetails dn = randomDatanodeDetails(); + nodeManager.register(dn, null, null); + nodeManager.setContainers(dn, containerIDSet); + datanodes.add(dn); + } + + containerStateManager.addContainer(container.getProtobuf()); + + getReplicas(contID, ContainerReplicaProto.State.CLOSED, 0, datanodes) + .forEach(r -> containerStateManager.updateContainerReplica(contID, r)); + + // Container manager should now be aware of 3 replicas of each container. + assertEquals(numNodes, containerManager.getContainerReplicas(contID).size()); + + // All replicas should start with an empty data checksum in SCM. + boolean contOneDataChecksumsEmpty = containerManager.getContainerReplicas(contID).stream() + .allMatch(r -> r.getDataChecksum().isEmpty()); + assertTrue(contOneDataChecksumsEmpty, "Replicas of container one should not yet have any data checksums."); + + // Send a report to SCM from one datanode that still does not have a data checksum. + for (DatanodeDetails dn: datanodes) { + final IncrementalContainerReportProto dnReportProto = getIncrementalContainerReportProto( + contID, ContainerReplicaProto.State.CLOSED, dn.getUuidString()); + final IncrementalContainerReportFromDatanode dnReport = new IncrementalContainerReportFromDatanode(dn, + dnReportProto); + reportHandler.onMessage(dnReport, publisher); + } + + // Regardless of which datanode sent the report, none of them have checksums, so all replica's data checksums + // should remain empty. + boolean containerDataChecksumEmpty = containerManager.getContainerReplicas(contID).stream() + .allMatch(r -> r.getDataChecksum().isEmpty()); + assertTrue(containerDataChecksumEmpty, "Replicas of the container should not have any data checksums."); + } + + @Test + public void testWithContainerDataChecksum() throws Exception { + final IncrementalContainerReportHandler reportHandler = new IncrementalContainerReportHandler(nodeManager, + containerManager, scmContext); + + final int numNodes = 3; + + // Create a container which will have one replica on each datanode. + final ContainerInfo container = getContainer(LifeCycleState.CLOSED); + ContainerID contID = container.containerID(); + final Set containerIDSet = Stream.of(contID).collect(Collectors.toSet()); + + List datanodes = new ArrayList<>(); + for (int i = 0; i < numNodes; i++) { + DatanodeDetails dn = randomDatanodeDetails(); + nodeManager.register(dn, null, null); + nodeManager.setContainers(dn, containerIDSet); + datanodes.add(dn); + } + + containerStateManager.addContainer(container.getProtobuf()); + + getReplicas(contID, ContainerReplicaProto.State.CLOSED, 0, datanodes) + .forEach(r -> containerStateManager.updateContainerReplica(contID, r)); + + // Container manager should now be aware of 3 replicas of each container. + assertEquals(3, containerManager.getContainerReplicas(contID).size()); + + // All replicas should start with an empty data checksum in SCM. + boolean dataChecksumsEmpty = containerManager.getContainerReplicas(contID).stream() + .allMatch(r -> r.getDataChecksum().isEmpty()); + assertTrue(dataChecksumsEmpty, "Replicas of container one should not yet have any data checksums."); + + // For each datanode, send a container report with a mismatched checksum. + for (DatanodeDetails dn: datanodes) { + IncrementalContainerReportProto dnReportProto = getIncrementalContainerReportProto( + contID, ContainerReplicaProto.State.CLOSED, dn.getUuidString()); + ContainerReplicaProto replicaWithChecksum = dnReportProto.getReport(0).toBuilder() + .setDataChecksum(createUniqueDataChecksumForReplica(contID, dn.getUuidString())) + .build(); + IncrementalContainerReportProto reportWithChecksum = dnReportProto.toBuilder() + .clearReport() + .addReport(replicaWithChecksum) + .build(); + final IncrementalContainerReportFromDatanode dnReport = new IncrementalContainerReportFromDatanode(dn, + reportWithChecksum); + reportHandler.onMessage(dnReport, publisher); + } + + // All the replicas should have different checksums. + // Since the containers don't have any data in this test, different checksums are based on container ID and + // datanode ID. + int numReplicasChecked = 0; + for (ContainerReplica replica: containerManager.getContainerReplicas(contID)) { + String expectedChecksum = createUniqueDataChecksumForReplica( + contID, replica.getDatanodeDetails().getUuidString()); + assertEquals(expectedChecksum, replica.getDataChecksum()); + numReplicasChecked++; + } + assertEquals(numNodes, numReplicasChecked); + + // For each datanode, send a container report with a matching checksum. + // This simulates reconciliation running. + for (DatanodeDetails dn: datanodes) { + IncrementalContainerReportProto dnReportProto = getIncrementalContainerReportProto( + contID, ContainerReplicaProto.State.CLOSED, dn.getUuidString()); + ContainerReplicaProto replicaWithChecksum = dnReportProto.getReport(0).toBuilder() + .setDataChecksum(createMatchingDataChecksumForReplica(contID)) + .build(); + IncrementalContainerReportProto reportWithChecksum = dnReportProto.toBuilder() + .clearReport() + .addReport(replicaWithChecksum) + .build(); + IncrementalContainerReportFromDatanode dnReport = new IncrementalContainerReportFromDatanode(dn, + reportWithChecksum); + reportHandler.onMessage(dnReport, publisher); + } + + // All the replicas should now have matching checksums. + // Since the containers don't have any data in this test, the matching checksums are based on container ID only. + numReplicasChecked = 0; + for (ContainerReplica replica: containerManager.getContainerReplicas(contID)) { + String expectedChecksum = createMatchingDataChecksumForReplica(contID); + assertEquals(expectedChecksum, replica.getDataChecksum()); + numReplicasChecked++; + } + assertEquals(numNodes, numReplicasChecked); + } + private static IncrementalContainerReportProto getIncrementalContainerReportProto(ContainerReplicaProto replicaProto) { final IncrementalContainerReportProto.Builder crBuilder = @@ -595,7 +740,6 @@ public void testICRFCRRace() throws IOException, NodeNotFoundException, .setContainerID(containerId.getId()) .setState(state) .setOriginNodeId(originNodeId) - .setFinalhash("e16cc9d6024365750ed8dbd194ea46d2") .setSize(5368709120L) .setUsed(2000000000L) .setKeyCount(100000000L) From 4841c8f3b619f32f3b5321dfbe9c87cac2f42539 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Wed, 17 Apr 2024 17:39:37 -0700 Subject: [PATCH 14/43] Remove duplicate line from report handler test --- .../hadoop/hdds/scm/container/TestContainerReportHandler.java | 1 - 1 file changed, 1 deletion(-) diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java index f1aea0aa6cd0..41304fa28b2b 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java @@ -103,7 +103,6 @@ void setup() throws IOException, InvalidStateTransitionException { dbStore = DBStoreBuilder.createDBStore( conf, new SCMDBDefinition()); scmhaManager = SCMHAManagerStub.getInstance(true); - nodeManager = new MockNodeManager(true, 10); pipelineManager = new MockPipelineManager(dbStore, scmhaManager, nodeManager); containerStateManager = ContainerStateManagerImpl.newBuilder() From 8e7e8f73b48e25c5d2efba4f31fd79b9780f7304 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Wed, 17 Apr 2024 17:39:52 -0700 Subject: [PATCH 15/43] Add (currently failing) test for scm event handler Some request blocking based on container states is not yet implemented. --- .../TestReconcileContainerEventHandler.java | 246 ++++++++++++++++++ 1 file changed, 246 insertions(+) create mode 100644 hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReconcileContainerEventHandler.java diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReconcileContainerEventHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReconcileContainerEventHandler.java new file mode 100644 index 000000000000..122a31eaa340 --- /dev/null +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReconcileContainerEventHandler.java @@ -0,0 +1,246 @@ +package org.apache.hadoop.hdds.scm.container; + +import org.apache.commons.lang3.stream.Streams; +import org.apache.hadoop.hdds.client.ECReplicationConfig; +import org.apache.hadoop.hdds.client.RatisReplicationConfig; +import org.apache.hadoop.hdds.client.ReplicationConfig; +import org.apache.hadoop.hdds.protocol.DatanodeDetails; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ReconcileContainerCommandProto; +import org.apache.hadoop.hdds.scm.HddsTestUtils; +import org.apache.hadoop.hdds.scm.ha.SCMContext; +import org.apache.hadoop.hdds.server.events.EventPublisher; +import org.apache.hadoop.ozone.protocol.commands.CommandForDatanode; +import org.apache.hadoop.ozone.protocol.commands.SCMCommand; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; +import org.mockito.ArgumentCaptor; +import org.mockito.Captor; +import org.mockito.MockitoAnnotations; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.UUID; +import java.util.stream.Collectors; + +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor.ONE; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor.THREE; +import static org.apache.hadoop.hdds.scm.events.SCMEvents.DATANODE_COMMAND; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +public class TestReconcileContainerEventHandler { + private ContainerManager containerManager; + private EventPublisher eventPublisher; + private ReconcileContainerEventHandler eventHandler; + private SCMContext scmContext; + + private static final ContainerID CONTAINER_ID = ContainerID.valueOf(123L); + private static final long LEADER_TERM = 3L; + + private static final ReplicationConfig RATIS_THREE_REP = RatisReplicationConfig.getInstance(THREE); + private static final ReplicationConfig RATIS_ONE_REP = RatisReplicationConfig.getInstance(ONE); + private static final ReplicationConfig EC_REP = new ECReplicationConfig(3, 2); + + @Captor + private ArgumentCaptor> commandCaptor; + + @BeforeEach + public void setup() throws Exception { + // TODO for command captor? + MockitoAnnotations.initMocks(this); + containerManager = mock(ContainerManager.class); + scmContext = mock(SCMContext.class); + when(scmContext.isLeader()).thenReturn(true); + when(scmContext.getTermOfLeader()).thenReturn(LEADER_TERM); + eventPublisher = mock(EventPublisher.class); + eventHandler = new ReconcileContainerEventHandler(containerManager, scmContext); + } + + /** + * EC containers are not yet supported for reconciliation. + */ + @Test + public void testReconcileECContainer() throws Exception { + addContainer(CONTAINER_ID, EC_REP, LifeCycleState.CLOSED); + addReplicasToContainer(CONTAINER_ID, 5); + eventHandler.onMessage(CONTAINER_ID, eventPublisher); + verify(eventPublisher, never()).fireEvent(eq(DATANODE_COMMAND), any()); + } + + /** + * Ratis 1 containers are not currently supported for reconciliation. + */ + @Test + public void testReconcileRatisOneContainer() throws Exception { + addContainer(CONTAINER_ID, RATIS_ONE_REP, LifeCycleState.CLOSED); + addReplicasToContainer(CONTAINER_ID, 1); + eventHandler.onMessage(CONTAINER_ID, eventPublisher); + verify(eventPublisher, never()).fireEvent(eq(DATANODE_COMMAND), any()); + } + + @Test + public void testReconcileWhenNotLeader() throws Exception { + addContainer(CONTAINER_ID, RATIS_THREE_REP, LifeCycleState.CLOSED); + addReplicasToContainer(CONTAINER_ID, 3); + when(scmContext.isLeader()).thenReturn(false); + eventHandler.onMessage(CONTAINER_ID, eventPublisher); + verify(eventPublisher, never()).fireEvent(eq(DATANODE_COMMAND), any()); + } + + @Test + public void testReconcileNonexistentContainer() throws Exception { + // The step of adding the container to the mocked ContainerManager is intentionally skipped to simulate a + // nonexistent container. + // No exceptions should be thrown out of this test method when this happens. If they are, they will be propagated + // and the test will fail. + when(containerManager.getContainer(any())).thenThrow(new ContainerNotFoundException()); + eventHandler.onMessage(CONTAINER_ID, eventPublisher); + verify(eventPublisher, never()).fireEvent(eq(DATANODE_COMMAND), any()); + } + + @Test + public void testReconcileMissingContainer() throws Exception { + addContainer(CONTAINER_ID, RATIS_THREE_REP, LifeCycleState.CLOSED); + assertTrue(containerManager.getContainerReplicas(CONTAINER_ID).isEmpty(), + "Expected no replicas for this container"); + eventHandler.onMessage(CONTAINER_ID, eventPublisher); + verify(eventPublisher, never()).fireEvent(eq(DATANODE_COMMAND), any()); + } + + @ParameterizedTest + @EnumSource(LifeCycleState.class) + public void testReconcileWithContainerStates(LifeCycleState state) throws Exception { + addContainer(CONTAINER_ID, RATIS_THREE_REP, state); + addReplicasToContainer(CONTAINER_ID, 3); + eventHandler.onMessage(CONTAINER_ID, eventPublisher); + switch (state) { + case OPEN: + case CLOSING: + case DELETING: + case DELETED: + case RECOVERING: + verify(eventPublisher, never()).fireEvent(eq(DATANODE_COMMAND), commandCaptor.capture()); + break; + default: + verify(eventPublisher, times(3)).fireEvent(eq(DATANODE_COMMAND), commandCaptor.capture()); + break; + } + + + if (state == LifeCycleState.OPEN || state == LifeCycleState.CLOSING) { + verify(eventPublisher, never()).fireEvent(eq(DATANODE_COMMAND), any()); + } else { + verify(eventPublisher, any()).fireEvent(eq(DATANODE_COMMAND), any()); + } + } + + @Test + public void testReconcileSentToAllPeers() throws Exception { + addContainer(CONTAINER_ID, RATIS_THREE_REP, LifeCycleState.CLOSED); + Set replicas = addReplicasToContainer(CONTAINER_ID, 3); + Set allNodeIDs = replicas.stream() + .map(r -> r.getDatanodeDetails().getUuid()) + .collect(Collectors.toSet()); + + eventHandler.onMessage(CONTAINER_ID, eventPublisher); + assertEquals(3, replicas.size()); + assertEquals(allNodeIDs.size(), replicas.size()); + verify(eventPublisher, times(replicas.size())).fireEvent(eq(DATANODE_COMMAND), commandCaptor.capture()); + + // Check each reconcile command sent for correctness. + Set nodesReceivingCommands = new HashSet<>(); + for (CommandForDatanode dnCommand: commandCaptor.getAllValues()) { + SCMCommand reconcileCommand = dnCommand.getCommand(); + ReconcileContainerCommandProto reconcileProto = reconcileCommand.getProto(); + // All commands should use the latest term of SCM so the datanode does not drop them. + assertEquals(LEADER_TERM, reconcileCommand.getTerm()); + // All commands should have the same container ID. + assertEquals(CONTAINER_ID, ContainerID.valueOf(reconcileProto.getContainerID())); + // Container ID is also used as the command's identifier. + assertEquals(CONTAINER_ID, ContainerID.valueOf(reconcileCommand.getId())); + + // Every node should receive exactly one reconcile command. + UUID targetNodeID = dnCommand.getDatanodeId(); + assertTrue(nodesReceivingCommands.add(targetNodeID), "Duplicate reconcile command sent to datanode."); + // All commands should have correctly constructed peer lists that exclude the node receiving the command. + Set expectedPeerIDs = allNodeIDs.stream() + .filter(id -> id != targetNodeID) + .collect(Collectors.toSet()); + Set actualPeerIDs = reconcileProto.getPeersList().stream() + .map(dn -> UUID.fromString(dn.getUuid())) + .collect(Collectors.toSet()); + assertEquals(replicas.size() - 1, actualPeerIDs.size()); + assertEquals(expectedPeerIDs, actualPeerIDs); + } + + assertEquals(allNodeIDs, nodesReceivingCommands); + } + + @ParameterizedTest + @EnumSource(State.class) + public void testReconcileFailsWithOpenReplicas(State replicaState) throws Exception { + // Overall container state is eligible for reconciliation, but some replicas may not be. + // This means the container will not be considered eligible. + addContainer(CONTAINER_ID, RATIS_THREE_REP, LifeCycleState.CLOSED); + // Only one replica is in a different state. + addReplicasToContainer(CONTAINER_ID, replicaState, State.CLOSED, State.CLOSED); + eventHandler.onMessage(CONTAINER_ID, eventPublisher); + switch (replicaState) { + case OPEN: + case INVALID: + case DELETED: + case CLOSING: + verify(eventPublisher, never()).fireEvent(eq(DATANODE_COMMAND), commandCaptor.capture()); + break; + default: + verify(eventPublisher, times(3)).fireEvent(eq(DATANODE_COMMAND), commandCaptor.capture()); + break; + } + } + + private ContainerInfo addContainer(ContainerID id, ReplicationConfig repConfig, LifeCycleState state) throws Exception { + ContainerInfo container = new ContainerInfo.Builder() + .setContainerID(id.getId()) +// .setOwner("Ozone") +// .setPipelineID(pipelineID) + .setReplicationConfig(repConfig) + .setState(state) + .build(); + when(containerManager.getContainer(id)).thenReturn(container); + return container; + } + + private Set addReplicasToContainer(ContainerID id, int count) throws Exception { + State[] replicaStates = new State[count]; + Arrays.fill(replicaStates, State.CLOSED); + return addReplicasToContainer(id, replicaStates); + } + + private Set addReplicasToContainer(ContainerID id, State... replicaStates) throws Exception { + // Add one container replica for each replica state specified. + // If no states are specified, replica list will be empty. + Set replicas = new HashSet<>(); + try (MockNodeManager nodeManager = new MockNodeManager(true, replicaStates.length)) { + List nodes = nodeManager.getAllNodes(); + for (int i = 0; i < replicaStates.length; i++) { + replicas.addAll(HddsTestUtils.getReplicas(id, replicaStates[i], nodes.get(i))); + } + } + when(containerManager.getContainerReplicas(id)).thenReturn(replicas); + + return replicas; + } +} From 74bb00a987fba4137ad282fe2730bb3482874e19 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Thu, 18 Apr 2024 17:16:15 -0700 Subject: [PATCH 16/43] Refactor contaienr eligibility, SCM event handler tests pass Also move some reconiliation specific classes to their own package. --- .../ReconcileContainerEventHandler.java | 91 ----------- .../ReconcileContainerEventHandler.java | 101 +++++++++++++ .../ReconciliationEligibilityHandler.java | 141 ++++++++++++++++++ .../reconciliation/package-info.java | 21 +++ .../scm/server/SCMClientProtocolServer.java | 41 +++-- .../scm/server/StorageContainerManager.java | 2 +- .../TestReconcileContainerEventHandler.java | 122 ++++++++++----- 7 files changed, 368 insertions(+), 151 deletions(-) delete mode 100644 hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerEventHandler.java create mode 100644 hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/ReconcileContainerEventHandler.java create mode 100644 hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/ReconciliationEligibilityHandler.java create mode 100644 hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/package-info.java rename hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/{ => reconciliation}/TestReconcileContainerEventHandler.java (66%) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerEventHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerEventHandler.java deleted file mode 100644 index 79a32d5d8d62..000000000000 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReconcileContainerEventHandler.java +++ /dev/null @@ -1,91 +0,0 @@ -package org.apache.hadoop.hdds.scm.container; - -import org.apache.hadoop.hdds.client.ReplicationConfig; -import org.apache.hadoop.hdds.protocol.DatanodeDetails; -import org.apache.hadoop.hdds.protocol.proto.HddsProtos; -import org.apache.hadoop.hdds.scm.exceptions.SCMException; -import org.apache.hadoop.hdds.scm.ha.SCMContext; -import org.apache.hadoop.hdds.server.events.EventHandler; -import org.apache.hadoop.hdds.server.events.EventPublisher; -import org.apache.hadoop.ozone.protocol.commands.CommandForDatanode; -import org.apache.hadoop.ozone.protocol.commands.ReconcileContainerCommand; -import org.apache.ratis.protocol.exceptions.NotLeaderException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; - -import static org.apache.hadoop.hdds.scm.events.SCMEvents.DATANODE_COMMAND; - -/** - * When a reconcile container event is fired, this class will check if the container is eligible for reconciliation, - * and if so, send the reconcile request to all datanodes with a replica of that container. - */ -public class ReconcileContainerEventHandler implements EventHandler { - public static final Logger LOG = - LoggerFactory.getLogger(ReconcileContainerEventHandler.class); - - private ContainerManager containerManager; - private SCMContext scmContext; - - public ReconcileContainerEventHandler( - final ContainerManager containerManager, - final SCMContext scmContext) { - this.containerManager = containerManager; - this.scmContext = scmContext; - } - - @Override - public void onMessage(ContainerID containerID, EventPublisher publisher) { - if (!scmContext.isLeader()) { - LOG.info("Skip reconciling container {} since current SCM is not leader.", containerID); - return; - } - - try { - ContainerInfo container = containerManager.getContainer(containerID); - final HddsProtos.LifeCycleState state = container.getState(); - if (state.equals(HddsProtos.LifeCycleState.OPEN)) { - LOG.error("Cannot reconcile container in state {}.", state); - return; - } - - // Reconcile on EC containers is not yet implemented. - ReplicationConfig repConfig = container.getReplicationConfig(); - HddsProtos.ReplicationType repType = repConfig.getReplicationType(); - if (repConfig.getReplicationType() != HddsProtos.ReplicationType.RATIS) { - LOG.error("Cannot reconcile container {} with replication type {}. Reconciliation is currently only supported" + - " for Ratis containers.", containerID, repType); - } - - // Reconciliation requires multiple replicas to reconcile. - int requiredNodes = repConfig.getRequiredNodes(); - if (requiredNodes <= 1) { - LOG.error("Cannot reconcile container {} with {} required nodes. Reconciliation is only supported for " + - "containers with more than 1 required node.", containerID, requiredNodes); - } - - Set replicas = containerManager.getContainerReplicas(containerID) - .stream() - .map(ContainerReplica::getDatanodeDetails) - .collect(Collectors.toSet()); - - LOG.info("Reconcile container event triggered for container {} with peers {}", containerID, replicas); - - for (DatanodeDetails replica : replicas) { - List otherReplicas = replicas.stream() - .filter(other -> !other.equals(replica)) - .collect(Collectors.toList()); - ReconcileContainerCommand command = new ReconcileContainerCommand(containerID.getId(), otherReplicas); - command.setTerm(scmContext.getTermOfLeader()); - publisher.fireEvent(DATANODE_COMMAND, new CommandForDatanode<>(replica.getUuid(), command)); - } - } catch (ContainerNotFoundException ex) { - LOG.error("Failed to start reconciliation for container {}. Container not found.", containerID); - } catch (NotLeaderException nle) { - LOG.info("Skip reconciling container {} since current SCM is not leader.", containerID); - } - } -} diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/ReconcileContainerEventHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/ReconcileContainerEventHandler.java new file mode 100644 index 000000000000..adec444b2a53 --- /dev/null +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/ReconcileContainerEventHandler.java @@ -0,0 +1,101 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package org.apache.hadoop.hdds.scm.container.reconciliation; + +import org.apache.hadoop.hdds.client.ReplicationConfig; +import org.apache.hadoop.hdds.protocol.DatanodeDetails; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State; +import org.apache.hadoop.hdds.scm.container.ContainerID; +import org.apache.hadoop.hdds.scm.container.ContainerInfo; +import org.apache.hadoop.hdds.scm.container.ContainerReplica; +import org.apache.hadoop.hdds.scm.container.ContainerManager; +import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException; +import org.apache.hadoop.hdds.scm.container.reconciliation.ReconciliationEligibilityHandler.EligibilityResult; +import org.apache.hadoop.hdds.scm.ha.SCMContext; +import org.apache.hadoop.hdds.scm.node.NodeManager; +import org.apache.hadoop.hdds.server.events.EventHandler; +import org.apache.hadoop.hdds.server.events.EventPublisher; +import org.apache.hadoop.ozone.protocol.commands.CommandForDatanode; +import org.apache.hadoop.ozone.protocol.commands.ReconcileContainerCommand; +import org.apache.ratis.protocol.exceptions.NotLeaderException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.EnumSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.apache.hadoop.hdds.scm.events.SCMEvents.DATANODE_COMMAND; + +/** + * When a reconcile container event is fired, this class will check if the container is eligible for reconciliation, + * and if so, send the reconcile request to all datanodes with a replica of that container. + */ +public class ReconcileContainerEventHandler implements EventHandler { + public static final Logger LOG = + LoggerFactory.getLogger(ReconcileContainerEventHandler.class); + + private final ContainerManager containerManager; + private final SCMContext scmContext; + + public ReconcileContainerEventHandler(ContainerManager containerManager, SCMContext scmContext) { + this.containerManager = containerManager; + this.scmContext = scmContext; + } + + @Override + public void onMessage(ContainerID containerID, EventPublisher publisher) { + if (!scmContext.isLeader()) { + LOG.info("Skip reconciling container {} since current SCM is not leader.", containerID); + return; + } + + EligibilityResult result = ReconciliationEligibilityHandler.isEligibleForReconciliation(containerID, + containerManager); + if (!result.isOk()) { + LOG.error("{}", result); + return; + } + + try { + // TODO HDDS-10714 restriction peer and target nodes based on node status. + Set allReplicaNodes = containerManager.getContainerReplicas(containerID) + .stream() + .map(ContainerReplica::getDatanodeDetails) + .collect(Collectors.toSet()); + + LOG.info("Reconcile container event triggered for container {} with peers {}", containerID, allReplicaNodes); + + for (DatanodeDetails replica : allReplicaNodes) { + List otherReplicas = allReplicaNodes.stream() + .filter(other -> !other.equals(replica)) + .collect(Collectors.toList()); + ReconcileContainerCommand command = new ReconcileContainerCommand(containerID.getId(), otherReplicas); + command.setTerm(scmContext.getTermOfLeader()); + publisher.fireEvent(DATANODE_COMMAND, new CommandForDatanode<>(replica.getUuid(), command)); + } + } catch (ContainerNotFoundException ex) { + LOG.error("Failed to start reconciliation for container {}. Container not found.", containerID); + } catch (NotLeaderException nle) { + LOG.info("Skip reconciling container {} since current SCM is not leader.", containerID); + } + } +} diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/ReconciliationEligibilityHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/ReconciliationEligibilityHandler.java new file mode 100644 index 000000000000..75e15ed93315 --- /dev/null +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/ReconciliationEligibilityHandler.java @@ -0,0 +1,141 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package org.apache.hadoop.hdds.scm.container.reconciliation; + + +import org.apache.hadoop.hdds.client.ReplicationConfig; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State; +import org.apache.hadoop.hdds.scm.container.ContainerID; +import org.apache.hadoop.hdds.scm.container.ContainerInfo; +import org.apache.hadoop.hdds.scm.container.ContainerManager; +import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException; +import org.apache.hadoop.hdds.scm.container.ContainerReplica; + +import java.util.EnumSet; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Determines whether a container is eligible for reconciliation based on its state, replica states, replication + * type, and replication factor. + */ +public final class ReconciliationEligibilityHandler { + public static final Set ELIGIBLE_CONTAINER_STATES = + EnumSet.of(HddsProtos.LifeCycleState.CLOSED, HddsProtos.LifeCycleState.QUASI_CLOSED); + public static final Set ELIGIBLE_REPLICA_STATES = + EnumSet.of(State.CLOSED, State.QUASI_CLOSED, State.UNHEALTHY); + + /** + * Utility class only. + */ + private ReconciliationEligibilityHandler() { } + + public static EligibilityResult isEligibleForReconciliation( + ContainerID containerID, ContainerManager containerManager) { + ContainerInfo container; + Set replicas; + try { + container = containerManager.getContainer(containerID); + replicas = containerManager.getContainerReplicas(containerID); + } catch (ContainerNotFoundException ex) { + return new EligibilityResult(Result.CONTAINER_NOT_FOUND, + String.format("Container %s not found for reconciliation.", containerID)); + } + + if (!ELIGIBLE_CONTAINER_STATES.contains(container.getState())) { + return new EligibilityResult(Result.INELIGIBLE_CONTAINER_STATE, + String.format("Cannot reconcile container %d in state %s.", container.getContainerID(), + container.getState())); + } + + if (replicas.isEmpty()) { + return new EligibilityResult(Result.NO_REPLICAS_FOUND, + String.format("Cannot reconcile container %d because no replicas could be found.", + container.getContainerID())); + } + + boolean replicasValid = replicas.stream() + .map(ContainerReplica::getState) + .allMatch(ELIGIBLE_REPLICA_STATES::contains); + if (!replicasValid) { + return new EligibilityResult(Result.INELIGIBLE_REPLICA_STATES, + String.format("Cannot reconcile container %s in state %s with replica states: %s", containerID, + container.getState(), replicas.stream() + .map(r -> r.getState().toString()) + .collect(Collectors.joining(", ")))); + } + + // Reconcile on EC containers is not yet implemented. + ReplicationConfig repConfig = container.getReplicationConfig(); + if (repConfig.getReplicationType() != HddsProtos.ReplicationType.RATIS) { + return new EligibilityResult(Result.INELIGIBLE_REPLICATION_TYPE, + String.format("Cannot reconcile container %s with replication type %s. Reconciliation is currently only " + + "supported for Ratis containers.", containerID, repConfig.getReplicationType())); + } + + // Reconciliation requires multiple replicas to reconcile. + int requiredNodes = repConfig.getRequiredNodes(); + if (requiredNodes <= 1) { + return new EligibilityResult(Result.NOT_ENOUGH_REQUIRED_NODES, + String.format("Cannot reconcile container %s with %d required nodes. Reconciliation is only supported for " + + "containers with more than 1 required node.", containerID, repConfig.getRequiredNodes())); + } + + return new EligibilityResult(Result.OK, "Container %s is eligible for reconciliation." + containerID); + } + + /** + * Defines the reasons a container may not be eligible for reconciliation. + */ + public enum Result { + OK, + CONTAINER_NOT_FOUND, + INELIGIBLE_CONTAINER_STATE, + INELIGIBLE_REPLICA_STATES, + INELIGIBLE_REPLICATION_TYPE, + NOT_ENOUGH_REQUIRED_NODES, + NO_REPLICAS_FOUND + } + + /** + * Provides a status and message indicating whether a container is eligible for reconciliation. + */ + public static final class EligibilityResult { + private final Result result; + private final String message; + + private EligibilityResult(Result result, String message) { + this.result = result; + this.message = message; + } + + public Result getResult() { + return result; + } + + public boolean isOk() { + return result == Result.OK; + } + + @Override + public String toString() { + return message; + } + } +} \ No newline at end of file diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/package-info.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/package-info.java new file mode 100644 index 000000000000..602d7fb69c0c --- /dev/null +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/package-info.java @@ -0,0 +1,21 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package org.apache.hadoop.hdds.scm.container.reconiliation; +/** + * This package contains classes related to container reconciliation. + */ diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java index 0b2d4f569378..285468efe859 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMClientProtocolServer.java @@ -56,6 +56,8 @@ import org.apache.hadoop.hdds.scm.container.balancer.IllegalContainerBalancerStateException; import org.apache.hadoop.hdds.scm.container.balancer.InvalidContainerBalancerConfigurationException; import org.apache.hadoop.hdds.scm.container.common.helpers.ContainerWithPipeline; +import org.apache.hadoop.hdds.scm.container.reconciliation.ReconciliationEligibilityHandler; +import org.apache.hadoop.hdds.scm.container.reconciliation.ReconciliationEligibilityHandler.EligibilityResult; import org.apache.hadoop.hdds.scm.events.SCMEvents; import org.apache.hadoop.hdds.scm.exceptions.SCMException; import org.apache.hadoop.hdds.scm.exceptions.SCMException.ResultCodes; @@ -1393,27 +1395,24 @@ public void reconcileContainer(long longContainerID) throws IOException { auditMap.put("remoteUser", remoteUser.getUserName()); try { - // May throw ContainerNotFoundException, which will be caught, audited, and returned to the user. - ContainerInfo container = getScm().getContainerManager().getContainer(containerID); - - // Reconcile is not allowed on open containers. - HddsProtos.LifeCycleState state = container.getState(); - if (state.equals(HddsProtos.LifeCycleState.OPEN)) { - throw new SCMException("Cannot reconcile container in state " + state, - ResultCodes.UNEXPECTED_CONTAINER_STATE); - } - - // Reconcile on EC containers is not yet implemented. - ReplicationConfig repConfig = container.getReplicationConfig(); - if (repConfig.getReplicationType() != HddsProtos.ReplicationType.RATIS) { - throw new SCMException("Reconciliation is currently only supported for Ratis containers", - ResultCodes.UNSUPPORTED_OPERATION); - } - - // Reconciliation requires multiple replicas to reconcile. - if (repConfig.getRequiredNodes() <= 1) { - throw new SCMException("Reconciliation is only supported for containers with more than one required node.", - ResultCodes.UNSUPPORTED_OPERATION); + EligibilityResult result = ReconciliationEligibilityHandler.isEligibleForReconciliation(containerID, + getScm().getContainerManager()); + if (!result.isOk()) { + switch (result.getResult()) { + case OK: + break; + case CONTAINER_NOT_FOUND: + throw new ContainerNotFoundException(result.toString()); + case INELIGIBLE_CONTAINER_STATE: + throw new SCMException(result.toString(), ResultCodes.UNEXPECTED_CONTAINER_STATE); + case INELIGIBLE_REPLICA_STATES: + case INELIGIBLE_REPLICATION_TYPE: + case NOT_ENOUGH_REQUIRED_NODES: + case NO_REPLICAS_FOUND: + throw new SCMException(result.toString(), ResultCodes.UNSUPPORTED_OPERATION); + default: + throw new SCMException("Unknown reconciliation eligibility result " + result, ResultCodes.INTERNAL_ERROR); + } } scm.getEventQueue().fireEvent(SCMEvents.RECONCILE_CONTAINER, containerID); diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java index 35c550f42f0e..f0327e533432 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java @@ -49,7 +49,7 @@ import org.apache.hadoop.hdds.scm.container.ContainerManager; import org.apache.hadoop.hdds.scm.container.ContainerManagerImpl; import org.apache.hadoop.hdds.scm.PlacementPolicyValidateProxy; -import org.apache.hadoop.hdds.scm.container.ReconcileContainerEventHandler; +import org.apache.hadoop.hdds.scm.container.reconciliation.ReconcileContainerEventHandler; import org.apache.hadoop.hdds.scm.container.balancer.MoveManager; import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaPendingOps; import org.apache.hadoop.hdds.scm.container.replication.DatanodeCommandCountUpdatedHandler; diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReconcileContainerEventHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/reconciliation/TestReconcileContainerEventHandler.java similarity index 66% rename from hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReconcileContainerEventHandler.java rename to hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/reconciliation/TestReconcileContainerEventHandler.java index 122a31eaa340..3b46473fd8e3 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReconcileContainerEventHandler.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/reconciliation/TestReconcileContainerEventHandler.java @@ -1,6 +1,5 @@ -package org.apache.hadoop.hdds.scm.container; +package org.apache.hadoop.hdds.scm.container.reconciliation; -import org.apache.commons.lang3.stream.Streams; import org.apache.hadoop.hdds.client.ECReplicationConfig; import org.apache.hadoop.hdds.client.RatisReplicationConfig; import org.apache.hadoop.hdds.client.ReplicationConfig; @@ -9,6 +8,14 @@ import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State; import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ReconcileContainerCommandProto; import org.apache.hadoop.hdds.scm.HddsTestUtils; +import org.apache.hadoop.hdds.scm.container.ContainerID; +import org.apache.hadoop.hdds.scm.container.ContainerInfo; +import org.apache.hadoop.hdds.scm.container.ContainerManager; +import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException; +import org.apache.hadoop.hdds.scm.container.ContainerReplica; +import org.apache.hadoop.hdds.scm.container.MockNodeManager; +import org.apache.hadoop.hdds.scm.container.reconciliation.ReconciliationEligibilityHandler.EligibilityResult; +import org.apache.hadoop.hdds.scm.container.reconciliation.ReconciliationEligibilityHandler.Result; import org.apache.hadoop.hdds.scm.ha.SCMContext; import org.apache.hadoop.hdds.server.events.EventPublisher; import org.apache.hadoop.ozone.protocol.commands.CommandForDatanode; @@ -18,8 +25,6 @@ import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.EnumSource; import org.mockito.ArgumentCaptor; -import org.mockito.Captor; -import org.mockito.MockitoAnnotations; import java.util.Arrays; import java.util.HashSet; @@ -32,6 +37,7 @@ import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor.THREE; import static org.apache.hadoop.hdds.scm.events.SCMEvents.DATANODE_COMMAND; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.eq; @@ -54,13 +60,11 @@ public class TestReconcileContainerEventHandler { private static final ReplicationConfig RATIS_ONE_REP = RatisReplicationConfig.getInstance(ONE); private static final ReplicationConfig EC_REP = new ECReplicationConfig(3, 2); - @Captor private ArgumentCaptor> commandCaptor; @BeforeEach public void setup() throws Exception { - // TODO for command captor? - MockitoAnnotations.initMocks(this); + commandCaptor = ArgumentCaptor.forClass(CommandForDatanode.class); containerManager = mock(ContainerManager.class); scmContext = mock(SCMContext.class); when(scmContext.isLeader()).thenReturn(true); @@ -74,8 +78,14 @@ public void setup() throws Exception { */ @Test public void testReconcileECContainer() throws Exception { - addContainer(CONTAINER_ID, EC_REP, LifeCycleState.CLOSED); - addReplicasToContainer(CONTAINER_ID, 5); + addContainer(EC_REP, LifeCycleState.CLOSED); + addReplicasToContainer(5); + + EligibilityResult result = + ReconciliationEligibilityHandler.isEligibleForReconciliation(CONTAINER_ID, containerManager); + assertFalse(result.isOk()); + assertEquals(Result.INELIGIBLE_REPLICATION_TYPE, result.getResult()); + eventHandler.onMessage(CONTAINER_ID, eventPublisher); verify(eventPublisher, never()).fireEvent(eq(DATANODE_COMMAND), any()); } @@ -85,17 +95,30 @@ public void testReconcileECContainer() throws Exception { */ @Test public void testReconcileRatisOneContainer() throws Exception { - addContainer(CONTAINER_ID, RATIS_ONE_REP, LifeCycleState.CLOSED); - addReplicasToContainer(CONTAINER_ID, 1); + addContainer(RATIS_ONE_REP, LifeCycleState.CLOSED); + addReplicasToContainer(1); + + EligibilityResult result = + ReconciliationEligibilityHandler.isEligibleForReconciliation(CONTAINER_ID, containerManager); + assertFalse(result.isOk()); + assertEquals(Result.NOT_ENOUGH_REQUIRED_NODES, result.getResult()); + eventHandler.onMessage(CONTAINER_ID, eventPublisher); verify(eventPublisher, never()).fireEvent(eq(DATANODE_COMMAND), any()); } @Test public void testReconcileWhenNotLeader() throws Exception { - addContainer(CONTAINER_ID, RATIS_THREE_REP, LifeCycleState.CLOSED); - addReplicasToContainer(CONTAINER_ID, 3); + addContainer(RATIS_THREE_REP, LifeCycleState.CLOSED); + addReplicasToContainer(3); when(scmContext.isLeader()).thenReturn(false); + + // Container is eligible for reconciliation, but the request will not go through because this SCM is not the leader. + EligibilityResult result = + ReconciliationEligibilityHandler.isEligibleForReconciliation(CONTAINER_ID, containerManager); + assertTrue(result.isOk()); + assertEquals(Result.OK, result.getResult()); + eventHandler.onMessage(CONTAINER_ID, eventPublisher); verify(eventPublisher, never()).fireEvent(eq(DATANODE_COMMAND), any()); } @@ -107,15 +130,27 @@ public void testReconcileNonexistentContainer() throws Exception { // No exceptions should be thrown out of this test method when this happens. If they are, they will be propagated // and the test will fail. when(containerManager.getContainer(any())).thenThrow(new ContainerNotFoundException()); + + EligibilityResult result = + ReconciliationEligibilityHandler.isEligibleForReconciliation(CONTAINER_ID, containerManager); + assertFalse(result.isOk()); + assertEquals(Result.CONTAINER_NOT_FOUND, result.getResult()); + eventHandler.onMessage(CONTAINER_ID, eventPublisher); verify(eventPublisher, never()).fireEvent(eq(DATANODE_COMMAND), any()); } @Test public void testReconcileMissingContainer() throws Exception { - addContainer(CONTAINER_ID, RATIS_THREE_REP, LifeCycleState.CLOSED); + addContainer(RATIS_THREE_REP, LifeCycleState.CLOSED); assertTrue(containerManager.getContainerReplicas(CONTAINER_ID).isEmpty(), "Expected no replicas for this container"); + + EligibilityResult result = + ReconciliationEligibilityHandler.isEligibleForReconciliation(CONTAINER_ID, containerManager); + assertFalse(result.isOk()); + assertEquals(Result.NO_REPLICAS_FOUND, result.getResult()); + eventHandler.onMessage(CONTAINER_ID, eventPublisher); verify(eventPublisher, never()).fireEvent(eq(DATANODE_COMMAND), any()); } @@ -123,8 +158,10 @@ public void testReconcileMissingContainer() throws Exception { @ParameterizedTest @EnumSource(LifeCycleState.class) public void testReconcileWithContainerStates(LifeCycleState state) throws Exception { - addContainer(CONTAINER_ID, RATIS_THREE_REP, state); - addReplicasToContainer(CONTAINER_ID, 3); + addContainer(RATIS_THREE_REP, state); + addReplicasToContainer(3); + EligibilityResult result = + ReconciliationEligibilityHandler.isEligibleForReconciliation(CONTAINER_ID, containerManager); eventHandler.onMessage(CONTAINER_ID, eventPublisher); switch (state) { case OPEN: @@ -132,29 +169,32 @@ public void testReconcileWithContainerStates(LifeCycleState state) throws Except case DELETING: case DELETED: case RECOVERING: + assertFalse(result.isOk()); + assertEquals(Result.INELIGIBLE_CONTAINER_STATE, result.getResult()); verify(eventPublisher, never()).fireEvent(eq(DATANODE_COMMAND), commandCaptor.capture()); break; default: + assertTrue(result.isOk()); + assertEquals(Result.OK, result.getResult()); verify(eventPublisher, times(3)).fireEvent(eq(DATANODE_COMMAND), commandCaptor.capture()); break; } - - - if (state == LifeCycleState.OPEN || state == LifeCycleState.CLOSING) { - verify(eventPublisher, never()).fireEvent(eq(DATANODE_COMMAND), any()); - } else { - verify(eventPublisher, any()).fireEvent(eq(DATANODE_COMMAND), any()); - } } + // TODO HDDS-10714 will change which datanodes are eligible to participate in reconciliation. @Test public void testReconcileSentToAllPeers() throws Exception { - addContainer(CONTAINER_ID, RATIS_THREE_REP, LifeCycleState.CLOSED); - Set replicas = addReplicasToContainer(CONTAINER_ID, 3); + addContainer(RATIS_THREE_REP, LifeCycleState.CLOSED); + Set replicas = addReplicasToContainer(3); Set allNodeIDs = replicas.stream() .map(r -> r.getDatanodeDetails().getUuid()) .collect(Collectors.toSet()); + EligibilityResult result = + ReconciliationEligibilityHandler.isEligibleForReconciliation(CONTAINER_ID, containerManager); + assertTrue(result.isOk()); + assertEquals(Result.OK, result.getResult()); + eventHandler.onMessage(CONTAINER_ID, eventPublisher); assertEquals(3, replicas.size()); assertEquals(allNodeIDs.size(), replicas.size()); @@ -191,55 +231,61 @@ public void testReconcileSentToAllPeers() throws Exception { @ParameterizedTest @EnumSource(State.class) - public void testReconcileFailsWithOpenReplicas(State replicaState) throws Exception { + public void testReconcileFailsWithIneligibleReplicas(State replicaState) throws Exception { // Overall container state is eligible for reconciliation, but some replicas may not be. // This means the container will not be considered eligible. - addContainer(CONTAINER_ID, RATIS_THREE_REP, LifeCycleState.CLOSED); + addContainer(RATIS_THREE_REP, LifeCycleState.CLOSED); // Only one replica is in a different state. - addReplicasToContainer(CONTAINER_ID, replicaState, State.CLOSED, State.CLOSED); + addReplicasToContainer(replicaState, State.CLOSED, State.CLOSED); + + EligibilityResult result = + ReconciliationEligibilityHandler.isEligibleForReconciliation(CONTAINER_ID, containerManager); + eventHandler.onMessage(CONTAINER_ID, eventPublisher); switch (replicaState) { case OPEN: case INVALID: case DELETED: case CLOSING: + assertFalse(result.isOk()); + assertEquals(Result.INELIGIBLE_REPLICA_STATES, result.getResult()); verify(eventPublisher, never()).fireEvent(eq(DATANODE_COMMAND), commandCaptor.capture()); break; default: + assertTrue(result.isOk()); + assertEquals(Result.OK, result.getResult()); verify(eventPublisher, times(3)).fireEvent(eq(DATANODE_COMMAND), commandCaptor.capture()); break; } } - private ContainerInfo addContainer(ContainerID id, ReplicationConfig repConfig, LifeCycleState state) throws Exception { + private ContainerInfo addContainer(ReplicationConfig repConfig, LifeCycleState state) throws Exception { ContainerInfo container = new ContainerInfo.Builder() - .setContainerID(id.getId()) -// .setOwner("Ozone") -// .setPipelineID(pipelineID) + .setContainerID(CONTAINER_ID.getId()) .setReplicationConfig(repConfig) .setState(state) .build(); - when(containerManager.getContainer(id)).thenReturn(container); + when(containerManager.getContainer(CONTAINER_ID)).thenReturn(container); return container; } - private Set addReplicasToContainer(ContainerID id, int count) throws Exception { + private Set addReplicasToContainer(int count) throws Exception { State[] replicaStates = new State[count]; Arrays.fill(replicaStates, State.CLOSED); - return addReplicasToContainer(id, replicaStates); + return addReplicasToContainer(replicaStates); } - private Set addReplicasToContainer(ContainerID id, State... replicaStates) throws Exception { + private Set addReplicasToContainer(State... replicaStates) throws Exception { // Add one container replica for each replica state specified. // If no states are specified, replica list will be empty. Set replicas = new HashSet<>(); try (MockNodeManager nodeManager = new MockNodeManager(true, replicaStates.length)) { List nodes = nodeManager.getAllNodes(); for (int i = 0; i < replicaStates.length; i++) { - replicas.addAll(HddsTestUtils.getReplicas(id, replicaStates[i], nodes.get(i))); + replicas.addAll(HddsTestUtils.getReplicas(CONTAINER_ID, replicaStates[i], nodes.get(i))); } } - when(containerManager.getContainerReplicas(id)).thenReturn(replicas); + when(containerManager.getContainerReplicas(CONTAINER_ID)).thenReturn(replicas); return replicas; } From 2effd781ea8cabcc8ef5b01fe2610b0388076cee Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Thu, 18 Apr 2024 17:25:58 -0700 Subject: [PATCH 17/43] Checkstyle --- .../TestReconcileContainerCommandHandler.java | 12 +++++++----- .../container/keyvalue/TestKeyValueHandler.java | 1 - .../container/replication/TestContainerImporter.java | 1 - .../hadoop/hdds/scm/container/ContainerReplica.java | 1 - .../ReconcileContainerEventHandler.java | 7 ------- .../ReconciliationEligibilityHandler.java | 2 +- .../scm/container/TestContainerReportHandler.java | 1 - .../TestIncrementalContainerReportHandler.java | 3 --- .../TestReconcileContainerEventHandler.java | 4 ++++ 9 files changed, 12 insertions(+), 20 deletions(-) diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java index 92f99ec3c754..892b29f972a5 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java @@ -40,6 +40,9 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; +/** + * Tests Datanode handling of reconcile container commands. + */ public class TestReconcileContainerCommandHandler { public static final Logger LOG = LoggerFactory.getLogger(TestReconcileContainerCommandHandler.class); @@ -156,11 +159,10 @@ public void testReconcileContainerCommandHandled(ContainerLayoutVersion layout) private void waitForQueueCount(int expectedQueueCount) throws Exception { GenericTestUtils.waitFor(() -> { - int qCount = subject.getQueuedCount(); - LOG.info("Waiting for queued command count to reach " + expectedQueueCount + ". Currently at " + qCount); - return qCount == expectedQueueCount; - }, - 500, 3000); + int qCount = subject.getQueuedCount(); + LOG.info("Waiting for queued command count to reach " + expectedQueueCount + ". Currently at " + qCount); + return qCount == expectedQueueCount; + }, 500, 3000); } private void verifyContainerReportsSent() throws Exception { diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandler.java index c1b4269c1680..accf94977b74 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandler.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandler.java @@ -69,7 +69,6 @@ import static org.junit.jupiter.api.Assertions.assertThrows; import static org.mockito.Mockito.any; -import org.checkerframework.checker.units.qual.A; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestContainerImporter.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestContainerImporter.java index a2a397ebc420..6680a467b12b 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestContainerImporter.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestContainerImporter.java @@ -56,7 +56,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.mockito.ArgumentMatchers.any; -import static org.mockito.Mockito.doNothing; import static org.mockito.Mockito.doReturn; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.spy; diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplica.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplica.java index f1d13497d867..bf6c2736d4a0 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplica.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplica.java @@ -27,7 +27,6 @@ import org.apache.commons.lang3.builder.CompareToBuilder; import org.apache.commons.lang3.builder.EqualsBuilder; import org.apache.commons.lang3.builder.HashCodeBuilder; -import org.apache.hadoop.ozone.container.common.interfaces.Container; /** * In-memory state of a container replica. diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/ReconcileContainerEventHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/ReconcileContainerEventHandler.java index adec444b2a53..f13b37f3ee23 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/ReconcileContainerEventHandler.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/ReconcileContainerEventHandler.java @@ -17,19 +17,13 @@ package org.apache.hadoop.hdds.scm.container.reconciliation; -import org.apache.hadoop.hdds.client.ReplicationConfig; import org.apache.hadoop.hdds.protocol.DatanodeDetails; -import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState; -import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType; -import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State; import org.apache.hadoop.hdds.scm.container.ContainerID; -import org.apache.hadoop.hdds.scm.container.ContainerInfo; import org.apache.hadoop.hdds.scm.container.ContainerReplica; import org.apache.hadoop.hdds.scm.container.ContainerManager; import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException; import org.apache.hadoop.hdds.scm.container.reconciliation.ReconciliationEligibilityHandler.EligibilityResult; import org.apache.hadoop.hdds.scm.ha.SCMContext; -import org.apache.hadoop.hdds.scm.node.NodeManager; import org.apache.hadoop.hdds.server.events.EventHandler; import org.apache.hadoop.hdds.server.events.EventPublisher; import org.apache.hadoop.ozone.protocol.commands.CommandForDatanode; @@ -38,7 +32,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.EnumSet; import java.util.List; import java.util.Set; import java.util.stream.Collectors; diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/ReconciliationEligibilityHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/ReconciliationEligibilityHandler.java index 75e15ed93315..cdc08556d2ca 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/ReconciliationEligibilityHandler.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/ReconciliationEligibilityHandler.java @@ -138,4 +138,4 @@ public String toString() { return message; } } -} \ No newline at end of file +} diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java index 41304fa28b2b..3da4b46beb01 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java @@ -56,7 +56,6 @@ import java.time.Clock; import java.time.ZoneId; import java.util.ArrayList; -import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestIncrementalContainerReportHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestIncrementalContainerReportHandler.java index 10eb1762d54b..f67b186c0308 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestIncrementalContainerReportHandler.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestIncrementalContainerReportHandler.java @@ -38,7 +38,6 @@ import org.apache.hadoop.hdds.scm.net.NetworkTopology; import org.apache.hadoop.hdds.scm.net.NetworkTopologyImpl; import org.apache.hadoop.hdds.scm.node.NodeManager; -import org.apache.hadoop.hdds.scm.node.NodeStatus; import org.apache.hadoop.hdds.scm.node.SCMNodeManager; import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException; import org.apache.hadoop.hdds.scm.pipeline.MockPipelineManager; @@ -67,8 +66,6 @@ import java.time.Clock; import java.time.ZoneId; import java.util.ArrayList; -import java.util.Arrays; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/reconciliation/TestReconcileContainerEventHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/reconciliation/TestReconcileContainerEventHandler.java index 3b46473fd8e3..5cdd8354e9ca 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/reconciliation/TestReconcileContainerEventHandler.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/reconciliation/TestReconcileContainerEventHandler.java @@ -47,6 +47,10 @@ import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; +/** + * Tests that the ReconcileContainerEventHandler properly accepts and rejects reconciliation events based on + * container state, and dispatches commands to datanodes accordingly. + */ public class TestReconcileContainerEventHandler { private ContainerManager containerManager; private EventPublisher eventPublisher; From d055094103fa8096e618b7dd2eb40c3ce92f6561 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Fri, 19 Apr 2024 19:41:22 -0700 Subject: [PATCH 18/43] Add robot test that may not pass yet --- .../main/smoketest/admincli/container.robot | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot b/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot index ea10fb98d874..d26503c65de2 100644 --- a/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot +++ b/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot @@ -85,6 +85,7 @@ Incomplete command Should contain ${output} info Should contain ${output} create Should contain ${output} close + Should contain ${output} reconcile #List containers on unknown host # ${output} = Execute And Ignore Error ozone admin --verbose container list --scm unknown-host @@ -96,5 +97,32 @@ Cannot close container without admin privilege Cannot create container without admin privilege Requires admin privilege ozone admin container create +Cannot reconcile container without admin privilege + Requires admin privilege ozone admin container reconcile "${CONTAINER}" + +Cannot reconcile open container + ${output} = Execute ozone admin container create + Should contain ${output} is created + # The newly created container should still be open. + ${container} = Execute ozone admin container list --state OPEN | jq -r 'select(.replicationConfig.replicationFactor == "THREE") | .containerID' | head -n1 + Execute and check rc ozone admin container reconcile "${container}" 255 + +Reconcile closed container + # Get a currently open container to use for this test. + ${container} = Execute ozone admin container list --state OPEN | jq -r 'select(.replicationConfig.replicationFactor == "THREE") | .containerID' | head -n1 + # The container should not yet have any replica checksums. + ${data_checksum} = Execute ozone admin container info "${container}" --json | jq -r '.dataChecksum' | head -n1 + Should be empty ${data_checksum} + # Close the container to it can be reconciled. + Execute ozone admin container close ${container} + # Check info still does not show replica checksums + ${data_checksum} = Execute ozone admin container info "${container}" --json | jq -r '.dataChecksum' | head -n1 + Should be empty ${data_checksum} + # Reconcile, and checksums should show up. + Execute ozone admin container reconcile ${container} + # When reconciliation finishes, replica checksums should be shown. + ${data_checksum} = Execute ozone admin container info "${container}" --json | jq -r '.dataChecksum' | head -n1 + Wait until keyword succeeds 1min 5sec Should not be empty ${data_checksum} + Reset user Run Keyword if '${SECURITY_ENABLED}' == 'true' Kinit test user testuser testuser.keytab From 3586cc3d46cbf6ba0b060ae284edc2244a80b4a4 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Fri, 19 Apr 2024 19:59:00 -0700 Subject: [PATCH 19/43] Rat --- .../ReconcileContainerCommandHandler.java | 18 ++++++++++++++++++ .../commands/ReconcileContainerCommand.java | 18 ++++++++++++++++++ .../TestReconcileContainerCommandHandler.java | 18 ++++++++++++++++++ .../TestReconcileContainerEventHandler.java | 18 ++++++++++++++++++ 4 files changed, 72 insertions(+) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java index 8bf18576104d..9a4110c7dfcb 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java @@ -1,3 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.hadoop.ozone.container.common.statemachine.commandhandler; import com.google.common.util.concurrent.ThreadFactoryBuilder; diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/ReconcileContainerCommand.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/ReconcileContainerCommand.java index 3e4d47fff5af..cdd4522cc691 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/ReconcileContainerCommand.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/ReconcileContainerCommand.java @@ -1,3 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.hadoop.ozone.protocol.commands; import com.google.common.base.Preconditions; diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java index 892b29f972a5..b83e983f99f6 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java @@ -1,3 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.hadoop.ozone.container.common.statemachine.commandhandler; import org.apache.hadoop.hdds.conf.OzoneConfiguration; diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/reconciliation/TestReconcileContainerEventHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/reconciliation/TestReconcileContainerEventHandler.java index 5cdd8354e9ca..ffc96217b441 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/reconciliation/TestReconcileContainerEventHandler.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/reconciliation/TestReconcileContainerEventHandler.java @@ -1,3 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.hadoop.hdds.scm.container.reconciliation; import org.apache.hadoop.hdds.client.ECReplicationConfig; From ac6ff0edf634385bf84e8be79ec2157adb37f45d Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Mon, 22 Apr 2024 13:28:12 -0700 Subject: [PATCH 20/43] Test repeat run with fixed acc test workflow --- .github/workflows/repeat-acceptance.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/repeat-acceptance.yml b/.github/workflows/repeat-acceptance.yml index 7269a9c417a6..3105758b3602 100644 --- a/.github/workflows/repeat-acceptance.yml +++ b/.github/workflows/repeat-acceptance.yml @@ -81,6 +81,8 @@ jobs: steps: - name: Checkout project uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.ref }} - name: Cache for npm dependencies uses: actions/cache@v4 with: From d84d3eff25c5ead71e1664dbecaa775302f2c80c Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Mon, 22 Apr 2024 18:39:02 -0700 Subject: [PATCH 21/43] Some acceptance test fixes --- .../dist/src/main/smoketest/admincli/container.robot | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot b/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot index d26503c65de2..73ecd52e139c 100644 --- a/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot +++ b/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot @@ -100,6 +100,9 @@ Cannot create container without admin privilege Cannot reconcile container without admin privilege Requires admin privilege ozone admin container reconcile "${CONTAINER}" +Reset user + Run Keyword if '${SECURITY_ENABLED}' == 'true' Kinit test user testuser testuser.keytab + Cannot reconcile open container ${output} = Execute ozone admin container create Should contain ${output} is created @@ -111,18 +114,15 @@ Reconcile closed container # Get a currently open container to use for this test. ${container} = Execute ozone admin container list --state OPEN | jq -r 'select(.replicationConfig.replicationFactor == "THREE") | .containerID' | head -n1 # The container should not yet have any replica checksums. - ${data_checksum} = Execute ozone admin container info "${container}" --json | jq -r '.dataChecksum' | head -n1 + ${data_checksum} = Execute ozone admin container info "${container}" --json | jq -r '.replicas[].dataChecksum' | head -n1 Should be empty ${data_checksum} # Close the container to it can be reconciled. Execute ozone admin container close ${container} # Check info still does not show replica checksums - ${data_checksum} = Execute ozone admin container info "${container}" --json | jq -r '.dataChecksum' | head -n1 + ${data_checksum} = Execute ozone admin container info "${container}" --json | jq -r '.replicas[].dataChecksum' | head -n1 Should be empty ${data_checksum} # Reconcile, and checksums should show up. Execute ozone admin container reconcile ${container} # When reconciliation finishes, replica checksums should be shown. - ${data_checksum} = Execute ozone admin container info "${container}" --json | jq -r '.dataChecksum' | head -n1 + ${data_checksum} = Execute ozone admin container info "${container}" --json | jq -r '.replicas[].dataChecksum' | head -n1 Wait until keyword succeeds 1min 5sec Should not be empty ${data_checksum} - -Reset user - Run Keyword if '${SECURITY_ENABLED}' == 'true' Kinit test user testuser testuser.keytab From a4a0a1c1f75684fd25aebbb1bddfdf7bacf87c0c Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Mon, 29 Apr 2024 14:10:53 -0700 Subject: [PATCH 22/43] Update comment --- .../hadoop/hdds/scm/container/TestContainerReportHandler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java index 3da4b46beb01..b49adbb0f771 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java @@ -1122,7 +1122,7 @@ protected static String createUniqueDataChecksumForReplica(ContainerID container } /** - * Generates a placeholder data checksum for testing that is specific to a container replica. + * Generates a placeholder data checksum for testing that is the same for all container replicas. */ protected static String createMatchingDataChecksumForReplica(ContainerID containerID) { return Integer.toString(Objects.hashCode(containerID)); From 6f2c7e405ead3c212db6052d3f05634f2faf870c Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Mon, 29 Apr 2024 15:59:25 -0700 Subject: [PATCH 23/43] findbugs --- .../commandhandler/TestReconcileContainerCommandHandler.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java index b83e983f99f6..a55c9a18579a 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java @@ -184,8 +184,9 @@ private void waitForQueueCount(int expectedQueueCount) throws Exception { } private void verifyContainerReportsSent() throws Exception { - for (ContainerID id: containerReportsSent.keySet()) { - String sentDataChecksum = containerReportsSent.get(id).getDataChecksum(); + for (Map.Entry entry: containerReportsSent.entrySet()) { + ContainerID id = entry.getKey(); + String sentDataChecksum = entry.getValue().getDataChecksum(); String expectedDataChecksum = ContainerUtils.getChecksum(Long.toString(id.getId())); Assertions.assertEquals(expectedDataChecksum, sentDataChecksum, "Checksum mismatch in report of container " + id); } From 9c94f1c5fa50624d469385fd590f163a5b916d8d Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Mon, 29 Apr 2024 16:44:31 -0700 Subject: [PATCH 24/43] Separate container handler test for metrics Still not done --- .../TestReconcileContainerCommandHandler.java | 103 +++++++++++------- 1 file changed, 63 insertions(+), 40 deletions(-) diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java index a55c9a18579a..72ffe5016f14 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java @@ -18,6 +18,7 @@ package org.apache.hadoop.ozone.container.common.statemachine.commandhandler; +import org.apache.commons.lang3.stream.Streams; import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.protocol.DatanodeDetails; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos; @@ -42,19 +43,22 @@ import org.apache.hadoop.ozone.container.ozoneimpl.OzoneContainer; import org.apache.hadoop.ozone.protocol.commands.ReconcileContainerCommand; import org.apache.ozone.test.GenericTestUtils; -import org.junit.jupiter.api.Assertions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Collections; import java.util.HashMap; import java.util.Map; +import java.util.Set; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; import static java.util.Collections.singletonMap; import static org.apache.hadoop.hdds.protocol.MockDatanodeDetails.randomDatanodeDetails; import static org.apache.hadoop.ozone.OzoneConsts.GB; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -73,21 +77,16 @@ public class TestReconcileContainerCommandHandler { private ContainerController controller; private ContainerSet containerSet; private ReconcileContainerCommandHandler subject; - // Used to block ICR sending so that queue metrics can be checked before the reconcile task completes. - private CountDownLatch icrLatch; private ContainerLayoutVersion layoutVersion; - // As data hashes are calculated during the test, they are written back here. - private final Map containerReportsSent = new HashMap<>(); - - public void initLayoutVersion(ContainerLayoutVersion layout) + public void initLayoutVersion(ContainerLayoutVersion layout, IncrementalReportSender icrSender) throws Exception { this.layoutVersion = layout; - init(); + init(icrSender); } - private void init() throws Exception { + private void init(IncrementalReportSender icrSender) throws Exception { OzoneConfiguration conf = new OzoneConfiguration(); DatanodeDetails dnDetails = randomDatanodeDetails(); subject = new ReconcileContainerCommandHandler(""); @@ -99,7 +98,39 @@ private void init() throws Exception { containerSet = new ContainerSet(1000); containerSet.addContainer(container); - icrLatch = new CountDownLatch(1); + containerHandler = new KeyValueHandler(new OzoneConfiguration(), dnDetails.getUuidString(), containerSet, + mock(VolumeSet.class), mock(ContainerMetrics.class), icrSender); + controller = new ContainerController(containerSet, + singletonMap(ContainerProtos.ContainerType.KeyValueContainer, containerHandler)); + ozoneContainer = mock(OzoneContainer.class); + when(ozoneContainer.getController()).thenReturn(controller); + when(ozoneContainer.getContainerSet()).thenReturn(containerSet); + } + + @ContainerLayoutTestInfo.ContainerTest + public void testReconcileContainerCommandReports(ContainerLayoutVersion layout) throws Exception { + Map containerReportsSent = new HashMap<>(); + IncrementalReportSender icrSender = c -> { + try { + containerReportsSent.put(ContainerID.valueOf(c.getContainerData().getContainerID()), c.getContainerReport()); + } catch (Exception ex) { + LOG.error("ICR sender failed", ex); + } + }; + initLayoutVersion(layout, icrSender); + + ReconcileContainerCommand cmd = new ReconcileContainerCommand(CONTAINER_ID, Collections.emptyList()); + subject.handle(cmd, ozoneContainer, context, null); + + verifyContainerReportsSent(containerReportsSent, Collections.singleton(CONTAINER_ID)); + } + + // TODO test is flaky on the second container layout run only. + @ContainerLayoutTestInfo.ContainerTest + public void testReconcileContainerCommandMetrics(ContainerLayoutVersion layout) throws Exception { + // Used to block ICR sending so that queue metrics can be checked before the reconcile task completes. + CountDownLatch icrLatch = new CountDownLatch(1); + Map containerReportsSent = new HashMap<>(); IncrementalReportSender icrSender = c -> { try { containerReportsSent.put(ContainerID.valueOf(c.getContainerData().getContainerID()), c.getContainerReport()); @@ -107,29 +138,14 @@ private void init() throws Exception { // Block the caller until the latch is counted down. // Caller can check queue metrics in the meantime. LOG.info("ICR sender waiting for latch"); - Assertions.assertTrue(icrLatch.await(30, TimeUnit.SECONDS)); + assertTrue(icrLatch.await(30, TimeUnit.SECONDS)); LOG.info("ICR sender proceeding after latch"); - // Reset the latch for the next iteration. - // This assumes requests are executed by a single thread reading the latch. - icrLatch = new CountDownLatch(1); } catch (Exception ex) { LOG.error("ICR sender failed", ex); } }; - containerHandler = new KeyValueHandler(new OzoneConfiguration(), dnDetails.getUuidString(), containerSet, - mock(VolumeSet.class), mock(ContainerMetrics.class), icrSender); - controller = new ContainerController(containerSet, - singletonMap(ContainerProtos.ContainerType.KeyValueContainer, containerHandler)); - ozoneContainer = mock(OzoneContainer.class); - when(ozoneContainer.getController()).thenReturn(controller); - when(ozoneContainer.getContainerSet()).thenReturn(containerSet); - } - - // TODO test is flaky on the second container layout run only. - @ContainerLayoutTestInfo.ContainerTest - public void testReconcileContainerCommandHandled(ContainerLayoutVersion layout) throws Exception { - initLayoutVersion(layout); + initLayoutVersion(layout, icrSender); ReconcileContainerCommand cmd = new ReconcileContainerCommand(CONTAINER_ID, Collections.emptyList()); // Queue two commands for processing. @@ -139,10 +155,10 @@ public void testReconcileContainerCommandHandled(ContainerLayoutVersion layout) // The first command was invoked when submitted, and is now blocked in the ICR sender. // Since neither command has finished, they both count towards queue count. - Assertions.assertEquals(1, subject.getInvocationCount()); - Assertions.assertEquals(2, subject.getQueuedCount()); - Assertions.assertEquals(0, subject.getTotalRunTime()); - Assertions.assertEquals(0, subject.getAverageRunTime()); + assertEquals(1, subject.getInvocationCount()); + assertEquals(2, subject.getQueuedCount()); + assertEquals(0, subject.getTotalRunTime()); + assertEquals(0, subject.getAverageRunTime()); // Wait this long before unblocking the ICR sender. This is the lower bound on simulated execution time. long minExecTimeMillis = 500; @@ -152,10 +168,10 @@ public void testReconcileContainerCommandHandled(ContainerLayoutVersion layout) // Decrementing queue count indicates the task completed. waitForQueueCount(1); // The other command is invoked but blocked in the ICR sender. - Assertions.assertEquals(2, subject.getInvocationCount()); + assertEquals(2, subject.getInvocationCount()); long firstTotalRunTime = subject.getTotalRunTime(); long firstAvgRunTime = subject.getAverageRunTime(); - Assertions.assertTrue(firstTotalRunTime >= minExecTimeMillis, + assertTrue(firstTotalRunTime >= minExecTimeMillis, "Total run time " + firstTotalRunTime + "ms was not larger than min exec time " + minExecTimeMillis + "ms"); // Wait a little longer before firing the second command. @@ -163,14 +179,14 @@ public void testReconcileContainerCommandHandled(ContainerLayoutVersion layout) icrLatch.countDown(); // Decrementing queue count indicates the task completed. waitForQueueCount(0); - Assertions.assertEquals(2, subject.getInvocationCount()); + assertEquals(2, subject.getInvocationCount()); long secondTotalRunTime = subject.getTotalRunTime(); long secondAvgRunTime = subject.getAverageRunTime(); - Assertions.assertTrue(secondTotalRunTime >= firstTotalRunTime + minExecTimeMillis); - Assertions.assertTrue(secondAvgRunTime >= minExecTimeMillis); + assertTrue(secondTotalRunTime >= firstTotalRunTime + minExecTimeMillis); + assertTrue(secondAvgRunTime >= minExecTimeMillis); // We slept the thread a little longer on the second invocation, which should have increased the average run time // from the first run. - Assertions.assertTrue(secondAvgRunTime >= firstAvgRunTime); + assertTrue(secondAvgRunTime >= firstAvgRunTime); verifyContainerReportsSent(); } @@ -183,12 +199,19 @@ private void waitForQueueCount(int expectedQueueCount) throws Exception { }, 500, 3000); } - private void verifyContainerReportsSent() throws Exception { - for (Map.Entry entry: containerReportsSent.entrySet()) { + private void verifyContainerReportsSent(Map reportsSent, + Set expectedContainerIDs) throws Exception { + + assertEquals(expectedContainerIDs.size(), reportsSent.size()); + + for (Map.Entry entry: reportsSent.entrySet()) { ContainerID id = entry.getKey(); + assertTrue(expectedContainerIDs.contains(id.getId())); + String sentDataChecksum = entry.getValue().getDataChecksum(); + // Current implementation is incomplete, and uses this as a mocked checksum. String expectedDataChecksum = ContainerUtils.getChecksum(Long.toString(id.getId())); - Assertions.assertEquals(expectedDataChecksum, sentDataChecksum, "Checksum mismatch in report of container " + id); + assertEquals(expectedDataChecksum, sentDataChecksum, "Checksum mismatch in report of container " + id); } } } From e42324e8c2cf04d5c542f5b02b31e4a10ec8cefc Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Mon, 29 Apr 2024 17:38:31 -0700 Subject: [PATCH 25/43] Almost finished separated metrics and report tests --- .../TestReconcileContainerCommandHandler.java | 89 ++++++++++--------- 1 file changed, 45 insertions(+), 44 deletions(-) diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java index 72ffe5016f14..d4d09d44d0e8 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java @@ -18,7 +18,6 @@ package org.apache.hadoop.ozone.container.common.statemachine.commandhandler; -import org.apache.commons.lang3.stream.Streams; import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.protocol.DatanodeDetails; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos; @@ -46,14 +45,17 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; +import java.util.concurrent.TimeoutException; +import static java.util.Collections.min; import static java.util.Collections.singletonMap; import static org.apache.hadoop.hdds.protocol.MockDatanodeDetails.randomDatanodeDetails; import static org.apache.hadoop.ozone.OzoneConsts.GB; @@ -80,13 +82,10 @@ public class TestReconcileContainerCommandHandler { private ContainerLayoutVersion layoutVersion; - public void initLayoutVersion(ContainerLayoutVersion layout, IncrementalReportSender icrSender) + public void init(ContainerLayoutVersion layout, IncrementalReportSender icrSender) throws Exception { this.layoutVersion = layout; - init(icrSender); - } - private void init(IncrementalReportSender icrSender) throws Exception { OzoneConfiguration conf = new OzoneConfiguration(); DatanodeDetails dnDetails = randomDatanodeDetails(); subject = new ReconcileContainerCommandHandler(""); @@ -112,17 +111,27 @@ public void testReconcileContainerCommandReports(ContainerLayoutVersion layout) Map containerReportsSent = new HashMap<>(); IncrementalReportSender icrSender = c -> { try { - containerReportsSent.put(ContainerID.valueOf(c.getContainerData().getContainerID()), c.getContainerReport()); + ContainerID id = ContainerID.valueOf(c.getContainerData().getContainerID()); + containerReportsSent.put(id, c.getContainerReport()); + LOG.info("Added container report for container {}", id); } catch (Exception ex) { LOG.error("ICR sender failed", ex); } }; - initLayoutVersion(layout, icrSender); + init(layout, icrSender); + // These two commands are for a container existing in the datanode. ReconcileContainerCommand cmd = new ReconcileContainerCommand(CONTAINER_ID, Collections.emptyList()); subject.handle(cmd, ozoneContainer, context, null); + subject.handle(cmd, ozoneContainer, context, null); + + // This container was + ReconcileContainerCommand cmd2 = new ReconcileContainerCommand(CONTAINER_ID + 1, Collections.emptyList()); + subject.handle(cmd2, ozoneContainer, context, null); - verifyContainerReportsSent(containerReportsSent, Collections.singleton(CONTAINER_ID)); + waitForAllCommandsToFinish(); + + verifyContainerReportsSent(containerReportsSent, new HashSet<>(Arrays.asList(CONTAINER_ID, CONTAINER_ID + 1))); } // TODO test is flaky on the second container layout run only. @@ -130,72 +139,64 @@ public void testReconcileContainerCommandReports(ContainerLayoutVersion layout) public void testReconcileContainerCommandMetrics(ContainerLayoutVersion layout) throws Exception { // Used to block ICR sending so that queue metrics can be checked before the reconcile task completes. CountDownLatch icrLatch = new CountDownLatch(1); - Map containerReportsSent = new HashMap<>(); + // Wait this long before completing the task. + // This provides a lower bound on execution time. + final long minExecTimeMillis = 500; + IncrementalReportSender icrSender = c -> { try { - containerReportsSent.put(ContainerID.valueOf(c.getContainerData().getContainerID()), c.getContainerReport()); - // Block the caller until the latch is counted down. // Caller can check queue metrics in the meantime. LOG.info("ICR sender waiting for latch"); assertTrue(icrLatch.await(30, TimeUnit.SECONDS)); LOG.info("ICR sender proceeding after latch"); + + Thread.sleep(minExecTimeMillis); } catch (Exception ex) { LOG.error("ICR sender failed", ex); } }; - initLayoutVersion(layout, icrSender); + init(layout, icrSender); ReconcileContainerCommand cmd = new ReconcileContainerCommand(CONTAINER_ID, Collections.emptyList()); // Queue two commands for processing. - // Handler is blocked until we count down the ICR latch. + // Both commands will be blocked until the latch is counted down. subject.handle(cmd, ozoneContainer, context, null); subject.handle(cmd, ozoneContainer, context, null); // The first command was invoked when submitted, and is now blocked in the ICR sender. - // Since neither command has finished, they both count towards queue count. - assertEquals(1, subject.getInvocationCount()); + // The second command is blocked on the first since handling is single threaded in the current implementation. + // Since neither command has finished they both count towards queue count, which is incremented synchronously. assertEquals(2, subject.getQueuedCount()); assertEquals(0, subject.getTotalRunTime()); assertEquals(0, subject.getAverageRunTime()); - // Wait this long before unblocking the ICR sender. This is the lower bound on simulated execution time. - long minExecTimeMillis = 500; - Thread.sleep(minExecTimeMillis); + // This will resume handling of the two tasks. icrLatch.countDown(); + // Two tasks were fired, and each one should have taken at least minExecTime. + final long expectedTotalMinExecTimeMillis = minExecTimeMillis * 2; - // Decrementing queue count indicates the task completed. - waitForQueueCount(1); - // The other command is invoked but blocked in the ICR sender. - assertEquals(2, subject.getInvocationCount()); - long firstTotalRunTime = subject.getTotalRunTime(); - long firstAvgRunTime = subject.getAverageRunTime(); - assertTrue(firstTotalRunTime >= minExecTimeMillis, - "Total run time " + firstTotalRunTime + "ms was not larger than min exec time " + minExecTimeMillis + "ms"); + waitForAllCommandsToFinish(); - // Wait a little longer before firing the second command. - Thread.sleep(minExecTimeMillis + 100); - icrLatch.countDown(); - // Decrementing queue count indicates the task completed. - waitForQueueCount(0); assertEquals(2, subject.getInvocationCount()); - long secondTotalRunTime = subject.getTotalRunTime(); - long secondAvgRunTime = subject.getAverageRunTime(); - assertTrue(secondTotalRunTime >= firstTotalRunTime + minExecTimeMillis); - assertTrue(secondAvgRunTime >= minExecTimeMillis); - // We slept the thread a little longer on the second invocation, which should have increased the average run time - // from the first run. - assertTrue(secondAvgRunTime >= firstAvgRunTime); - - verifyContainerReportsSent(); + long totalRunTime = subject.getTotalRunTime(); + assertTrue(totalRunTime >= expectedTotalMinExecTimeMillis, + "Total run time " + totalRunTime + "ms was not larger than the minimum total exec time " + + expectedTotalMinExecTimeMillis + "ms"); + long avgRunTime = subject.getAverageRunTime(); + assertTrue(avgRunTime >= minExecTimeMillis, + "Average run time " + avgRunTime + "ms was not larger than the minimum per task exec time " + + minExecTimeMillis + "ms"); } - private void waitForQueueCount(int expectedQueueCount) throws Exception { + private void waitForAllCommandsToFinish() throws Exception { + // Queue count should be decremented only after the task completes, so the other metrics should be consistent when + // it reaches zero. GenericTestUtils.waitFor(() -> { int qCount = subject.getQueuedCount(); - LOG.info("Waiting for queued command count to reach " + expectedQueueCount + ". Currently at " + qCount); - return qCount == expectedQueueCount; + LOG.info("Waiting for queued command count to reach 0. Currently at " + qCount); + return qCount == 0; }, 500, 3000); } From 3cb04162852762ffa626af928ecfb5bbde6a14e8 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Mon, 29 Apr 2024 18:09:23 -0700 Subject: [PATCH 26/43] TestReconcileContainerCommandHandler complete and passing --- .../TestReconcileContainerCommandHandler.java | 82 ++++++++----------- 1 file changed, 34 insertions(+), 48 deletions(-) diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java index d4d09d44d0e8..8693d7bf7f8f 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java @@ -45,21 +45,18 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.Arrays; import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; import java.util.Map; -import java.util.Set; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; import static java.util.Collections.min; import static java.util.Collections.singletonMap; import static org.apache.hadoop.hdds.protocol.MockDatanodeDetails.randomDatanodeDetails; import static org.apache.hadoop.ozone.OzoneConsts.GB; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -70,36 +67,33 @@ public class TestReconcileContainerCommandHandler { public static final Logger LOG = LoggerFactory.getLogger(TestReconcileContainerCommandHandler.class); - private static final long CONTAINER_ID = 123L; + private static final int NUM_CONTAINERS = 3; + ContainerSet containerSet; private OzoneContainer ozoneContainer; private StateContext context; - private Container container; - private Handler containerHandler; - private ContainerController controller; - private ContainerSet containerSet; private ReconcileContainerCommandHandler subject; - private ContainerLayoutVersion layoutVersion; - public void init(ContainerLayoutVersion layout, IncrementalReportSender icrSender) throws Exception { - this.layoutVersion = layout; OzoneConfiguration conf = new OzoneConfiguration(); DatanodeDetails dnDetails = randomDatanodeDetails(); subject = new ReconcileContainerCommandHandler(""); context = ContainerTestUtils.getMockContext(dnDetails, conf); - KeyValueContainerData data = new KeyValueContainerData(CONTAINER_ID, layoutVersion, GB, - PipelineID.randomId().toString(), randomDatanodeDetails().getUuidString()); - container = new KeyValueContainer(data, conf); containerSet = new ContainerSet(1000); - containerSet.addContainer(container); + for (int id = 1; id <= NUM_CONTAINERS; id++) { + KeyValueContainerData data = new KeyValueContainerData(id, layout, GB, + PipelineID.randomId().toString(), randomDatanodeDetails().getUuidString()); + containerSet.addContainer(new KeyValueContainer(data, conf)); + } + + assertEquals(NUM_CONTAINERS, containerSet.containerCount()); - containerHandler = new KeyValueHandler(new OzoneConfiguration(), dnDetails.getUuidString(), containerSet, + Handler containerHandler = new KeyValueHandler(new OzoneConfiguration(), dnDetails.getUuidString(), containerSet, mock(VolumeSet.class), mock(ContainerMetrics.class), icrSender); - controller = new ContainerController(containerSet, + ContainerController controller = new ContainerController(containerSet, singletonMap(ContainerProtos.ContainerType.KeyValueContainer, containerHandler)); ozoneContainer = mock(OzoneContainer.class); when(ozoneContainer.getController()).thenReturn(controller); @@ -120,21 +114,20 @@ public void testReconcileContainerCommandReports(ContainerLayoutVersion layout) }; init(layout, icrSender); - // These two commands are for a container existing in the datanode. - ReconcileContainerCommand cmd = new ReconcileContainerCommand(CONTAINER_ID, Collections.emptyList()); - subject.handle(cmd, ozoneContainer, context, null); - subject.handle(cmd, ozoneContainer, context, null); + for (int id = 1; id <= NUM_CONTAINERS; id++) { + ReconcileContainerCommand cmd = new ReconcileContainerCommand(id, Collections.emptyList()); + subject.handle(cmd, ozoneContainer, context, null); + } - // This container was - ReconcileContainerCommand cmd2 = new ReconcileContainerCommand(CONTAINER_ID + 1, Collections.emptyList()); - subject.handle(cmd2, ozoneContainer, context, null); + // An unknown container should not trigger a container report being sent. + ReconcileContainerCommand unknownContainerCmd = new ReconcileContainerCommand(NUM_CONTAINERS + 1, + Collections.emptyList()); + subject.handle(unknownContainerCmd, ozoneContainer, context, null); waitForAllCommandsToFinish(); - - verifyContainerReportsSent(containerReportsSent, new HashSet<>(Arrays.asList(CONTAINER_ID, CONTAINER_ID + 1))); + verifyAllContainerReports(containerReportsSent); } - // TODO test is flaky on the second container layout run only. @ContainerLayoutTestInfo.ContainerTest public void testReconcileContainerCommandMetrics(ContainerLayoutVersion layout) throws Exception { // Used to block ICR sending so that queue metrics can be checked before the reconcile task completes. @@ -142,6 +135,8 @@ public void testReconcileContainerCommandMetrics(ContainerLayoutVersion layout) // Wait this long before completing the task. // This provides a lower bound on execution time. final long minExecTimeMillis = 500; + // This is the lower bound on execution time of all the commands combined. + final long expectedTotalMinExecTimeMillis = minExecTimeMillis * NUM_CONTAINERS; IncrementalReportSender icrSender = c -> { try { @@ -159,27 +154,20 @@ public void testReconcileContainerCommandMetrics(ContainerLayoutVersion layout) init(layout, icrSender); - ReconcileContainerCommand cmd = new ReconcileContainerCommand(CONTAINER_ID, Collections.emptyList()); - // Queue two commands for processing. - // Both commands will be blocked until the latch is counted down. - subject.handle(cmd, ozoneContainer, context, null); - subject.handle(cmd, ozoneContainer, context, null); - - // The first command was invoked when submitted, and is now blocked in the ICR sender. - // The second command is blocked on the first since handling is single threaded in the current implementation. - // Since neither command has finished they both count towards queue count, which is incremented synchronously. - assertEquals(2, subject.getQueuedCount()); + // All commands submitted will be blocked until the latch is counted down. + for (int id = 1; id <= NUM_CONTAINERS; id++) { + ReconcileContainerCommand cmd = new ReconcileContainerCommand(id, Collections.emptyList()); + subject.handle(cmd, ozoneContainer, context, null); + } + assertEquals(NUM_CONTAINERS, subject.getQueuedCount()); assertEquals(0, subject.getTotalRunTime()); assertEquals(0, subject.getAverageRunTime()); - // This will resume handling of the two tasks. + // This will resume handling of the tasks. icrLatch.countDown(); - // Two tasks were fired, and each one should have taken at least minExecTime. - final long expectedTotalMinExecTimeMillis = minExecTimeMillis * 2; - waitForAllCommandsToFinish(); - assertEquals(2, subject.getInvocationCount()); + assertEquals(NUM_CONTAINERS, subject.getInvocationCount()); long totalRunTime = subject.getTotalRunTime(); assertTrue(totalRunTime >= expectedTotalMinExecTimeMillis, "Total run time " + totalRunTime + "ms was not larger than the minimum total exec time " + @@ -200,14 +188,12 @@ private void waitForAllCommandsToFinish() throws Exception { }, 500, 3000); } - private void verifyContainerReportsSent(Map reportsSent, - Set expectedContainerIDs) throws Exception { - - assertEquals(expectedContainerIDs.size(), reportsSent.size()); + private void verifyAllContainerReports(Map reportsSent) throws Exception { + assertEquals(NUM_CONTAINERS, reportsSent.size()); for (Map.Entry entry: reportsSent.entrySet()) { ContainerID id = entry.getKey(); - assertTrue(expectedContainerIDs.contains(id.getId())); + assertNotNull(containerSet.getContainer(id.getId())); String sentDataChecksum = entry.getValue().getDataChecksum(); // Current implementation is incomplete, and uses this as a mocked checksum. From 18f4f2ad75e30dafc2c0807b83a8b0822ce35c63 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Tue, 30 Apr 2024 12:06:55 -0700 Subject: [PATCH 27/43] Checkstyle --- .../commandhandler/TestReconcileContainerCommandHandler.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java index 8693d7bf7f8f..5dc71bf30208 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java @@ -51,7 +51,6 @@ import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; -import static java.util.Collections.min; import static java.util.Collections.singletonMap; import static org.apache.hadoop.hdds.protocol.MockDatanodeDetails.randomDatanodeDetails; import static org.apache.hadoop.ozone.OzoneConsts.GB; @@ -69,7 +68,7 @@ public class TestReconcileContainerCommandHandler { private static final int NUM_CONTAINERS = 3; - ContainerSet containerSet; + private ContainerSet containerSet; private OzoneContainer ozoneContainer; private StateContext context; private ReconcileContainerCommandHandler subject; From 2163008ba8f78ee89640ad4a08e87555cc031bdc Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Tue, 30 Apr 2024 16:24:42 -0700 Subject: [PATCH 28/43] Fix TestSCMExceptionResultCodes --- .../interface-server/src/main/proto/ScmServerProtocol.proto | 1 + 1 file changed, 1 insertion(+) diff --git a/hadoop-hdds/interface-server/src/main/proto/ScmServerProtocol.proto b/hadoop-hdds/interface-server/src/main/proto/ScmServerProtocol.proto index 307c23a56202..781ef8d1f1f7 100644 --- a/hadoop-hdds/interface-server/src/main/proto/ScmServerProtocol.proto +++ b/hadoop-hdds/interface-server/src/main/proto/ScmServerProtocol.proto @@ -137,6 +137,7 @@ enum Status { CA_ROTATION_IN_POST_PROGRESS = 44; CONTAINER_ALREADY_CLOSED = 45; CONTAINER_ALREADY_CLOSING = 46; + UNSUPPORTED_OPERATION = 47; } /** From 13a335efbcb26d7d0dc78e20623bc14e45c3113c Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Tue, 30 Apr 2024 18:20:29 -0700 Subject: [PATCH 29/43] Might have fixed acceptance test --- .../main/smoketest/admincli/container.robot | 36 +++++++++---------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot b/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot index 73ecd52e139c..6bd730856493 100644 --- a/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot +++ b/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot @@ -71,13 +71,6 @@ Verbose container info ${output} = Execute ozone admin --verbose container info "${CONTAINER}" Should contain ${output} Pipeline Info -Close container - ${container} = Execute ozone admin container list --state OPEN | jq -r 'select(.replicationConfig.replicationFactor == "THREE") | .containerID' | head -1 - Execute ozone admin container close "${container}" - ${output} = Execute ozone admin container info "${container}" - Should contain ${output} CLOS - Wait until keyword succeeds 1min 10sec Container is closed ${container} - Incomplete command ${output} = Execute And Ignore Error ozone admin container Should contain ${output} Incomplete command @@ -104,25 +97,28 @@ Reset user Run Keyword if '${SECURITY_ENABLED}' == 'true' Kinit test user testuser testuser.keytab Cannot reconcile open container - ${output} = Execute ozone admin container create - Should contain ${output} is created - # The newly created container should still be open. + # At this point we should have an open Ratis Three container. ${container} = Execute ozone admin container list --state OPEN | jq -r 'select(.replicationConfig.replicationFactor == "THREE") | .containerID' | head -n1 Execute and check rc ozone admin container reconcile "${container}" 255 - -Reconcile closed container - # Get a currently open container to use for this test. - ${container} = Execute ozone admin container list --state OPEN | jq -r 'select(.replicationConfig.replicationFactor == "THREE") | .containerID' | head -n1 # The container should not yet have any replica checksums. + # TODO When the scanner is computing checksums automatically, this test may need to be updated. ${data_checksum} = Execute ozone admin container info "${container}" --json | jq -r '.replicas[].dataChecksum' | head -n1 Should be empty ${data_checksum} - # Close the container to it can be reconciled. - Execute ozone admin container close ${container} - # Check info still does not show replica checksums + +Close container + ${container} = Execute ozone admin container list --state OPEN | jq -r 'select(.replicationConfig.replicationFactor == "THREE") | .containerID' | head -1 + Execute ozone admin container close "${container}" + ${output} = Execute ozone admin container info "${container}" + Should contain ${output} CLOS + Wait until keyword succeeds 1min 10sec Container is closed ${container} + +Reconcile closed container + # Check info does not show replica checksums, since manual reconciliation has not yet been triggered. + # TODO When the scanner is computing checksums automatically, this test may need to be updated. + ${container} = Execute ozone admin container list --state CLOSED | jq -r 'select(.replicationConfig.replicationFactor == "THREE") | .containerID' | head -1 ${data_checksum} = Execute ozone admin container info "${container}" --json | jq -r '.replicas[].dataChecksum' | head -n1 Should be empty ${data_checksum} - # Reconcile, and checksums should show up. - Execute ozone admin container reconcile ${container} # When reconciliation finishes, replica checksums should be shown. + Execute ozone admin container reconcile ${container} ${data_checksum} = Execute ozone admin container info "${container}" --json | jq -r '.replicas[].dataChecksum' | head -n1 - Wait until keyword succeeds 1min 5sec Should not be empty ${data_checksum} + Wait until keyword succeeds 1min 5sec Should not be empty ${data_checksum} From b03c4fcf41edba0f3ffa60295ab4b7eae32a20ca Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Thu, 2 May 2024 15:50:14 -0700 Subject: [PATCH 30/43] Use long as checksum representation --- .../scm/container/ContainerReplicaInfo.java | 6 +++--- .../container/common/impl/ContainerData.java | 10 ++++------ .../container/keyvalue/KeyValueHandler.java | 9 +++++++-- .../common/TestKeyValueContainerData.java | 7 ++++--- .../TestReconcileContainerCommandHandler.java | 9 +++++---- .../keyvalue/TestKeyValueHandler.java | 7 +++---- .../src/main/proto/hdds.proto | 2 +- .../ScmServerDatanodeHeartbeatProtocol.proto | 3 +-- .../hdds/scm/container/ContainerReplica.java | 11 +++++------ .../container/TestContainerReportHandler.java | 19 +++++++++---------- ...TestIncrementalContainerReportHandler.java | 12 ++++++------ 11 files changed, 48 insertions(+), 47 deletions(-) diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplicaInfo.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplicaInfo.java index 1cde385b67a5..29ce0b9a37e5 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplicaInfo.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplicaInfo.java @@ -35,7 +35,7 @@ public final class ContainerReplicaInfo { private long keyCount; private long bytesUsed; private int replicaIndex = -1; - private String dataChecksum; + private long dataChecksum; public static ContainerReplicaInfo fromProto( HddsProtos.SCMContainerReplicaProto proto) { @@ -89,7 +89,7 @@ public int getReplicaIndex() { return replicaIndex; } - public String getDataChecksum() { + public long getDataChecksum() { return dataChecksum; } @@ -140,7 +140,7 @@ public Builder setReplicaIndex(int replicaIndex) { return this; } - public Builder setDataChecksum(String dataChecksum) { + public Builder setDataChecksum(long dataChecksum) { subject.dataChecksum = dataChecksum; return this; } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerData.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerData.java index 48c6c2e33d15..9f29db172331 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerData.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerData.java @@ -101,9 +101,7 @@ public abstract class ContainerData { private String containerFileChecksum; - // TODO This should have type Checksum once we decide on the checksum implementation to use. - // Currently this is just a placeholder to save data for testing. - private String dataChecksum; + private long dataChecksum; private boolean isEmpty; @@ -164,7 +162,7 @@ protected ContainerData(ContainerType type, long containerId, this.originNodeId = originNodeId; this.isEmpty = false; this.containerFileChecksum = ZERO_CHECKSUM; - this.dataChecksum = ""; + this.dataChecksum = 0; } protected ContainerData(ContainerData source) { @@ -648,11 +646,11 @@ public void computeAndSetContainerFileChecksum(Yaml yaml) throws IOException { this.containerFileChecksum = ContainerUtils.getChecksum(containerDataYamlStr); } - public void setDataChecksum(String checksum) { + public void setDataChecksum(long checksum) { dataChecksum = checksum; } - public String getDataChecksum() { + public long getDataChecksum() { return dataChecksum; } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java index ffa7e2e5e10f..9ec8a1876419 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java @@ -57,6 +57,8 @@ import org.apache.hadoop.hdds.utils.HddsServerUtil; import org.apache.hadoop.ozone.OzoneConfigKeys; import org.apache.hadoop.ozone.common.Checksum; +import org.apache.hadoop.ozone.common.ChecksumByteBuffer; +import org.apache.hadoop.ozone.common.ChecksumByteBufferFactory; import org.apache.hadoop.ozone.common.ChunkBuffer; import org.apache.hadoop.ozone.common.OzoneChecksumException; import org.apache.hadoop.ozone.common.utils.BufferUtils; @@ -1155,8 +1157,11 @@ public void deleteContainer(Container container, boolean force) public void reconcileContainer(Container container, List peers) throws IOException { // TODO Just a deterministic placeholder hash for testing until actual implementation is finished. ContainerData data = container.getContainerData(); - String dataChecksum = ContainerUtils.getChecksum(Long.toString(data.getContainerID())); - data.setDataChecksum(dataChecksum); + long id = data.getContainerID(); + ByteBuffer byteBuffer = ByteBuffer.allocate(Long.BYTES); + ChecksumByteBuffer checksumImpl = ChecksumByteBufferFactory.crc32Impl(); + checksumImpl.update(byteBuffer.putLong(id)); + data.setDataChecksum(checksumImpl.getValue()); sendICR(container); } diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestKeyValueContainerData.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestKeyValueContainerData.java index c06ec651c672..4360243f0be8 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestKeyValueContainerData.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/TestKeyValueContainerData.java @@ -84,7 +84,7 @@ public void testKeyValueData(ContainerTestVersionInfo versionInfo) { assertEquals(val.get(), kvData.getBlockCount()); assertEquals(val.get(), kvData.getNumPendingDeletionBlocks()); assertEquals(MAXSIZE, kvData.getMaxSize()); - assertEquals("", kvData.getDataChecksum()); + assertEquals(0, kvData.getDataChecksum()); kvData.setState(state); kvData.setContainerDBType(containerDBType); @@ -99,7 +99,8 @@ public void testKeyValueData(ContainerTestVersionInfo versionInfo) { kvData.incrPendingDeletionBlocks(1); kvData.setSchemaVersion( VersionedDatanodeFeatures.SchemaV3.chooseSchemaVersion(conf)); - kvData.setDataChecksum("1234"); + long expectedDataHash = 1234L; + kvData.setDataChecksum(expectedDataHash); assertEquals(state, kvData.getState()); assertEquals(containerDBType, kvData.getContainerDBType()); @@ -116,7 +117,7 @@ public void testKeyValueData(ContainerTestVersionInfo versionInfo) { assertEquals(datanodeId.toString(), kvData.getOriginNodeId()); assertEquals(VersionedDatanodeFeatures.SchemaV3.chooseSchemaVersion(conf), kvData.getSchemaVersion()); - assertEquals("1234", kvData.getDataChecksum()); + assertEquals(expectedDataHash, kvData.getDataChecksum()); KeyValueContainerData newKvData = new KeyValueContainerData(kvData); assertEquals(kvData.getReplicaIndex(), newKvData.getReplicaIndex()); diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java index 5dc71bf30208..b05b770f308f 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java @@ -55,6 +55,7 @@ import static org.apache.hadoop.hdds.protocol.MockDatanodeDetails.randomDatanodeDetails; import static org.apache.hadoop.ozone.OzoneConsts.GB; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.Mockito.mock; @@ -194,10 +195,10 @@ private void verifyAllContainerReports(Map r ContainerID id = entry.getKey(); assertNotNull(containerSet.getContainer(id.getId())); - String sentDataChecksum = entry.getValue().getDataChecksum(); - // Current implementation is incomplete, and uses this as a mocked checksum. - String expectedDataChecksum = ContainerUtils.getChecksum(Long.toString(id.getId())); - assertEquals(expectedDataChecksum, sentDataChecksum, "Checksum mismatch in report of container " + id); + long sentDataChecksum = entry.getValue().getDataChecksum(); + // Current implementation is incomplete, and uses a mocked checksum. + assertNotEquals(0, sentDataChecksum, "Report of container " + id + + " should have a non-zero checksum"); } } } diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandler.java index accf94977b74..ca7c1a3c14d5 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandler.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandler.java @@ -459,10 +459,9 @@ public void testReconcileContainer(ContainerLayoutVersion layoutVersion) throws long reportedID = report.getContainerID(); Assertions.assertEquals(container.getContainerData().getContainerID(), reportedID); - String reportDataChecksum = report.getDataChecksum(); - String expectedDataChecksum = ContainerUtils.getChecksum(Long.toString(reportedID)); - Assertions.assertEquals(expectedDataChecksum, reportDataChecksum, - "Checksum mismatch in report of container " + reportedID); + long reportDataChecksum = report.getDataChecksum(); + Assertions.assertNotEquals(0, reportDataChecksum, + "Container report should have populated the checksum field with a non-zero value."); icrCount.incrementAndGet(); }); diff --git a/hadoop-hdds/interface-client/src/main/proto/hdds.proto b/hadoop-hdds/interface-client/src/main/proto/hdds.proto index 7c80adb7ee1d..e13f1a96d30c 100644 --- a/hadoop-hdds/interface-client/src/main/proto/hdds.proto +++ b/hadoop-hdds/interface-client/src/main/proto/hdds.proto @@ -431,7 +431,7 @@ message SCMContainerReplicaProto { required int64 keyCount = 6; required int64 bytesUsed = 7; optional int64 replicaIndex = 8; - optional string dataChecksum = 9; + optional int64 dataChecksum = 9; } message KeyContainerIDList { diff --git a/hadoop-hdds/interface-server/src/main/proto/ScmServerDatanodeHeartbeatProtocol.proto b/hadoop-hdds/interface-server/src/main/proto/ScmServerDatanodeHeartbeatProtocol.proto index 187ee32e8390..d5600880e66f 100644 --- a/hadoop-hdds/interface-server/src/main/proto/ScmServerDatanodeHeartbeatProtocol.proto +++ b/hadoop-hdds/interface-server/src/main/proto/ScmServerDatanodeHeartbeatProtocol.proto @@ -236,8 +236,7 @@ message ContainerReplicaProto { optional string originNodeId = 13; optional int32 replicaIndex = 14; optional bool isEmpty = 15 [default = false]; - // TODO Leaving this as a string for now. We can define a checksum type like the client protocol has later. - optional string dataChecksum = 16; + optional int64 dataChecksum = 16; } message CommandStatusReportsProto { diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplica.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplica.java index bf6c2736d4a0..d008e24f1c14 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplica.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplica.java @@ -43,8 +43,7 @@ public final class ContainerReplica implements Comparable { private final long keyCount; private final long bytesUsed; private final boolean isEmpty; - // TODO Use a dedicated checksum class for this if required later. - private final String dataChecksum; + private final long dataChecksum; private ContainerReplica(ContainerReplicaBuilder b) { containerID = b.containerID; @@ -56,7 +55,7 @@ private ContainerReplica(ContainerReplicaBuilder b) { replicaIndex = b.replicaIndex; isEmpty = b.isEmpty; sequenceId = b.sequenceId; - dataChecksum = Optional.ofNullable(b.dataChecksum).orElse(""); + dataChecksum = b.dataChecksum; } /** @@ -117,7 +116,7 @@ public boolean isEmpty() { return isEmpty; } - public String getDataChecksum() { + public long getDataChecksum() { return dataChecksum; } @@ -208,7 +207,7 @@ public static class ContainerReplicaBuilder { private long keyCount; private int replicaIndex; private boolean isEmpty; - private String dataChecksum; + private long dataChecksum; /** * Set Container Id. @@ -283,7 +282,7 @@ public ContainerReplicaBuilder setEmpty(boolean empty) { return this; } - public ContainerReplicaBuilder setDataChecksum(String dataChecksum) { + public ContainerReplicaBuilder setDataChecksum(long dataChecksum) { this.dataChecksum = dataChecksum; return this; } diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java index b49adbb0f771..8b77e300137b 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java @@ -1008,7 +1008,7 @@ public void testWithNoContainerDataChecksum() throws Exception { // All replicas should start with an empty data checksum in SCM. boolean contOneDataChecksumsEmpty = containerManager.getContainerReplicas(contID).stream() - .allMatch(r -> r.getDataChecksum().isEmpty()); + .allMatch(r -> r.getDataChecksum() == 0); assertTrue(contOneDataChecksumsEmpty, "Replicas of container one should not yet have any data checksums."); // Send a report to SCM from one datanode that still does not have a data checksum. @@ -1025,7 +1025,7 @@ public void testWithNoContainerDataChecksum() throws Exception { // Regardless of which datanode sent the report, none of them have checksums, so all replica's data checksums // should remain empty. boolean containerDataChecksumEmpty = containerManager.getContainerReplicas(contID).stream() - .allMatch(r -> r.getDataChecksum().isEmpty()); + .allMatch(r -> r.getDataChecksum() == 0); assertTrue(containerDataChecksumEmpty, "Replicas of the container should not have any data checksums."); } @@ -1057,7 +1057,7 @@ public void testWithContainerDataChecksum() throws Exception { // All replicas should start with an empty data checksum in SCM. boolean dataChecksumsEmpty = containerManager.getContainerReplicas(contID).stream() - .allMatch(r -> r.getDataChecksum().isEmpty()); + .allMatch(r -> r.getDataChecksum() == 0); assertTrue(dataChecksumsEmpty, "Replicas of container one should not yet have any data checksums."); // For each datanode, send a container report with a mismatched checksum. @@ -1080,8 +1080,7 @@ public void testWithContainerDataChecksum() throws Exception { // datanode ID. int numReplicasChecked = 0; for (ContainerReplica replica: containerManager.getContainerReplicas(contID)) { - String expectedChecksum = createUniqueDataChecksumForReplica( - contID, replica.getDatanodeDetails().getUuidString()); + long expectedChecksum = createUniqueDataChecksumForReplica(contID, replica.getDatanodeDetails().getUuidString()); assertEquals(expectedChecksum, replica.getDataChecksum()); numReplicasChecked++; } @@ -1107,7 +1106,7 @@ public void testWithContainerDataChecksum() throws Exception { // Since the containers don't have any data in this test, the matching checksums are based on container ID only. numReplicasChecked = 0; for (ContainerReplica replica: containerManager.getContainerReplicas(contID)) { - String expectedChecksum = createMatchingDataChecksumForReplica(contID); + long expectedChecksum = createMatchingDataChecksumForReplica(contID); assertEquals(expectedChecksum, replica.getDataChecksum()); numReplicasChecked++; } @@ -1117,15 +1116,15 @@ public void testWithContainerDataChecksum() throws Exception { /** * Generates a placeholder data checksum for testing that is specific to a container replica. */ - protected static String createUniqueDataChecksumForReplica(ContainerID containerID, String datanodeID) { - return Integer.toString((datanodeID + containerID).hashCode()); + protected static long createUniqueDataChecksumForReplica(ContainerID containerID, String datanodeID) { + return (datanodeID + containerID).hashCode(); } /** * Generates a placeholder data checksum for testing that is the same for all container replicas. */ - protected static String createMatchingDataChecksumForReplica(ContainerID containerID) { - return Integer.toString(Objects.hashCode(containerID)); + protected static long createMatchingDataChecksumForReplica(ContainerID containerID) { + return Objects.hashCode(containerID); } private ContainerReportFromDatanode getContainerReportFromDatanode( diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestIncrementalContainerReportHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestIncrementalContainerReportHandler.java index f67b186c0308..9abbda819340 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestIncrementalContainerReportHandler.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestIncrementalContainerReportHandler.java @@ -610,7 +610,7 @@ public void testWithNoContainerDataChecksum() throws Exception { // All replicas should start with an empty data checksum in SCM. boolean contOneDataChecksumsEmpty = containerManager.getContainerReplicas(contID).stream() - .allMatch(r -> r.getDataChecksum().isEmpty()); + .allMatch(r -> r.getDataChecksum() == 0); assertTrue(contOneDataChecksumsEmpty, "Replicas of container one should not yet have any data checksums."); // Send a report to SCM from one datanode that still does not have a data checksum. @@ -625,7 +625,7 @@ public void testWithNoContainerDataChecksum() throws Exception { // Regardless of which datanode sent the report, none of them have checksums, so all replica's data checksums // should remain empty. boolean containerDataChecksumEmpty = containerManager.getContainerReplicas(contID).stream() - .allMatch(r -> r.getDataChecksum().isEmpty()); + .allMatch(r -> r.getDataChecksum() == 0); assertTrue(containerDataChecksumEmpty, "Replicas of the container should not have any data checksums."); } @@ -657,9 +657,9 @@ public void testWithContainerDataChecksum() throws Exception { // Container manager should now be aware of 3 replicas of each container. assertEquals(3, containerManager.getContainerReplicas(contID).size()); - // All replicas should start with an empty data checksum in SCM. + // All replicas should start with a zero data checksum in SCM. boolean dataChecksumsEmpty = containerManager.getContainerReplicas(contID).stream() - .allMatch(r -> r.getDataChecksum().isEmpty()); + .allMatch(r -> r.getDataChecksum() == 0); assertTrue(dataChecksumsEmpty, "Replicas of container one should not yet have any data checksums."); // For each datanode, send a container report with a mismatched checksum. @@ -683,7 +683,7 @@ public void testWithContainerDataChecksum() throws Exception { // datanode ID. int numReplicasChecked = 0; for (ContainerReplica replica: containerManager.getContainerReplicas(contID)) { - String expectedChecksum = createUniqueDataChecksumForReplica( + long expectedChecksum = createUniqueDataChecksumForReplica( contID, replica.getDatanodeDetails().getUuidString()); assertEquals(expectedChecksum, replica.getDataChecksum()); numReplicasChecked++; @@ -711,7 +711,7 @@ public void testWithContainerDataChecksum() throws Exception { // Since the containers don't have any data in this test, the matching checksums are based on container ID only. numReplicasChecked = 0; for (ContainerReplica replica: containerManager.getContainerReplicas(contID)) { - String expectedChecksum = createMatchingDataChecksumForReplica(contID); + long expectedChecksum = createMatchingDataChecksumForReplica(contID); assertEquals(expectedChecksum, replica.getDataChecksum()); numReplicasChecked++; } From bc434ec12155022c27da149ffc1e04dbf952825b Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Fri, 3 May 2024 10:27:50 -0700 Subject: [PATCH 31/43] Apparently snakeyaml is coupled to Java variable names --- .../container/common/impl/ContainerData.java | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerData.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerData.java index 9f29db172331..9cc526b4ee6b 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerData.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerData.java @@ -99,8 +99,10 @@ public abstract class ContainerData { private HddsVolume volume; - private String containerFileChecksum; + // Checksum of just the container file. + private String checksum; + // Checksum of the data within the container. private long dataChecksum; private boolean isEmpty; @@ -161,7 +163,7 @@ protected ContainerData(ContainerType type, long containerId, this.originPipelineId = originPipelineId; this.originNodeId = originNodeId; this.isEmpty = false; - this.containerFileChecksum = ZERO_CHECKSUM; + this.checksum = ZERO_CHECKSUM; this.dataChecksum = 0; } @@ -575,11 +577,11 @@ public void setBlockCount(long count) { } public void setContainerFileChecksum(String checkSum) { - this.containerFileChecksum = checkSum; + this.checksum = checkSum; } public String getContainerFileChecksum() { - return this.containerFileChecksum; + return this.checksum; } /** @@ -628,7 +630,7 @@ public String getOriginNodeId() { * on ContainerType) and set the checksum. * * Checksum of ContainerData is calculated by setting the - * {@link ContainerData#containerFileChecksum} field to a 64-byte array with all 0's - + * {@link ContainerData#checksum} field to a 64-byte array with all 0's - * {@link ContainerData#ZERO_CHECKSUM}. After the checksum is calculated, * the checksum field is updated with this value. * @@ -638,12 +640,12 @@ public String getOriginNodeId() { public void computeAndSetContainerFileChecksum(Yaml yaml) throws IOException { // Set checksum to dummy value - 0 byte array, to calculate the checksum // of rest of the data. - this.containerFileChecksum = ZERO_CHECKSUM; + this.checksum = ZERO_CHECKSUM; // Dump yaml data into a string to compute its checksum String containerDataYamlStr = yaml.dump(this); - this.containerFileChecksum = ContainerUtils.getChecksum(containerDataYamlStr); + this.checksum = ContainerUtils.getChecksum(containerDataYamlStr); } public void setDataChecksum(long checksum) { From 290fbb78195c8d54a1a4f4aa16a17581b913fe87 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Fri, 3 May 2024 17:22:52 -0700 Subject: [PATCH 32/43] checkstyle --- .../hadoop/ozone/container/common/impl/ContainerData.java | 4 ++-- .../commandhandler/TestReconcileContainerCommandHandler.java | 1 - .../hadoop/ozone/container/keyvalue/TestKeyValueHandler.java | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerData.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerData.java index 9cc526b4ee6b..14ef0210ff9b 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerData.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerData.java @@ -648,8 +648,8 @@ public void computeAndSetContainerFileChecksum(Yaml yaml) throws IOException { this.checksum = ContainerUtils.getChecksum(containerDataYamlStr); } - public void setDataChecksum(long checksum) { - dataChecksum = checksum; + public void setDataChecksum(long dataChecksum) { + this.dataChecksum = dataChecksum; } public long getDataChecksum() { diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java index b05b770f308f..d6be667f41bc 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java @@ -26,7 +26,6 @@ import org.apache.hadoop.hdds.scm.pipeline.PipelineID; import org.apache.hadoop.ozone.container.common.ContainerTestUtils; import org.apache.hadoop.ozone.container.common.helpers.ContainerMetrics; -import org.apache.hadoop.ozone.container.common.helpers.ContainerUtils; import org.apache.hadoop.ozone.container.common.impl.ContainerLayoutVersion; import org.apache.hadoop.ozone.container.common.impl.ContainerSet; import org.apache.hadoop.ozone.container.common.interfaces.Container; diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandler.java index ca7c1a3c14d5..2ce6eabe3944 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandler.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandler.java @@ -43,7 +43,6 @@ import org.apache.hadoop.hdds.security.token.TokenVerifier; import org.apache.hadoop.ozone.container.common.ContainerTestUtils; import org.apache.hadoop.ozone.container.common.helpers.ContainerMetrics; -import org.apache.hadoop.ozone.container.common.helpers.ContainerUtils; import org.apache.hadoop.ozone.container.common.impl.ContainerLayoutVersion; import org.apache.hadoop.ozone.container.common.impl.ContainerSet; import org.apache.hadoop.ozone.container.common.impl.HddsDispatcher; From afd604351ee8e405deff3e419b7c5609c5ece648 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Fri, 3 May 2024 17:25:36 -0700 Subject: [PATCH 33/43] Rename two existing container file checksum methods for clarity --- .../container/common/helpers/ContainerUtils.java | 4 ++-- .../ozone/container/common/impl/ContainerData.java | 2 +- .../container/keyvalue/KeyValueContainerCheck.java | 2 +- .../keyvalue/helpers/KeyValueContainerUtil.java | 2 +- .../container/replication/ContainerImporter.java | 2 +- .../common/impl/TestContainerDataYaml.java | 14 +++++++------- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java index 759b5edae3bb..22a7408642b5 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerUtils.java @@ -194,7 +194,7 @@ public static synchronized DatanodeDetails readDatanodeDetailsFrom(File path) * Verify that the checksum stored in containerData is equal to the * computed checksum. */ - public static void verifyChecksum(ContainerData containerData, + public static void verifyContainerFileChecksum(ContainerData containerData, ConfigurationSource conf) throws IOException { boolean enabled = conf.getBoolean( HddsConfigKeys.HDDS_CONTAINER_CHECKSUM_VERIFICATION_ENABLED, @@ -225,7 +225,7 @@ public static void verifyChecksum(ContainerData containerData, * @param containerDataYamlStr ContainerData as a Yaml String * @return Checksum of the container data */ - public static String getChecksum(String containerDataYamlStr) + public static String getContainerFileChecksum(String containerDataYamlStr) throws StorageContainerException { MessageDigest sha; try { diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerData.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerData.java index 14ef0210ff9b..4e3f2a7d53be 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerData.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerData.java @@ -645,7 +645,7 @@ public void computeAndSetContainerFileChecksum(Yaml yaml) throws IOException { // Dump yaml data into a string to compute its checksum String containerDataYamlStr = yaml.dump(this); - this.checksum = ContainerUtils.getChecksum(containerDataYamlStr); + this.checksum = ContainerUtils.getContainerFileChecksum(containerDataYamlStr); } public void setDataChecksum(long dataChecksum) { diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainerCheck.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainerCheck.java index f0713469e61f..4f3a7d5ff886 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainerCheck.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainerCheck.java @@ -177,7 +177,7 @@ private ScanResult checkContainerFile(File containerFile) { .checkState(onDiskContainerData != null, "Container File not loaded"); try { - ContainerUtils.verifyChecksum(onDiskContainerData, checkConfig); + ContainerUtils.verifyContainerFileChecksum(onDiskContainerData, checkConfig); } catch (IOException ex) { return ScanResult.unhealthy(ScanResult.FailureType.CORRUPT_CONTAINER_FILE, containerFile, ex); diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/helpers/KeyValueContainerUtil.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/helpers/KeyValueContainerUtil.java index 90ee356ab59d..53def2586090 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/helpers/KeyValueContainerUtil.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/helpers/KeyValueContainerUtil.java @@ -210,7 +210,7 @@ public static void parseKVContainerData(KeyValueContainerData kvContainerData, long containerID = kvContainerData.getContainerID(); // Verify Checksum - ContainerUtils.verifyChecksum(kvContainerData, config); + ContainerUtils.verifyContainerFileChecksum(kvContainerData, config); if (kvContainerData.getSchemaVersion() == null) { // If this container has not specified a schema version, it is in the old diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ContainerImporter.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ContainerImporter.java index 1929c16089b0..3ee18197458a 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ContainerImporter.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ContainerImporter.java @@ -123,7 +123,7 @@ public void importContainer(long containerID, Path tarFilePath, packer.unpackContainerDescriptor(input); containerData = getKeyValueContainerData(containerDescriptorYaml); } - ContainerUtils.verifyChecksum(containerData, conf); + ContainerUtils.verifyContainerFileChecksum(containerData, conf); containerData.setVolume(targetVolume); try (FileInputStream input = new FileInputStream(tarFilePath.toFile())) { diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/impl/TestContainerDataYaml.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/impl/TestContainerDataYaml.java index ec78398824e7..2b4a3290836b 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/impl/TestContainerDataYaml.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/impl/TestContainerDataYaml.java @@ -218,7 +218,7 @@ void testCheckBackWardCompatibilityOfContainerFile( File file = new File(classLoader.getResource(containerFile).getFile()); KeyValueContainerData kvData = (KeyValueContainerData) ContainerDataYaml .readContainerFile(file); - ContainerUtils.verifyChecksum(kvData, conf); + ContainerUtils.verifyContainerFileChecksum(kvData, conf); //Checking the Container file data is consistent or not assertEquals(ContainerProtos.ContainerDataProto.State.CLOSED, kvData @@ -236,7 +236,7 @@ void testCheckBackWardCompatibilityOfContainerFile( } /** - * Test to verify {@link ContainerUtils#verifyChecksum(ContainerData,ConfigurationSource)}. + * Test to verify {@link ContainerUtils#verifyContainerFileChecksum(ContainerData,ConfigurationSource)}. */ @ContainerLayoutTestInfo.ContainerTest public void testChecksumInContainerFile(ContainerLayoutVersion layout) throws IOException { @@ -247,13 +247,13 @@ public void testChecksumInContainerFile(ContainerLayoutVersion layout) throws IO // Read from .container file, and verify data. KeyValueContainerData kvData = (KeyValueContainerData) ContainerDataYaml.readContainerFile(containerFile); - ContainerUtils.verifyChecksum(kvData, conf); + ContainerUtils.verifyContainerFileChecksum(kvData, conf); cleanup(); } /** - * Test to verify {@link ContainerUtils#verifyChecksum(ContainerData,ConfigurationSource)}. + * Test to verify {@link ContainerUtils#verifyContainerFileChecksum(ContainerData,ConfigurationSource)}. */ @ContainerLayoutTestInfo.ContainerTest public void testChecksumInContainerFileWithReplicaIndex( @@ -266,7 +266,7 @@ public void testChecksumInContainerFileWithReplicaIndex( // Read from .container file, and verify data. KeyValueContainerData kvData = (KeyValueContainerData) ContainerDataYaml .readContainerFile(containerFile); - ContainerUtils.verifyChecksum(kvData, conf); + ContainerUtils.verifyContainerFileChecksum(kvData, conf); cleanup(); } @@ -287,7 +287,7 @@ public void testIncorrectChecksum(ContainerLayoutVersion layout) { setLayoutVersion(layout); Exception ex = assertThrows(Exception.class, () -> { KeyValueContainerData kvData = getKeyValueContainerData(); - ContainerUtils.verifyChecksum(kvData, conf); + ContainerUtils.verifyContainerFileChecksum(kvData, conf); }); assertThat(ex).hasMessageStartingWith("Container checksum error for ContainerID:"); @@ -303,6 +303,6 @@ public void testDisabledChecksum(ContainerLayoutVersion layout) KeyValueContainerData kvData = getKeyValueContainerData(); conf.setBoolean(HddsConfigKeys. HDDS_CONTAINER_CHECKSUM_VERIFICATION_ENABLED, false); - ContainerUtils.verifyChecksum(kvData, conf); + ContainerUtils.verifyContainerFileChecksum(kvData, conf); } } From d8e86bddff01a5c0216937319a9b576ef476b54c Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Fri, 3 May 2024 17:32:16 -0700 Subject: [PATCH 34/43] Test that container data checksum is not written to .container file --- .../common/impl/TestContainerDataYaml.java | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/impl/TestContainerDataYaml.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/impl/TestContainerDataYaml.java index 2b4a3290836b..2057c4400a45 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/impl/TestContainerDataYaml.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/impl/TestContainerDataYaml.java @@ -87,6 +87,7 @@ private File createContainerFile(long containerID, int replicaIndex) keyValueContainerData.setSchemaVersion( VersionedDatanodeFeatures.SchemaV2.chooseSchemaVersion()); keyValueContainerData.setReplicaIndex(replicaIndex); + keyValueContainerData.setDataChecksum(12345); File containerFile = new File(testRoot, containerPath); @@ -252,6 +253,25 @@ public void testChecksumInContainerFile(ContainerLayoutVersion layout) throws IO cleanup(); } + /** + * The container's data checksum is stored in a separate file with its Merkle hash tree. It should not be persisted + * to the .container file. + */ + @ContainerLayoutTestInfo.ContainerTest + public void testDataChecksumNotInContainerFile(ContainerLayoutVersion layout) throws IOException { + setLayoutVersion(layout); + long containerID = testContainerID++; + + File containerFile = createContainerFile(containerID, 0); + + // Read from .container file. The kvData object should not have a data hash because it was not persisted in this + // file. + KeyValueContainerData kvData = (KeyValueContainerData) ContainerDataYaml.readContainerFile(containerFile); + assertEquals(0, kvData.getDataChecksum()); + + cleanup(); + } + /** * Test to verify {@link ContainerUtils#verifyContainerFileChecksum(ContainerData,ConfigurationSource)}. */ From 990735f3e0e4a7f2ed0bcfd4a2cca85fcb3264c0 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Fri, 3 May 2024 19:37:55 -0700 Subject: [PATCH 35/43] Fix simple acceptance test issue --- .../src/main/smoketest/admincli/container.robot | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot b/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot index 6bd730856493..e0c6ac1114de 100644 --- a/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot +++ b/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot @@ -34,6 +34,12 @@ Container is closed ${output} = Execute ozone admin container info "${container}" Should contain ${output} CLOSED +Reconciliation complete + [arguments] ${container} + ${data_checksum} = Execute ozone admin container info "${container}" --json | jq -r '.replicas[].dataChecksum' | head -n1 + Should not be empty ${data_checksum} + Should not be equal as strings 0 ${data_checksum} + *** Test Cases *** Create container ${output} = Execute ozone admin container create @@ -103,7 +109,8 @@ Cannot reconcile open container # The container should not yet have any replica checksums. # TODO When the scanner is computing checksums automatically, this test may need to be updated. ${data_checksum} = Execute ozone admin container info "${container}" --json | jq -r '.replicas[].dataChecksum' | head -n1 - Should be empty ${data_checksum} + # 0 is the hex value of an empty checksum. + Should Be Equal As Strings 0 ${data_checksum} Close container ${container} = Execute ozone admin container list --state OPEN | jq -r 'select(.replicationConfig.replicationFactor == "THREE") | .containerID' | head -1 @@ -113,12 +120,12 @@ Close container Wait until keyword succeeds 1min 10sec Container is closed ${container} Reconcile closed container - # Check info does not show replica checksums, since manual reconciliation has not yet been triggered. + # Check that info does not show replica checksums, since manual reconciliation has not yet been triggered. # TODO When the scanner is computing checksums automatically, this test may need to be updated. ${container} = Execute ozone admin container list --state CLOSED | jq -r 'select(.replicationConfig.replicationFactor == "THREE") | .containerID' | head -1 ${data_checksum} = Execute ozone admin container info "${container}" --json | jq -r '.replicas[].dataChecksum' | head -n1 - Should be empty ${data_checksum} + # 0 is the hex value of an empty checksum. + Should Be Equal As Strings 0 ${data_checksum} # When reconciliation finishes, replica checksums should be shown. Execute ozone admin container reconcile ${container} - ${data_checksum} = Execute ozone admin container info "${container}" --json | jq -r '.replicas[].dataChecksum' | head -n1 - Wait until keyword succeeds 1min 5sec Should not be empty ${data_checksum} + Wait until keyword succeeds 1min 5sec Reconciliation complete ${container} From f4ace2d94e2e506fb8525d5360c45652dc2b8755 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Fri, 3 May 2024 19:38:25 -0700 Subject: [PATCH 36/43] Print data checksum as hex string in container info output --- .../hdds/scm/container/ContainerReplicaInfo.java | 15 +++++++++++++++ .../scm/cli/container/ReconcileSubcommand.java | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplicaInfo.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplicaInfo.java index 29ce0b9a37e5..2144992d0b55 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplicaInfo.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplicaInfo.java @@ -17,9 +17,16 @@ */ package org.apache.hadoop.hdds.scm.container; +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.databind.JsonSerializer; +import com.fasterxml.jackson.databind.SerializerProvider; +import com.fasterxml.jackson.databind.annotation.JsonSerialize; +import com.fasterxml.jackson.databind.ser.std.StdKeySerializer; import org.apache.hadoop.hdds.protocol.DatanodeDetails; import org.apache.hadoop.hdds.protocol.proto.HddsProtos; +import java.io.IOException; +import java.math.BigInteger; import java.util.UUID; /** @@ -35,6 +42,7 @@ public final class ContainerReplicaInfo { private long keyCount; private long bytesUsed; private int replicaIndex = -1; + @JsonSerialize(using = LongToHexJsonSerializer.class) private long dataChecksum; public static ContainerReplicaInfo fromProto( @@ -93,6 +101,13 @@ public long getDataChecksum() { return dataChecksum; } + private static class LongToHexJsonSerializer extends JsonSerializer { + @Override + public void serialize(Long value, JsonGenerator gen, SerializerProvider provider) throws IOException { + gen.writeString(Long.toHexString(value)); + } + } + /** * Builder for ContainerReplicaInfo class. */ diff --git a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java index e6893013f956..e747455a8823 100644 --- a/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java +++ b/hadoop-hdds/tools/src/main/java/org/apache/hadoop/hdds/scm/cli/container/ReconcileSubcommand.java @@ -44,7 +44,7 @@ public void execute(ScmClient scmClient) throws IOException { scmClient.reconcileContainer(containerId); System.out.println("Reconciliation has been triggered for container " + containerId); // TODO a better option to check status may be added later. - System.out.println("Use \"ozone admin container info --json " + containerId + "\" to check the hashes of each " + + System.out.println("Use \"ozone admin container info --json " + containerId + "\" to see the checksums of each " + "container replica"); } } From 36f92ee4d72e0cce8a3ee89e4f9c9fd16f8c8446 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Fri, 3 May 2024 19:38:53 -0700 Subject: [PATCH 37/43] Add log of placeholder checksum generated It's coming back as 0 always for some reason. --- .../hadoop/ozone/container/keyvalue/KeyValueHandler.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java index 9ec8a1876419..96f65f5ed030 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java @@ -1161,7 +1161,9 @@ public void reconcileContainer(Container container, List pee ByteBuffer byteBuffer = ByteBuffer.allocate(Long.BYTES); ChecksumByteBuffer checksumImpl = ChecksumByteBufferFactory.crc32Impl(); checksumImpl.update(byteBuffer.putLong(id)); - data.setDataChecksum(checksumImpl.getValue()); + long dataChecksum = checksumImpl.getValue(); + LOG.info("Generated data checksum of container {} for testing: {}", id, dataChecksum); + data.setDataChecksum(dataChecksum); sendICR(container); } From f26a40286b44ca480b94e9db3a6b418b56ac029a Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Fri, 3 May 2024 19:46:35 -0700 Subject: [PATCH 38/43] checkstyle --- .../apache/hadoop/hdds/scm/container/ContainerReplicaInfo.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplicaInfo.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplicaInfo.java index 2144992d0b55..a239cbfdba96 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplicaInfo.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReplicaInfo.java @@ -21,12 +21,10 @@ import com.fasterxml.jackson.databind.JsonSerializer; import com.fasterxml.jackson.databind.SerializerProvider; import com.fasterxml.jackson.databind.annotation.JsonSerialize; -import com.fasterxml.jackson.databind.ser.std.StdKeySerializer; import org.apache.hadoop.hdds.protocol.DatanodeDetails; import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import java.io.IOException; -import java.math.BigInteger; import java.util.UUID; /** From c8bdffc7f48adb6b5e3ec9db2b2c3eaf81f86a22 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Mon, 6 May 2024 12:05:51 -0700 Subject: [PATCH 39/43] putLong increments position, need to rewind. --- .../hadoop/ozone/container/keyvalue/KeyValueHandler.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java index 96f65f5ed030..698d71d7f368 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java @@ -1158,9 +1158,12 @@ public void reconcileContainer(Container container, List pee // TODO Just a deterministic placeholder hash for testing until actual implementation is finished. ContainerData data = container.getContainerData(); long id = data.getContainerID(); - ByteBuffer byteBuffer = ByteBuffer.allocate(Long.BYTES); + ByteBuffer byteBuffer = ByteBuffer.allocate(Long.BYTES) + .putLong(id) + .asReadOnlyBuffer(); + byteBuffer.rewind(); ChecksumByteBuffer checksumImpl = ChecksumByteBufferFactory.crc32Impl(); - checksumImpl.update(byteBuffer.putLong(id)); + checksumImpl.update(byteBuffer); long dataChecksum = checksumImpl.getValue(); LOG.info("Generated data checksum of container {} for testing: {}", id, dataChecksum); data.setDataChecksum(dataChecksum); From f862836efe28e1c467bcacd9f272c6c68285b923 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Mon, 6 May 2024 20:30:43 -0700 Subject: [PATCH 40/43] Undo acceptance workflow change from master --- .github/workflows/repeat-acceptance.yml | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/.github/workflows/repeat-acceptance.yml b/.github/workflows/repeat-acceptance.yml index 3105758b3602..74ef6b87c2fc 100644 --- a/.github/workflows/repeat-acceptance.yml +++ b/.github/workflows/repeat-acceptance.yml @@ -93,9 +93,11 @@ jobs: restore-keys: | ${{ runner.os }}-pnpm- - name: Cache for maven dependencies - uses: actions/cache@v4 + uses: actions/cache/restore@v4 with: - path: ~/.m2/repository + path: | + ~/.m2/repository/*/*/* + !~/.m2/repository/org/apache/ozone key: maven-repo-${{ hashFiles('**/pom.xml') }}-${{ env.JAVA_VERSION }} restore-keys: | maven-repo-${{ hashFiles('**/pom.xml') }} @@ -117,12 +119,6 @@ jobs: hadoop-ozone/dist/target/ozone-*.tar.gz !hadoop-ozone/dist/target/ozone-*-src.tar.gz retention-days: 1 - - name: Delete temporary build artifacts before caching - run: | - #Never cache local artifacts - rm -rf ~/.m2/repository/org/apache/ozone/hdds* - rm -rf ~/.m2/repository/org/apache/ozone/ozone* - if: always() acceptance: needs: - prepare-job From 8a2427b76c447b3335573433dbd581eca03a5e6e Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Tue, 21 May 2024 11:08:59 -0700 Subject: [PATCH 41/43] Undo accidental gh actions change --- .github/workflows/repeat-acceptance.yml | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/repeat-acceptance.yml b/.github/workflows/repeat-acceptance.yml index 74ef6b87c2fc..7269a9c417a6 100644 --- a/.github/workflows/repeat-acceptance.yml +++ b/.github/workflows/repeat-acceptance.yml @@ -81,8 +81,6 @@ jobs: steps: - name: Checkout project uses: actions/checkout@v4 - with: - ref: ${{ github.event.inputs.ref }} - name: Cache for npm dependencies uses: actions/cache@v4 with: @@ -93,11 +91,9 @@ jobs: restore-keys: | ${{ runner.os }}-pnpm- - name: Cache for maven dependencies - uses: actions/cache/restore@v4 + uses: actions/cache@v4 with: - path: | - ~/.m2/repository/*/*/* - !~/.m2/repository/org/apache/ozone + path: ~/.m2/repository key: maven-repo-${{ hashFiles('**/pom.xml') }}-${{ env.JAVA_VERSION }} restore-keys: | maven-repo-${{ hashFiles('**/pom.xml') }} @@ -119,6 +115,12 @@ jobs: hadoop-ozone/dist/target/ozone-*.tar.gz !hadoop-ozone/dist/target/ozone-*-src.tar.gz retention-days: 1 + - name: Delete temporary build artifacts before caching + run: | + #Never cache local artifacts + rm -rf ~/.m2/repository/org/apache/ozone/hdds* + rm -rf ~/.m2/repository/org/apache/ozone/ozone* + if: always() acceptance: needs: - prepare-job From 31e60a950166bc621905f85f99b7aa5249e8f737 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Tue, 21 May 2024 11:19:10 -0700 Subject: [PATCH 42/43] Fix comment typo --- .../main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java index 2e56d141b363..345e4ba7d216 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java @@ -456,7 +456,7 @@ DecommissionScmResponseProto decommissionScm( String getMetrics(String query) throws IOException; /** - * Trigger a reconcile command to datanodes for the current container ID. + * Trigger a reconcile command to datanodes for a container ID. * * @param containerID The ID of the container to reconcile. * @throws IOException On error From 28b88623ac833567dd108f69afe57688d577bf81 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Tue, 28 May 2024 14:40:39 -0700 Subject: [PATCH 43/43] Address review comments, fix error after merge commit --- .../hdds/scm/container/reconciliation/package-info.java | 2 +- .../dist/src/main/smoketest/admincli/container.robot | 9 ++------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/package-info.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/package-info.java index 602d7fb69c0c..fa1e355fd174 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/package-info.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/reconciliation/package-info.java @@ -15,7 +15,7 @@ * the License. */ -package org.apache.hadoop.hdds.scm.container.reconiliation; +package org.apache.hadoop.hdds.scm.container.reconciliation; /** * This package contains classes related to container reconciliation. */ diff --git a/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot b/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot index 7dbb43af912e..fae08991781f 100644 --- a/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot +++ b/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot @@ -101,13 +101,6 @@ Report containers as JSON Should contain ${output} stats Should contain ${output} samples -Close container - ${container} = Execute ozone admin container list --state OPEN | jq -r 'select(.replicationConfig.replicationFactor == "THREE") | .containerID' | head -1 - Execute ozone admin container close "${container}" - ${output} = Execute ozone admin container info "${container}" - Should contain ${output} CLOS - Wait until keyword succeeds 1min 10sec Container is closed ${container} - #List containers on unknown host # ${output} = Execute And Ignore Error ozone admin --verbose container list --scm unknown-host # Should contain ${output} Invalid host name @@ -137,6 +130,8 @@ Cannot reconcile open container Close container ${container} = Execute ozone admin container list --state OPEN | jq -r 'select(.replicationConfig.replicationFactor == "THREE") | .containerID' | head -1 Execute ozone admin container close "${container}" + # The container may either be in CLOSED or CLOSING state at this point. Once we have verified this, we will wait + # for it to progress to CLOSED. ${output} = Execute ozone admin container info "${container}" Should contain ${output} CLOS Wait until keyword succeeds 1min 10sec Container is closed ${container}