apache · siddhantsangwan · May 15, 2025 · May 20, 2025 · May 20, 2025 · May 27, 2025
diff --git a/...r-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java b/...r-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java
@@ -32,6 +32,7 @@
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicLong;
 import org.apache.hadoop.hdds.HddsConfigKeys;
 import org.apache.hadoop.hdds.HddsUtils;
 import org.apache.hadoop.hdds.client.BlockID;
@@ -44,6 +45,7 @@
 import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerType;
 import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.Result;
 import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.Type;
+import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos;
 import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerAction;
 import org.apache.hadoop.hdds.scm.container.common.helpers.ContainerNotOpenException;
 import org.apache.hadoop.hdds.scm.container.common.helpers.InvalidContainerStateException;
@@ -110,6 +112,8 @@ public class HddsDispatcher implements ContainerDispatcher, Auditor {
   private ContainerMetrics metrics;
   private final TokenVerifier tokenVerifier;
   private long slowOpThresholdNs;
+  private AtomicLong fullVolumeLastHeartbeatTriggerMs;
+  private long fullVolumeHeartbeatThrottleIntervalMs;
 
   /**
    * Constructs an OzoneContainer that receives calls from
@@ -130,6 +134,10 @@ public HddsDispatcher(ConfigurationSource config, ContainerSet contSet,
     this.tokenVerifier = tokenVerifier != null ? tokenVerifier
         : new NoopTokenVerifier();
     this.slowOpThresholdNs = getSlowOpThresholdMs(conf) * 1000000;
+    fullVolumeLastHeartbeatTriggerMs = new AtomicLong(-1);
+    long heartbeatInterval =
+        config.getTimeDuration("hdds.heartbeat.interval", 30000, TimeUnit.MILLISECONDS);
+    fullVolumeHeartbeatThrottleIntervalMs = Math.min(heartbeatInterval, 30000);
 
     protocolMetrics =
         new ProtocolMessageMetrics<>(
@@ -335,7 +343,15 @@ && getMissingContainerSet().contains(containerID)) {
     // Small performance optimization. We check if the operation is of type
     // write before trying to send CloseContainerAction.
     if (!HddsUtils.isReadOnly(msg)) {
-      sendCloseContainerActionIfNeeded(container);
+      boolean isFull = isVolumeFull(container);
+      sendCloseContainerActionIfNeeded(container, isFull);
+      if (isFull) {
+        try {
+          handleFullVolume(container.getContainerData().getVolume());
+        } catch (StorageContainerException e) {
+          ContainerUtils.logAndReturnError(LOG, e, msg);
+        }
+      }
     }
     Handler handler = getHandler(containerType);
     if (handler == null) {
@@ -403,7 +419,7 @@ && getMissingContainerSet().contains(containerID)) {
         // in any case, the in memory state of the container should be unhealthy
         Preconditions.checkArgument(
             container.getContainerData().getState() == State.UNHEALTHY);
-        sendCloseContainerActionIfNeeded(container);
+        sendCloseContainerActionIfNeeded(container, isVolumeFull(container));
       }
       if (cmdType == Type.CreateContainer
           && result == Result.SUCCESS && dispatcherContext != null) {
@@ -435,6 +451,37 @@ && getMissingContainerSet().contains(containerID)) {
     }
   }
 
+  /**
+   * If the volume is full, we need to inform SCM about the latest volume usage stats and send the close container
+   * action for this container immediately. {@link HddsDispatcher#sendCloseContainerActionIfNeeded(Container, boolean)}
+   * just adds the action to the heartbeat. Here, we get the latest storage statistics for this node, add them to the
+   * heartbeat, and then send the heartbeat (including container close action) immediately.
+   * @param volume the volume being written to
+   */
+  private void handleFullVolume(HddsVolume volume) throws StorageContainerException {
+    long current = System.currentTimeMillis();
+    long last = fullVolumeLastHeartbeatTriggerMs.get();
+    boolean isFirstTrigger = last == -1;
+    boolean allowedToTrigger = (current - fullVolumeHeartbeatThrottleIntervalMs) >= last;
+    if (isFirstTrigger || allowedToTrigger) {
+      if (fullVolumeLastHeartbeatTriggerMs.compareAndSet(last, current)) {
+        StorageContainerDatanodeProtocolProtos.NodeReportProto nodeReport;
+        try {
+          nodeReport = context.getParent().getContainer().getNodeReport();
+          context.refreshFullReport(nodeReport);
+          context.getParent().triggerHeartbeat();
+          LOG.info("Triggering heartbeat for full volume {}, with node report: {}.", volume, nodeReport);
+        } catch (IOException e) {
+          String volumePath = volume.getVolumeRootDir();
+          StorageLocationReport volumeReport = volume.getReport();
+          String error = String.format(
+              "Failed to create node report when handling full volume %s. Volume Report: %s", volumePath, volumeReport);
+          throw new StorageContainerException(error, e, Result.IO_EXCEPTION);
+        }
+      }
+    }
+  }
+
   private long getSlowOpThresholdMs(ConfigurationSource config) {
     return config.getTimeDuration(
         HddsConfigKeys.HDDS_DATANODE_SLOW_OP_WARNING_THRESHOLD_KEY,
@@ -578,9 +625,9 @@ public void validateContainerCommand(
    * marked unhealthy we send Close ContainerAction to SCM.
    * @param container current state of container
    */
-  private void sendCloseContainerActionIfNeeded(Container container) {
+  private void sendCloseContainerActionIfNeeded(Container container, boolean isVolumeFull) {
     // We have to find a more efficient way to close a container.
-    boolean isSpaceFull = isContainerFull(container) || isVolumeFull(container);
+    boolean isSpaceFull = isContainerFull(container) || isVolumeFull;
     boolean shouldClose = isSpaceFull || isContainerUnhealthy(container);
     if (shouldClose) {
       ContainerData containerData = container.getContainerData();

diff --git a/...rvice/src/test/java/org/apache/hadoop/ozone/container/common/impl/TestHddsDispatcher.java b/...rvice/src/test/java/org/apache/hadoop/ozone/container/common/impl/TestHddsDispatcher.java
@@ -28,6 +28,7 @@
 import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.mockito.ArgumentMatchers.eq;
 import static org.mockito.Mockito.any;
 import static org.mockito.Mockito.doReturn;
 import static org.mockito.Mockito.mock;
@@ -63,6 +64,7 @@
 import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandResponseProto;
 import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerType;
 import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.WriteChunkRequestProto;
+import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos;
 import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerAction;
 import org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException;
 import org.apache.hadoop.hdds.security.token.TokenVerifier;
@@ -79,6 +81,7 @@
 import org.apache.hadoop.ozone.container.common.interfaces.VolumeChoosingPolicy;
 import org.apache.hadoop.ozone.container.common.report.IncrementalReportSender;
 import org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration;
+import org.apache.hadoop.ozone.container.common.statemachine.DatanodeStateMachine;
 import org.apache.hadoop.ozone.container.common.statemachine.StateContext;
 import org.apache.hadoop.ozone.container.common.transport.server.ratis.DispatcherContext;
 import org.apache.hadoop.ozone.container.common.transport.server.ratis.DispatcherContext.Op;
@@ -93,6 +96,7 @@
 import org.apache.hadoop.ozone.container.keyvalue.ContainerLayoutTestInfo;
 import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainer;
 import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData;
+import org.apache.hadoop.ozone.container.ozoneimpl.OzoneContainer;
 import org.apache.hadoop.security.token.Token;
 import org.apache.ozone.test.GenericTestUtils.LogCapturer;
 import org.apache.ratis.thirdparty.com.google.protobuf.ByteString;
@@ -177,6 +181,9 @@ public void testContainerCloseActionWhenFull(
       verify(context, times(1))
           .addContainerActionIfAbsent(any(ContainerAction.class));
 
+      // since the volume is not full, context.refreshFullReport(NodeReportProto) should not be called
+      verify(context, times(0)).refreshFullReport(any());
+
     } finally {
       volumeSet.shutdown();
       ContainerMetrics.remove();
@@ -276,6 +283,16 @@ public void testContainerCloseActionWhenVolumeFull(
       UUID scmId = UUID.randomUUID();
       ContainerSet containerSet = newContainerSet();
       StateContext context = ContainerTestUtils.getMockContext(dd, conf);
+
+      // empty report object for testing that an immediate heartbeat is triggered
+      StorageContainerDatanodeProtocolProtos.NodeReportProto.Builder nrb
+          = StorageContainerDatanodeProtocolProtos.
+          NodeReportProto.newBuilder();
+      StorageContainerDatanodeProtocolProtos.NodeReportProto reportProto = nrb.build();
+      DatanodeStateMachine stateMachine = context.getParent();
+      OzoneContainer ozoneContainer = mock(OzoneContainer.class);
+      doReturn(ozoneContainer).when(stateMachine).getContainer();
+      doReturn(reportProto).when(ozoneContainer).getNodeReport();
       // create a 50 byte container
       // available (160) > 100 (min free space) + 50 (container size)
       KeyValueContainerData containerData = new KeyValueContainerData(1L,
@@ -308,6 +325,15 @@ public void testContainerCloseActionWhenVolumeFull(
           response.getResult());
       verify(context, times(1))
           .addContainerActionIfAbsent(any(ContainerAction.class));
+      // verify that node report is refreshed and heartbeat is triggered
+      verify(context, times(1)).refreshFullReport(eq(reportProto));
+      verify(stateMachine, times(1)).triggerHeartbeat();
+
+      // the volume is past the min free space boundary but this time the heartbeat should not be triggered because
+      // of throttling
+      hddsDispatcher.dispatch(getWriteChunkRequest(dd.getUuidString(), 1L, 2L), null);
+      verify(context, times(1)).refreshFullReport(eq(reportProto)); // was called once before
+      verify(stateMachine, times(1)).triggerHeartbeat(); // was called once before
 
       // try creating another container now as the volume used has crossed
       // threshold