diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMDeletedBlockTransactionStatusManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMDeletedBlockTransactionStatusManager.java index f3ca531cc2ad..ad747c48411d 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMDeletedBlockTransactionStatusManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMDeletedBlockTransactionStatusManager.java @@ -89,7 +89,7 @@ public SCMDeletedBlockTransactionStatusManager( this.transactionToDNsCommitMap = new ConcurrentHashMap<>(); this.transactionToRetryCountMap = new ConcurrentHashMap<>(); this.scmDeleteBlocksCommandStatusManager = - new SCMDeleteBlocksCommandStatusManager(); + new SCMDeleteBlocksCommandStatusManager(metrics); } /** @@ -104,8 +104,11 @@ protected static class SCMDeleteBlocksCommandStatusManager { private static final CmdStatus DEFAULT_STATUS = TO_BE_SENT; private static final Set STATUSES_REQUIRING_TIMEOUT = Collections.singleton(SENT); - public SCMDeleteBlocksCommandStatusManager() { + private ScmBlockDeletingServiceMetrics metrics; + + public SCMDeleteBlocksCommandStatusManager(ScmBlockDeletingServiceMetrics metrics) { this.scmCmdStatusRecord = new ConcurrentHashMap<>(); + this.metrics = metrics; } /** @@ -314,6 +317,7 @@ private void removeTimeoutScmCommand(DatanodeID dnId, if (updateTime != null && Duration.between(updateTime, now).toMillis() > timeoutMs) { CmdStatusData state = removeScmCommand(dnId, scmCmdId); + metrics.incrDNCommandsTimeout(dnId, 1); LOG.warn("SCM BlockDeletionCommand {} for Datanode: {} was removed after {}ms without update", state, dnId, timeoutMs); } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/ScmBlockDeletingServiceMetrics.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/ScmBlockDeletingServiceMetrics.java index 495d9bdf10cb..d7d4d781876f 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/ScmBlockDeletingServiceMetrics.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/ScmBlockDeletingServiceMetrics.java @@ -179,6 +179,11 @@ public void incrDNCommandsFailure(DatanodeID id, long delta) { .incrCommandsFailure(delta); } + public void incrDNCommandsTimeout(DatanodeID id, long delta) { + numCommandsDatanode.computeIfAbsent(id, k -> new DatanodeCommandCounts()) + .incrCommandsTimeout(delta); + } + public long getNumBlockDeletionCommandSent() { return numBlockDeletionCommandSent.value(); } @@ -248,7 +253,9 @@ public void getMetrics(MetricsCollector metricsCollector, boolean all) { .addGauge(DatanodeCommandCounts.COMMANDS_SUCCESSFUL_EXECUTION_BY_DN, e.getValue().getCommandsSuccess()) .addGauge(DatanodeCommandCounts.COMMANDS_FAILED_EXECUTION_BY_DN, - e.getValue().getCommandsFailure()); + e.getValue().getCommandsFailure()) + .addGauge(DatanodeCommandCounts.COMMANDS_TIMEOUT_BY_DN, + e.getValue().getCommandsTimeout()); } recordBuilder.endRecord(); } @@ -260,6 +267,7 @@ public static final class DatanodeCommandCounts { private long commandsSent; private long commandsSuccess; private long commandsFailure; + private long commandsTimeout; private static final MetricsInfo COMMANDS_SENT_TO_DN = Interns.info( "CommandsSent", @@ -270,11 +278,17 @@ public static final class DatanodeCommandCounts { private static final MetricsInfo COMMANDS_FAILED_EXECUTION_BY_DN = Interns.info( "CommandsFailed", "Number of commands sent from SCM to the datanode for deletion for which execution failed."); + + private static final MetricsInfo COMMANDS_TIMEOUT_BY_DN = Interns.info( + "CommandsTimeout", + "Number of commands timeout from SCM to DN" + ); public DatanodeCommandCounts() { this.commandsSent = 0; this.commandsSuccess = 0; this.commandsFailure = 0; + this.commandsTimeout = 0; } public void incrCommandsSent(long delta) { @@ -288,6 +302,10 @@ public void incrCommandsSuccess(long delta) { public void incrCommandsFailure(long delta) { this.commandsFailure += delta; } + + public void incrCommandsTimeout(long delta) { + this.commandsTimeout += delta; + } public long getCommandsSent() { return commandsSent; @@ -300,10 +318,15 @@ public long getCommandsSuccess() { public long getCommandsFailure() { return commandsFailure; } + + public long getCommandsTimeout() { + return commandsTimeout; + } @Override public String toString() { - return "Sent=" + commandsSent + ", Success=" + commandsSuccess + ", Failed=" + commandsFailure; + return "Sent=" + commandsSent + ", Success=" + commandsSuccess + ", Failed=" + commandsFailure + + ", Timeout=" + commandsTimeout; } } diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/block/TestSCMDeleteBlocksCommandStatusManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/block/TestSCMDeleteBlocksCommandStatusManager.java index a2f969914951..64d5c0377b79 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/block/TestSCMDeleteBlocksCommandStatusManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/block/TestSCMDeleteBlocksCommandStatusManager.java @@ -24,6 +24,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; +import static org.mockito.Mockito.mock; import java.util.Arrays; import java.util.Collections; @@ -42,6 +43,7 @@ public class TestSCMDeleteBlocksCommandStatusManager { private SCMDeleteBlocksCommandStatusManager manager; + private ScmBlockDeletingServiceMetrics metrics; private DatanodeID dnId1; private DatanodeID dnId2; private long scmCmdId1; @@ -55,7 +57,8 @@ public class TestSCMDeleteBlocksCommandStatusManager { @BeforeEach public void setup() throws Exception { - manager = new SCMDeleteBlocksCommandStatusManager(); + metrics = mock(ScmBlockDeletingServiceMetrics.class); + manager = new SCMDeleteBlocksCommandStatusManager(metrics); // Create test data dnId1 = DatanodeID.randomID(); dnId2 = DatanodeID.randomID();