From b2c04e29b5e5baa821a5f62a522b212a810006de Mon Sep 17 00:00:00 2001 From: Penumudy Tanvi Date: Wed, 19 Mar 2025 13:58:06 +0530 Subject: [PATCH] HDDS-12463. Add detailed SCM metrics for allocate block code path --- .../placement/metrics/SCMPerformanceMetrics.java | 14 ++++++++++++++ .../hdds/scm/pipeline/PipelineManagerImpl.java | 3 +++ .../hdds/scm/pipeline/SCMPipelineMetrics.java | 8 ++++++++ .../hdds/scm/server/SCMBlockProtocolServer.java | 5 +++++ 4 files changed, 30 insertions(+) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/placement/metrics/SCMPerformanceMetrics.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/placement/metrics/SCMPerformanceMetrics.java index cd7a94ec6e7c..b2d142dc3d46 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/placement/metrics/SCMPerformanceMetrics.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/placement/metrics/SCMPerformanceMetrics.java @@ -51,6 +51,10 @@ public final class SCMPerformanceMetrics implements MetricsSource { private MutableRate deleteKeyFailureLatencyNs; @Metric(about = "Latency for deleteKey success in nanoseconds") private MutableRate deleteKeySuccessLatencyNs; + @Metric(about = "Latency for a successful allocateBlock call in nanoseconds") + private MutableRate allocateBlockSuccessLatencyNs; + @Metric(about = "Latency for a failed allocateBlock call in nanoseconds") + private MutableRate allocateBlockFailureLatencyNs; public SCMPerformanceMetrics() { this.registry = new MetricsRegistry(SOURCE_NAME); @@ -78,6 +82,16 @@ public void getMetrics(MetricsCollector collector, boolean all) { deleteKeySuccessLatencyNs.snapshot(recordBuilder, true); deleteKeyFailure.snapshot(recordBuilder, true); deleteKeyFailureLatencyNs.snapshot(recordBuilder, true); + allocateBlockSuccessLatencyNs.snapshot(recordBuilder, true); + allocateBlockFailureLatencyNs.snapshot(recordBuilder, true); + } + + public void updateAllocateBlockSuccessLatencyNs(long startNanos) { + allocateBlockSuccessLatencyNs.add(Time.monotonicNowNanos() - startNanos); + } + + public void updateAllocateBlockFailureLatencyNs(long startNanos) { + allocateBlockFailureLatencyNs.add(Time.monotonicNowNanos() - startNanos); } public void updateDeleteKeySuccessStats(long startNanos) { diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipeline/PipelineManagerImpl.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipeline/PipelineManagerImpl.java index 2dcf2d56f2e3..5d660064c633 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipeline/PipelineManagerImpl.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipeline/PipelineManagerImpl.java @@ -60,6 +60,7 @@ import org.apache.hadoop.metrics2.util.MBeans; import org.apache.hadoop.ozone.ClientVersion; import org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException; +import org.apache.hadoop.util.Time; import org.apache.ratis.protocol.exceptions.NotLeaderException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -416,6 +417,7 @@ public int getNumberOfContainers(PipelineID pipelineID) throws IOException { @Override public void openPipeline(PipelineID pipelineId) throws IOException { + long startNanos = Time.monotonicNowNanos(); HddsProtos.PipelineID pipelineIdProtobuf = pipelineId.getProtobuf(); acquireWriteLock(); final Pipeline pipeline; @@ -431,6 +433,7 @@ public void openPipeline(PipelineID pipelineId) } finally { releaseWriteLock(); } + metrics.updatePipelineCreationLatencyNs(startNanos); metrics.incNumPipelineCreated(); metrics.createPerPipelineMetrics(pipeline); } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipeline/SCMPipelineMetrics.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipeline/SCMPipelineMetrics.java index 817bad4469ee..65d781943b8c 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipeline/SCMPipelineMetrics.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipeline/SCMPipelineMetrics.java @@ -31,7 +31,9 @@ import org.apache.hadoop.metrics2.lib.Interns; import org.apache.hadoop.metrics2.lib.MetricsRegistry; import org.apache.hadoop.metrics2.lib.MutableCounterLong; +import org.apache.hadoop.metrics2.lib.MutableRate; import org.apache.hadoop.ozone.OzoneConsts; +import org.apache.hadoop.util.Time; /** * This class maintains Pipeline related metrics. @@ -54,6 +56,7 @@ public final class SCMPipelineMetrics implements MetricsSource { private @Metric MutableCounterLong numPipelineReportProcessed; private @Metric MutableCounterLong numPipelineReportProcessingFailed; private @Metric MutableCounterLong numPipelineContainSameDatanodes; + private @Metric MutableRate pipelineCreationLatencyNs; private final Map numBlocksAllocated; private final Map numBytesWritten; @@ -100,6 +103,7 @@ public void getMetrics(MetricsCollector collector, boolean all) { numPipelineReportProcessed.snapshot(recordBuilder, true); numPipelineReportProcessingFailed.snapshot(recordBuilder, true); numPipelineContainSameDatanodes.snapshot(recordBuilder, true); + pipelineCreationLatencyNs.snapshot(recordBuilder, true); numBytesWritten .forEach((pid, metric) -> metric.snapshot(recordBuilder, true)); numBlocksAllocated @@ -208,4 +212,8 @@ void incNumPipelineReportProcessingFailed() { void incNumPipelineContainSameDatanodes() { numPipelineContainSameDatanodes.incr(); } + + public void updatePipelineCreationLatencyNs(long startNanos) { + pipelineCreationLatencyNs.add(Time.monotonicNowNanos() - startNanos); + } } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMBlockProtocolServer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMBlockProtocolServer.java index e81569dd2a17..48fd51b3c106 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMBlockProtocolServer.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMBlockProtocolServer.java @@ -191,6 +191,7 @@ public List allocateBlock( String owner, ExcludeList excludeList, String clientMachine ) throws IOException { + long startNanos = Time.monotonicNowNanos(); Map auditMap = Maps.newHashMap(); auditMap.put("size", String.valueOf(size)); auditMap.put("num", String.valueOf(num)); @@ -234,17 +235,21 @@ public List allocateBlock( AUDIT.logWriteFailure(buildAuditMessageForFailure( SCMAction.ALLOCATE_BLOCK, auditMap, null) ); + perfMetrics.updateAllocateBlockFailureLatencyNs(startNanos); } else { AUDIT.logWriteSuccess(buildAuditMessageForSuccess( SCMAction.ALLOCATE_BLOCK, auditMap)); + perfMetrics.updateAllocateBlockSuccessLatencyNs(startNanos); } return blocks; } catch (TimeoutException ex) { + perfMetrics.updateAllocateBlockFailureLatencyNs(startNanos); AUDIT.logWriteFailure(buildAuditMessageForFailure( SCMAction.ALLOCATE_BLOCK, auditMap, ex)); throw new IOException(ex); } catch (Exception ex) { + perfMetrics.updateAllocateBlockFailureLatencyNs(startNanos); AUDIT.logWriteFailure(buildAuditMessageForFailure( SCMAction.ALLOCATE_BLOCK, auditMap, ex)); throw ex;