diff --git a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/XceiverClientMetrics.java b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/XceiverClientMetrics.java index 399c520fb05a..96db6d13fea5 100644 --- a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/XceiverClientMetrics.java +++ b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/XceiverClientMetrics.java @@ -39,6 +39,8 @@ public class XceiverClientMetrics { private @Metric MutableCounterLong pendingOps; private @Metric MutableCounterLong totalOps; + private @Metric MutableCounterLong ecReconstructionTotal; + private @Metric MutableCounterLong ecReconstructionFailsTotal; private MutableCounterLong[] pendingOpsArray; private MutableCounterLong[] opsArray; private MutableRate[] containerOpsLatency; @@ -100,6 +102,14 @@ public long getPendingContainerOpCountMetrics(ContainerProtos.Type type) { return pendingOpsArray[type.ordinal()].value(); } + public void incECReconstructionTotal() { + ecReconstructionTotal.incr(); + } + + public void incECReconstructionFailsTotal() { + ecReconstructionFailsTotal.incr(); + } + @VisibleForTesting public long getTotalOpCount() { return totalOps.value(); diff --git a/hadoop-hdds/client/src/main/java/org/apache/hadoop/ozone/client/io/ECBlockInputStreamProxy.java b/hadoop-hdds/client/src/main/java/org/apache/hadoop/ozone/client/io/ECBlockInputStreamProxy.java index 5427e300cec3..7a8b0d3e8eea 100644 --- a/hadoop-hdds/client/src/main/java/org/apache/hadoop/ozone/client/io/ECBlockInputStreamProxy.java +++ b/hadoop-hdds/client/src/main/java/org/apache/hadoop/ozone/client/io/ECBlockInputStreamProxy.java @@ -21,6 +21,7 @@ import org.apache.hadoop.hdds.client.ECReplicationConfig; import org.apache.hadoop.hdds.protocol.DatanodeDetails; import org.apache.hadoop.hdds.scm.XceiverClientFactory; +import org.apache.hadoop.hdds.scm.XceiverClientManager; import org.apache.hadoop.hdds.scm.pipeline.Pipeline; import org.apache.hadoop.hdds.scm.storage.BlockExtendedInputStream; import org.apache.hadoop.hdds.scm.storage.BlockLocationInfo; @@ -117,6 +118,10 @@ private synchronized void setReaderType() { } private void createBlockReader() { + if (reconstructionReader) { + XceiverClientManager.getXceiverClientMetrics() + .incECReconstructionTotal(); + } blockReader = ecBlockInputStreamFactory.create(reconstructionReader, failedLocations, repConfig, blockInfo, verifyChecksum, xceiverClientFactory, refreshFunction); @@ -162,6 +167,8 @@ public synchronized int read(ByteBuffer buf) throws IOException { // If we get an error from the reconstruction reader, there // is nothing left to try. It will re-try until it has insufficient // locations internally, so if an error comes here, just re-throw it. + XceiverClientManager.getXceiverClientMetrics() + .incECReconstructionFailsTotal(); throw e; } if (e instanceof BadDataLocationException) { diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java index cc05511b58aa..31864e2868c8 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java @@ -54,6 +54,7 @@ import org.apache.hadoop.ozone.container.common.statemachine.commandhandler.ReplicateContainerCommandHandler; import org.apache.hadoop.ozone.container.common.statemachine.commandhandler.SetNodeOperationalStateCommandHandler; import org.apache.hadoop.ozone.container.ec.reconstruction.ECReconstructionCoordinator; +import org.apache.hadoop.ozone.container.ec.reconstruction.ECReconstructionMetrics; import org.apache.hadoop.ozone.container.ec.reconstruction.ECReconstructionSupervisor; import org.apache.hadoop.ozone.container.keyvalue.TarContainerPacker; import org.apache.hadoop.ozone.container.ozoneimpl.OzoneContainer; @@ -116,6 +117,7 @@ public class DatanodeStateMachine implements Closeable { private final ReadWriteLock constructionLock = new ReentrantReadWriteLock(); private final MeasuredReplicator replicatorMetrics; private final ReplicationSupervisorMetrics replicationSupervisorMetrics; + private final ECReconstructionMetrics ecReconstructionMetrics; /** * Constructs a datanode state machine. @@ -182,8 +184,11 @@ public DatanodeStateMachine(DatanodeDetails datanodeDetails, replicationSupervisorMetrics = ReplicationSupervisorMetrics.create(supervisor); + ecReconstructionMetrics = ECReconstructionMetrics.create(); + ECReconstructionCoordinator ecReconstructionCoordinator = - new ECReconstructionCoordinator(conf, certClient); + new ECReconstructionCoordinator(conf, certClient, + ecReconstructionMetrics); ecReconstructionSupervisor = new ECReconstructionSupervisor(container.getContainerSet(), context, replicationConfig.getReplicationMaxStreams(), @@ -378,6 +383,7 @@ public void close() throws IOException { } context.setState(DatanodeStates.getLastState()); replicationSupervisorMetrics.unRegister(); + ecReconstructionMetrics.unRegister(); executorService.shutdown(); try { if (!executorService.awaitTermination(5, TimeUnit.SECONDS)) { diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinator.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinator.java index b9da5cba5f11..3fb5361d261c 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinator.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinator.java @@ -102,10 +102,11 @@ public class ECReconstructionCoordinator implements Closeable { private final BlockInputStreamFactory blockInputStreamFactory; private final TokenHelper tokenHelper; private final ContainerClientMetrics clientMetrics; + private final ECReconstructionMetrics metrics; public ECReconstructionCoordinator(ConfigurationSource conf, - CertificateClient certificateClient) - throws IOException { + CertificateClient certificateClient, + ECReconstructionMetrics metrics) throws IOException { this.containerOperationClient = new ECContainerOperationClient(conf, certificateClient); this.byteBufferPool = new ElasticByteBufferPool(); @@ -121,6 +122,7 @@ public ECReconstructionCoordinator(ConfigurationSource conf, .getInstance(byteBufferPool, () -> ecReconstructExecutor); tokenHelper = new TokenHelper(conf, certificateClient); this.clientMetrics = ContainerClientMetrics.acquire(); + this.metrics = metrics; } public void reconstructECContainerGroup(long containerID, @@ -162,8 +164,13 @@ public void reconstructECContainerGroup(long containerID, containerOperationClient .closeContainer(containerID, dn, repConfig, containerToken); } + metrics.incReconstructionTotal(); + metrics.incBlockGroupReconstructionTotal(blockLocationInfoMap.size()); } catch (Exception e) { // Any exception let's delete the recovering containers. + metrics.incReconstructionFailsTotal(); + metrics.incBlockGroupReconstructionFailsTotal( + blockLocationInfoMap.size()); LOG.warn( "Exception while reconstructing the container {}. Cleaning up" + " all the recovering containers in the reconstruction process.", @@ -445,4 +452,8 @@ private long calcEffectiveBlockGroupLen(BlockData[] blockGroup, } return blockGroupLen == Long.MAX_VALUE ? 0 : blockGroupLen; } + + public ECReconstructionMetrics getECReconstructionMetrics() { + return this.metrics; + } } \ No newline at end of file diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionMetrics.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionMetrics.java new file mode 100644 index 000000000000..91442c65f74f --- /dev/null +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionMetrics.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *
+ * http://www.apache.org/licenses/LICENSE-2.0 + *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ozone.container.ec.reconstruction;
+
+import org.apache.hadoop.hdds.annotation.InterfaceAudience;
+import org.apache.hadoop.metrics2.MetricsSystem;
+import org.apache.hadoop.metrics2.annotation.Metric;
+import org.apache.hadoop.metrics2.annotation.Metrics;
+import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.metrics2.lib.MutableCounterLong;
+import org.apache.hadoop.ozone.OzoneConsts;
+
+/**
+ * Metrics class for EC Reconstruction.
+ */
+@InterfaceAudience.Private
+@Metrics(about = "EC Reconstruction Coordinator Metrics",
+ context = OzoneConsts.OZONE)
+public final class ECReconstructionMetrics {
+ private static final String SOURCE =
+ ECReconstructionMetrics.class.getSimpleName();
+
+ private @Metric MutableCounterLong blockGroupReconstructionTotal;
+ private @Metric MutableCounterLong blockGroupReconstructionFailsTotal;
+ private @Metric MutableCounterLong reconstructionTotal;
+ private @Metric MutableCounterLong reconstructionFailsTotal;
+
+ private ECReconstructionMetrics() {
+ }
+
+ public static ECReconstructionMetrics create() {
+ MetricsSystem ms = DefaultMetricsSystem.instance();
+ return ms.register(SOURCE, "EC Reconstruction Coordinator Metrics",
+ new ECReconstructionMetrics());
+ }
+
+ public void unRegister() {
+ MetricsSystem ms = DefaultMetricsSystem.instance();
+ ms.unregisterSource(SOURCE);
+ }
+
+ public void incBlockGroupReconstructionTotal(long count) {
+ blockGroupReconstructionTotal.incr(count);
+ }
+
+ public void incBlockGroupReconstructionFailsTotal(long count) {
+ blockGroupReconstructionFailsTotal.incr(count);
+ }
+
+ public void incReconstructionTotal() {
+ reconstructionTotal.incr();
+ }
+
+ public void incReconstructionFailsTotal() {
+ reconstructionFailsTotal.incr();
+ }
+
+ public long getReconstructionTotal() {
+ return reconstructionTotal.value();
+ }
+
+ public long getBlockGroupReconstructionTotal() {
+ return blockGroupReconstructionTotal.value();
+ }
+}
diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ec/reconstruction/TestECReconstructionSupervisor.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ec/reconstruction/TestECReconstructionSupervisor.java
index b98eef7f2f07..c40ceb2ea36c 100644
--- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ec/reconstruction/TestECReconstructionSupervisor.java
+++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ec/reconstruction/TestECReconstructionSupervisor.java
@@ -42,7 +42,8 @@ public void testAddTaskShouldExecuteTheGivenTask()
final CountDownLatch holdProcessing = new CountDownLatch(1);
ECReconstructionSupervisor supervisor =
new ECReconstructionSupervisor(null, null, 5,
- new ECReconstructionCoordinator(new OzoneConfiguration(), null) {
+ new ECReconstructionCoordinator(new OzoneConfiguration(), null,
+ ECReconstructionMetrics.create()) {
@Override
public void reconstructECContainerGroup(long containerID,
ECReplicationConfig repConfig,
diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/storage/TestContainerCommandsEC.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/storage/TestContainerCommandsEC.java
index 8713dc2a17b8..df8ee103f6df 100644
--- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/storage/TestContainerCommandsEC.java
+++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/storage/TestContainerCommandsEC.java
@@ -66,6 +66,7 @@
import org.apache.hadoop.ozone.container.ContainerTestHelper;
import org.apache.hadoop.ozone.container.ec.reconstruction.ECContainerOperationClient;
import org.apache.hadoop.ozone.container.ec.reconstruction.ECReconstructionCoordinator;
+import org.apache.hadoop.ozone.container.ec.reconstruction.ECReconstructionMetrics;
import org.apache.hadoop.ozone.om.OzoneManager;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.security.token.TokenIdentifier;
@@ -390,8 +391,10 @@ private void testECReconstructionCoordinator(List