diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java index 939c6d08b336..46dc4aa0ba2d 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java @@ -16,6 +16,7 @@ */ package org.apache.hadoop.ozone.container.checksum; +import com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos; import org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration; import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData; @@ -34,6 +35,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import static org.apache.hadoop.util.MetricUtil.captureLatencyNs; + /** * This class coordinates reading and writing Container checksum information for all containers. */ @@ -44,12 +47,15 @@ public class ContainerChecksumTreeManager { // Used to coordinate reads and writes to each container's checksum file. // Each container ID is mapped to a stripe. private final Striped fileLock; + private final ContainerMerkleTreeMetrics metrics; /** * Creates one instance that should be used to coordinate all container checksum info within a datanode. */ public ContainerChecksumTreeManager(DatanodeConfiguration dnConf) { fileLock = SimpleStriped.readWriteLock(dnConf.getContainerChecksumLockStripes(), true); + // TODO: TO unregister metrics on stop. + metrics = ContainerMerkleTreeMetrics.create(); } /** @@ -63,7 +69,7 @@ public void writeContainerDataTree(KeyValueContainerData data, ContainerMerkleTr writeLock.lock(); try { ContainerProtos.ContainerChecksumInfo newChecksumInfo = read(data).toBuilder() - .setContainerMerkleTree(tree.toProto()) + .setContainerMerkleTree(captureLatencyNs(metrics.getCreateMerkleTreeLatencyNS(), tree::toProto)) .build(); write(data, newChecksumInfo); LOG.debug("Data merkle tree for container {} updated", data.getContainerID()); @@ -99,7 +105,7 @@ public void markBlocksAsDeleted(KeyValueContainerData data, SortedSet dele } } - public ContainerDiff diff(KeyValueContainerData thisContainer, File otherContainerTree) + public ContainerDiff diff(KeyValueContainerData thisContainer, ContainerProtos.ContainerChecksumInfo otherInfo) throws IOException { // TODO HDDS-10928 compare the checksum info of the two containers and return a summary. // Callers can act on this summary to repair their container replica using the peer's replica. @@ -107,13 +113,6 @@ public ContainerDiff diff(KeyValueContainerData thisContainer, File otherContain return new ContainerDiff(); } - /** - * Returns the container checksum tree file for the specified container without deserializing it. - */ - public File getContainerChecksumFile(KeyValueContainerData data) { - return new File(data.getMetadataPath(), data.getContainerID() + ".tree"); - } - private Lock getReadLock(long containerID) { return fileLock.get(containerID).readLock(); } @@ -139,8 +138,13 @@ private ContainerProtos.ContainerChecksumInfo read(KeyValueContainerData data) t .build(); } try (FileInputStream inStream = new FileInputStream(checksumFile)) { - return ContainerProtos.ContainerChecksumInfo.parseFrom(inStream); + return captureLatencyNs(metrics.getReadContainerMerkleTreeLatencyNS(), + () -> ContainerProtos.ContainerChecksumInfo.parseFrom(inStream)); } + } catch (IOException ex) { + metrics.incrementMerkleTreeReadFailures(); + throw new IOException("Error occurred when reading container merkle tree for containerID " + + data.getContainerID(), ex); } finally { readLock.unlock(); } @@ -151,12 +155,26 @@ private void write(KeyValueContainerData data, ContainerProtos.ContainerChecksum Lock writeLock = getWriteLock(data.getContainerID()); writeLock.lock(); try (FileOutputStream outStream = new FileOutputStream(getContainerChecksumFile(data))) { - checksumInfo.writeTo(outStream); + captureLatencyNs(metrics.getWriteContainerMerkleTreeLatencyNS(), + () -> checksumInfo.writeTo(outStream)); + } catch (IOException ex) { + metrics.incrementMerkleTreeWriteFailures(); + throw new IOException("Error occurred when writing container merkle tree for containerID " + + data.getContainerID(), ex); } finally { writeLock.unlock(); } } + public File getContainerChecksumFile(KeyValueContainerData data) { + return new File(data.getMetadataPath(), data.getContainerID() + ".tree"); + } + + @VisibleForTesting + public ContainerMerkleTreeMetrics getMetrics() { + return this.metrics; + } + /** * This class represents the difference between our replica of a container and a peer's replica of a container. * It summarizes the operations we need to do to reconcile our replica with the peer replica it was compared to. diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTreeMetrics.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTreeMetrics.java new file mode 100644 index 000000000000..a288e15f6bd2 --- /dev/null +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTreeMetrics.java @@ -0,0 +1,77 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.ozone.container.checksum; + +import org.apache.hadoop.metrics2.MetricsSystem; +import org.apache.hadoop.metrics2.annotation.Metric; +import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; +import org.apache.hadoop.metrics2.lib.MutableCounterLong; +import org.apache.hadoop.metrics2.lib.MutableRate; + +/** + * Class to collect metrics related to container merkle tree. + */ +public class ContainerMerkleTreeMetrics { + private static final String METRICS_SOURCE_NAME = ContainerMerkleTreeMetrics.class.getSimpleName(); + + public static ContainerMerkleTreeMetrics create() { + MetricsSystem ms = DefaultMetricsSystem.instance(); + return ms.register(METRICS_SOURCE_NAME, "Container Merkle Tree Metrics", + new ContainerMerkleTreeMetrics()); + } + + public void unregister() { + MetricsSystem ms = DefaultMetricsSystem.instance(); + ms.unregisterSource(METRICS_SOURCE_NAME); + } + + @Metric(about = "Number of Merkle tree write failure") + private MutableCounterLong numMerkleTreeWriteFailure; + + @Metric(about = "Number of Merkle tree read failure") + private MutableCounterLong numMerkleTreeReadFailure; + + @Metric(about = "Merkle tree write latency") + private MutableRate merkleTreeWriteLatencyNS; + + @Metric(about = "Merkle tree read latency") + private MutableRate merkleTreeReadLatencyNS; + + @Metric(about = "Merkle tree creation latency") + private MutableRate merkleTreeCreateLatencyNS; + + public void incrementMerkleTreeWriteFailures() { + this.numMerkleTreeWriteFailure.incr(); + } + + public void incrementMerkleTreeReadFailures() { + this.numMerkleTreeReadFailure.incr(); + } + + public MutableRate getWriteContainerMerkleTreeLatencyNS() { + return this.merkleTreeWriteLatencyNS; + } + + public MutableRate getReadContainerMerkleTreeLatencyNS() { + return this.merkleTreeReadLatencyNS; + } + + public MutableRate getCreateMerkleTreeLatencyNS() { + return this.merkleTreeCreateLatencyNS; + } +} diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/TestContainerChecksumTreeManager.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/TestContainerChecksumTreeManager.java index 767eed8a73d3..56a5dbfd55f7 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/TestContainerChecksumTreeManager.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/TestContainerChecksumTreeManager.java @@ -47,6 +47,7 @@ class TestContainerChecksumTreeManager { private KeyValueContainerData container; private File checksumFile; private ContainerChecksumTreeManager checksumManager; + private ContainerMerkleTreeMetrics metrics; @BeforeEach public void init() { @@ -55,11 +56,17 @@ public void init() { when(container.getMetadataPath()).thenReturn(testDir.getAbsolutePath()); checksumFile = new File(testDir, CONTAINER_ID + ".tree"); checksumManager = new ContainerChecksumTreeManager(new DatanodeConfiguration()); + metrics = checksumManager.getMetrics(); } @Test public void testWriteEmptyTreeToFile() throws Exception { + assertEquals(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total(), 0); + assertEquals(metrics.getCreateMerkleTreeLatencyNS().lastStat().total(), 0); checksumManager.writeContainerDataTree(container, new ContainerMerkleTree()); + assertTrue(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total() > 0); + assertTrue(metrics.getCreateMerkleTreeLatencyNS().lastStat().total() > 0); + ContainerProtos.ContainerChecksumInfo checksumInfo = readFile(); assertEquals(CONTAINER_ID, checksumInfo.getContainerID()); @@ -71,7 +78,10 @@ public void testWriteEmptyTreeToFile() throws Exception { @Test public void testWriteEmptyBlockListToFile() throws Exception { + assertEquals(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total(), 0); checksumManager.markBlocksAsDeleted(container, new TreeSet<>()); + assertTrue(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total() > 0); + ContainerProtos.ContainerChecksumInfo checksumInfo = readFile(); assertEquals(CONTAINER_ID, checksumInfo.getContainerID()); @@ -83,11 +93,14 @@ public void testWriteEmptyBlockListToFile() throws Exception { @Test public void testWriteOnlyTreeToFile() throws Exception { + assertEquals(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total(), 0); + assertEquals(metrics.getCreateMerkleTreeLatencyNS().lastStat().total(), 0); ContainerMerkleTree tree = buildTestTree(); checksumManager.writeContainerDataTree(container, tree); - + assertTrue(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total() > 0); ContainerProtos.ContainerChecksumInfo checksumInfo = readFile(); + assertTrue(metrics.getCreateMerkleTreeLatencyNS().lastStat().total() > 0); assertEquals(CONTAINER_ID, checksumInfo.getContainerID()); assertTrue(checksumInfo.getDeletedBlocksList().isEmpty()); // TestContainerMerkleTree verifies that going from ContainerMerkleTree to its proto is consistent. @@ -97,8 +110,10 @@ public void testWriteOnlyTreeToFile() throws Exception { @Test public void testWriteOnlyDeletedBlocksToFile() throws Exception { + assertEquals(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total(), 0); List expectedBlocksToDelete = Arrays.asList(1L, 2L, 3L); checksumManager.markBlocksAsDeleted(container, new TreeSet<>(expectedBlocksToDelete)); + assertTrue(metrics.getWriteContainerMerkleTreeLatencyNS().changed()); ContainerProtos.ContainerChecksumInfo checksumInfo = readFile(); @@ -111,13 +126,20 @@ public void testWriteOnlyDeletedBlocksToFile() throws Exception { @Test public void testDeletedBlocksPreservedOnTreeWrite() throws Exception { + assertEquals(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total(), 0); + assertEquals(metrics.getCreateMerkleTreeLatencyNS().lastStat().total(), 0); + assertEquals(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total(), 0); List expectedBlocksToDelete = Arrays.asList(1L, 2L, 3L); checksumManager.markBlocksAsDeleted(container, new TreeSet<>(expectedBlocksToDelete)); + assertEquals(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total(), 0); ContainerMerkleTree tree = buildTestTree(); checksumManager.writeContainerDataTree(container, tree); + assertTrue(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total() > 0); + assertTrue(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total() > 0); ContainerProtos.ContainerChecksumInfo checksumInfo = readFile(); + assertTrue(metrics.getCreateMerkleTreeLatencyNS().lastStat().total() > 0); assertEquals(CONTAINER_ID, checksumInfo.getContainerID()); assertEquals(expectedBlocksToDelete, checksumInfo.getDeletedBlocksList()); assertTreesSortedAndMatch(tree.toProto(), checksumInfo.getContainerMerkleTree()); @@ -125,21 +147,41 @@ public void testDeletedBlocksPreservedOnTreeWrite() throws Exception { @Test public void testTreePreservedOnDeletedBlocksWrite() throws Exception { + assertEquals(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total(), 0); + assertEquals(metrics.getCreateMerkleTreeLatencyNS().lastStat().total(), 0); + assertEquals(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total(), 0); ContainerMerkleTree tree = buildTestTree(); checksumManager.writeContainerDataTree(container, tree); + assertEquals(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total(), 0); List expectedBlocksToDelete = Arrays.asList(1L, 2L, 3L); checksumManager.markBlocksAsDeleted(container, new TreeSet<>(expectedBlocksToDelete)); + assertTrue(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total() > 0); + assertTrue(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total() > 0); ContainerProtos.ContainerChecksumInfo checksumInfo = readFile(); + assertTrue(metrics.getCreateMerkleTreeLatencyNS().lastStat().total() > 0); assertEquals(CONTAINER_ID, checksumInfo.getContainerID()); assertEquals(expectedBlocksToDelete, checksumInfo.getDeletedBlocksList()); assertTreesSortedAndMatch(tree.toProto(), checksumInfo.getContainerMerkleTree()); } + @Test + public void testReadContainerMerkleTreeMetric() throws Exception { + assertEquals(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total(), 0); + assertEquals(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total(), 0); + ContainerMerkleTree tree = buildTestTree(); + checksumManager.writeContainerDataTree(container, tree); + assertEquals(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total(), 0); + checksumManager.writeContainerDataTree(container, tree); + assertTrue(metrics.getWriteContainerMerkleTreeLatencyNS().lastStat().total() > 0); + assertTrue(metrics.getReadContainerMerkleTreeLatencyNS().lastStat().total() > 0); + } + @Test public void testChecksumTreeFilePath() { - assertEquals(checksumFile.getAbsolutePath(), checksumManager.getContainerChecksumFile(container).getAbsolutePath()); + assertEquals(checksumFile.getAbsolutePath(), + checksumManager.getContainerChecksumFile(container).getAbsolutePath()); } private ContainerMerkleTree buildTestTree() throws Exception {