apache · kerneltime · Jun 28, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jun 18, 2024
diff --git a/...rc/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java b/...rc/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership.  The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.hadoop.ozone.container.checksum;
+
+import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos;
+import org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration;
+import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.SortedSet;
+import java.util.TreeSet;
+import java.util.concurrent.locks.ReadWriteLock;
+import java.util.concurrent.locks.Lock;
+
+import com.google.common.util.concurrent.Striped;
+import org.apache.hadoop.hdds.utils.SimpleStriped;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class coordinates reading and writing Container checksum information for all containers.
+ */
+public class ContainerChecksumTreeManager {
+
+  private static final Logger LOG = LoggerFactory.getLogger(ContainerChecksumTreeManager.class);
+
+  // Used to coordinate reads and writes to each container's checksum file.
+  // Each container ID is mapped to a stripe.
+  private final Striped<ReadWriteLock> fileLock;
+
+  /**
+   * Creates one instance that should be used to coordinate all container checksum info within a datanode.
+   */
+  public ContainerChecksumTreeManager(DatanodeConfiguration dnConf) {
+    fileLock = SimpleStriped.readWriteLock(dnConf.getContainerChecksumLockStripes(), true);
+  }
+
+  /**
+   * Writes the specified container merkle tree to the specified container's checksum file.
+   * The data merkle tree within the file is replaced with the {@code tree} parameter, but all other content of the
+   * file remains unchanged.
+   * Concurrent writes to the same file are coordinated internally.
+   */
+  public void writeContainerDataTree(KeyValueContainerData data, ContainerMerkleTree tree) throws IOException {
+    Lock writeLock = getWriteLock(data.getContainerID());
+    writeLock.lock();
+    try {
+      ContainerProtos.ContainerChecksumInfo newChecksumInfo = read(data).toBuilder()
+          .setContainerMerkleTree(tree.toProto())
+          .build();
+      write(data, newChecksumInfo);
+      LOG.debug("Data merkle tree for container {} updated", data.getContainerID());
+    } finally {
+      writeLock.unlock();
+    }
+  }
+
+  /**
+   * Adds the specified blocks to the list of deleted blocks specified in the container's checksum file.
+   * All other content of the file remains unchanged.
+   * Concurrent writes to the same file are coordinated internally.
+   */
+  public void markBlocksAsDeleted(KeyValueContainerData data, SortedSet<Long> deletedBlockIDs) throws IOException {
+    Lock writeLock = getWriteLock(data.getContainerID());
+    writeLock.lock();
+    try {
+      ContainerProtos.ContainerChecksumInfo.Builder checksumInfoBuilder = read(data).toBuilder();
+      // Although the persisted block list should already be sorted, we will sort it here to make sure.
+      // This will automatically fix any bugs in the persisted order that may show up.
+      SortedSet<Long> sortedDeletedBlockIDs = new TreeSet<>(checksumInfoBuilder.getDeletedBlocksList());
+      // Since the provided list of block IDs is already sorted, this is a linear time addition.
+      sortedDeletedBlockIDs.addAll(deletedBlockIDs);
+
+      checksumInfoBuilder
+          .clearDeletedBlocks()
+          .addAllDeletedBlocks(sortedDeletedBlockIDs)
+          .build();
+      write(data, checksumInfoBuilder.build());
+      LOG.debug("Deleted block list for container {} updated", data.getContainerID());
+    } finally {
+      writeLock.unlock();
+    }
+  }
+
+  public ContainerDiff diff(KeyValueContainerData thisContainer, File otherContainerTree)
+      throws IOException {
+    // TODO HDDS-10928 compare the checksum info of the two containers and return a summary.
+    //  Callers can act on this summary to repair their container replica using the peer's replica.
+    //  This method will use the read lock, which is unused in the current implementation.
+    return new ContainerDiff();
+  }
+
+  /**
+   * Returns the container checksum tree file for the specified container without deserializing it.
+   */
+  public File getContainerChecksumFile(KeyValueContainerData data) {
+    return new File(data.getMetadataPath(), data.getContainerID() + ".tree");
+  }
+
+  private Lock getReadLock(long containerID) {
+    return fileLock.get(containerID).readLock();
+  }
+
+  private Lock getWriteLock(long containerID) {
+    return fileLock.get(containerID).writeLock();
+  }
+
+  private ContainerProtos.ContainerChecksumInfo read(KeyValueContainerData data) throws IOException {
+    long containerID = data.getContainerID();
+    Lock readLock = getReadLock(containerID);
+    readLock.lock();
+    try {
+      File checksumFile = getContainerChecksumFile(data);
+      // If the checksum file has not been created yet, return an empty instance.
+      // Since all writes happen as part of an atomic read-modify-write cycle that requires a write lock, two empty
+      // instances for the same container obtained only under the read lock will not conflict.
+      if (!checksumFile.exists()) {
+        LOG.debug("No checksum file currently exists for container {} at the path {}. Returning an empty instance.",
+            containerID, checksumFile);
+        return ContainerProtos.ContainerChecksumInfo.newBuilder()
+            .setContainerID(containerID)
+            .build();
+      }
+      try (FileInputStream inStream = new FileInputStream(checksumFile)) {
+        return ContainerProtos.ContainerChecksumInfo.parseFrom(inStream);
+      }
+    } finally {
+      readLock.unlock();
+    }
+  }
+
+  private void write(KeyValueContainerData data, ContainerProtos.ContainerChecksumInfo checksumInfo)
+      throws IOException {
+    Lock writeLock = getWriteLock(data.getContainerID());
+    writeLock.lock();
+    try (FileOutputStream outStream = new FileOutputStream(getContainerChecksumFile(data))) {
+      checksumInfo.writeTo(outStream);
+    } finally {
+      writeLock.unlock();
+    }
+  }
+
+  /**
+   * This class represents the difference between our replica of a container and a peer's replica of a container.
+   * It summarizes the operations we need to do to reconcile our replica with the peer replica it was compared to.
+   *
+   * TODO HDDS-10928
+   */
+  public static class ContainerDiff {
+    public ContainerDiff() {
+
+    }
+  }
+}
diff --git a/...service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTree.java b/...service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTree.java
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership.  The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.hadoop.ozone.container.checksum;
+
+import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos;
+import org.apache.hadoop.ozone.common.ChecksumByteBuffer;
+import org.apache.hadoop.ozone.common.ChecksumByteBufferFactory;
+import org.apache.hadoop.ozone.container.common.helpers.ChunkInfo;
+import org.apache.ratis.thirdparty.com.google.protobuf.ByteString;
+
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+/**
+ * This class represents a Merkle tree that provides one checksum for all data within a container.
+ *
+ * As the leaves of the tree, a checksum for each chunk is computed by taking a checksum of all checksums within that
+ * chunk. Each chunk checksum in a block is further checksummed together to generate the block level checksum. Finally,
+ * The checksums of all blocks are checksummed together to create a container level checksum.
+ * Note that checksums are order dependent. Chunk checksums are sorted by their
+ * offset within a block, and block checksums are sorted by their block ID.
+ *
+ * This class can be used to construct a consistent and completely filled {@link ContainerProtos.ContainerMerkleTree}
+ * object. It allows building a container merkle tree from scratch by incrementally adding chunks.
+ * The final checksums at higher levels of the tree are not calculated until
+ * {@link ContainerMerkleTree#toProto} is called.
+ */
+public class ContainerMerkleTree {
+
+  private final SortedMap<Long, BlockMerkleTree> id2Block;
+
+  /**
+   * Constructs an empty Container merkle tree object.
+   */
+  public ContainerMerkleTree() {
+    id2Block = new TreeMap<>();
+  }
+
+  /**
+   * Adds chunks to a block in the tree. The block entry will be created if it is the first time adding chunks to it.
+   * If the block entry already exists, the chunks will be added to the existing chunks for that block.
+   *
+   * @param blockID The ID of the block that these chunks belong to.
+   * @param chunks A list of chunks to add to this block. The chunks will be sorted internally by their offset.
+   */
+  public void addChunks(long blockID, Collection<ChunkInfo> chunks) {
+    id2Block.computeIfAbsent(blockID, BlockMerkleTree::new).addChunks(chunks);
+  }
+
+  /**
+   * Uses chunk hashes to compute all remaining hashes in the tree, and returns it as a protobuf object. No checksum
+   * computation for the tree happens outside of this method.
+   *
+   * @return A complete protobuf object representation of this tree.
+   */
+  public ContainerProtos.ContainerMerkleTree toProto() {
+    // Compute checksums and return the result.
+    ContainerProtos.ContainerMerkleTree.Builder containerTreeBuilder = ContainerProtos.ContainerMerkleTree.newBuilder();
+    ChecksumByteBuffer checksumImpl = ChecksumByteBufferFactory.crc32Impl();
+    ByteBuffer containerChecksumBuffer = ByteBuffer.allocate(Long.BYTES * id2Block.size());
+
+    for (BlockMerkleTree blockTree: id2Block.values()) {
+      ContainerProtos.BlockMerkleTree blockTreeProto = blockTree.toProto();
+      containerTreeBuilder.addBlockMerkleTree(blockTreeProto);
+      // Add the block's checksum to the buffer that will be used to calculate the container checksum.
+      containerChecksumBuffer.putLong(blockTreeProto.getBlockChecksum());
+    }
+    containerChecksumBuffer.flip();
+    checksumImpl.update(containerChecksumBuffer);
+
+    return containerTreeBuilder
+        .setDataChecksum(checksumImpl.getValue())
+        .build();
+  }
+
+  /**
+   * Represents a merkle tree for a single block within a container.
+   */
+  private static class BlockMerkleTree {
+    // Map of each offset within the block to its chunk info.
+    // Chunk order in the checksum is determined by their offset.
+    private final SortedMap<Long, ChunkMerkleTree> offset2Chunk;
+    private final long blockID;
+
+    BlockMerkleTree(long blockID) {
+      this.blockID = blockID;
+      this.offset2Chunk = new TreeMap<>();
+    }
+
+    /**
+     * Adds the specified chunks to this block. The offset value of the chunk must be unique within the block,
+     * otherwise it will overwrite the previous value at that offset.
+     *
+     * @param chunks A list of chunks to add to this block.
+     */
+    public void addChunks(Collection<ChunkInfo> chunks) {
+      for (ChunkInfo chunk: chunks) {
+        offset2Chunk.put(chunk.getOffset(), new ChunkMerkleTree(chunk));
+      }
+    }
+
+    /**
+     * Uses chunk hashes to compute a block hash for this tree, and returns it as a protobuf object. All block checksum
+     * computation for the tree happens within this method.
+     *
+     * @return A complete protobuf object representation of this block tree.
+     */
+    public ContainerProtos.BlockMerkleTree toProto() {
+      ContainerProtos.BlockMerkleTree.Builder blockTreeBuilder = ContainerProtos.BlockMerkleTree.newBuilder();
+      ChecksumByteBuffer checksumImpl = ChecksumByteBufferFactory.crc32Impl();
+      ByteBuffer blockChecksumBuffer = ByteBuffer.allocate(Long.BYTES * offset2Chunk.size());
+
+      for (ChunkMerkleTree chunkTree: offset2Chunk.values()) {
+        // Ordering of checksums within a chunk is assumed to be in the order they are written.
+        // This assumption is already built in to the code that reads and writes the values (see
+        // ChunkInputStream#validateChunk for an example on the client read path).
+        // There is no other value we can use to sort these checksums, so we assume the stored proto has them in the
+        // correct order.
+        ContainerProtos.ChunkMerkleTree chunkTreeProto = chunkTree.toProto();
+        blockTreeBuilder.addChunkMerkleTree(chunkTreeProto);
+        blockChecksumBuffer.putLong(chunkTreeProto.getChunkChecksum());
+      }
+      blockChecksumBuffer.flip();
+      checksumImpl.update(blockChecksumBuffer);
+
+      return blockTreeBuilder
+          .setBlockID(blockID)
+          .setBlockChecksum(checksumImpl.getValue())
+          .build();
+    }
+  }
+
+  /**
+   * Represents a merkle tree for a single chunk within a container.
+   * Each chunk has multiple checksums within it at each "bytesPerChecksum" interval.
+   * This class computes one checksum for the whole chunk by aggregating these.
+   */
+  private static class ChunkMerkleTree {
+    private final ChunkInfo chunk;
+
+    ChunkMerkleTree(ChunkInfo chunk) {
+      this.chunk = chunk;
+    }
+
+    /**
+     * Computes a single hash for this ChunkInfo object. All chunk level checksum computation happens within this
+     * method.
+     *
+     * @return A complete protobuf representation of this chunk as a leaf in the container merkle tree.
+     */
+    public ContainerProtos.ChunkMerkleTree toProto() {
+      ChecksumByteBuffer checksumImpl = ChecksumByteBufferFactory.crc32Impl();
+      for (ByteString checksum: chunk.getChecksumData().getChecksums()) {
+        checksumImpl.update(checksum.asReadOnlyByteBuffer());
+      }
+
+      return ContainerProtos.ChunkMerkleTree.newBuilder()
+          .setOffset(chunk.getOffset())
+          .setLength(chunk.getLen())
+          .setChunkChecksum(checksumImpl.getValue())
+          .build();
+    }
+  }
+}
diff --git a/...tainer-service/src/main/java/org/apache/hadoop/ozone/container/checksum/package-info.java b/...tainer-service/src/main/java/org/apache/hadoop/ozone/container/checksum/package-info.java
@@ -0,0 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ozone.container.checksum;
+/**
+ * This package contains classes handling container level checksums.
+ */