diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java index 8a1afc5a6be3..584f7d87cd02 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java @@ -29,7 +29,6 @@ import java.io.InputStream; import java.io.OutputStream; import java.nio.file.Files; -import java.nio.file.NoSuchFileException; import java.util.Collection; import java.util.List; import java.util.Map; @@ -409,7 +408,7 @@ private SortedSet getDeletedBlockIDs(ContainerProtos.ContainerChecksumInfo public ByteString getContainerChecksumInfo(KeyValueContainerData data) throws IOException { File checksumFile = getContainerChecksumFile(data); if (!checksumFile.exists()) { - throw new NoSuchFileException("Checksum file does not exist for container #" + data.getContainerID()); + throw new FileNotFoundException("Checksum file does not exist for container #" + data.getContainerID()); } try (InputStream inStream = Files.newInputStream(checksumFile.toPath())) { diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java index b79c0b780506..0b83e34f6d2b 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java @@ -63,6 +63,7 @@ import com.google.common.base.Preconditions; import com.google.common.util.concurrent.Striped; import java.io.File; +import java.io.FileNotFoundException; import java.io.FilenameFilter; import java.io.IOException; import java.io.InputStream; @@ -760,14 +761,33 @@ ContainerCommandResponseProto handleGetContainerChecksumInfo( try { checksumTree = checksumManager.getContainerChecksumInfo(containerData); } catch (IOException ex) { - // For file not found or other inability to read the file, return an error to the client. - LOG.error("Error occurred when reading checksum file for container {}", containerData.getContainerID(), ex); - return ContainerCommandResponseProto.newBuilder() - .setCmdType(request.getCmdType()) - .setTraceID(request.getTraceID()) - .setResult(IO_EXCEPTION) - .setMessage(ex.getMessage()) - .build(); + // Only build from metadata if the file doesn't exist + if (ex instanceof FileNotFoundException) { + try { + LOG.info("Checksum tree file not found for container {}. Building merkle tree from container metadata.", + containerData.getContainerID()); + ContainerProtos.ContainerChecksumInfo checksumInfo = updateAndGetContainerChecksumFromMetadata(kvContainer); + checksumTree = checksumInfo.toByteString(); + } catch (IOException metadataEx) { + LOG.error("Failed to build merkle tree from metadata for container {}", + containerData.getContainerID(), metadataEx); + return ContainerCommandResponseProto.newBuilder() + .setCmdType(request.getCmdType()) + .setTraceID(request.getTraceID()) + .setResult(IO_EXCEPTION) + .setMessage("Failed to get or build merkle tree: " + metadataEx.getMessage()) + .build(); + } + } else { + // For other inability to read the file, return an error to the client. + LOG.error("Error occurred when reading checksum file for container {}", containerData.getContainerID(), ex); + return ContainerCommandResponseProto.newBuilder() + .setCmdType(request.getCmdType()) + .setTraceID(request.getTraceID()) + .setResult(IO_EXCEPTION) + .setMessage(ex.getMessage()) + .build(); + } } return getGetContainerMerkleTreeResponse(request, checksumTree); diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/checksum/TestContainerCommandReconciliation.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/checksum/TestContainerCommandReconciliation.java index e4e25566bf42..dc3522f094ec 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/checksum/TestContainerCommandReconciliation.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/checksum/TestContainerCommandReconciliation.java @@ -58,6 +58,7 @@ import static org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod.KERBEROS; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -101,6 +102,7 @@ import org.apache.hadoop.ozone.client.OzoneVolume; import org.apache.hadoop.ozone.client.io.OzoneOutputStream; import org.apache.hadoop.ozone.container.TestHelper; +import org.apache.hadoop.ozone.container.checksum.ContainerChecksumTreeManager; import org.apache.hadoop.ozone.container.checksum.ContainerMerkleTreeWriter; import org.apache.hadoop.ozone.container.checksum.DNContainerOperationClient; import org.apache.hadoop.ozone.container.common.helpers.BlockData; @@ -230,11 +232,10 @@ public void testGetChecksumInfoNonexistentReplica() { } /** - * Tests reading the container checksum info file from a datanode where the container exists, but the file has not - * yet been created. + * Tests container checksum file creation if it doesn't exist during getContainerChecksumInfo call. */ @Test - public void testGetChecksumInfoNonexistentFile() throws Exception { + public void testMerkleTreeCreationDuringGetChecksumInfo() throws Exception { String volume = UUID.randomUUID().toString(); String bucket = UUID.randomUUID().toString(); long containerID = writeDataAndGetContainer(true, volume, bucket); @@ -244,14 +245,39 @@ public void testGetChecksumInfoNonexistentFile() throws Exception { .getContainerSet().getContainer(containerID); File treeFile = getContainerChecksumFile(container.getContainerData()); // Closing the container should have generated the tree file. + ContainerProtos.ContainerChecksumInfo srcChecksumInfo = ContainerChecksumTreeManager.readChecksumInfo( + container.getContainerData()).get(); assertTrue(treeFile.exists()); assertTrue(treeFile.delete()); + ContainerProtos.ContainerChecksumInfo destChecksumInfo = dnClient.getContainerChecksumInfo( + containerID, targetDN.getDatanodeDetails()); + assertNotNull(destChecksumInfo); + assertTreesSortedAndMatch(srcChecksumInfo.getContainerMerkleTree(), destChecksumInfo.getContainerMerkleTree()); + } + + /** + * Tests reading the container checksum info file from a datanode where there's an IO error + * that's not related to file not found (e.g., permission error). Such errors should not + * trigger fallback to building from metadata. + */ + @Test + public void testGetChecksumInfoIOError() throws Exception { + String volume = UUID.randomUUID().toString(); + String bucket = UUID.randomUUID().toString(); + long containerID = writeDataAndGetContainer(true, volume, bucket); + // Pick a datanode and make its checksum file unreadable to simulate permission error. + HddsDatanodeService targetDN = cluster.getHddsDatanodes().get(0); + Container container = targetDN.getDatanodeStateMachine().getContainer() + .getContainerSet().getContainer(containerID); + File treeFile = getContainerChecksumFile(container.getContainerData()); + assertTrue(treeFile.exists()); + // Make the server unable to read the file (permission error, not file not found). + assertTrue(treeFile.setReadable(false)); + StorageContainerException ex = assertThrows(StorageContainerException.class, () -> dnClient.getContainerChecksumInfo(containerID, targetDN.getDatanodeDetails())); assertEquals(ContainerProtos.Result.IO_EXCEPTION, ex.getResult()); - assertTrue(ex.getMessage().contains("Checksum file does not exist"), ex.getMessage() + - " did not contain the expected string"); } /**