-
Notifications
You must be signed in to change notification settings - Fork 588
HDDS-10374. Make container scanner generate merkle trees during the scan #7490
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 19 commits
a8b8dbc
999a913
b0d1ba9
382bce2
28b1889
dc182e8
a3401a9
a25d44d
7550a3c
847f8d8
452c294
dc45eca
1cb291f
d6b21d2
2a2dbbd
c9a077c
0bbbdc5
7b971a9
15d6848
0989881
834be96
e73757e
f0d8efe
4cb054c
8abedb6
577a075
4c8d843
0bd4127
61fae12
61f30f3
192eb7b
0cf79f6
8b30f54
9c74f4b
d550669
97e02ea
22b41b8
f49a9dd
f5d4dbf
1140c90
e0aa7cb
62d7794
9c3b87c
9322b4a
9b75957
f615275
dadc829
076a82e
cc55527
35879b4
1ab8c14
6c8be07
60a1a6e
d035c17
53336ae
56e7ed4
2504638
4be9992
c0b89dd
e24a24e
dc27f74
e2974b4
34b4b9a
5fda700
de6a757
1574cdd
7ff972c
02a3ac6
54cbf92
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -188,7 +188,6 @@ public DataScanResult fullCheck(DataTransferThrottler throttler, Canceler cancel | |
|
|
||
| LOG.debug("Running data checks for container {}", containerID); | ||
| try { | ||
| // TODO HDDS-10374 this tree will get updated with the container's contents as it is scanned. | ||
| ContainerMerkleTreeWriter dataTree = new ContainerMerkleTreeWriter(); | ||
| List<ContainerScanError> dataErrors = scanData(dataTree, throttler, canceler); | ||
| if (containerIsDeleted()) { | ||
|
|
@@ -375,12 +374,15 @@ private List<ContainerScanError> scanBlock(DBHandle db, File dbFile, BlockData b | |
| // So, we need to make sure, chunk length > 0, before declaring | ||
| // the missing chunk file. | ||
| if (!block.getChunks().isEmpty() && block.getChunks().get(0).getLen() > 0) { | ||
| ContainerScanError error = new ContainerScanError(FailureType.MISSING_CHUNK_FILE, | ||
| ContainerScanError error = new ContainerScanError(FailureType.MISSING_DATA_FILE, | ||
| new File(containerDataFromDisk.getChunksPath()), new IOException("Missing chunk file " + | ||
| chunkFile.getAbsolutePath())); | ||
| blockErrors.add(error); | ||
| } | ||
| } else if (chunk.getChecksumData().getType() != ContainerProtos.ChecksumType.NONE) { | ||
| // Before adding chunks, add a block entry to the tree to represent cases where the block exists but has no | ||
| // chunks. | ||
| currentTree.addBlock(block.getBlockID().getLocalID()); | ||
errose28 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| int bytesPerChecksum = chunk.getChecksumData().getBytesPerChecksum(); | ||
| ByteBuffer buffer = BUFFER_POOL.getBuffer(bytesPerChecksum); | ||
| // Keep scanning the block even if there are errors with individual chunks. | ||
|
|
@@ -418,6 +420,14 @@ private static List<ContainerScanError> verifyChecksum(BlockData block, | |
|
|
||
| List<ContainerScanError> scanErrors = new ArrayList<>(); | ||
|
|
||
| // Information used to populate the merkle tree. Chunk metadata will be the same, but we must fill in the | ||
| // checksums with what we actually observe. | ||
| ContainerProtos.ChunkInfo.Builder observedChunkBuilder = chunk.toBuilder(); | ||
| ContainerProtos.ChecksumData.Builder observedChecksumData = chunk.getChecksumData().toBuilder(); | ||
| observedChecksumData.clearChecksums(); | ||
| boolean chunkHealthy = true; | ||
| boolean chunkMissing = false; | ||
|
|
||
| ChecksumData checksumData = | ||
| ChecksumData.getFromProtoBuf(chunk.getChecksumData()); | ||
| int checksumCount = checksumData.getChecksums().size(); | ||
|
|
@@ -430,10 +440,7 @@ private static List<ContainerScanError> verifyChecksum(BlockData block, | |
| if (layout == ContainerLayoutVersion.FILE_PER_BLOCK) { | ||
| channel.position(chunk.getOffset()); | ||
| } | ||
| // Only report one error per chunk. Reporting corruption at every "bytes per checksum" interval will lead to a | ||
| // large amount of errors when a full chunk is corrupted. | ||
| boolean chunkHealthy = true; | ||
| for (int i = 0; i < checksumCount && chunkHealthy; i++) { | ||
| for (int i = 0; i < checksumCount; i++) { | ||
| // limit last read for FILE_PER_BLOCK, to avoid reading next chunk | ||
| if (layout == ContainerLayoutVersion.FILE_PER_BLOCK && | ||
| i == checksumCount - 1 && | ||
|
|
@@ -453,7 +460,11 @@ private static List<ContainerScanError> verifyChecksum(BlockData block, | |
| ByteString expected = checksumData.getChecksums().get(i); | ||
| ByteString actual = cal.computeChecksum(buffer) | ||
| .getChecksums().get(0); | ||
| if (!expected.equals(actual)) { | ||
| observedChecksumData.addChecksums(actual); | ||
| // Only report one error per chunk. Reporting corruption at every "bytes per checksum" interval will lead to a | ||
| // large amount of errors when a full chunk is corrupted. | ||
| // Continue scanning the chunk even after the first error so the full merkle tree can be built. | ||
| if (chunkHealthy && !expected.equals(actual)) { | ||
| String message = String | ||
| .format("Inconsistent read for chunk=%s" + | ||
| " checksum item %d" + | ||
|
|
@@ -465,26 +476,46 @@ private static List<ContainerScanError> verifyChecksum(BlockData block, | |
| StringUtils.bytes2Hex(expected.asReadOnlyByteBuffer()), | ||
| StringUtils.bytes2Hex(actual.asReadOnlyByteBuffer()), | ||
| block.getBlockID()); | ||
| chunkHealthy = false; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can break the loop here because smallest unit is chunk. |
||
| scanErrors.add(new ContainerScanError(FailureType.CORRUPT_CHUNK, chunkFile, | ||
| new OzoneChecksumException(message))); | ||
| chunkHealthy = false; | ||
| } | ||
| } | ||
| // If all the checksums match, also check that the length stored in the metadata matches the number of bytes | ||
| // seen on the disk. | ||
|
|
||
| observedChunkBuilder.setLen(bytesRead); | ||
| // If we haven't seen any errors after scanning the whole chunk, verify that the length stored in the metadata | ||
| // matches the number of bytes seen on the disk. | ||
| if (chunkHealthy && bytesRead != chunk.getLen()) { | ||
| String message = String | ||
| .format("Inconsistent read for chunk=%s expected length=%d" | ||
| + " actual length=%d for block %s", | ||
| chunk.getChunkName(), | ||
| chunk.getLen(), bytesRead, block.getBlockID()); | ||
| scanErrors.add(new ContainerScanError(FailureType.INCONSISTENT_CHUNK_LENGTH, chunkFile, | ||
| new IOException(message))); | ||
| if (bytesRead == 0) { | ||
| // If we could not find any data for the chunk, report it as missing. | ||
| chunkMissing = true; | ||
| chunkHealthy = false; | ||
| String message = String.format("Missing chunk=%s with expected length=%d for block %s", | ||
| chunk.getChunkName(), chunk.getLen(), block.getBlockID()); | ||
| scanErrors.add(new ContainerScanError(FailureType.MISSING_CHUNK, chunkFile, new IOException(message))); | ||
| } else { | ||
| // We found data for the chunk, but it was shorter than expected. | ||
| String message = String | ||
| .format("Inconsistent read for chunk=%s expected length=%d" | ||
| + " actual length=%d for block %s", | ||
| chunk.getChunkName(), | ||
| chunk.getLen(), bytesRead, block.getBlockID()); | ||
| chunkHealthy = false; | ||
| scanErrors.add(new ContainerScanError(FailureType.INCONSISTENT_CHUNK_LENGTH, chunkFile, | ||
| new IOException(message))); | ||
| } | ||
| } | ||
| } catch (IOException ex) { | ||
| scanErrors.add(new ContainerScanError(FailureType.MISSING_CHUNK_FILE, chunkFile, ex)); | ||
| // An unknown error occurred trying to access the chunk. Report it as corrupted. | ||
| chunkHealthy = false; | ||
| scanErrors.add(new ContainerScanError(FailureType.CORRUPT_CHUNK, chunkFile, ex)); | ||
| } | ||
|
|
||
| // Missing chunks should not be added to the merkle tree. | ||
| if (!chunkMissing) { | ||
| observedChunkBuilder.setChecksumData(observedChecksumData); | ||
| currentTree.addChunks(block.getBlockID().getLocalID(), chunkHealthy, observedChunkBuilder.build()); | ||
| } | ||
| return scanErrors; | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -626,13 +626,18 @@ ContainerCommandResponseProto handleCloseContainer( | |
| return getSuccessResponse(request); | ||
| } | ||
|
|
||
|
|
||
| /** | ||
| * Create a Merkle tree for the container if it does not exist. | ||
| * Write the merkle tree for this container using the existing checksum metadata only. The data is not read or | ||
| * validated by this method, so it is expected to run quickly. | ||
| * | ||
| * If a checksum file already exists on the disk, this method will do nothing. The existing file would have either | ||
| * been made from the metadata or data itself so there is no need to recreate it from the metadata. | ||
| * | ||
| * TODO: This method should be changed to private after HDDS-10374 is merged. | ||
| * | ||
| * @param container The container which will have a tree generated. | ||
| */ | ||
| @VisibleForTesting | ||
| public void createContainerMerkleTree(Container container) { | ||
| public void createContainerMerkleTreeFromMetadata(Container container) { | ||
| if (ContainerChecksumTreeManager.checksumFileExist(container)) { | ||
| return; | ||
| } | ||
|
|
@@ -1393,7 +1398,7 @@ public void markContainerForClose(Container container) | |
| } finally { | ||
| container.writeUnlock(); | ||
| } | ||
| createContainerMerkleTree(container); | ||
| createContainerMerkleTreeFromMetadata(container); | ||
| ContainerLogger.logClosing(container.getContainerData()); | ||
| sendICR(container); | ||
| } | ||
|
|
@@ -1426,7 +1431,6 @@ public void markContainerUnhealthy(Container container, ScanResult reason) | |
| } finally { | ||
| container.writeUnlock(); | ||
| } | ||
| createContainerMerkleTree(container); | ||
|
||
| // Even if the container file is corrupted/missing and the unhealthy | ||
| // update fails, the unhealthy state is kept in memory and sent to | ||
| // SCM. Write a corresponding entry to the container log as well. | ||
|
|
@@ -1457,7 +1461,7 @@ public void quasiCloseContainer(Container container, String reason) | |
| } finally { | ||
| container.writeUnlock(); | ||
| } | ||
| createContainerMerkleTree(container); | ||
| createContainerMerkleTreeFromMetadata(container); | ||
| ContainerLogger.logQuasiClosed(container.getContainerData(), reason); | ||
| sendICR(container); | ||
| } | ||
|
|
@@ -1491,7 +1495,7 @@ public void closeContainer(Container container) | |
| } finally { | ||
| container.writeUnlock(); | ||
| } | ||
| createContainerMerkleTree(container); | ||
| createContainerMerkleTreeFromMetadata(container); | ||
| ContainerLogger.logClosed(container.getContainerData()); | ||
| sendICR(container); | ||
| } | ||
|
|
@@ -1599,12 +1603,12 @@ private ContainerProtos.ContainerChecksumInfo updateAndGetContainerChecksum(KeyV | |
| BlockData blockData = blockIterator.nextBlock(); | ||
| List<ContainerProtos.ChunkInfo> chunkInfos = blockData.getChunks(); | ||
| // TODO: Add empty blocks to the merkle tree. Done in HDDS-10374, needs to be backported. | ||
| merkleTree.addChunks(blockData.getLocalID(), chunkInfos); | ||
| // Assume all chunks are healthy when building the tree from metadata. Scanner will identify corruption when | ||
| // it runs after. | ||
| merkleTree.addChunks(blockData.getLocalID(), true, chunkInfos); | ||
| } | ||
| } | ||
| ContainerProtos.ContainerChecksumInfo checksumInfo = checksumManager | ||
| .writeContainerDataTree(containerData, merkleTree); | ||
| return checksumInfo; | ||
| return checksumManager.writeContainerDataTree(containerData, merkleTree); | ||
| } | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,9 +32,10 @@ public enum FailureType { | |
| MISSING_METADATA_DIR, | ||
| MISSING_CONTAINER_FILE, | ||
| MISSING_CHUNKS_DIR, | ||
| MISSING_CHUNK_FILE, | ||
| MISSING_DATA_FILE, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. qq: Is this for the cases when
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. I changed the name so it is agnostic of container layout. |
||
| CORRUPT_CONTAINER_FILE, | ||
| CORRUPT_CHUNK, | ||
| MISSING_CHUNK, | ||
| INCONSISTENT_CHUNK_LENGTH, | ||
| INACCESSIBLE_DB, | ||
| WRITE_FAILURE, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should there be a way to tell if a checksum failed vs. scanner has not yet run? Should failure to generate checksum == -1?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As discussed, the in-memory hash should be what is written to disk as it is an in-memory cache. If updating the Merkle tree is failing, we can have a metric + log message and/or a status report to SCM or a new Admin API to Datanode to query the Merkle tree (I think there are going to be multiple debug scenarios where we would like to query a Datanode what it knows independent of SCM reports).