-
Notifications
You must be signed in to change notification settings - Fork 9.2k
HDFS-16479. EC: NameNode should not send a reconstruction work when the source datanodes are insufficient #4138
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
8a4daad
1b40cf5
7784d02
f6c9d6b
494a921
a0d5756
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2163,6 +2163,17 @@ BlockReconstructionWork scheduleReconstruction(BlockInfo block, | |
| return null; | ||
| } | ||
|
|
||
| // skip if source datanodes for reconstructing ec block are not enough | ||
| if (block.isStriped()) { | ||
| BlockInfoStriped stripedBlock = (BlockInfoStriped) block; | ||
| int cellsNum = (int) ((stripedBlock.getNumBytes() - 1) / stripedBlock.getCellSize() + 1); | ||
| int minRequiredSources = Math.min(cellsNum, stripedBlock.getDataBlockNum()); | ||
| if (minRequiredSources > srcNodes.length) { | ||
| LOG.debug("Block {} cannot be reconstructed due to shortage of source datanodes ", block); | ||
| return null; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we increment the metrics before returning |
||
| } | ||
| } | ||
|
|
||
| // liveReplicaNodes can include READ_ONLY_SHARED replicas which are | ||
| // not included in the numReplicas.liveReplicas() count | ||
| assert liveReplicaNodes.size() >= numReplicas.liveReplicas(); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -852,6 +852,101 @@ public void testChooseSrcDNWithDupECInDecommissioningNode() throws Exception { | |
| 0, numReplicas.redundantInternalBlocks()); | ||
| } | ||
|
|
||
| @Test | ||
| public void testSkipReconstructionWithManyBusyNodes() { | ||
| long blockId = -9223372036854775776L; // real ec block id | ||
| // RS-3-2 EC policy | ||
| ErasureCodingPolicy ecPolicy = | ||
| SystemErasureCodingPolicies.getPolicies().get(1); | ||
|
|
||
| // striped blockInfo: 3 data blocks + 2 parity blocks | ||
| Block aBlock = new Block(blockId, ecPolicy.getCellSize() * ecPolicy.getNumDataUnits(), 0); | ||
| BlockInfoStriped aBlockInfoStriped = new BlockInfoStriped(aBlock, ecPolicy); | ||
|
|
||
| // create 4 storageInfo, which means 1 block is missing | ||
| DatanodeStorageInfo ds1 = DFSTestUtil.createDatanodeStorageInfo( | ||
| "storage1", "1.1.1.1", "rack1", "host1"); | ||
| DatanodeStorageInfo ds2 = DFSTestUtil.createDatanodeStorageInfo( | ||
| "storage2", "2.2.2.2", "rack2", "host2"); | ||
| DatanodeStorageInfo ds3 = DFSTestUtil.createDatanodeStorageInfo( | ||
| "storage3", "3.3.3.3", "rack3", "host3"); | ||
| DatanodeStorageInfo ds4 = DFSTestUtil.createDatanodeStorageInfo( | ||
| "storage4", "4.4.4.4", "rack4", "host4"); | ||
|
|
||
| // link block with storage | ||
| aBlockInfoStriped.addStorage(ds1, aBlock); | ||
| aBlockInfoStriped.addStorage(ds2, new Block(blockId + 1, 0, 0)); | ||
| aBlockInfoStriped.addStorage(ds3, new Block(blockId + 2, 0, 0)); | ||
| aBlockInfoStriped.addStorage(ds4, new Block(blockId + 3, 0, 0)); | ||
|
|
||
| addEcBlockToBM(blockId, ecPolicy); | ||
| aBlockInfoStriped.setBlockCollectionId(mockINodeId); | ||
|
|
||
| // reconstruction should be scheduled | ||
| BlockReconstructionWork work = bm.scheduleReconstruction(aBlockInfoStriped, 3); | ||
| assertNotNull(work); | ||
|
|
||
| // simulate the 2 nodes reach maxReplicationStreams | ||
| for(int i = 0; i < bm.maxReplicationStreams; i++){ | ||
| ds3.getDatanodeDescriptor().incrementPendingReplicationWithoutTargets(); | ||
| ds4.getDatanodeDescriptor().incrementPendingReplicationWithoutTargets(); | ||
| } | ||
|
|
||
| // reconstruction should be skipped since the number of non-busy nodes are not enough | ||
| work = bm.scheduleReconstruction(aBlockInfoStriped, 3); | ||
| assertNull(work); | ||
| } | ||
|
|
||
| @Test | ||
| public void testSkipReconstructionWithManyBusyNodes2() { | ||
| long blockId = -9223372036854775776L; // real ec block id | ||
| // RS-3-2 EC policy | ||
| ErasureCodingPolicy ecPolicy = | ||
| SystemErasureCodingPolicies.getPolicies().get(1); | ||
|
|
||
| // striped blockInfo: 2 data blocks + 2 paritys | ||
|
||
| Block aBlock = new Block(blockId, ecPolicy.getCellSize() * (ecPolicy.getNumDataUnits() - 1), 0); | ||
| BlockInfoStriped aBlockInfoStriped = new BlockInfoStriped(aBlock, ecPolicy); | ||
|
||
|
|
||
| // create 3 storageInfo, which means 1 block is missing | ||
| DatanodeStorageInfo ds1 = DFSTestUtil.createDatanodeStorageInfo( | ||
| "storage1", "1.1.1.1", "rack1", "host1"); | ||
| DatanodeStorageInfo ds2 = DFSTestUtil.createDatanodeStorageInfo( | ||
| "storage2", "2.2.2.2", "rack2", "host2"); | ||
| DatanodeStorageInfo ds3 = DFSTestUtil.createDatanodeStorageInfo( | ||
| "storage3", "3.3.3.3", "rack3", "host3"); | ||
|
|
||
| // link block with storage | ||
| aBlockInfoStriped.addStorage(ds1, aBlock); | ||
| aBlockInfoStriped.addStorage(ds2, new Block(blockId + 1, 0, 0)); | ||
| aBlockInfoStriped.addStorage(ds3, new Block(blockId + 2, 0, 0)); | ||
|
|
||
| addEcBlockToBM(blockId, ecPolicy); | ||
| aBlockInfoStriped.setBlockCollectionId(mockINodeId); | ||
|
|
||
| // reconstruction should be scheduled | ||
| BlockReconstructionWork work = bm.scheduleReconstruction(aBlockInfoStriped, 3); | ||
| assertNotNull(work); | ||
|
|
||
| // simulate the 1 node reaches maxReplicationStreams | ||
| for(int i = 0; i < bm.maxReplicationStreams; i++){ | ||
| ds2.getDatanodeDescriptor().incrementPendingReplicationWithoutTargets(); | ||
| } | ||
|
|
||
| // reconstruction should still be scheduled since there are 2 source nodes to create 2 blocks | ||
| work = bm.scheduleReconstruction(aBlockInfoStriped, 3); | ||
| assertNotNull(work); | ||
|
|
||
| // simulate the 1 more node reaches maxReplicationStreams | ||
| for(int i = 0; i < bm.maxReplicationStreams; i++){ | ||
| ds3.getDatanodeDescriptor().incrementPendingReplicationWithoutTargets(); | ||
| } | ||
|
|
||
| // reconstruction should be skipped since the number of non-busy nodes are not enough | ||
| work = bm.scheduleReconstruction(aBlockInfoStriped, 3); | ||
| assertNull(work); | ||
| } | ||
|
|
||
| @Test | ||
| public void testFavorDecomUntilHardLimit() throws Exception { | ||
| bm.maxReplicationStreams = 0; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this logic same as BlockInfoStriped.getRealDataBlockNum() can we use or extract the logic from there? or do some refactoring there, just trying if we can keep the logic at one place, in case there is some issue in the logic changing at one places fixes all the places..