-
Notifications
You must be signed in to change notification settings - Fork 593
HDDS-10316. Speed up TestReconTasks #6223
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
53e2c11
399b76a
23df399
e37e875
d3f021e
9b22c3c
64e35f9
e42e398
3ebe2a1
833e008
27d7902
c1fe332
c540681
9665426
8cc60b4
68a7697
c8a858a
cb9d7c2
317dcfa
dc25ff1
59c8920
50033f7
46f97dd
3c5305e
cd8ba26
7725dfa
14b4cd9
3b1b51c
f54cdbd
2844cec
922f92d
1f4a91e
e75f857
eada75b
608631b
285c084
dc519e1
c566ab6
1fd1b5b
8a8db39
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,8 +41,12 @@ | |
| import org.apache.ozone.test.LambdaTestUtils; | ||
| import org.hadoop.ozone.recon.schema.ContainerSchemaDefinition; | ||
| import org.hadoop.ozone.recon.schema.tables.pojos.UnhealthyContainers; | ||
| import org.junit.jupiter.api.AfterEach; | ||
| import org.junit.jupiter.api.BeforeEach; | ||
| import org.junit.jupiter.api.AfterAll; | ||
| import org.junit.jupiter.api.BeforeAll; | ||
| import org.junit.jupiter.api.MethodOrderer.OrderAnnotation; | ||
| import org.junit.jupiter.api.Order; | ||
| import org.junit.jupiter.api.TestMethodOrder; | ||
| import org.junit.jupiter.api.TestInstance; | ||
| import org.junit.jupiter.api.Test; | ||
| import org.junit.jupiter.api.Timeout; | ||
| import org.slf4j.event.Level; | ||
|
|
@@ -58,12 +62,14 @@ | |
| * Integration Tests for Recon's tasks. | ||
| */ | ||
| @Timeout(300) | ||
| @TestInstance(TestInstance.Lifecycle.PER_CLASS) | ||
| @TestMethodOrder(OrderAnnotation.class) | ||
| public class TestReconTasks { | ||
| private MiniOzoneCluster cluster = null; | ||
| private OzoneConfiguration conf; | ||
|
|
||
| @BeforeEach | ||
| public void init() throws Exception { | ||
| @BeforeAll | ||
| void init() throws Exception { | ||
| conf = new OzoneConfiguration(); | ||
| conf.set(HDDS_CONTAINER_REPORT_INTERVAL, "5s"); | ||
| conf.set(HDDS_PIPELINE_REPORT_INTERVAL, "5s"); | ||
|
|
@@ -74,21 +80,22 @@ public void init() throws Exception { | |
|
|
||
| conf.set("ozone.scm.stale.node.interval", "6s"); | ||
| conf.set("ozone.scm.dead.node.interval", "8s"); | ||
| cluster = MiniOzoneCluster.newBuilder(conf).setNumDatanodes(1) | ||
| cluster = MiniOzoneCluster.newBuilder(conf).setNumDatanodes(3) | ||
| .includeRecon(true).build(); | ||
| cluster.waitForClusterToBeReady(); | ||
| GenericTestUtils.setLogLevel(SCMDatanodeHeartbeatDispatcher.LOG, | ||
| Level.DEBUG); | ||
| } | ||
|
|
||
| @AfterEach | ||
| public void shutdown() { | ||
| @AfterAll | ||
| void shutdown() { | ||
| if (cluster != null) { | ||
| cluster.shutdown(); | ||
| } | ||
| } | ||
|
|
||
| @Test | ||
| @Order(3) | ||
| public void testSyncSCMContainerInfo() throws Exception { | ||
| ReconStorageContainerManagerFacade reconScm = | ||
| (ReconStorageContainerManagerFacade) | ||
|
|
@@ -121,6 +128,7 @@ public void testSyncSCMContainerInfo() throws Exception { | |
| } | ||
|
|
||
| @Test | ||
| @Order(1) | ||
| public void testMissingContainerDownNode() throws Exception { | ||
| ReconStorageContainerManagerFacade reconScm = | ||
| (ReconStorageContainerManagerFacade) | ||
|
|
@@ -141,7 +149,7 @@ public void testMissingContainerDownNode() throws Exception { | |
| (ReconContainerManager) reconScm.getContainerManager(); | ||
| ContainerInfo containerInfo = | ||
| scmContainerManager | ||
| .allocateContainer(RatisReplicationConfig.getInstance(ONE), "test"); | ||
| .allocateContainer(RatisReplicationConfig.getInstance(ONE), "testMissingContainer"); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you are increasing datanodes, then better to keep replication factor also as THREE |
||
| long containerID = containerInfo.getContainerID(); | ||
|
|
||
| try (RDBBatchOperation rdbBatchOperation = new RDBBatchOperation()) { | ||
|
|
@@ -181,6 +189,8 @@ public void testMissingContainerDownNode() throws Exception { | |
| 0, 1000); | ||
| return (allMissingContainers.isEmpty()); | ||
| }); | ||
| // Cleaning up some data | ||
| scmContainerManager.deleteContainer(containerInfo.containerID()); | ||
| IOUtils.closeQuietly(client); | ||
| } | ||
|
|
||
|
|
@@ -202,6 +212,7 @@ public void testMissingContainerDownNode() throws Exception { | |
| * @throws Exception | ||
| */ | ||
| @Test | ||
| @Order(2) | ||
| public void testEmptyMissingContainerDownNode() throws Exception { | ||
| ReconStorageContainerManagerFacade reconScm = | ||
| (ReconStorageContainerManagerFacade) | ||
|
|
@@ -219,9 +230,10 @@ public void testEmptyMissingContainerDownNode() throws Exception { | |
| ContainerManager scmContainerManager = scm.getContainerManager(); | ||
| ReconContainerManager reconContainerManager = | ||
| (ReconContainerManager) reconScm.getContainerManager(); | ||
| int previousContainerCount = reconContainerManager.getContainers().size(); | ||
| ContainerInfo containerInfo = | ||
| scmContainerManager | ||
| .allocateContainer(RatisReplicationConfig.getInstance(ONE), "test"); | ||
| .allocateContainer(RatisReplicationConfig.getInstance(ONE), "testEmptyMissingContainer"); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you are increasing datanodes, then better to keep replication factor also as THREE
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried changing it to HddsProtos.ReplicationFactor.THREE but it seems having problem with number of pipelines..
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, this might be because of not meeting the criteria of sufficient healthy nodes because default |
||
| long containerID = containerInfo.getContainerID(); | ||
|
|
||
| Pipeline pipeline = | ||
|
|
@@ -230,8 +242,8 @@ public void testEmptyMissingContainerDownNode() throws Exception { | |
| runTestOzoneContainerViaDataNode(containerID, client); | ||
|
|
||
| // Make sure Recon got the container report with new container. | ||
| assertEquals(scmContainerManager.getContainers(), | ||
| reconContainerManager.getContainers()); | ||
| assertEquals(scmContainerManager.getContainers().size(), | ||
| reconContainerManager.getContainers().size() - previousContainerCount); | ||
|
|
||
| // Bring down the Datanode that had the container replica. | ||
| cluster.shutdownHddsDatanode(pipeline.getFirstNode()); | ||
|
|
@@ -305,7 +317,8 @@ public void testEmptyMissingContainerDownNode() throws Exception { | |
| 0, 1000); | ||
| return (allMissingContainers.isEmpty()); | ||
| }); | ||
|
|
||
| // Cleaning up some data | ||
| reconContainerManager.deleteContainer(containerInfo.containerID()); | ||
| IOUtils.closeQuietly(client); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks @raju-balpande for working on this patch. What is the need to increase the number of datanodes in cluster ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With 1 datanode it was working fine in local but was getting stuck in a wait condition in CI. After multiple such tries I found the CI working when we switch to 3 datanodes.
The performance after changes can be viewed in https://github.com/raju-balpande/apache_ozone/actions/runs/8091976415/job/22112283975

and previously it was https://github.com/raju-balpande/apache_ozone/actions/runs/7843423779/job/21404234425

There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you tell which test case and which wait condition it was getting stuck when 1 DN was used for cluster ?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @devmadhuu ,
It was getting stuck on wait condition at TestReconTasks.java:173
LambdaTestUtils.await(120000, 6000, () -> { List<UnhealthyContainers> allMissingContainers = reconContainerManager.getContainerSchemaManager() .getUnhealthyContainers( ContainerSchemaDefinition.UnHealthyContainerStates.MISSING, 0, 1000); return (allMissingContainers.size() == 1); });As I see in log https://github.com/raju-balpande/apache_ozone/actions/runs/7916623862/job/21611614999
Error: Tests run: 3, Failures: 0, Errors: 1, Skipped: 0, Time elapsed: 365.519 s <<< FAILURE! - in org.apache.hadoop.ozone.recon.TestReconTasks Error: org.apache.hadoop.ozone.recon.TestReconTasks.testMissingContainerDownNode Time elapsed: 300.006 s <<< ERROR! java.util.concurrent.TimeoutException: testMissingContainerDownNode() timed out after 300 seconds at java.util.ArrayList.forEach(ArrayList.java:1259) at java.util.ArrayList.forEach(ArrayList.java:1259) Suppressed: java.lang.InterruptedException: sleep interrupted at java.lang.Thread.sleep(Native Method) at org.apache.ozone.test.LambdaTestUtils.await(LambdaTestUtils.java:133) at org.apache.ozone.test.LambdaTestUtils.await(LambdaTestUtils.java:180) at org.apache.hadoop.ozone.recon.TestReconTasks.testMissingContainerDownNode(TestReconTasks.java:173) at java.lang.reflect.Method.invoke(Method.java:498)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, thanks for the update @raju-balpande , However I am not sure why above test condition should timeout with just 1 DN in CI and pass in local, because with 1 DN in cluster, we are shutting down that only DN and so missing container count should be 1.