diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOMRatisSnapshots.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOMRatisSnapshots.java index 5cd5b788495d..b65a17207e63 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOMRatisSnapshots.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOMRatisSnapshots.java @@ -50,12 +50,11 @@ import org.apache.ozone.test.tag.Flaky; import org.apache.ratis.server.protocol.TermIndex; import org.assertj.core.api.Fail; -import org.junit.Ignore; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInfo; import org.junit.jupiter.api.Timeout; import org.junit.jupiter.api.io.TempDir; import org.junit.jupiter.params.ParameterizedTest; @@ -96,8 +95,6 @@ * Tests the Ratis snapshots feature in OM. */ @Timeout(5000) -@Flaky("HDDS-8876") -@Disabled("HDDS-8880") public class TestOMRatisSnapshots { private MiniOzoneHAClusterImpl cluster = null; @@ -127,7 +124,7 @@ public class TestOMRatisSnapshots { * @throws IOException */ @BeforeEach - public void init() throws Exception { + public void init(TestInfo testInfo) throws Exception { conf = new OzoneConfiguration(); clusterId = UUID.randomUUID().toString(); scmId = UUID.randomUUID().toString(); @@ -137,9 +134,16 @@ public void init() throws Exception { StorageUnit.KB); conf.setStorageSize(OMConfigKeys. OZONE_OM_RATIS_SEGMENT_PREALLOCATED_SIZE_KEY, 16, StorageUnit.KB); + long snapshotThreshold = SNAPSHOT_THRESHOLD; + // TODO: refactor tests to run under a new class with different configs. + if (testInfo.getTestMethod().isPresent() && + testInfo.getTestMethod().get().getName() + .equals("testInstallSnapshot")) { + snapshotThreshold = SNAPSHOT_THRESHOLD * 10; + } conf.setLong( OMConfigKeys.OZONE_OM_RATIS_SNAPSHOT_AUTO_TRIGGER_THRESHOLD_KEY, - SNAPSHOT_THRESHOLD); + snapshotThreshold); cluster = (MiniOzoneHAClusterImpl) MiniOzoneCluster.newOMHABuilder(conf) .setClusterId(clusterId) .setScmId(scmId) @@ -444,9 +448,15 @@ public void testInstallIncrementalSnapshot(@TempDir Path tempDir) // Read & Write after snapshot installed. List newKeys = writeKeys(1); readKeys(newKeys); - assertNotNull(followerOMMetaMngr.getKeyTable( - TEST_BUCKET_LAYOUT).get(followerOMMetaMngr.getOzoneKey( - volumeName, bucketName, newKeys.get(0)))); + GenericTestUtils.waitFor(() -> { + try { + return followerOMMetaMngr.getKeyTable(TEST_BUCKET_LAYOUT) + .get(followerOMMetaMngr.getOzoneKey( + volumeName, bucketName, newKeys.get(0))) != null; + } catch (IOException e) { + throw new RuntimeException(e); + } + }, 100, 10000); // Verify follower candidate directory get cleaned String[] filesInCandidate = followerOM.getOmSnapshotProvider(). @@ -550,6 +560,7 @@ private IncrementData getNextIncrementalTarball( @Test @Timeout(300) + @Flaky("HDDS-8876") public void testInstallIncrementalSnapshotWithFailure() throws Exception { // Get the leader OM String leaderOMNodeId = OmFailoverProxyUtil @@ -649,6 +660,11 @@ public void testInstallIncrementalSnapshotWithFailure() throws Exception { .get(followerOMMetaMngr.getOzoneKey(volumeName, bucketName, key))); } + // There is a chance we end up checking the DBCheckpointMetrics + // before the follower sends another request to the leader + // to generate a checkpoint. + // TODO: Add wait check here, to avoid flakiness. + // Verify the metrics DBCheckpointMetrics dbMetrics = leaderOM.getMetrics(). getDBCheckpointMetrics(); @@ -664,9 +680,15 @@ public void testInstallIncrementalSnapshotWithFailure() throws Exception { // Read & Write after snapshot installed. List newKeys = writeKeys(1); readKeys(newKeys); - assertNotNull(followerOMMetaMngr.getKeyTable( - TEST_BUCKET_LAYOUT).get(followerOMMetaMngr.getOzoneKey( - volumeName, bucketName, newKeys.get(0)))); + GenericTestUtils.waitFor(() -> { + try { + return followerOMMetaMngr.getKeyTable(TEST_BUCKET_LAYOUT) + .get(followerOMMetaMngr.getOzoneKey( + volumeName, bucketName, newKeys.get(0))) != null; + } catch (IOException e) { + throw new RuntimeException(e); + } + }, 100, 10000); // Verify follower candidate directory get cleaned String[] filesInCandidate = followerOM.getOmSnapshotProvider(). @@ -675,7 +697,7 @@ public void testInstallIncrementalSnapshotWithFailure() throws Exception { assertEquals(0, filesInCandidate.length); } - @Ignore("Enable this unit test after RATIS-1481 used") + @Test public void testInstallSnapshotWithClientWrite() throws Exception { // Get the leader OM String leaderOMNodeId = OmFailoverProxyUtil