Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,14 @@ public void pause() {
getLifeCycle().transition(LifeCycle.State.PAUSED);
}

@Override
public void reinitialize() {
if (getLifeCycleState() == LifeCycle.State.PAUSED) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @hanishakoneru does OmStateMachine also need similar code?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: How these tests are working before? (Not able to understand that? can you shed some info)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After notifyInstallSnapshotFromLeader is called, ratis calls

                if (reply != null) {
                  LOG.info("{}: StateMachine successfully installed snapshot index {}. Reloading the StateMachine.",
                      getMemberId(), reply.getIndex());
                  stateMachine.pause();
                  state.updateInstalledSnapshotIndex(reply);
                  state.reloadStateMachine(reply.getIndex());
                }

stateMachine.pause(); will make SM to be in PAUSED state, state.reloadStateMachine(reply.getIndex()) will trigger StateMachineUpdater#reload() to be called, which will then call stateMachine.reinitialize();.

This is the reason of the fix.

As far as I known, at the end of TestSCMInstallSnapshotWithHA#testInstallSnapshot, the followerSCM is also in PAUSED state, which is not checked before.

And for testInstallOldCheckpointFailure, notifyInstallSnapshotFromLeader is not really called, since without the fix in RATIS-1369, downloading snapshot taken at index 0 is ignore by follower SCM.

getLifeCycle().transition(LifeCycle.State.STARTING);
getLifeCycle().transition(LifeCycle.State.RUNNING);
}
}

@Override
public void close() throws IOException {
if (!isInitialized) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
import org.apache.ratis.server.protocol.TermIndex;

import static org.junit.Assert.assertTrue;
import static org.junit.Assert.assertFalse;

import org.junit.Assert;
import org.junit.jupiter.api.AfterEach;
Expand Down Expand Up @@ -121,7 +122,7 @@ public void testInstallSnapshot() throws Exception {
// Find the inactive SCM
String followerId = getInactiveSCM(cluster).getScmId();

StorageContainerManager follower = cluster.getSCM(followerId);
StorageContainerManager followerSCM = cluster.getSCM(followerId);
// Do some transactions so that the log index increases
List<ContainerInfo> containers = writeToIncreaseLogIndex(leaderSCM, 200);

Expand All @@ -131,15 +132,16 @@ public void testInstallSnapshot() throws Exception {
cluster.startInactiveSCM(followerId);

// The recently started should be lagging behind the leader .
SCMStateMachine followerSM =
followerSCM.getScmHAManager().getRatisServer().getSCMStateMachine();
long followerLastAppliedIndex =
follower.getScmHAManager().getRatisServer().getSCMStateMachine()
.getLastAppliedTermIndex().getIndex();
assertTrue(
followerLastAppliedIndex >= 200);
followerSM.getLastAppliedTermIndex().getIndex();
assertTrue(followerLastAppliedIndex >= 200);
assertFalse(followerSM.getLifeCycleState().isPausingOrPaused());

// Verify that the follower 's DB contains the transactions which were
// made while it was inactive.
SCMMetadataStore followerMetaStore = follower.getScmMetadataStore();
SCMMetadataStore followerMetaStore = followerSCM.getScmMetadataStore();
for (ContainerInfo containerInfo : containers) {
Assert.assertNotNull(followerMetaStore.getContainerTable()
.get(containerInfo.containerID()));
Expand All @@ -154,9 +156,9 @@ public void testInstallOldCheckpointFailure() throws Exception {
String followerId = getInactiveSCM(cluster).getScmId();
// Find the inactive SCM

StorageContainerManager follower = cluster.getSCM(followerId);
StorageContainerManager followerSCM = cluster.getSCM(followerId);
cluster.startInactiveSCM(followerId);
follower.exitSafeMode();
followerSCM.exitSafeMode();
DBCheckpoint leaderDbCheckpoint = leaderSCM.getScmMetadataStore().getStore()
.getCheckpoint(false);

Expand All @@ -165,8 +167,8 @@ public void testInstallOldCheckpointFailure() throws Exception {
TermIndex lastTermIndex = leaderSM.getLastAppliedTermIndex();

SCMStateMachine followerSM =
follower.getScmHAManager().getRatisServer().getSCMStateMachine();
follower.getScmMetadataStore().getTransactionInfoTable().
followerSCM.getScmHAManager().getRatisServer().getSCMStateMachine();
followerSCM.getScmMetadataStore().getTransactionInfoTable().
put(OzoneConsts.TRANSACTION_INFO_KEY, TransactionInfo.builder()
.setCurrentTerm(lastTermIndex.getTerm())
.setTransactionIndex(lastTermIndex.getIndex() + 100).build());
Expand All @@ -183,7 +185,7 @@ public void testInstallOldCheckpointFailure() throws Exception {
// state should be reloaded.
TermIndex followerTermIndex = followerSM.getLastAppliedTermIndex();
SCMHAManagerImpl scmhaManager =
(SCMHAManagerImpl) (follower.getScmHAManager());
(SCMHAManagerImpl) (followerSCM.getScmHAManager());
TermIndex newTermIndex =
scmhaManager.installCheckpoint(leaderNodeId, leaderDbCheckpoint);

Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xs
<declared.ozone.version>${ozone.version}</declared.ozone.version>

<!-- Apache Ratis version -->
<ratis.version>2.1.0-43915d2-SNAPSHOT</ratis.version>
<ratis.version>2.1.0-ff8aa66-SNAPSHOT</ratis.version>

<!-- Apache Ratis thirdparty version -->
<ratis.thirdparty.version>0.7.0-a398b19-SNAPSHOT</ratis.thirdparty.version>
Expand Down