diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAManagerImpl.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAManagerImpl.java index 382157545908..a69f2cbbb436 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAManagerImpl.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMHAManagerImpl.java @@ -73,6 +73,7 @@ public SCMHAManagerImpl(final ConfigurationSource conf, final StorageContainerManager scm) throws IOException { this.conf = conf; this.scm = scm; + this.exitManager = new ExitManager(); if (SCMHAUtils.isSCMHAEnabled(conf)) { this.transactionBuffer = new SCMHADBTransactionBufferImpl(scm); this.ratisServer = new SCMRatisServerImpl(conf, scm, @@ -258,7 +259,7 @@ public TermIndex installCheckpoint(Path checkpointLocation, throw e; } - File dbBackup = null; + File dbBackup; try { dbBackup = HAUtils .replaceDBWithCheckpoint(lastAppliedIndex, oldDBLocation, @@ -266,29 +267,41 @@ public TermIndex installCheckpoint(Path checkpointLocation, LOG.info("Replaced DB with checkpoint, term: {}, index: {}", term, lastAppliedIndex); } catch (Exception e) { + // If we are not able to install latest checkpoint we should throw + // this exception. In this way reinitialize can throw exception to + // ratis to handle properly. LOG.error("Failed to install Snapshot as SCM failed to replace" - + " DB with downloaded checkpoint. Reloading old SCM state.", e); + + " DB with downloaded checkpoint. Checkpoint transaction {}", e, + checkpointTxnInfo.getTransactionIndex()); + throw e; } + // Reload the DB store with the new checkpoint. - // Restart (unpause) the state machine and update its last applied index - // to the installed checkpoint's snapshot index. try { reloadSCMState(); LOG.info("Reloaded SCM state with Term: {} and Index: {}", term, lastAppliedIndex); } catch (Exception ex) { + LOG.info("Failed to reload SCM state with Term: {} and Index: {}", term, + lastAppliedIndex); + // revert to the old db, since the new db may be a corrupted one + // so that SCM can restart from the old db. try { - // revert to the old db, since the new db may be a corrupted one, - // so that SCM can restart from the old db. if (dbBackup != null) { - dbBackup = HAUtils - .replaceDBWithCheckpoint(lastAppliedIndex, oldDBLocation, + dbBackup = + HAUtils.replaceDBWithCheckpoint(lastAppliedIndex, oldDBLocation, dbBackup.toPath(), OzoneConsts.SCM_DB_BACKUP_PREFIX); - startServices(); + LOG.error("Replacing SCM state with Term : {} and Index:", + termIndex.getTerm(), termIndex.getTerm()); + // This is being done to check before stop with old db + // try to reload and then finally terminate and also test has + // assumption for re-verify after corrupt DB loading without + // reloadSCMState call test fails with NPE when finding db location. + reloadSCMState(); } } finally { - String errorMsg = - "Failed to reload SCM state and instantiate services."; + String errorMsg = "Failed to reload SCM state and instantiate " + + "services."; exitManager.exitSystem(1, errorMsg, ex, LOG); } } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java index 8fa1866d5b7e..9aeda10225ff 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java @@ -357,7 +357,7 @@ public void pause() { } @Override - public void reinitialize() { + public void reinitialize() throws IOException { Preconditions.checkNotNull(installingDBCheckpoint); DBCheckpoint checkpoint = installingDBCheckpoint; @@ -369,8 +369,8 @@ public void reinitialize() { termIndex = scm.getScmHAManager().installCheckpoint(checkpoint); } catch (Exception e) { - LOG.error("Failed to reinitialize SCMStateMachine."); - return; + LOG.error("Failed to reinitialize SCMStateMachine.", e); + throw new IOException(e); } // re-initialize the DBTransactionBuffer and update the lastAppliedIndex.