diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 1aea8f9dbcf0..5bdbb927690f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -131,6 +131,10 @@ public class AssignmentManager { "hbase.assignment.maximum.attempts"; private static final int DEFAULT_ASSIGN_MAX_ATTEMPTS = Integer.MAX_VALUE; + public static final String ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS = + "hbase.assignment.retry.immediately.maximum.attempts"; + private static final int DEFAULT_ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS = 3; + /** Region in Transition metrics threshold time */ public static final String METRICS_RIT_STUCK_WARNING_THRESHOLD = "hbase.metrics.rit.stuck.warning.threshold"; @@ -151,6 +155,7 @@ public class AssignmentManager { private final int assignDispatchWaitQueueMaxSize; private final int assignDispatchWaitMillis; private final int assignMaxAttempts; + private final int assignRetryImmediatelyMaxAttempts; private final Object checkIfShouldMoveSystemRegionLock = new Object(); @@ -179,6 +184,8 @@ public AssignmentManager(final MasterServices master) { this.assignMaxAttempts = Math.max(1, conf.getInt(ASSIGN_MAX_ATTEMPTS, DEFAULT_ASSIGN_MAX_ATTEMPTS)); + this.assignRetryImmediatelyMaxAttempts = conf.getInt(ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS, + DEFAULT_ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS); int ritChoreInterval = conf.getInt(RIT_CHORE_INTERVAL_MSEC_CONF_KEY, DEFAULT_RIT_CHORE_INTERVAL_MSEC); @@ -308,6 +315,10 @@ int getAssignMaxAttempts() { return assignMaxAttempts; } + int getAssignRetryImmediatelyMaxAttempts() { + return assignRetryImmediatelyMaxAttempts; + } + public RegionStates getRegionStates() { return regionStates; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java index 1be7a9af992a..716db69a31d8 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java @@ -226,20 +226,32 @@ private Flow confirmOpened(MasterProcedureEnv env, RegionStateNode regionNode) return Flow.HAS_MORE_STATE; } - if (incrementAndCheckMaxAttempts(env, regionNode)) { + int retries = env.getAssignmentManager().getRegionStates().addToFailedOpen(regionNode) + .incrementAndGetRetries(); + int maxAttempts = env.getAssignmentManager().getAssignMaxAttempts(); + LOG.info("Retry={} of max={}; {}; {}", retries, maxAttempts, this, regionNode.toShortString()); + + if (retries >= maxAttempts) { env.getAssignmentManager().regionFailedOpen(regionNode, true); setFailure(getClass().getSimpleName(), new RetriesExhaustedException( "Max attempts " + env.getAssignmentManager().getAssignMaxAttempts() + " exceeded")); regionNode.unsetProcedure(this); return Flow.NO_MORE_STATE; } + env.getAssignmentManager().regionFailedOpen(regionNode, false); // we failed to assign the region, force a new plan forceNewPlan = true; regionNode.setRegionLocation(null); setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE); - // Here we do not throw exception because we want to the region to be online ASAP - return Flow.HAS_MORE_STATE; + + if (retries > env.getAssignmentManager().getAssignRetryImmediatelyMaxAttempts()) { + // Throw exception to backoff and retry when failed open too many times + throw new HBaseIOException("Failed to open region"); + } else { + // Here we do not throw exception because we want to the region to be online ASAP + return Flow.HAS_MORE_STATE; + } } private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode) throws IOException { @@ -400,14 +412,6 @@ void unattachRemoteProc(RegionRemoteProcedureBase proc) { this.remoteProc = null; } - private boolean incrementAndCheckMaxAttempts(MasterProcedureEnv env, RegionStateNode regionNode) { - int retries = env.getAssignmentManager().getRegionStates().addToFailedOpen(regionNode) - .incrementAndGetRetries(); - int max = env.getAssignmentManager().getAssignMaxAttempts(); - LOG.info("Retry={} of max={}; {}; {}", retries, max, this, regionNode.toShortString()); - return retries >= max; - } - @Override protected void rollbackState(MasterProcedureEnv env, RegionStateTransitionState state) throws IOException, InterruptedException {