-
Notifications
You must be signed in to change notification settings - Fork 3.4k
HBASE-27551 Add config options to delay assignment to retain last region location #4945
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
87fc143
69db293
d1c9ecb
20c454c
b5d21a0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,6 +31,7 @@ | |
| import org.apache.hadoop.hbase.client.RetriesExhaustedException; | ||
| import org.apache.hadoop.hbase.master.MetricsAssignmentManager; | ||
| import org.apache.hadoop.hbase.master.RegionState.State; | ||
| import org.apache.hadoop.hbase.master.ServerManager; | ||
| import org.apache.hadoop.hbase.master.procedure.AbstractStateMachineRegionProcedure; | ||
| import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; | ||
| import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; | ||
|
|
@@ -95,6 +96,10 @@ | |
| * Notice that, although we allow specify a target server, it just acts as a candidate, we do not | ||
| * guarantee that the region will finally be on the target server. If this is important for you, you | ||
| * should check whether the region is on the target server after the procedure is finished. | ||
| * </p> | ||
| * Altenatively, for trying retaining assignments, the | ||
| * <b>hbase.master.scp.retain.assignment.force</b> option can be used together with | ||
| * <b>hbase.master.scp.retain.assignment</b>. | ||
| * <p/> | ||
| * When you want to schedule a TRSP, please check whether there is still one for this region, and | ||
| * the check should be under the RegionStateNode lock. We will remove the TRSP from a | ||
|
|
@@ -107,6 +112,25 @@ public class TransitRegionStateProcedure | |
|
|
||
| private static final Logger LOG = LoggerFactory.getLogger(TransitRegionStateProcedure.class); | ||
|
|
||
| public static final String FORCE_REGION_RETAINMENT = "hbase.master.scp.retain.assignment.force"; | ||
|
|
||
| public static final boolean DEFAULT_FORCE_REGION_RETAINMENT = false; | ||
|
|
||
| /** The wait time in millis before checking again if the region's previous RS is back online */ | ||
| public static final String FORCE_REGION_RETAINMENT_WAIT = | ||
| "hbase.master.scp.retain.assignment.force.wait"; | ||
|
|
||
| public static final int DEFAULT_FORCE_REGION_RETAINMENT_WAIT = 100; | ||
|
|
||
| /** | ||
| * The number of times to check if the region's previous RS is back online, before giving up and | ||
| * proceeding with assignment on a new RS | ||
| */ | ||
| public static final String FORCE_REGION_RETAINMENT_RETRIES = | ||
| "hbase.master.scp.retain.assignment.force.retries"; | ||
|
|
||
| public static final long DEFAULT_FORCE_REGION_RETAINMENT_RETRIES = 600; | ||
|
|
||
| private TransitionType type; | ||
|
|
||
| private RegionStateTransitionState initialState; | ||
|
|
@@ -126,6 +150,16 @@ public class TransitRegionStateProcedure | |
|
|
||
| private boolean isSplit; | ||
|
|
||
| private boolean forceRegionRetainment; | ||
|
|
||
| private ServerManager serverManager; | ||
|
||
|
|
||
| private int forceRegionRetainmentWait; | ||
|
|
||
| private long forceRegionRetainmentRetries; | ||
|
|
||
| private long retries; | ||
|
|
||
| public TransitRegionStateProcedure() { | ||
| } | ||
|
|
||
|
|
@@ -163,6 +197,23 @@ protected TransitRegionStateProcedure(MasterProcedureEnv env, RegionInfo hri, | |
| } | ||
| evictCache = | ||
| env.getMasterConfiguration().getBoolean(EVICT_BLOCKS_ON_CLOSE_KEY, DEFAULT_EVICT_ON_CLOSE); | ||
|
|
||
| readConfigs(env); | ||
| } | ||
|
|
||
| private void readConfigs(MasterProcedureEnv env) { | ||
| forceRegionRetainment = env.getMasterConfiguration().getBoolean(FORCE_REGION_RETAINMENT, | ||
| DEFAULT_FORCE_REGION_RETAINMENT); | ||
| forceRegionRetainmentWait = env.getMasterConfiguration().getInt(FORCE_REGION_RETAINMENT_WAIT, | ||
| DEFAULT_FORCE_REGION_RETAINMENT_WAIT); | ||
| forceRegionRetainmentRetries = env.getMasterConfiguration() | ||
| .getLong(FORCE_REGION_RETAINMENT_RETRIES, DEFAULT_FORCE_REGION_RETAINMENT_RETRIES); | ||
| serverManager = env.getMasterServices().getServerManager(); | ||
| } | ||
|
|
||
| @Override | ||
| protected void afterReplay(MasterProcedureEnv env) { | ||
| readConfigs(env); | ||
| } | ||
|
|
||
| protected TransitRegionStateProcedure(MasterProcedureEnv env, RegionInfo hri, | ||
|
|
@@ -188,6 +239,25 @@ protected boolean waitInitialized(MasterProcedureEnv env) { | |
| return am.waitMetaLoaded(this) || am.waitMetaAssigned(this, getRegion()); | ||
| } | ||
|
|
||
| private void checkAndWaitForOriginalServer(ServerName lastHost) | ||
| throws ProcedureSuspendedException { | ||
| ServerName newNameForServer = serverManager.findServerWithSameHostnamePortWithLock(lastHost); | ||
| boolean isOnline = serverManager.createDestinationServersList().contains(newNameForServer); | ||
| if (!isOnline && retries < forceRegionRetainmentRetries) { | ||
| retries++; | ||
| LOG.info("Suspending the TRSP PID={} because {} is true and previous host {} " | ||
| + "for region is not yet online.", this.getProcId(), FORCE_REGION_RETAINMENT, lastHost); | ||
| setTimeout(forceRegionRetainmentWait); | ||
|
||
| setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT); | ||
| throw new ProcedureSuspendedException(); | ||
| } | ||
| LOG.info( | ||
Apache9 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| "{} is true. TRSP PID={} waited {}ms for host {} to come back online. " | ||
| + "Did host come back online? {}", | ||
| FORCE_REGION_RETAINMENT, this.getProcId(), (retries * forceRegionRetainmentWait), lastHost, | ||
| isOnline); | ||
| } | ||
|
|
||
| private void queueAssign(MasterProcedureEnv env, RegionStateNode regionNode) | ||
| throws ProcedureSuspendedException { | ||
| boolean retain = false; | ||
|
|
@@ -200,9 +270,15 @@ private void queueAssign(MasterProcedureEnv env, RegionStateNode regionNode) | |
| regionNode.setRegionLocation(assignCandidate); | ||
| } else if (regionNode.getLastHost() != null) { | ||
| retain = true; | ||
| LOG.info("Setting lastHost as the region location {}", regionNode.getLastHost()); | ||
| LOG.info("Setting lastHost {} as the location for region {}", regionNode.getLastHost(), | ||
| regionNode.getRegionInfo().getEncodedName()); | ||
| regionNode.setRegionLocation(regionNode.getLastHost()); | ||
| } | ||
| if (regionNode.getRegionLocation() != null && forceRegionRetainment) { | ||
| LOG.warn("{} is set to true. This may delay regions re-assignment " | ||
| + "upon RegionServers crashes or restarts.", FORCE_REGION_RETAINMENT); | ||
| checkAndWaitForOriginalServer(regionNode.getRegionLocation()); | ||
| } | ||
| } | ||
| LOG.info("Starting {}; {}; forceNewPlan={}, retain={}", this, regionNode.toShortString(), | ||
| forceNewPlan, retain); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All the below fields should not be stored here, when reloading we will use the default constructor to create a procedure and use deserialize method to restore the fields, so if you want to store them here, you need to serialize them, or you should implement the afterReplay method to initialize them...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for pointing this out. Since these are all config readings, decided to go with the
afterReplayoverriding option.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looking at other configs in TRSP, I prefer we store these configs in AssignmentManager, just like