Skip to content

Commit 279abdd

Browse files
committed
HBASE-29259 Master crash when loading procedures (#6906)
Signed-off-by: Nick Dimiduk <[email protected]> (cherry picked from commit 38fe074)
1 parent ca46488 commit 279abdd

File tree

5 files changed

+168
-13
lines changed

5 files changed

+168
-13
lines changed

hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,25 @@ protected void afterReplay(TEnvironment env) {
346346
// no-op
347347
}
348348

349+
/**
350+
* Called before we call the execute method of this procedure, but after we acquire the execution
351+
* lock and procedure scheduler lock.
352+
*/
353+
protected void beforeExec(TEnvironment env) throws ProcedureSuspendedException {
354+
// no-op
355+
}
356+
357+
/**
358+
* Called after we call the execute method of this procedure, and also after we initialize all the
359+
* sub procedures and persist the the state if persistence is needed.
360+
* <p>
361+
* This is for doing some hooks after we initialize the sub procedures. See HBASE-29259 for more
362+
* details on why we can not release the region lock inside the execute method.
363+
*/
364+
protected void afterExec(TEnvironment env) {
365+
// no-op
366+
}
367+
349368
/**
350369
* Called when the procedure is marked as completed (success or rollback). The procedure
351370
* implementor may use this method to cleanup in-memory states. This operation will not be retried

hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1781,6 +1781,7 @@ private void execProcedure(RootProcedureState<TEnvironment> procStack,
17811781
reExecute = false;
17821782
procedure.resetPersistence();
17831783
try {
1784+
procedure.beforeExec(getEnvironment());
17841785
subprocs = procedure.doExecute(getEnvironment());
17851786
if (subprocs != null && subprocs.length == 0) {
17861787
subprocs = null;
@@ -1790,11 +1791,13 @@ private void execProcedure(RootProcedureState<TEnvironment> procStack,
17901791
suspended = true;
17911792
} catch (ProcedureYieldException e) {
17921793
LOG.trace("Yield {}", procedure, e);
1794+
procedure.afterExec(getEnvironment());
17931795
yieldProcedure(procedure);
17941796
return;
17951797
} catch (InterruptedException e) {
17961798
LOG.trace("Yield interrupt {}", procedure, e);
17971799
handleInterruptedException(procedure, e);
1800+
procedure.afterExec(getEnvironment());
17981801
yieldProcedure(procedure);
17991802
return;
18001803
} catch (Throwable e) {
@@ -1866,6 +1869,7 @@ private void execProcedure(RootProcedureState<TEnvironment> procStack,
18661869
updateStoreOnExec(procStack, procedure, subprocs);
18671870
}
18681871
}
1872+
procedure.afterExec(getEnvironment());
18691873

18701874
// if the store is not running we are aborting
18711875
if (!store.isRunning()) {

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -283,11 +283,22 @@ private void unattach(MasterProcedureEnv env) {
283283
getParent(env).unattachRemoteProc(this);
284284
}
285285

286+
@Override
287+
protected void beforeExec(MasterProcedureEnv env) {
288+
RegionStateNode regionNode = getRegionNode(env);
289+
regionNode.lock();
290+
}
291+
292+
@Override
293+
protected void afterExec(MasterProcedureEnv env) {
294+
RegionStateNode regionNode = getRegionNode(env);
295+
regionNode.unlock();
296+
}
297+
286298
@Override
287299
protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env)
288300
throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException {
289301
RegionStateNode regionNode = getRegionNode(env);
290-
regionNode.lock();
291302
try {
292303
switch (state) {
293304
case REGION_REMOTE_PROCEDURE_DISPATCH: {
@@ -333,8 +344,6 @@ protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env)
333344
setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
334345
skipPersistence();
335346
throw new ProcedureSuspendedException();
336-
} finally {
337-
regionNode.unlock();
338347
}
339348
}
340349

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
import org.apache.hadoop.hbase.master.procedure.AbstractStateMachineRegionProcedure;
3333
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
3434
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
35-
import org.apache.hadoop.hbase.procedure2.Procedure;
3635
import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
3736
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
3837
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
@@ -313,19 +312,18 @@ private Flow confirmClosed(MasterProcedureEnv env, RegionStateNode regionNode)
313312
return Flow.HAS_MORE_STATE;
314313
}
315314

316-
// Override to lock RegionStateNode
317-
@SuppressWarnings("rawtypes")
318315
@Override
319-
protected Procedure[] execute(MasterProcedureEnv env)
320-
throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
316+
protected void beforeExec(MasterProcedureEnv env) {
321317
RegionStateNode regionNode =
322318
env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion());
323319
regionNode.lock();
324-
try {
325-
return super.execute(env);
326-
} finally {
327-
regionNode.unlock();
328-
}
320+
}
321+
322+
@Override
323+
protected void afterExec(MasterProcedureEnv env) {
324+
RegionStateNode regionNode =
325+
env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion());
326+
regionNode.unlock();
329327
}
330328

331329
private RegionStateNode getRegionStateNode(MasterProcedureEnv env) {
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.master.assignment;
19+
20+
import java.io.IOException;
21+
import java.io.UncheckedIOException;
22+
import org.apache.hadoop.hbase.HBaseClassTestRule;
23+
import org.apache.hadoop.hbase.HBaseTestingUtility;
24+
import org.apache.hadoop.hbase.ServerName;
25+
import org.apache.hadoop.hbase.TableName;
26+
import org.apache.hadoop.hbase.client.RegionInfo;
27+
import org.apache.hadoop.hbase.master.HMaster;
28+
import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure.TransitionType;
29+
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
30+
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
31+
import org.apache.hadoop.hbase.procedure2.Procedure;
32+
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
33+
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
34+
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
35+
import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
36+
import org.apache.hadoop.hbase.testclassification.MasterTests;
37+
import org.apache.hadoop.hbase.testclassification.MediumTests;
38+
import org.apache.hadoop.hbase.util.Bytes;
39+
import org.junit.AfterClass;
40+
import org.junit.BeforeClass;
41+
import org.junit.ClassRule;
42+
import org.junit.Test;
43+
import org.junit.experimental.categories.Category;
44+
45+
import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos.ProcedureState;
46+
47+
/**
48+
* Testcase for HBASE-29259
49+
*/
50+
@Category({ MasterTests.class, MediumTests.class })
51+
public class TestTRSPPersistUninitializedSubProc {
52+
53+
@ClassRule
54+
public static final HBaseClassTestRule CLASS_RULE =
55+
HBaseClassTestRule.forClass(TestTRSPPersistUninitializedSubProc.class);
56+
57+
private static HBaseTestingUtility UTIL = new HBaseTestingUtility();
58+
59+
private static byte[] CF = Bytes.toBytes("cf");
60+
61+
private static TableName TN = TableName.valueOf("tn");
62+
63+
public static class TRSPForTest extends TransitRegionStateProcedure {
64+
65+
private boolean injected = false;
66+
67+
public TRSPForTest() {
68+
}
69+
70+
public TRSPForTest(MasterProcedureEnv env, RegionInfo hri, ServerName assignCandidate,
71+
boolean forceNewPlan, TransitionType type) {
72+
super(env, hri, assignCandidate, forceNewPlan, type);
73+
}
74+
75+
@Override
76+
protected Procedure[] execute(MasterProcedureEnv env)
77+
throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
78+
Procedure[] subProcs = super.execute(env);
79+
if (!injected && subProcs != null && subProcs[0] instanceof CloseRegionProcedure) {
80+
injected = true;
81+
ServerName sn = ((CloseRegionProcedure) subProcs[0]).targetServer;
82+
env.getMasterServices().getServerManager().expireServer(sn);
83+
try {
84+
UTIL.waitFor(15000, () -> env.getMasterServices().getProcedures().stream().anyMatch(
85+
p -> p instanceof ServerCrashProcedure && p.getState() != ProcedureState.INITIALIZING));
86+
} catch (IOException e) {
87+
throw new UncheckedIOException(e);
88+
}
89+
// sleep 10 seconds to let the SCP interrupt the TRSP, where we will call TRSP.serverCrashed
90+
Thread.sleep(10000);
91+
}
92+
return subProcs;
93+
}
94+
}
95+
96+
@BeforeClass
97+
public static void setUpBeforeClass() throws Exception {
98+
UTIL.startMiniCluster(2);
99+
UTIL.getAdmin().balancerSwitch(false, true);
100+
UTIL.createTable(TN, CF);
101+
UTIL.waitTableAvailable(TN);
102+
}
103+
104+
@AfterClass
105+
public static void tearDownAfterClass() throws Exception {
106+
UTIL.shutdownMiniCluster();
107+
}
108+
109+
@Test
110+
public void testServerCrash() throws Exception {
111+
HMaster master = UTIL.getHBaseCluster().getMaster();
112+
ProcedureExecutor<MasterProcedureEnv> procExec = master.getMasterProcedureExecutor();
113+
RegionInfo region = UTIL.getAdmin().getRegions(TN).get(0);
114+
RegionStateNode rsn =
115+
master.getAssignmentManager().getRegionStates().getRegionStateNode(region);
116+
TRSPForTest trsp =
117+
new TRSPForTest(procExec.getEnvironment(), region, null, false, TransitionType.REOPEN);
118+
// attach it to RegionStateNode, to simulate normal reopen
119+
rsn.setProcedure(trsp);
120+
procExec.submitProcedure(trsp);
121+
ProcedureTestingUtility.waitProcedure(procExec, trsp);
122+
// make sure we do not store invalid procedure to procedure store
123+
ProcedureTestingUtility.restart(procExec);
124+
}
125+
}

0 commit comments

Comments
 (0)