-
Notifications
You must be signed in to change notification settings - Fork 3.4k
HBASE-22414 Interruption of moving regions in RSGroup will cause regi… #323
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
a44b321
3471e7f
3586a3f
310bc21
27ecfc2
f9927e5
d12694c
e559550
cc9e0a0
ec50f4f
670ab0e
bdb68f5
3ed9621
9ca8b9d
963fba6
8bf26f4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -204,6 +204,7 @@ private void checkServersAndTables(Set<Address> servers, Set<TableName> tables, | |
| private void moveServerRegionsFromGroup(Set<Address> servers, String targetGroupName) | ||
| throws IOException { | ||
| boolean hasRegionsToMove; | ||
| int retry = 0; | ||
| RSGroupInfo targetGrp = getRSGroupInfo(targetGroupName); | ||
| Set<Address> allSevers = new HashSet<>(servers); | ||
| do { | ||
|
|
@@ -215,7 +216,12 @@ private void moveServerRegionsFromGroup(Set<Address> servers, String targetGroup | |
| if (!targetGrp.containsTable(region.getTable())) { | ||
| LOG.info("Moving server region {}, which do not belong to RSGroup {}", | ||
| region.getShortNameToLog(), targetGroupName); | ||
| this.master.getAssignmentManager().move(region); | ||
| try { | ||
| this.master.getAssignmentManager().move(region); | ||
| }catch (IOException ioe){ | ||
| LOG.error("Move region {} from group failed, will retry, current retry time is {}", | ||
|
||
| region.getShortNameToLog(), retry, ioe); | ||
| } | ||
| if (master.getAssignmentManager().getRegionStates(). | ||
| getRegionState(region).isFailedOpen()) { | ||
| continue; | ||
|
|
@@ -229,13 +235,15 @@ private void moveServerRegionsFromGroup(Set<Address> servers, String targetGroup | |
| iter.remove(); | ||
| } | ||
| } | ||
|
|
||
| retry++; | ||
| try { | ||
| rsGroupInfoManager.wait(1000); | ||
| } catch (InterruptedException e) { | ||
| LOG.warn("Sleep interrupted", e); | ||
| Thread.currentThread().interrupt(); | ||
| } | ||
| } while (hasRegionsToMove); | ||
| } while (hasRegionsToMove && retry <= 50); | ||
sunhelly marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
wchevreuil marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| /** | ||
|
|
@@ -247,23 +255,49 @@ private void moveServerRegionsFromGroup(Set<Address> servers, String targetGroup | |
| */ | ||
| private void moveTableRegionsToGroup(Set<TableName> tables, String targetGroupName) | ||
wchevreuil marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| throws IOException { | ||
| boolean hasRegionsToMove; | ||
| int retry = 0; | ||
| RSGroupInfo targetGrp = getRSGroupInfo(targetGroupName); | ||
| for (TableName table : tables) { | ||
| if (master.getAssignmentManager().isTableDisabled(table)) { | ||
| LOG.debug("Skipping move regions because the table {} is disabled", table); | ||
| continue; | ||
| } | ||
| LOG.info("Moving region(s) for table {} to RSGroup {}", table, targetGroupName); | ||
| for (RegionInfo region : master.getAssignmentManager().getRegionStates() | ||
| .getRegionsOfTable(table)) { | ||
| ServerName sn = | ||
| master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(region); | ||
| if (!targetGrp.containsServer(sn.getAddress())) { | ||
| LOG.info("Moving region {} to RSGroup {}", region.getShortNameToLog(), targetGroupName); | ||
| master.getAssignmentManager().move(region); | ||
| Set<TableName> allTables = new HashSet<>(tables); | ||
| do { | ||
| hasRegionsToMove = false; | ||
| for (Iterator<TableName> iter = allTables.iterator(); iter.hasNext(); ) { | ||
| TableName table = iter.next(); | ||
| if (master.getAssignmentManager().isTableDisabled(table)) { | ||
| LOG.debug("Skipping move regions because the table {} is disabled", table); | ||
| continue; | ||
| } | ||
| LOG.info("Moving region(s) for table {} to RSGroup {}", table, targetGroupName); | ||
| for (RegionInfo region : master.getAssignmentManager().getRegionStates() | ||
| .getRegionsOfTable(table)) { | ||
| ServerName sn = | ||
| master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(region); | ||
| if (!targetGrp.containsServer(sn.getAddress())) { | ||
| LOG.info("Moving region {} to RSGroup {}", region.getShortNameToLog(), targetGroupName); | ||
| try { | ||
| master.getAssignmentManager().move(region); | ||
| }catch (IOException ioe){ | ||
| LOG.error("Move region {} to group failed, will retry, current retry time is {}", | ||
| region.getShortNameToLog(), retry, ioe); | ||
| } | ||
| hasRegionsToMove = true; | ||
| } | ||
| } | ||
|
|
||
| if (!hasRegionsToMove) { | ||
| LOG.info("Table {} has no more regions to move for RSGroup", table.getNameAsString()); | ||
| iter.remove(); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| retry++; | ||
| try { | ||
| rsGroupInfoManager.wait(1000); | ||
| } catch (InterruptedException e) { | ||
| LOG.warn("Sleep interrupted", e); | ||
| Thread.currentThread().interrupt(); | ||
| } | ||
| } while (hasRegionsToMove && retry <= 50); | ||
wchevreuil marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| @edu.umd.cs.findbugs.annotations.SuppressWarnings( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,7 @@ | |
| */ | ||
| package org.apache.hadoop.hbase.rsgroup; | ||
|
|
||
| import static org.apache.hadoop.hbase.util.Threads.sleep; | ||
| import static org.junit.Assert.assertEquals; | ||
| import static org.junit.Assert.assertFalse; | ||
| import static org.junit.Assert.assertTrue; | ||
|
|
@@ -29,15 +30,18 @@ | |
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.Set; | ||
| import java.util.concurrent.atomic.AtomicBoolean; | ||
| import org.apache.hadoop.hbase.ClusterMetrics.Option; | ||
| import org.apache.hadoop.hbase.HBaseClassTestRule; | ||
| import org.apache.hadoop.hbase.ServerName; | ||
| import org.apache.hadoop.hbase.TableName; | ||
| import org.apache.hadoop.hbase.Waiter; | ||
| import org.apache.hadoop.hbase.client.RegionInfo; | ||
| import org.apache.hadoop.hbase.constraint.ConstraintException; | ||
| import org.apache.hadoop.hbase.master.RegionState; | ||
| import org.apache.hadoop.hbase.master.assignment.RegionStateNode; | ||
| import org.apache.hadoop.hbase.net.Address; | ||
| import org.apache.hadoop.hbase.testclassification.MediumTests; | ||
| import org.apache.hadoop.hbase.testclassification.LargeTests; | ||
| import org.apache.hadoop.hbase.util.Bytes; | ||
| import org.junit.After; | ||
| import org.junit.AfterClass; | ||
|
|
@@ -52,7 +56,7 @@ | |
|
|
||
| import org.apache.hbase.thirdparty.com.google.common.collect.Sets; | ||
|
|
||
| @Category({ MediumTests.class }) | ||
| @Category({ LargeTests.class }) | ||
| public class TestRSGroupsAdmin2 extends TestRSGroupsBase { | ||
|
|
||
| @ClassRule | ||
|
|
@@ -459,4 +463,197 @@ public boolean evaluate() throws Exception { | |
| Assert.assertEquals(null, rsGroupAdmin.getRSGroupInfo(fooGroup.getName())); | ||
| } | ||
|
|
||
| @Test | ||
| public void testFailedMoveWhenMoveServer() throws Exception { | ||
| final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1); | ||
|
||
| final byte[] familyNameBytes = Bytes.toBytes("f"); | ||
| final int tableRegionCount = 10; | ||
| // All the regions created below will be assigned to the default group. | ||
| TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, tableRegionCount); | ||
| TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { | ||
| @Override | ||
| public boolean evaluate() throws Exception { | ||
| List<String> regions = getTableRegionMap().get(tableName); | ||
| if (regions == null) { | ||
| return false; | ||
| } | ||
| return getTableRegionMap().get(tableName).size() >= tableRegionCount; | ||
| } | ||
| }); | ||
|
|
||
| // get target server to move, which should has more than one regions | ||
| // randomly set a region state to SPLITTING | ||
| Map<ServerName, List<String>> assignMap = getTableServerRegionMap().get(tableName); | ||
| String rregion = null; | ||
| ServerName toMoveServer = null; | ||
| for (ServerName server : assignMap.keySet()) { | ||
| rregion = assignMap.get(server).size() > 1 && !newGroup.containsServer(server.getAddress()) ? | ||
| assignMap.get(server).get(0) : | ||
| null; | ||
| if (rregion != null) { | ||
| toMoveServer = server; | ||
| break; | ||
| } | ||
| } | ||
| assert toMoveServer != null; | ||
| RegionInfo ri = TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(). | ||
| getRegionInfo(Bytes.toBytesBinary(rregion)); | ||
| RegionStateNode rsn = | ||
| TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager().getRegionStates() | ||
| .getRegionStateNode(ri); | ||
| rsn.setState(RegionState.State.SPLITTING); | ||
|
|
||
| // start thread to recover region state | ||
| final ServerName movedServer = toMoveServer; | ||
| final String sregion = rregion; | ||
| AtomicBoolean changed = new AtomicBoolean(false); | ||
| Thread t1 = new Thread(() -> { | ||
wchevreuil marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| LOG.debug("thread1 start running, will recover region state"); | ||
| long current = System.currentTimeMillis(); | ||
| while (System.currentTimeMillis() - current <= 50000) { | ||
wchevreuil marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| List<RegionInfo> regions = master.getAssignmentManager().getRegionsOnServer(movedServer); | ||
| LOG.debug("server region size is:{}", regions.size()); | ||
| assert regions.size() >= 1; | ||
| // when there is exactly one region left, we can determine the move operation encountered | ||
| // exception caused by the strange region state. | ||
| if (regions.size() == 1) { | ||
| assertEquals(regions.get(0).getRegionNameAsString(), sregion); | ||
| rsn.setState(RegionState.State.OPEN); | ||
| LOG.info("set region {} state OPEN", sregion); | ||
| changed.set(true); | ||
| break; | ||
| } | ||
| sleep(5000); | ||
| } | ||
| }); | ||
| t1.start(); | ||
|
|
||
| // move target server to group | ||
| Thread t2 = new Thread(() -> { | ||
| LOG.info("thread2 start running, to move regions"); | ||
| try { | ||
| rsGroupAdmin.moveServers(Sets.newHashSet(movedServer.getAddress()), newGroup.getName()); | ||
| } catch (IOException e) { | ||
| LOG.error("move server error", e); | ||
| } | ||
| }); | ||
| t2.start(); | ||
|
|
||
| t1.join(); | ||
| t2.join(); | ||
|
|
||
| TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { | ||
| @Override | ||
| public boolean evaluate() { | ||
| if (changed.get()) { | ||
| return master.getAssignmentManager().getRegionsOnServer(movedServer).size() == 0 && !rsn | ||
| .getRegionLocation().equals(movedServer); | ||
| } | ||
| return false; | ||
| } | ||
| }); | ||
| } | ||
|
|
||
| @Test | ||
| public void testFailedMoveWhenMoveTable() throws Exception { | ||
|
||
| final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1); | ||
| final byte[] familyNameBytes = Bytes.toBytes("f"); | ||
| final int tableRegionCount = 5; | ||
| // All the regions created below will be assigned to the default group. | ||
| TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, tableRegionCount); | ||
| TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { | ||
| @Override | ||
| public boolean evaluate() throws Exception { | ||
| List<String> regions = getTableRegionMap().get(tableName); | ||
| if (regions == null) { | ||
| return false; | ||
| } | ||
| return getTableRegionMap().get(tableName).size() >= tableRegionCount; | ||
| } | ||
| }); | ||
|
|
||
| // randomly set a region state to SPLITTING | ||
| Map<ServerName, List<String>> assignMap = getTableServerRegionMap().get(tableName); | ||
| String rregion = null; | ||
| ServerName srcServer = null; | ||
| for (ServerName server : assignMap.keySet()) { | ||
| rregion = assignMap.get(server).size() >= 1 && !newGroup.containsServer(server.getAddress()) ? | ||
| assignMap.get(server).get(0) : | ||
| null; | ||
| if (rregion != null) { | ||
| srcServer = server; | ||
| break; | ||
| } | ||
| } | ||
| assert srcServer != null; | ||
| RegionInfo ri = TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(). | ||
| getRegionInfo(Bytes.toBytesBinary(rregion)); | ||
| RegionStateNode rsn = | ||
| TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager().getRegionStates() | ||
| .getRegionStateNode(ri); | ||
| rsn.setState(RegionState.State.SPLITTING); | ||
|
|
||
| // move table to group | ||
| Thread t2 = new Thread(() -> { | ||
| LOG.info("thread2 start running, to move regions"); | ||
| try { | ||
| rsGroupAdmin.moveTables(Sets.newHashSet(tableName), newGroup.getName()); | ||
| } catch (IOException e) { | ||
| LOG.error("move server error", e); | ||
| } | ||
| }); | ||
| t2.start(); | ||
|
|
||
| // start thread to recover region state | ||
| final ServerName ss = srcServer; | ||
| final String sregion = rregion; | ||
| AtomicBoolean changed = new AtomicBoolean(false); | ||
| Thread t1 = new Thread(() -> { | ||
wchevreuil marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| LOG.info("thread1 start running, will recover region state"); | ||
| long current = System.currentTimeMillis(); | ||
| while (System.currentTimeMillis() - current <= 50000) { | ||
wchevreuil marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| List<RegionInfo> regions = master.getAssignmentManager().getRegionsOnServer(ss); | ||
| List<RegionInfo> tableRegions = new ArrayList<>(); | ||
| for (RegionInfo regionInfo : regions) { | ||
| if (regionInfo.getTable().equals(tableName)) { | ||
| tableRegions.add(regionInfo); | ||
| } | ||
| } | ||
| LOG.debug("server table region size is:{}", tableRegions.size()); | ||
| assert tableRegions.size() >= 1; | ||
| // when there is exactly one region left, we can determine the move operation encountered | ||
| // exception caused by the strange region state. | ||
| if (tableRegions.size() == 1) { | ||
| assertEquals(tableRegions.get(0).getRegionNameAsString(), sregion); | ||
| rsn.setState(RegionState.State.OPEN); | ||
| LOG.info("set region {} state OPEN", sregion); | ||
| changed.set(true); | ||
| break; | ||
| } | ||
| sleep(5000); | ||
| } | ||
| }); | ||
| t1.start(); | ||
|
|
||
| t1.join(); | ||
| t2.join(); | ||
|
|
||
| TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { | ||
| @Override | ||
| public boolean evaluate() { | ||
| if (changed.get()) { | ||
| boolean serverHasTableRegions = false; | ||
| for (RegionInfo regionInfo : master.getAssignmentManager().getRegionsOnServer(ss)) { | ||
| if (regionInfo.getTable().equals(tableName)) { | ||
| serverHasTableRegions = true; | ||
| break; | ||
| } | ||
| } | ||
| return !serverHasTableRegions && !rsn.getRegionLocation().equals(ss); | ||
| } | ||
| return false; | ||
| } | ||
| }); | ||
| } | ||
|
|
||
| } | ||
wchevreuil marked this conversation as resolved.
Show resolved
Hide resolved
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this really a good idea to group "TO and FROM" into one method? It's an open question, just want to hear some reasoning from you. thanks
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think added To/From to method name makes codes more easy to be understood.
And just like writing in the docs, TO means move tables regions, FROM means move server regions.