Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ private void checkServersAndTables(Set<Address> servers, Set<TableName> tables,
private void moveServerRegionsFromGroup(Set<Address> servers, String targetGroupName)
throws IOException {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this really a good idea to group "TO and FROM" into one method? It's an open question, just want to hear some reasoning from you. thanks

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think added To/From to method name makes codes more easy to be understood.
And just like writing in the docs, TO means move tables regions, FROM means move server regions.

boolean hasRegionsToMove;
int retry = 0;
RSGroupInfo targetGrp = getRSGroupInfo(targetGroupName);
Set<Address> allSevers = new HashSet<>(servers);
do {
Expand All @@ -215,7 +216,12 @@ private void moveServerRegionsFromGroup(Set<Address> servers, String targetGroup
if (!targetGrp.containsTable(region.getTable())) {
LOG.info("Moving server region {}, which do not belong to RSGroup {}",
region.getShortNameToLog(), targetGroupName);
this.master.getAssignmentManager().move(region);
try {
this.master.getAssignmentManager().move(region);
}catch (IOException ioe){
LOG.error("Move region {} from group failed, will retry, current retry time is {}",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May get too verbose. How about log as debug on each retry, then if max number of reties has been reached, log as error?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I agree. But I think we need failed regions' names to move them to the target group servers after the failed call, because we can't call move tables or servers methods to move failed regions again.Maybe I can add a failed regions list in the ex message when max number of reties has been reached.What do you think about it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have changed this log level to DEBUG, and added a failed regions list in the ex message when max number of reties has been reached.

region.getShortNameToLog(), retry, ioe);
}
if (master.getAssignmentManager().getRegionStates().
getRegionState(region).isFailedOpen()) {
continue;
Expand All @@ -229,13 +235,15 @@ private void moveServerRegionsFromGroup(Set<Address> servers, String targetGroup
iter.remove();
}
}

retry++;
try {
rsGroupInfoManager.wait(1000);
} catch (InterruptedException e) {
LOG.warn("Sleep interrupted", e);
Thread.currentThread().interrupt();
}
} while (hasRegionsToMove);
} while (hasRegionsToMove && retry <= 50);
}

/**
Expand All @@ -247,23 +255,49 @@ private void moveServerRegionsFromGroup(Set<Address> servers, String targetGroup
*/
private void moveTableRegionsToGroup(Set<TableName> tables, String targetGroupName)
throws IOException {
boolean hasRegionsToMove;
int retry = 0;
RSGroupInfo targetGrp = getRSGroupInfo(targetGroupName);
for (TableName table : tables) {
if (master.getAssignmentManager().isTableDisabled(table)) {
LOG.debug("Skipping move regions because the table {} is disabled", table);
continue;
}
LOG.info("Moving region(s) for table {} to RSGroup {}", table, targetGroupName);
for (RegionInfo region : master.getAssignmentManager().getRegionStates()
.getRegionsOfTable(table)) {
ServerName sn =
master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(region);
if (!targetGrp.containsServer(sn.getAddress())) {
LOG.info("Moving region {} to RSGroup {}", region.getShortNameToLog(), targetGroupName);
master.getAssignmentManager().move(region);
Set<TableName> allTables = new HashSet<>(tables);
do {
hasRegionsToMove = false;
for (Iterator<TableName> iter = allTables.iterator(); iter.hasNext(); ) {
TableName table = iter.next();
if (master.getAssignmentManager().isTableDisabled(table)) {
LOG.debug("Skipping move regions because the table {} is disabled", table);
continue;
}
LOG.info("Moving region(s) for table {} to RSGroup {}", table, targetGroupName);
for (RegionInfo region : master.getAssignmentManager().getRegionStates()
.getRegionsOfTable(table)) {
ServerName sn =
master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(region);
if (!targetGrp.containsServer(sn.getAddress())) {
LOG.info("Moving region {} to RSGroup {}", region.getShortNameToLog(), targetGroupName);
try {
master.getAssignmentManager().move(region);
}catch (IOException ioe){
LOG.error("Move region {} to group failed, will retry, current retry time is {}",
region.getShortNameToLog(), retry, ioe);
}
hasRegionsToMove = true;
}
}

if (!hasRegionsToMove) {
LOG.info("Table {} has no more regions to move for RSGroup", table.getNameAsString());
iter.remove();
}
}
}

retry++;
try {
rsGroupInfoManager.wait(1000);
} catch (InterruptedException e) {
LOG.warn("Sleep interrupted", e);
Thread.currentThread().interrupt();
}
} while (hasRegionsToMove && retry <= 50);
}

@edu.umd.cs.findbugs.annotations.SuppressWarnings(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
*/
package org.apache.hadoop.hbase.rsgroup;

import static org.apache.hadoop.hbase.util.Threads.sleep;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
Expand All @@ -29,15 +30,18 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.hadoop.hbase.ClusterMetrics.Option;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.Waiter;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.constraint.ConstraintException;
import org.apache.hadoop.hbase.master.RegionState;
import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
import org.apache.hadoop.hbase.net.Address;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.util.Bytes;
import org.junit.After;
import org.junit.AfterClass;
Expand All @@ -52,7 +56,7 @@

import org.apache.hbase.thirdparty.com.google.common.collect.Sets;

@Category({ MediumTests.class })
@Category({ LargeTests.class })
public class TestRSGroupsAdmin2 extends TestRSGroupsBase {

@ClassRule
Expand Down Expand Up @@ -459,4 +463,197 @@ public boolean evaluate() throws Exception {
Assert.assertEquals(null, rsGroupAdmin.getRSGroupInfo(fooGroup.getName()));
}

@Test
public void testFailedMoveWhenMoveServer() throws Exception {
final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

addGroup already invokes _ moveServerRegionsFromGroup_ indirectly, and adds an RS to it. We could simplify this test if we just create a new group from RSGroupAdminServer.addRSGroup, pick one of the RSes from assignMap, change one of its regions state, then call later call RSGroupAdminServer.moveServers?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, good idea.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case, I have used rsGroupAdmin.addRSGroup to add a new group without adding server. Then I create a multi-region table, choose a region to change state, and record the server that the region we changed on. In the last call RSGroupAdminServer.moveServers to move the recorded server, and check if the error message contains the region name.

final byte[] familyNameBytes = Bytes.toBytes("f");
final int tableRegionCount = 10;
// All the regions created below will be assigned to the default group.
TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, tableRegionCount);
TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() {
@Override
public boolean evaluate() throws Exception {
List<String> regions = getTableRegionMap().get(tableName);
if (regions == null) {
return false;
}
return getTableRegionMap().get(tableName).size() >= tableRegionCount;
}
});

// get target server to move, which should has more than one regions
// randomly set a region state to SPLITTING
Map<ServerName, List<String>> assignMap = getTableServerRegionMap().get(tableName);
String rregion = null;
ServerName toMoveServer = null;
for (ServerName server : assignMap.keySet()) {
rregion = assignMap.get(server).size() > 1 && !newGroup.containsServer(server.getAddress()) ?
assignMap.get(server).get(0) :
null;
if (rregion != null) {
toMoveServer = server;
break;
}
}
assert toMoveServer != null;
RegionInfo ri = TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager().
getRegionInfo(Bytes.toBytesBinary(rregion));
RegionStateNode rsn =
TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager().getRegionStates()
.getRegionStateNode(ri);
rsn.setState(RegionState.State.SPLITTING);

// start thread to recover region state
final ServerName movedServer = toMoveServer;
final String sregion = rregion;
AtomicBoolean changed = new AtomicBoolean(false);
Thread t1 = new Thread(() -> {
LOG.debug("thread1 start running, will recover region state");
long current = System.currentTimeMillis();
while (System.currentTimeMillis() - current <= 50000) {
List<RegionInfo> regions = master.getAssignmentManager().getRegionsOnServer(movedServer);
LOG.debug("server region size is:{}", regions.size());
assert regions.size() >= 1;
// when there is exactly one region left, we can determine the move operation encountered
// exception caused by the strange region state.
if (regions.size() == 1) {
assertEquals(regions.get(0).getRegionNameAsString(), sregion);
rsn.setState(RegionState.State.OPEN);
LOG.info("set region {} state OPEN", sregion);
changed.set(true);
break;
}
sleep(5000);
}
});
t1.start();

// move target server to group
Thread t2 = new Thread(() -> {
LOG.info("thread2 start running, to move regions");
try {
rsGroupAdmin.moveServers(Sets.newHashSet(movedServer.getAddress()), newGroup.getName());
} catch (IOException e) {
LOG.error("move server error", e);
}
});
t2.start();

t1.join();
t2.join();

TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() {
@Override
public boolean evaluate() {
if (changed.get()) {
return master.getAssignmentManager().getRegionsOnServer(movedServer).size() == 0 && !rsn
.getRegionLocation().equals(movedServer);
}
return false;
}
});
}

@Test
public void testFailedMoveWhenMoveTable() throws Exception {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as to the tested methods, we may apply some code reuse, given how similarly the two test structures are?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I make a function named setARegionState to randomly choosing a region and set its state. The difference of the two UTs is at adding group and moving operation. Is this OK?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will add a new skeleton function recoverRegionStateThread at the newly attached patch.

final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1);
final byte[] familyNameBytes = Bytes.toBytes("f");
final int tableRegionCount = 5;
// All the regions created below will be assigned to the default group.
TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, tableRegionCount);
TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() {
@Override
public boolean evaluate() throws Exception {
List<String> regions = getTableRegionMap().get(tableName);
if (regions == null) {
return false;
}
return getTableRegionMap().get(tableName).size() >= tableRegionCount;
}
});

// randomly set a region state to SPLITTING
Map<ServerName, List<String>> assignMap = getTableServerRegionMap().get(tableName);
String rregion = null;
ServerName srcServer = null;
for (ServerName server : assignMap.keySet()) {
rregion = assignMap.get(server).size() >= 1 && !newGroup.containsServer(server.getAddress()) ?
assignMap.get(server).get(0) :
null;
if (rregion != null) {
srcServer = server;
break;
}
}
assert srcServer != null;
RegionInfo ri = TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager().
getRegionInfo(Bytes.toBytesBinary(rregion));
RegionStateNode rsn =
TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager().getRegionStates()
.getRegionStateNode(ri);
rsn.setState(RegionState.State.SPLITTING);

// move table to group
Thread t2 = new Thread(() -> {
LOG.info("thread2 start running, to move regions");
try {
rsGroupAdmin.moveTables(Sets.newHashSet(tableName), newGroup.getName());
} catch (IOException e) {
LOG.error("move server error", e);
}
});
t2.start();

// start thread to recover region state
final ServerName ss = srcServer;
final String sregion = rregion;
AtomicBoolean changed = new AtomicBoolean(false);
Thread t1 = new Thread(() -> {
LOG.info("thread1 start running, will recover region state");
long current = System.currentTimeMillis();
while (System.currentTimeMillis() - current <= 50000) {
List<RegionInfo> regions = master.getAssignmentManager().getRegionsOnServer(ss);
List<RegionInfo> tableRegions = new ArrayList<>();
for (RegionInfo regionInfo : regions) {
if (regionInfo.getTable().equals(tableName)) {
tableRegions.add(regionInfo);
}
}
LOG.debug("server table region size is:{}", tableRegions.size());
assert tableRegions.size() >= 1;
// when there is exactly one region left, we can determine the move operation encountered
// exception caused by the strange region state.
if (tableRegions.size() == 1) {
assertEquals(tableRegions.get(0).getRegionNameAsString(), sregion);
rsn.setState(RegionState.State.OPEN);
LOG.info("set region {} state OPEN", sregion);
changed.set(true);
break;
}
sleep(5000);
}
});
t1.start();

t1.join();
t2.join();

TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() {
@Override
public boolean evaluate() {
if (changed.get()) {
boolean serverHasTableRegions = false;
for (RegionInfo regionInfo : master.getAssignmentManager().getRegionsOnServer(ss)) {
if (regionInfo.getTable().equals(tableName)) {
serverHasTableRegions = true;
break;
}
}
return !serverHasTableRegions && !rsn.getRegionLocation().equals(ss);
}
return false;
}
});
}

}