diff --git a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon index dd10be45c936..a5d45760cc02 100644 --- a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon +++ b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon @@ -580,9 +580,13 @@ AssignmentManager assignmentManager = master.getAssignmentManager(); <%java>String description = null; if (tableName.equals(TableName.META_TABLE_NAME)){ description = "The hbase:meta table holds references to all User Table regions."; - } else if (tableName.equals(CanaryTool.DEFAULT_WRITE_TABLE_NAME)){ - description = "The hbase:canary table is used to sniff the write availbility of" - + " each regionserver."; + } else if (tableName.getNameWithNamespaceInclAsString().startsWith(CanaryTool.DEFAULT_WRITE_TABLE_NAME_WITH_NAMESPACE_PREFIX)){ + description = "The " + tableName.getNameWithNamespaceInclAsString() + " table is used to sniff the write availbility of "; + if(tableName.getNameWithNamespaceInclAsString().startsWith(CanaryTool.DEFAULT_WRITE_TABLE_NAME_WITH_NAMESPACE_PREFIX + "_")){ + description = description + tableName.getNameWithNamespaceInclAsString().substring(CanaryTool.DEFAULT_WRITE_TABLE_NAME_WITH_NAMESPACE_PREFIX.length() + 1) + " RSGroup"; + }else{ + description = description + "null RSGroup"; + } } else if (tableName.equals(PermissionStorage.ACL_TABLE_NAME)){ description = "The hbase:acl table holds information about acl."; } else if (tableName.equals(VisibilityConstants.LABELS_TABLE_NAME)){ diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java index 92dca7c24c92..0ae1abd5312d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java @@ -28,7 +28,6 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; -import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; @@ -48,18 +47,17 @@ import java.util.concurrent.atomic.LongAdder; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.AuthUtil; import org.apache.hadoop.hbase.ChoreService; import org.apache.hadoop.hbase.ClusterMetrics; -import org.apache.hadoop.hbase.ClusterMetrics.Option; import org.apache.hadoop.hbase.DoNotRetryIOException; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HBaseInterfaceAudience; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HRegionLocation; -import org.apache.hadoop.hbase.MetaTableAccessor; import org.apache.hadoop.hbase.NamespaceDescriptor; import org.apache.hadoop.hbase.ScheduledChore; import org.apache.hadoop.hbase.ServerName; @@ -75,6 +73,7 @@ import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RegionLocator; +import org.apache.hadoop.hbase.client.RegionStatesCount; import org.apache.hadoop.hbase.client.ResultScanner; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.client.Table; @@ -82,10 +81,11 @@ import org.apache.hadoop.hbase.client.TableDescriptorBuilder; import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter; import org.apache.hadoop.hbase.http.InfoServer; +import org.apache.hadoop.hbase.net.Address; +import org.apache.hadoop.hbase.rsgroup.RSGroupInfo; import org.apache.hadoop.hbase.tool.CanaryTool.RegionTask.TaskType; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; -import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.hbase.util.ReflectionUtils; import org.apache.hadoop.hbase.util.RegionSplitter; import org.apache.hadoop.hbase.zookeeper.EmptyWatcher; @@ -303,6 +303,7 @@ public void publishReadTiming(String znode, String server, long msTime) { */ public static class RegionStdOutSink extends StdOutSink { private Map perTableReadLatency = new HashMap<>(); + private Map perTableWriteLatency = new HashMap<>(); private LongAdder writeLatency = new LongAdder(); private final ConcurrentMap> regionMap = new ConcurrentHashMap<>(); @@ -400,12 +401,22 @@ public Map getReadLatencyMap() { return this.perTableReadLatency; } + public Map getWriteLatencyMap() { + return this.perTableWriteLatency; + } + public LongAdder initializeAndGetReadLatencyForTable(String tableName) { LongAdder initLatency = new LongAdder(); this.perTableReadLatency.put(tableName, initLatency); return initLatency; } + public LongAdder initializeAndGetWriteLatencyForTable(String tableName) { + LongAdder initLatency = new LongAdder(); + this.perTableWriteLatency.put(tableName, initLatency); + return initLatency; + } + public void initializeWriteLatency() { this.writeLatency.reset(); } @@ -777,8 +788,10 @@ public Void call() { private static final Logger LOG = LoggerFactory.getLogger(Canary.class); - public static final TableName DEFAULT_WRITE_TABLE_NAME = - TableName.valueOf(NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR, "canary"); + public static final String DEFAULT_WRITE_TABLE_NAME_PREFIX = "Canary"; + public static final String DEFAULT_WRITE_TABLE_NAME_WITH_NAMESPACE_PREFIX = + NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR + ":" + + CanaryTool.DEFAULT_WRITE_TABLE_NAME_PREFIX; private static final String CANARY_TABLE_FAMILY_NAME = "Test"; @@ -810,8 +823,6 @@ public Void call() { "hbase.canary.region.write.sniffing"; public static final String HBASE_CANARY_REGION_WRITE_TABLE_TIMEOUT = "hbase.canary.region.write.table.timeout"; - public static final String HBASE_CANARY_REGION_WRITE_TABLE_NAME = - "hbase.canary.region.write.table.name"; public static final String HBASE_CANARY_REGION_READ_TABLE_TIMEOUT = "hbase.canary.region.read.table.timeout"; @@ -938,14 +949,6 @@ private int parseArgs(String[] args) { printUsageAndExit(); } conf.setLong(HBASE_CANARY_REGION_WRITE_TABLE_TIMEOUT, configuredWriteTableTimeout); - } else if (cmd.equals("-writeTable")) { - i++; - - if (i == args.length) { - System.err.println("-writeTable takes a string tablename value argument."); - printUsageAndExit(); - } - conf.set(HBASE_CANARY_REGION_WRITE_TABLE_NAME, args[i]); } else if (cmd.equals("-f")) { i++; if (i == args.length) { @@ -1131,8 +1134,8 @@ private void printUsageAndExit() { System.err.println(" -f exit on first error; default=true"); System.err.println(" -failureAsError treat read/write failure as error"); System.err.println(" -t timeout for canary-test run; default=600000ms"); - System.err.println(" -writeSniffing enable write sniffing"); - System.err.println(" -writeTable the table used for write sniffing; default=hbase:canary"); + System.err.println(" -writeSniffing enable write sniffing, each rsgroup will create a default" + + " table, which tablename is hbase:Canary_rsgroupname"); System.err.println(" -writeTableTimeout timeout for writeTable; default=600000ms"); System.err.println( " -readTableTimeouts =," + "=,..."); @@ -1278,8 +1281,6 @@ private Monitor newMonitor(final Connection connection, String[] monitorTargets) boolean failOnError = conf.getBoolean(HBASE_CANARY_FAIL_ON_ERROR, true); int permittedFailures = conf.getInt(HBASE_CANARY_ZOOKEEPER_PERMITTED_FAILURES, 0); boolean writeSniffing = conf.getBoolean(HBASE_CANARY_REGION_WRITE_SNIFFING, false); - String writeTableName = - conf.get(HBASE_CANARY_REGION_WRITE_TABLE_NAME, DEFAULT_WRITE_TABLE_NAME.getNameAsString()); long configuredWriteTableTimeout = conf.getLong(HBASE_CANARY_REGION_WRITE_TABLE_TIMEOUT, DEFAULT_TIMEOUT); @@ -1295,8 +1296,8 @@ private Monitor newMonitor(final Connection connection, String[] monitorTargets) } else { monitor = new RegionMonitor(connection, monitorTargets, useRegExp, getSink(connection.getConfiguration(), RegionStdOutSink.class), this.executor, - writeSniffing, TableName.valueOf(writeTableName), failOnError, configuredReadTableTimeouts, - configuredWriteTableTimeout, permittedFailures); + writeSniffing, failOnError, configuredReadTableTimeouts, configuredWriteTableTimeout, + permittedFailures); } return monitor; } @@ -1416,7 +1417,6 @@ private static class RegionMonitor extends Monitor { private long lastCheckTime = -1; private boolean writeSniffing; - private TableName writeTableName; private int writeDataTTL; private float regionsLowerLimit; private float regionsUpperLimit; @@ -1433,14 +1433,13 @@ private static class RegionMonitor extends Monitor { private long configuredWriteTableTimeout; public RegionMonitor(Connection connection, String[] monitorTargets, boolean useRegExp, - Sink sink, ExecutorService executor, boolean writeSniffing, TableName writeTableName, - boolean treatFailureAsError, HashMap configuredReadTableTimeouts, - long configuredWriteTableTimeout, long allowedFailures) { + Sink sink, ExecutorService executor, boolean writeSniffing, boolean treatFailureAsError, + HashMap configuredReadTableTimeouts, long configuredWriteTableTimeout, + long allowedFailures) { super(connection, monitorTargets, useRegExp, sink, executor, treatFailureAsError, allowedFailures); Configuration conf = connection.getConfiguration(); this.writeSniffing = writeSniffing; - this.writeTableName = writeTableName; this.writeDataTTL = conf.getInt(HConstants.HBASE_CANARY_WRITE_DATA_TTL_KEY, DEFAULT_WRITE_DATA_TTL); this.regionsLowerLimit = @@ -1493,20 +1492,22 @@ public void run() { } if (writeSniffing) { + Set canaryTableList = null; if (EnvironmentEdgeManager.currentTime() - lastCheckTime > checkPeriod) { try { - checkWriteTableDistribution(); + canaryTableList = checkWriteTableDistribution(); } catch (IOException e) { LOG.error("Check canary table distribution failed!", e); } lastCheckTime = EnvironmentEdgeManager.currentTime(); } - // sniff canary table with write operation - regionSink.initializeWriteLatency(); - LongAdder writeTableLatency = regionSink.getWriteLatency(); - taskFutures - .addAll(CanaryTool.sniff(admin, regionSink, admin.getDescriptor(writeTableName), - executor, TaskType.WRITE, this.rawScanEnabled, writeTableLatency, readAllCF)); + for (TableName tableName : canaryTableList) { + // sniff canary table with write operation + LongAdder writeLatency = regionSink + .initializeAndGetWriteLatencyForTable(tableName.getNameWithNamespaceInclAsString()); + taskFutures.addAll(CanaryTool.sniff(admin, regionSink, admin.getDescriptor(tableName), + executor, TaskType.WRITE, this.rawScanEnabled, writeLatency, readAllCF)); + } } for (Future future : taskFutures) { @@ -1534,15 +1535,19 @@ public void run() { } } if (this.writeSniffing) { - String writeTableStringName = this.writeTableName.getNameAsString(); - long actualWriteLatency = regionSink.getWriteLatency().longValue(); - LOG.info("Write operation for {} took {}ms. Configured write timeout {}ms.", - writeTableStringName, actualWriteLatency, this.configuredWriteTableTimeout); - // Check that the writeTable write operation latency does not exceed the configured - // timeout. - if (actualWriteLatency > this.configuredWriteTableTimeout) { - LOG.error("Write operation for {} exceeded the configured write timeout.", - writeTableStringName); + Map actualWriteTableLatency = regionSink.getWriteLatencyMap(); + for (Map.Entry entry : actualWriteTableLatency.entrySet()) { + String tableName = entry.getKey(); + long actual = entry.getValue().longValue(); + if (actual > configuredWriteTableTimeout) { + LOG.error( + "Write operation for {} took {}ms exceeded the configured write timeout." + + "(Configured write timeout {}ms.", + tableName, actual, configuredWriteTableTimeout); + } else { + LOG.info("Write operation for {} took {}ms (Configured write timeout {}ms.", + tableName, actual, configuredWriteTableTimeout); + } } } } catch (Exception e) { @@ -1607,7 +1612,8 @@ private List> sniff(TaskType taskType, RegionStdOutSink regionSink) for (TableDescriptor td : admin.listTableDescriptors()) { if ( admin.tableExists(td.getTableName()) && admin.isTableEnabled(td.getTableName()) - && (!td.getTableName().equals(writeTableName)) + && (!td.getTableName().getNameWithNamespaceInclAsString() + .startsWith(DEFAULT_WRITE_TABLE_NAME_WITH_NAMESPACE_PREFIX)) ) { LongAdder readLatency = regionSink.initializeAndGetReadLatencyForTable(td.getTableName().getNameAsString()); @@ -1618,60 +1624,82 @@ private List> sniff(TaskType taskType, RegionStdOutSink regionSink) return taskFutures; } - private void checkWriteTableDistribution() throws IOException { - if (!admin.tableExists(writeTableName)) { - int numberOfServers = admin.getRegionServers().size(); - if (numberOfServers == 0) { - throw new IllegalStateException("No live regionservers"); + private Set checkWriteTableDistribution() throws IOException { + Set canaryTableList = new HashSet<>(); + ClusterMetrics clusterMetrics = admin.getClusterMetrics(); + Map groupNameServerNumsMap = new HashMap<>(); + for (ServerName serverName : clusterMetrics.getServersName()) { + RSGroupInfo rsGroup = + admin.getRSGroup(Address.fromParts(serverName.getHostname(), serverName.getPort())); + if (rsGroup == null) { + continue; + } + String groupName = rsGroup.getName(); + Integer count = groupNameServerNumsMap.get(groupName); + if (count == null) { + groupNameServerNumsMap.put(groupName, 1); + } else { + groupNameServerNumsMap.put(groupName, ++count); } - createWriteTable(numberOfServers); - } - - if (!admin.isTableEnabled(writeTableName)) { - admin.enableTable(writeTableName); } - ClusterMetrics status = - admin.getClusterMetrics(EnumSet.of(Option.SERVERS_NAME, Option.MASTER)); - int numberOfServers = status.getServersName().size(); - if (status.getServersName().contains(status.getMasterName())) { - numberOfServers -= 1; + if (groupNameServerNumsMap.isEmpty()) { + throw new IllegalStateException("No live regionservers"); } - List> pairs = - MetaTableAccessor.getTableRegionsAndLocations(connection, writeTableName); - int numberOfRegions = pairs.size(); - if ( - numberOfRegions < numberOfServers * regionsLowerLimit - || numberOfRegions > numberOfServers * regionsUpperLimit - ) { - admin.disableTable(writeTableName); - admin.deleteTable(writeTableName); - createWriteTable(numberOfServers); - } - HashSet serverSet = new HashSet<>(); - for (Pair pair : pairs) { - serverSet.add(pair.getSecond()); - } - int numberOfCoveredServers = serverSet.size(); - if (numberOfCoveredServers < numberOfServers) { - admin.balance(); + Map tableStates = clusterMetrics.getTableRegionStatesCount(); + for (Map.Entry entry : groupNameServerNumsMap.entrySet()) { + String group = entry.getKey(); + int numberOfServers = entry.getValue(); + String tableName = + DEFAULT_WRITE_TABLE_NAME_PREFIX + (StringUtils.isBlank(group) ? "" : ("_" + group)); + TableName canaryTable = + TableName.valueOf(NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR, tableName); + RegionStatesCount states = tableStates.get(canaryTable); + int regions = 0; + if (states == null) { + createWriteTable(canaryTable, group, numberOfServers); + } else { + if (states.getTotalRegions() <= 0) { + admin.enableTable(canaryTable); + regions = admin.getRegions(canaryTable).size(); + } else { + regions = states.getTotalRegions(); + } + if ( + regions < numberOfServers * regionsLowerLimit + || regions > numberOfServers * regionsUpperLimit + ) { + admin.disableTable(canaryTable); + admin.deleteTable(canaryTable); + createWriteTable(canaryTable, group, numberOfServers); + } + } + canaryTableList.add(canaryTable); } + return canaryTableList; } - private void createWriteTable(int numberOfServers) throws IOException { + private void createWriteTable(TableName tableName, String group, int numberOfServers) + throws IOException { int numberOfRegions = (int) (numberOfServers * regionsLowerLimit); - LOG.info("Number of live regionservers {}, pre-splitting the canary table into {} regions " - + "(current lower limit of regions per server is {} and you can change it with config {}).", + LOG.info( + "Number of live regionservers {} " + + (StringUtils.isBlank(group) ? "" : (" of group " + group)) + + ", pre-splitting the canary table into {} regions (current" + + " lower limit of regions per server is {} and you can change it with config {}).", numberOfServers, numberOfRegions, regionsLowerLimit, HConstants.HBASE_CANARY_WRITE_PERSERVER_REGIONS_LOWERLIMIT_KEY); ColumnFamilyDescriptor family = ColumnFamilyDescriptorBuilder.newBuilder(Bytes.toBytes(CANARY_TABLE_FAMILY_NAME)) .setMaxVersions(1).setTimeToLive(writeDataTTL).build(); - TableDescriptor desc = - TableDescriptorBuilder.newBuilder(writeTableName).setColumnFamily(family).build(); + TableDescriptorBuilder tableDescriptorBuilder = + TableDescriptorBuilder.newBuilder(tableName).setColumnFamily(family); + if (StringUtils.isNotBlank(group) && !group.equals("default")) { + tableDescriptorBuilder.setRegionServerGroup(group); + } byte[][] splits = new RegionSplitter.HexStringSplit().split(numberOfRegions); - admin.createTable(desc, splits); + admin.createTable(tableDescriptorBuilder.build(), splits); } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java index 93c147f3f96c..bf8b6b57658d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java @@ -221,7 +221,7 @@ public void testCanaryRegionTaskResult() throws Exception { assertNotNull("verify getColumnFamily()", res.getColumnFamily()); assertNotNull("verify getColumnFamilyNameAsString()", res.getColumnFamilyNameAsString()); - if (regionName.contains(CanaryTool.DEFAULT_WRITE_TABLE_NAME.getNameAsString())) { + if (regionName.contains(CanaryTool.DEFAULT_WRITE_TABLE_NAME_WITH_NAMESPACE_PREFIX)) { assertTrue("write to region " + regionName + " succeeded", res.isWriteSuccess()); assertTrue("write took some time", res.getWriteLatency() > -1); } else {