diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 21fc291d9253..4f0e0dbb3db0 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -136,6 +136,7 @@ import org.apache.hadoop.hbase.master.cleaner.SnapshotCleanerChore; import org.apache.hadoop.hbase.master.hbck.HbckChore; import org.apache.hadoop.hbase.master.http.MasterDumpServlet; +import org.apache.hadoop.hbase.master.http.MasterHealthServlet; import org.apache.hadoop.hbase.master.http.MasterRedirectServlet; import org.apache.hadoop.hbase.master.http.MasterStatusServlet; import org.apache.hadoop.hbase.master.http.api_v1.ResourceConfigFactory; @@ -744,6 +745,11 @@ protected Class getDumpServlet() { return MasterDumpServlet.class; } + @Override + protected Class getHealthServlet() { + return MasterHealthServlet.class; + } + @Override public MetricsMaster getMasterMetrics() { return metricsMaster; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/http/MasterHealthServlet.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/http/MasterHealthServlet.java new file mode 100644 index 000000000000..fc677de080dc --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/http/MasterHealthServlet.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.http; + +import java.io.IOException; +import java.util.EnumSet; +import java.util.Optional; +import javax.servlet.http.HttpServletRequest; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.ClusterMetrics; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.client.Connection; +import org.apache.hadoop.hbase.client.ConnectionFactory; +import org.apache.hadoop.hbase.client.RpcConnectionRegistry; +import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.monitoring.HealthCheckServlet; +import org.apache.yetus.audience.InterfaceAudience; + +@InterfaceAudience.Private +public class MasterHealthServlet extends HealthCheckServlet { + + private static final String CLIENT_RPC_TIMEOUT = "healthcheck.hbase.client.rpc.timeout"; + private static final int CLIENT_RPC_TIMEOUT_DEFAULT = 5000; + private static final String CLIENT_RETRIES = "healthcheck.hbase.client.retries"; + private static final int CLIENT_RETRIES_DEFAULT = 2; + private static final String CLIENT_OPERATION_TIMEOUT = + "healthcheck.hbase.client.operation.timeout"; + private static final int CLIENT_OPERATION_TIMEOUT_DEFAULT = 15000; + + public MasterHealthServlet() { + super(HMaster.MASTER); + } + + @Override + protected Optional check(HMaster master, HttpServletRequest req) throws IOException { + Configuration conf = new Configuration(master.getConfiguration()); + conf.set(HConstants.CLIENT_CONNECTION_REGISTRY_IMPL_CONF_KEY, + RpcConnectionRegistry.class.getName()); + conf.set(RpcConnectionRegistry.BOOTSTRAP_NODES, master.getServerName().getAddress().toString()); + conf.setInt(HConstants.HBASE_RPC_TIMEOUT_KEY, + conf.getInt(CLIENT_RPC_TIMEOUT, CLIENT_RPC_TIMEOUT_DEFAULT)); + conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, + conf.getInt(CLIENT_RETRIES, CLIENT_RETRIES_DEFAULT)); + conf.setInt(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, + conf.getInt(CLIENT_OPERATION_TIMEOUT, CLIENT_OPERATION_TIMEOUT_DEFAULT)); + + try (Connection conn = ConnectionFactory.createConnection(conf)) { + // this will fail if the server is not accepting requests + if (conn.getClusterId() == null) { + throw new IOException("Could not retrieve clusterId from self via rpc"); + } + + if (master.isActiveMaster() && master.isOnline()) { + // this will fail if there is a problem with the active master + conn.getAdmin().getClusterMetrics(EnumSet.of(ClusterMetrics.Option.CLUSTER_ID)); + } + } + + return Optional.empty(); + } +} diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/monitoring/HealthCheckServlet.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/monitoring/HealthCheckServlet.java new file mode 100644 index 000000000000..b7ec6239e451 --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/monitoring/HealthCheckServlet.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.monitoring; + +import java.io.IOException; +import java.util.Optional; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; +import org.apache.hadoop.hbase.regionserver.HRegionServer; +import org.apache.yetus.audience.InterfaceAudience; + +@InterfaceAudience.Private +public abstract class HealthCheckServlet extends HttpServlet { + + private final String serverLookupKey; + + public HealthCheckServlet(String serverLookupKey) { + this.serverLookupKey = serverLookupKey; + } + + @SuppressWarnings("unchecked") + @Override + protected void doGet(HttpServletRequest req, HttpServletResponse resp) + throws ServletException, IOException { + T server = (T) getServletContext().getAttribute(serverLookupKey); + try { + checkGeneric(server); + Optional message = check(server, req); + resp.setStatus(200); + resp.getWriter().write(message.orElse("ok")); + } catch (Exception e) { + resp.setStatus(500); + resp.getWriter().write(e.toString()); + } finally { + resp.getWriter().close(); + } + } + + private void checkGeneric(T server) throws IOException { + if (server == null) { + throw new IOException("Unable to get access to " + serverLookupKey); + } + if (server.isAborted() || server.isStopped() || server.isStopping() || server.isKilled()) { + throw new IOException("The " + serverLookupKey + " is stopping!"); + } + if (!server.getRpcServer().isStarted()) { + throw new IOException("The " + serverLookupKey + "'s RpcServer is not started"); + } + } + + protected abstract Optional check(T server, HttpServletRequest req) throws IOException; +} diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 9005fdae5d99..f4304a96cde2 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -154,6 +154,7 @@ import org.apache.hadoop.hbase.regionserver.handler.RSProcedureHandler; import org.apache.hadoop.hbase.regionserver.handler.RegionReplicaFlushHandler; import org.apache.hadoop.hbase.regionserver.http.RSDumpServlet; +import org.apache.hadoop.hbase.regionserver.http.RSHealthServlet; import org.apache.hadoop.hbase.regionserver.http.RSStatusServlet; import org.apache.hadoop.hbase.regionserver.throttle.FlushThroughputControllerFactory; import org.apache.hadoop.hbase.regionserver.throttle.ThroughputController; @@ -361,7 +362,7 @@ public class HRegionServer extends Thread // A state before we go into stopped state. At this stage we're closing user // space regions. - private boolean stopping = false; + private volatile boolean stopping = false; private volatile boolean killed = false; private volatile boolean shutDown = false; @@ -850,6 +851,10 @@ protected Class getDumpServlet() { return RSDumpServlet.class; } + protected Class getHealthServlet() { + return RSHealthServlet.class; + } + /** * Used by {@link RSDumpServlet} to generate debugging information. */ @@ -2329,6 +2334,7 @@ private void putUpWebUI() throws IOException { try { this.infoServer = new InfoServer(getProcessName(), addr, port, false, this.conf); infoServer.addPrivilegedServlet("dump", "/dump", getDumpServlet()); + infoServer.addPrivilegedServlet("health", "/health", getHealthServlet()); configureInfoServer(); this.infoServer.start(); break; @@ -3051,6 +3057,10 @@ public boolean isStopping() { return this.stopping; } + public boolean isKilled() { + return this.killed; + } + @Override public Configuration getConfiguration() { return conf; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/http/RSHealthServlet.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/http/RSHealthServlet.java new file mode 100644 index 000000000000..dcf9aec21843 --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/http/RSHealthServlet.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.regionserver.http; + +import java.io.IOException; +import java.time.Duration; +import java.time.Instant; +import java.util.HashSet; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import javax.servlet.http.HttpServletRequest; +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.hbase.monitoring.HealthCheckServlet; +import org.apache.hadoop.hbase.regionserver.HRegion; +import org.apache.hadoop.hbase.regionserver.HRegionServer; +import org.apache.yetus.audience.InterfaceAudience; + +@InterfaceAudience.Private +public class RSHealthServlet extends HealthCheckServlet { + + Map regionUnavailabeSince = new ConcurrentHashMap<>(); + + public RSHealthServlet() { + super(HRegionServer.REGIONSERVER); + } + + @Override + protected Optional check(HRegionServer regionServer, HttpServletRequest req) + throws IOException { + long maxUnavailableMillis = Optional.ofNullable(req.getParameter("maxUnavailableMillis")) + .filter(StringUtils::isNumeric).map(Long::parseLong).orElse(Long.MAX_VALUE); + + Instant oldestUnavailableSince = Instant.MAX; + String longestUnavailableRegion = null; + int unavailableCount = 0; + Set regionsPreviouslyUnavailable = new HashSet<>(regionUnavailabeSince.keySet()); + + for (HRegion region : regionServer.getOnlineRegionsLocalContext()) { + regionsPreviouslyUnavailable.remove(region.getRegionInfo().getEncodedName()); + if (!region.isAvailable()) { + unavailableCount++; + Instant unavailableSince = regionUnavailabeSince + .computeIfAbsent(region.getRegionInfo().getEncodedName(), k -> Instant.now()); + + if (unavailableSince.isBefore(oldestUnavailableSince)) { + oldestUnavailableSince = unavailableSince; + longestUnavailableRegion = region.getRegionInfo().getEncodedName(); + } + + } else { + regionUnavailabeSince.remove(region.getRegionInfo().getEncodedName()); + } + } + + regionUnavailabeSince.keySet().removeAll(regionsPreviouslyUnavailable); + + Duration longestUnavailableRegionTime = Duration.between(oldestUnavailableSince, Instant.now()); + if (longestUnavailableRegionTime.toMillis() > maxUnavailableMillis) { + throw new IOException("Region " + longestUnavailableRegion + + " has been unavailable too long, since " + oldestUnavailableSince); + } + + return Optional + .of("ok - unavailableRegions: " + unavailableCount + ", longestUnavailableDuration: " + + longestUnavailableRegionTime + ", longestUnavailableRegion: " + longestUnavailableRegion); + + } +}