From 51eef0d0ab9ab7fbfad3c0b241749947c8e98c61 Mon Sep 17 00:00:00 2001 From: Nick Dimiduk Date: Thu, 19 May 2022 12:57:42 +0200 Subject: [PATCH 1/2] HBASE-26366 Provide meaningful parent spans to ZK interactions Signed-off-by: Andrew Purtell --- .../apache/hadoop/hbase/HBaseServerBase.java | 108 +++--- .../hadoop/hbase/MetaRegionLocationCache.java | 5 +- .../apache/hadoop/hbase/master/HMaster.java | 88 ++--- .../hbase/master/HMasterCommandLine.java | 13 +- .../hbase/master/RegionServerTracker.java | 5 +- .../hbase/regionserver/HRegionServer.java | 281 +++++++++------- .../HRegionServerCommandLine.java | 9 +- .../TestRegionServerInternalsTracing.java | 314 ++++++++++++++++++ .../hbase/zookeeper/MasterAddressTracker.java | 13 +- .../hbase/zookeeper/RecoverableZooKeeper.java | 37 +-- .../hadoop/hbase/zookeeper/ZKWatcher.java | 66 ++-- 11 files changed, 661 insertions(+), 278 deletions(-) create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerInternalsTracing.java diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/HBaseServerBase.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/HBaseServerBase.java index a4168d058abc..625139964ab1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/HBaseServerBase.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/HBaseServerBase.java @@ -21,6 +21,8 @@ import static org.apache.hadoop.hbase.HConstants.HBASE_SPLIT_WAL_COORDINATED_BY_ZK; import com.google.errorprone.annotations.RestrictedApi; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.context.Scope; import java.io.IOException; import java.lang.management.MemoryType; import java.net.BindException; @@ -57,6 +59,7 @@ import org.apache.hadoop.hbase.security.UserProvider; import org.apache.hadoop.hbase.security.access.AccessChecker; import org.apache.hadoop.hbase.security.access.ZKPermissionWatcher; +import org.apache.hadoop.hbase.trace.TraceUtil; import org.apache.hadoop.hbase.unsafe.HBasePlatformDependent; import org.apache.hadoop.hbase.util.Addressing; import org.apache.hadoop.hbase.util.CommonFSUtils; @@ -231,60 +234,67 @@ protected final void initializeFileSystem() throws IOException { !canUpdateTableDescriptor(), cacheTableDescriptor()); } - public HBaseServerBase(Configuration conf, String name) - throws ZooKeeperConnectionException, IOException { + public HBaseServerBase(Configuration conf, String name) throws IOException { super(name); // thread name - this.conf = conf; - this.eventLoopGroupConfig = - NettyEventLoopGroupConfig.setup(conf, getClass().getSimpleName() + "-EventLoopGroup"); - this.startcode = EnvironmentEdgeManager.currentTime(); - this.userProvider = UserProvider.instantiate(conf); - this.msgInterval = conf.getInt("hbase.regionserver.msginterval", 3 * 1000); - this.sleeper = new Sleeper(this.msgInterval, this); - this.namedQueueRecorder = createNamedQueueRecord(); - this.rpcServices = createRpcServices(); - useThisHostnameInstead = getUseThisHostnameInstead(conf); - InetSocketAddress addr = rpcServices.getSocketAddress(); - String hostName = StringUtils.isBlank(useThisHostnameInstead) - ? addr.getHostName() - : this.useThisHostnameInstead; - serverName = ServerName.valueOf(hostName, addr.getPort(), this.startcode); - // login the zookeeper client principal (if using security) - ZKAuthentication.loginClient(this.conf, HConstants.ZK_CLIENT_KEYTAB_FILE, - HConstants.ZK_CLIENT_KERBEROS_PRINCIPAL, hostName); - // login the server principal (if using secure Hadoop) - login(userProvider, hostName); - // init superusers and add the server principal (if using security) - // or process owner as default super user. - Superusers.initialize(conf); - zooKeeper = - new ZKWatcher(conf, getProcessName() + ":" + addr.getPort(), this, canCreateBaseZNode()); - - this.configurationManager = new ConfigurationManager(); - setupWindows(conf, configurationManager); - - initializeFileSystem(); - - this.choreService = new ChoreService(getName(), true); - this.executorService = new ExecutorService(getName()); - - this.metaRegionLocationCache = new MetaRegionLocationCache(zooKeeper); - - if (clusterMode()) { - if ( - conf.getBoolean(HBASE_SPLIT_WAL_COORDINATED_BY_ZK, DEFAULT_HBASE_SPLIT_COORDINATED_BY_ZK) - ) { - csm = new ZkCoordinatedStateManager(this); + final Span span = TraceUtil.createSpan("HBaseServerBase.cxtor"); + try (Scope ignored = span.makeCurrent()) { + this.conf = conf; + this.eventLoopGroupConfig = + NettyEventLoopGroupConfig.setup(conf, getClass().getSimpleName() + "-EventLoopGroup"); + this.startcode = EnvironmentEdgeManager.currentTime(); + this.userProvider = UserProvider.instantiate(conf); + this.msgInterval = conf.getInt("hbase.regionserver.msginterval", 3 * 1000); + this.sleeper = new Sleeper(this.msgInterval, this); + this.namedQueueRecorder = createNamedQueueRecord(); + this.rpcServices = createRpcServices(); + useThisHostnameInstead = getUseThisHostnameInstead(conf); + InetSocketAddress addr = rpcServices.getSocketAddress(); + String hostName = StringUtils.isBlank(useThisHostnameInstead) + ? addr.getHostName() + : this.useThisHostnameInstead; + serverName = ServerName.valueOf(hostName, addr.getPort(), this.startcode); + // login the zookeeper client principal (if using security) + ZKAuthentication.loginClient(this.conf, HConstants.ZK_CLIENT_KEYTAB_FILE, + HConstants.ZK_CLIENT_KERBEROS_PRINCIPAL, hostName); + // login the server principal (if using secure Hadoop) + login(userProvider, hostName); + // init superusers and add the server principal (if using security) + // or process owner as default super user. + Superusers.initialize(conf); + zooKeeper = + new ZKWatcher(conf, getProcessName() + ":" + addr.getPort(), this, canCreateBaseZNode()); + + this.configurationManager = new ConfigurationManager(); + setupWindows(conf, configurationManager); + + initializeFileSystem(); + + this.choreService = new ChoreService(getName(), true); + this.executorService = new ExecutorService(getName()); + + this.metaRegionLocationCache = new MetaRegionLocationCache(zooKeeper); + + if (clusterMode()) { + if ( + conf.getBoolean(HBASE_SPLIT_WAL_COORDINATED_BY_ZK, DEFAULT_HBASE_SPLIT_COORDINATED_BY_ZK) + ) { + csm = new ZkCoordinatedStateManager(this); + } else { + csm = null; + } + clusterStatusTracker = new ClusterStatusTracker(zooKeeper, this); + clusterStatusTracker.start(); } else { csm = null; + clusterStatusTracker = null; } - clusterStatusTracker = new ClusterStatusTracker(zooKeeper, this); - clusterStatusTracker.start(); - } else { - csm = null; - clusterStatusTracker = null; + putUpWebUI(); + } catch (Throwable t) { + TraceUtil.setError(span, t); + throw t; + } finally { + span.end(); } - putUpWebUI(); } /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/MetaRegionLocationCache.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/MetaRegionLocationCache.java index b4ab1f28944f..b144342236af 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/MetaRegionLocationCache.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/MetaRegionLocationCache.java @@ -17,6 +17,7 @@ */ package org.apache.hadoop.hbase; +import io.opentelemetry.context.Context; import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -91,8 +92,8 @@ public MetaRegionLocationCache(ZKWatcher zkWatcher) { ThreadFactory threadFactory = new ThreadFactoryBuilder().setDaemon(true).build(); RetryCounterFactory retryFactory = new RetryCounterFactory(Integer.MAX_VALUE, SLEEP_INTERVAL_MS_BETWEEN_RETRIES, SLEEP_INTERVAL_MS_MAX); - threadFactory.newThread(() -> loadMetaLocationsFromZk(retryFactory.create(), ZNodeOpType.INIT)) - .start(); + threadFactory.newThread(Context.current() + .wrap(() -> loadMetaLocationsFromZk(retryFactory.create(), ZNodeOpType.INIT))).start(); } /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 42367857db10..239e30e7c171 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -24,6 +24,8 @@ import static org.apache.hadoop.hbase.util.DNS.MASTER_HOSTNAME_KEY; import com.google.errorprone.annotations.RestrictedApi; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.context.Scope; import java.io.IOException; import java.io.InterruptedIOException; import java.lang.reflect.Constructor; @@ -221,6 +223,7 @@ import org.apache.hadoop.hbase.security.SecurityConstants; import org.apache.hadoop.hbase.security.Superusers; import org.apache.hadoop.hbase.security.UserProvider; +import org.apache.hadoop.hbase.trace.TraceUtil; import org.apache.hadoop.hbase.util.Addressing; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.CommonFSUtils; @@ -471,7 +474,8 @@ public class HMaster extends HBaseServerBase implements Maste */ public HMaster(final Configuration conf) throws IOException { super(conf, "Master"); - try { + final Span span = TraceUtil.createSpan("HMaster.cxtor"); + try (Scope ignored = span.makeCurrent()) { if (conf.getBoolean(MAINTENANCE_MODE, false)) { LOG.info("Detected {}=true via configuration.", MAINTENANCE_MODE); maintenanceMode = true; @@ -534,8 +538,11 @@ public HMaster(final Configuration conf) throws IOException { } catch (Throwable t) { // Make sure we log the exception. HMaster is often started via reflection and the // cause of failed startup is lost. + TraceUtil.setError(span, t); LOG.error("Failed construction of Master", t); throw t; + } finally { + span.end(); } } @@ -565,7 +572,7 @@ public void run() { try { installShutdownHook(); registerConfigurationObservers(); - Threads.setDaemonThreadRunning(new Thread(() -> { + Threads.setDaemonThreadRunning(new Thread(() -> TraceUtil.trace(() -> { try { int infoPort = putUpJettyServer(); startActiveMasterManager(infoPort); @@ -578,17 +585,22 @@ public void run() { abort(error, t); } } - }), getName() + ":becomeActiveMaster"); + }, "HMaster.becomeActiveMaster")), getName() + ":becomeActiveMaster"); while (!isStopped() && !isAborted()) { sleeper.sleep(); } - stopInfoServer(); - closeClusterConnection(); - stopServiceThreads(); - if (this.rpcServices != null) { - this.rpcServices.stop(); + final Span span = TraceUtil.createSpan("HMaster exiting main loop"); + try (Scope ignored = span.makeCurrent()) { + stopInfoServer(); + closeClusterConnection(); + stopServiceThreads(); + if (this.rpcServices != null) { + this.rpcServices.stop(); + } + closeZooKeeper(); + } finally { + span.end(); } - closeZooKeeper(); } finally { if (this.clusterSchemaService != null) { // If on way out, then we are no longer active master. @@ -3089,36 +3101,38 @@ public MemoryBoundedLogMessageBuffer getRegionServerFatalLogBuffer() { * Shutdown the cluster. Master runs a coordinated stop of all RegionServers and then itself. */ public void shutdown() throws IOException { - if (cpHost != null) { - cpHost.preShutdown(); - } + TraceUtil.trace(() -> { + if (cpHost != null) { + cpHost.preShutdown(); + } - // Tell the servermanager cluster shutdown has been called. This makes it so when Master is - // last running server, it'll stop itself. Next, we broadcast the cluster shutdown by setting - // the cluster status as down. RegionServers will notice this change in state and will start - // shutting themselves down. When last has exited, Master can go down. - if (this.serverManager != null) { - this.serverManager.shutdownCluster(); - } - if (this.clusterStatusTracker != null) { - try { - this.clusterStatusTracker.setClusterDown(); - } catch (KeeperException e) { - LOG.error("ZooKeeper exception trying to set cluster as down in ZK", e); + // Tell the servermanager cluster shutdown has been called. This makes it so when Master is + // last running server, it'll stop itself. Next, we broadcast the cluster shutdown by setting + // the cluster status as down. RegionServers will notice this change in state and will start + // shutting themselves down. When last has exited, Master can go down. + if (this.serverManager != null) { + this.serverManager.shutdownCluster(); } - } - // Stop the procedure executor. Will stop any ongoing assign, unassign, server crash etc., - // processing so we can go down. - if (this.procedureExecutor != null) { - this.procedureExecutor.stop(); - } - // Shutdown our cluster connection. This will kill any hosted RPCs that might be going on; - // this is what we want especially if the Master is in startup phase doing call outs to - // hbase:meta, etc. when cluster is down. Without ths connection close, we'd have to wait on - // the rpc to timeout. - if (this.asyncClusterConnection != null) { - this.asyncClusterConnection.close(); - } + if (this.clusterStatusTracker != null) { + try { + this.clusterStatusTracker.setClusterDown(); + } catch (KeeperException e) { + LOG.error("ZooKeeper exception trying to set cluster as down in ZK", e); + } + } + // Stop the procedure executor. Will stop any ongoing assign, unassign, server crash etc., + // processing so we can go down. + if (this.procedureExecutor != null) { + this.procedureExecutor.stop(); + } + // Shutdown our cluster connection. This will kill any hosted RPCs that might be going on; + // this is what we want especially if the Master is in startup phase doing call outs to + // hbase:meta, etc. when cluster is down. Without ths connection close, we'd have to wait on + // the rpc to timeout. + if (this.asyncClusterConnection != null) { + this.asyncClusterConnection.close(); + } + }, "HMaster.shutdown"); } public void stopMaster() throws IOException { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMasterCommandLine.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMasterCommandLine.java index da2c5bc11e80..822e4e27cee6 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMasterCommandLine.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMasterCommandLine.java @@ -17,6 +17,8 @@ */ package org.apache.hadoop.hbase.master; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.context.Scope; import java.io.File; import java.io.IOException; import java.util.List; @@ -30,13 +32,13 @@ import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.regionserver.HRegionServer; +import org.apache.hadoop.hbase.trace.TraceUtil; import org.apache.hadoop.hbase.util.JVMClusterUtil; import org.apache.hadoop.hbase.util.ServerCommandLine; import org.apache.hadoop.hbase.zookeeper.MiniZooKeeperCluster; import org.apache.hadoop.hbase.zookeeper.ZKAuthentication; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.yetus.audience.InterfaceAudience; -import org.apache.zookeeper.KeeperException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -161,7 +163,8 @@ public int run(String args[]) throws Exception { private int startMaster() { Configuration conf = getConf(); - try { + final Span span = TraceUtil.createSpan("HMasterCommandLine.startMaster"); + try (Scope ignored = span.makeCurrent()) { // If 'local', defer to LocalHBaseCluster instance. Starts master // and regionserver both in the one JVM. if (LocalHBaseCluster.isLocal(conf)) { @@ -251,8 +254,11 @@ private int startMaster() { if (master.isAborted()) throw new RuntimeException("HMaster Aborted"); } } catch (Throwable t) { + TraceUtil.setError(span, t); LOG.error("Master exiting", t); return 1; + } finally { + span.end(); } return 0; } @@ -310,8 +316,7 @@ private void waitOnMasterThreads(LocalHBaseCluster cluster) throws InterruptedEx public static class LocalHMaster extends HMaster { private MiniZooKeeperCluster zkcluster = null; - public LocalHMaster(Configuration conf) - throws IOException, KeeperException, InterruptedException { + public LocalHMaster(Configuration conf) throws IOException { super(conf); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java index 645690c135e1..418fc4187adb 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java @@ -17,6 +17,7 @@ */ package org.apache.hadoop.hbase.master; +import io.opentelemetry.context.Context; import java.io.IOException; import java.io.InterruptedIOException; import java.util.Collections; @@ -70,8 +71,8 @@ public class RegionServerTracker extends ZKListener { public RegionServerTracker(ZKWatcher watcher, MasterServices server) { super(watcher); this.server = server; - this.executor = Executors.newSingleThreadExecutor( - new ThreadFactoryBuilder().setDaemon(true).setNameFormat("RegionServerTracker-%d").build()); + this.executor = Context.current().wrap(Executors.newSingleThreadExecutor( + new ThreadFactoryBuilder().setDaemon(true).setNameFormat("RegionServerTracker-%d").build())); watcher.registerListener(this); refresh(); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 7dc474dcdb2a..70300c237eb2 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -23,6 +23,8 @@ import static org.apache.hadoop.hbase.HConstants.HBASE_SPLIT_WAL_MAX_SPLITTER; import static org.apache.hadoop.hbase.util.DNS.UNSAFE_RS_HOSTNAME_KEY; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.context.Scope; import java.io.IOException; import java.io.PrintWriter; import java.lang.management.MemoryUsage; @@ -137,6 +139,7 @@ import org.apache.hadoop.hbase.security.Superusers; import org.apache.hadoop.hbase.security.User; import org.apache.hadoop.hbase.security.UserProvider; +import org.apache.hadoop.hbase.trace.TraceUtil; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.CompressionTest; import org.apache.hadoop.hbase.util.CoprocessorConfigurationUtil; @@ -471,7 +474,8 @@ public class HRegionServer extends HBaseServerBase */ public HRegionServer(final Configuration conf) throws IOException { super(conf, "RegionServer"); // thread name - try { + final Span span = TraceUtil.createSpan("HRegionServer.cxtor"); + try (Scope ignored = span.makeCurrent()) { this.dataFsOk = true; this.masterless = !clusterMode(); MemorySizeUtil.checkForClusterFreeHeapMemoryLimit(this.conf); @@ -516,8 +520,11 @@ public HRegionServer(final Configuration conf) throws IOException { } catch (Throwable t) { // Make sure we log the exception. HRegionServer is often started via reflection and the // cause of failed startup is lost. + TraceUtil.setError(span, t); LOG.error("Failed construction RegionServer", t); throw t; + } finally { + span.end(); } } @@ -645,7 +652,8 @@ public String getClusterId() { * In here we just put up the RpcServer, setup Connection, and ZooKeeper. */ private void preRegistrationInitialization() { - try { + final Span span = TraceUtil.createSpan("HRegionServer.preRegistrationInitialization"); + try (Scope ignored = span.makeCurrent()) { initializeZooKeeper(); setupClusterConnection(); bootstrapNodeManager = new BootstrapNodeManager(asyncClusterConnection, masterAddressTracker); @@ -655,8 +663,11 @@ private void preRegistrationInitialization() { } catch (Throwable t) { // Call stop if error or process will stick around for ever since server // puts up non-daemon threads. + TraceUtil.setError(span, t); this.rpcServices.stop(); abort("Initialization of RS failed. Hence aborting RS.", t); + } finally { + span.end(); } } @@ -766,35 +777,39 @@ public void run() { // start up all Services. Use RetryCounter to get backoff in case Master is struggling to // come up. LOG.debug("About to register with Master."); - RetryCounterFactory rcf = - new RetryCounterFactory(Integer.MAX_VALUE, this.sleeper.getPeriod(), 1000 * 60 * 5); - RetryCounter rc = rcf.create(); - while (keepLooping()) { - RegionServerStartupResponse w = reportForDuty(); - if (w == null) { - long sleepTime = rc.getBackoffTimeAndIncrementAttempts(); - LOG.warn("reportForDuty failed; sleeping {} ms and then retrying.", sleepTime); - this.sleeper.sleep(sleepTime); - } else { - handleReportForDutyResponse(w); - break; + TraceUtil.trace(() -> { + RetryCounterFactory rcf = + new RetryCounterFactory(Integer.MAX_VALUE, this.sleeper.getPeriod(), 1000 * 60 * 5); + RetryCounter rc = rcf.create(); + while (keepLooping()) { + RegionServerStartupResponse w = reportForDuty(); + if (w == null) { + long sleepTime = rc.getBackoffTimeAndIncrementAttempts(); + LOG.warn("reportForDuty failed; sleeping {} ms and then retrying.", sleepTime); + this.sleeper.sleep(sleepTime); + } else { + handleReportForDutyResponse(w); + break; + } } - } + }, "HRegionServer.registerWithMaster"); } if (!isStopped() && isHealthy()) { - // start the snapshot handler and other procedure handlers, - // since the server is ready to run - if (this.rspmHost != null) { - this.rspmHost.start(); - } - // Start the Quota Manager - if (this.rsQuotaManager != null) { - rsQuotaManager.start(getRpcServer().getScheduler()); - } - if (this.rsSpaceQuotaManager != null) { - this.rsSpaceQuotaManager.start(); - } + TraceUtil.trace(() -> { + // start the snapshot handler and other procedure handlers, + // since the server is ready to run + if (this.rspmHost != null) { + this.rspmHost.start(); + } + // Start the Quota Manager + if (this.rsQuotaManager != null) { + rsQuotaManager.start(getRpcServer().getScheduler()); + } + if (this.rsSpaceQuotaManager != null) { + this.rsSpaceQuotaManager.start(); + } + }, "HRegionServer.startup"); } // We registered with the Master. Go into run mode. @@ -845,119 +860,124 @@ public void run() { } } - if (this.leaseManager != null) { - this.leaseManager.closeAfterLeasesExpire(); - } - if (this.splitLogWorker != null) { - splitLogWorker.stop(); - } - stopInfoServer(); - // Send cache a shutdown. - if (blockCache != null) { - blockCache.shutdown(); - } - if (mobFileCache != null) { - mobFileCache.shutdown(); - } - - // Send interrupts to wake up threads if sleeping so they notice shutdown. - // TODO: Should we check they are alive? If OOME could have exited already - if (this.hMemManager != null) { - this.hMemManager.stop(); - } - if (this.cacheFlusher != null) { - this.cacheFlusher.interruptIfNecessary(); - } - if (this.compactSplitThread != null) { - this.compactSplitThread.interruptIfNecessary(); - } + final Span span = TraceUtil.createSpan("HRegionServer exiting main loop"); + try (Scope ignored = span.makeCurrent()) { + if (this.leaseManager != null) { + this.leaseManager.closeAfterLeasesExpire(); + } + if (this.splitLogWorker != null) { + splitLogWorker.stop(); + } + stopInfoServer(); + // Send cache a shutdown. + if (blockCache != null) { + blockCache.shutdown(); + } + if (mobFileCache != null) { + mobFileCache.shutdown(); + } - // Stop the snapshot and other procedure handlers, forcefully killing all running tasks - if (rspmHost != null) { - rspmHost.stop(this.abortRequested.get() || this.killed); - } + // Send interrupts to wake up threads if sleeping so they notice shutdown. + // TODO: Should we check they are alive? If OOME could have exited already + if (this.hMemManager != null) { + this.hMemManager.stop(); + } + if (this.cacheFlusher != null) { + this.cacheFlusher.interruptIfNecessary(); + } + if (this.compactSplitThread != null) { + this.compactSplitThread.interruptIfNecessary(); + } - if (this.killed) { - // Just skip out w/o closing regions. Used when testing. - } else if (abortRequested.get()) { - if (this.dataFsOk) { - closeUserRegions(abortRequested.get()); // Don't leave any open file handles + // Stop the snapshot and other procedure handlers, forcefully killing all running tasks + if (rspmHost != null) { + rspmHost.stop(this.abortRequested.get() || this.killed); } - LOG.info("aborting server " + this.serverName); - } else { - closeUserRegions(abortRequested.get()); - LOG.info("stopping server " + this.serverName); - } - regionReplicationBufferManager.stop(); - closeClusterConnection(); - // Closing the compactSplit thread before closing meta regions - if (!this.killed && containsMetaTableRegions()) { - if (!abortRequested.get() || this.dataFsOk) { - if (this.compactSplitThread != null) { - this.compactSplitThread.join(); - this.compactSplitThread = null; + + if (this.killed) { + // Just skip out w/o closing regions. Used when testing. + } else if (abortRequested.get()) { + if (this.dataFsOk) { + closeUserRegions(abortRequested.get()); // Don't leave any open file handles + } + LOG.info("aborting server " + this.serverName); + } else { + closeUserRegions(abortRequested.get()); + LOG.info("stopping server " + this.serverName); + } + regionReplicationBufferManager.stop(); + closeClusterConnection(); + // Closing the compactSplit thread before closing meta regions + if (!this.killed && containsMetaTableRegions()) { + if (!abortRequested.get() || this.dataFsOk) { + if (this.compactSplitThread != null) { + this.compactSplitThread.join(); + this.compactSplitThread = null; + } + closeMetaTableRegions(abortRequested.get()); } - closeMetaTableRegions(abortRequested.get()); } - } - if (!this.killed && this.dataFsOk) { - waitOnAllRegionsToClose(abortRequested.get()); - LOG.info("stopping server " + this.serverName + "; all regions closed."); - } + if (!this.killed && this.dataFsOk) { + waitOnAllRegionsToClose(abortRequested.get()); + LOG.info("stopping server " + this.serverName + "; all regions closed."); + } - // Stop the quota manager - if (rsQuotaManager != null) { - rsQuotaManager.stop(); - } - if (rsSpaceQuotaManager != null) { - rsSpaceQuotaManager.stop(); - rsSpaceQuotaManager = null; - } + // Stop the quota manager + if (rsQuotaManager != null) { + rsQuotaManager.stop(); + } + if (rsSpaceQuotaManager != null) { + rsSpaceQuotaManager.stop(); + rsSpaceQuotaManager = null; + } - // flag may be changed when closing regions throws exception. - if (this.dataFsOk) { - shutdownWAL(!abortRequested.get()); - } + // flag may be changed when closing regions throws exception. + if (this.dataFsOk) { + shutdownWAL(!abortRequested.get()); + } - // Make sure the proxy is down. - if (this.rssStub != null) { - this.rssStub = null; - } - if (this.lockStub != null) { - this.lockStub = null; - } - if (this.rpcClient != null) { - this.rpcClient.close(); - } - if (this.leaseManager != null) { - this.leaseManager.close(); - } - if (this.pauseMonitor != null) { - this.pauseMonitor.stop(); - } + // Make sure the proxy is down. + if (this.rssStub != null) { + this.rssStub = null; + } + if (this.lockStub != null) { + this.lockStub = null; + } + if (this.rpcClient != null) { + this.rpcClient.close(); + } + if (this.leaseManager != null) { + this.leaseManager.close(); + } + if (this.pauseMonitor != null) { + this.pauseMonitor.stop(); + } - if (!killed) { - stopServiceThreads(); - } + if (!killed) { + stopServiceThreads(); + } - if (this.rpcServices != null) { - this.rpcServices.stop(); - } + if (this.rpcServices != null) { + this.rpcServices.stop(); + } - try { - deleteMyEphemeralNode(); - } catch (KeeperException.NoNodeException nn) { - // pass - } catch (KeeperException e) { - LOG.warn("Failed deleting my ephemeral node", e); - } - // We may have failed to delete the znode at the previous step, but - // we delete the file anyway: a second attempt to delete the znode is likely to fail again. - ZNodeClearer.deleteMyEphemeralNodeOnDisk(); + try { + deleteMyEphemeralNode(); + } catch (KeeperException.NoNodeException nn) { + // pass + } catch (KeeperException e) { + LOG.warn("Failed deleting my ephemeral node", e); + } + // We may have failed to delete the znode at the previous step, but + // we delete the file anyway: a second attempt to delete the znode is likely to fail again. + ZNodeClearer.deleteMyEphemeralNodeOnDisk(); - closeZooKeeper(); - LOG.info("Exiting; stopping=" + this.serverName + "; zookeeper connection closed."); + closeZooKeeper(); + LOG.info("Exiting; stopping=" + this.serverName + "; zookeeper connection closed."); + } finally { + span.end(); + } } private boolean containsMetaTableRegions() { @@ -998,7 +1018,8 @@ protected void tryRegionServerReport(long reportStartTime, long reportEndTime) return; } ClusterStatusProtos.ServerLoad sl = buildServerLoad(reportStartTime, reportEndTime); - try { + final Span span = TraceUtil.createSpan("HRegionServer.tryRegionServerReport"); + try (Scope ignored = span.makeCurrent()) { RegionServerReportRequest.Builder request = RegionServerReportRequest.newBuilder(); request.setServer(ProtobufUtil.toServerName(this.serverName)); request.setLoad(sl); @@ -1007,14 +1028,18 @@ protected void tryRegionServerReport(long reportStartTime, long reportEndTime) IOException ioe = ProtobufUtil.getRemoteException(se); if (ioe instanceof YouAreDeadException) { // This will be caught and handled as a fatal error in run() + TraceUtil.setError(span, ioe); throw ioe; } if (rssStub == rss) { rssStub = null; } + TraceUtil.setError(span, se); // Couldn't connect to the master, get location from zk and reconnect // Method blocks until new master is found or we are stopped createRegionServerStatusStub(true); + } finally { + span.end(); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServerCommandLine.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServerCommandLine.java index 3b21171c28e7..f17d79db5ea3 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServerCommandLine.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServerCommandLine.java @@ -17,9 +17,12 @@ */ package org.apache.hadoop.hbase.regionserver; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.context.Scope; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.LocalHBaseCluster; +import org.apache.hadoop.hbase.trace.TraceUtil; import org.apache.hadoop.hbase.util.ServerCommandLine; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; @@ -47,7 +50,8 @@ protected String getUsage() { private int start() throws Exception { Configuration conf = getConf(); - try { + final Span span = TraceUtil.createSpan("HRegionServerCommandLine.start"); + try (Scope ignored = span.makeCurrent()) { // If 'local', don't start a region server here. Defer to // LocalHBaseCluster. It manages 'local' clusters. if (LocalHBaseCluster.isLocal(conf)) { @@ -63,8 +67,11 @@ private int start() throws Exception { } } } catch (Throwable t) { + TraceUtil.setError(span, t); LOG.error("Region server exiting", t); return 1; + } finally { + span.end(); } return 0; } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerInternalsTracing.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerInternalsTracing.java new file mode 100644 index 000000000000..7b7c4b6feadf --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerInternalsTracing.java @@ -0,0 +1,314 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.regionserver; + +import static org.apache.hadoop.hbase.client.trace.hamcrest.SpanDataMatchers.hasName; +import static org.apache.hadoop.hbase.client.trace.hamcrest.SpanDataMatchers.hasParentSpanId; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.allOf; +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.endsWith; +import static org.hamcrest.Matchers.hasItem; +import static org.hamcrest.Matchers.hasItems; +import static org.hamcrest.Matchers.hasSize; +import static org.hamcrest.Matchers.startsWith; + +import io.opentelemetry.sdk.trace.data.SpanData; +import java.util.List; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtil; +import org.apache.hadoop.hbase.MiniClusterRule; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.trace.StringTraceRenderer; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.testclassification.RegionServerTests; +import org.apache.hadoop.hbase.trace.OpenTelemetryClassRule; +import org.hamcrest.Matcher; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.rules.ExternalResource; +import org.junit.rules.RuleChain; +import org.junit.rules.TestRule; +import org.junit.runners.model.Statement; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Test that sundry operations internal to the region server are traced as expected. + */ +@Category({ MediumTests.class, RegionServerTests.class }) +public class TestRegionServerInternalsTracing { + private static final Logger LOG = LoggerFactory.getLogger(TestRegionServerInternalsTracing.class); + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestRegionServerInternalsTracing.class); + + private static final String NO_PARENT_ID = "0000000000000000"; + private static List spans; + + /** + * Wait for the underlying cluster to come up -- defined by meta being available. + */ + private static class Setup extends ExternalResource { + private final Supplier testingUtilSupplier; + + public Setup(final Supplier testingUtilSupplier) { + this.testingUtilSupplier = testingUtilSupplier; + } + + @Override + protected void before() throws Throwable { + final HBaseTestingUtil testingUtil = testingUtilSupplier.get(); + testingUtil.waitTableAvailable(TableName.META_TABLE_NAME); + } + } + + private static class Noop extends Statement { + @Override + public void evaluate() throws Throwable { + } + } + + @ClassRule + public static TestRule classRule = (base, description) -> new Statement() { + @Override + public void evaluate() throws Throwable { + // setup and tear down the cluster, collecting all the spans produced in the process. + final OpenTelemetryClassRule otelClassRule = OpenTelemetryClassRule.create(); + final MiniClusterRule miniClusterRule = MiniClusterRule.newBuilder().build(); + final Setup setup = new Setup(miniClusterRule::getTestingUtility); + final TestRule clusterRule = + RuleChain.outerRule(otelClassRule).around(miniClusterRule).around(setup); + clusterRule.apply(new Noop(), description).evaluate(); + spans = otelClassRule.getSpans(); + if (LOG.isDebugEnabled()) { + StringTraceRenderer renderer = new StringTraceRenderer(spans); + renderer.render(LOG::debug); + } + base.evaluate(); + } + }; + + @Test + public void testHBaseServerBaseConstructor() { + final Matcher hbaseServerBaseMatcher = + allOf(hasName("HBaseServerBase.cxtor"), hasParentSpanId(NO_PARENT_ID)); + assertThat("there should be a span from the HBaseServerBase constructor.", spans, + hasItem(hbaseServerBaseMatcher)); + final List hbaseServerBaseSpans = + spans.stream().filter(hbaseServerBaseMatcher::matches).collect(Collectors.toList()); + assertThat("there should be two spans from the HBaseServerBase constructor, one for the master" + + " and one for the region server.", hbaseServerBaseSpans, hasSize(2)); + assertThat("both should show zookeeper interaction.", spans, + hasItems( + allOf(hasParentSpanId(hbaseServerBaseSpans.get(0)), + hasName(containsString("RecoverableZookeeper."))), + allOf(hasParentSpanId(hbaseServerBaseSpans.get(1)), + hasName(containsString("RecoverableZookeeper."))))); + } + + @Test + public void testHMasterConstructor() { + final Matcher masterConstructorMatcher = + allOf(hasName("HMaster.cxtor"), hasParentSpanId(NO_PARENT_ID)); + assertThat("there should be a span from the HMaster constructor.", spans, + hasItem(masterConstructorMatcher)); + final SpanData masterConstructorSpan = spans.stream().filter(masterConstructorMatcher::matches) + .findAny().orElseThrow(AssertionError::new); + assertThat("the HMaster constructor span should show zookeeper interaction.", spans, hasItem( + allOf(hasName(startsWith("RecoverableZookeeper.")), hasParentSpanId(masterConstructorSpan)))); + } + + @Test + public void testHMasterBecomeActiveMaster() { + final Matcher masterBecomeActiveMasterMatcher = + allOf(hasName("HMaster.becomeActiveMaster"), hasParentSpanId(NO_PARENT_ID)); + assertThat("there should be a span from the HMaster.becomeActiveMaster.", spans, + hasItem(masterBecomeActiveMasterMatcher)); + final SpanData masterBecomeActiveMasterSpan = spans.stream() + .filter(masterBecomeActiveMasterMatcher::matches).findAny().orElseThrow(AssertionError::new); + assertThat("the HMaster.becomeActiveMaster span should show zookeeper interaction.", spans, + hasItem(allOf(hasName(startsWith("RecoverableZookeeper.")), + hasParentSpanId(masterBecomeActiveMasterSpan)))); + assertThat("the HMaster.becomeActiveMaster span should show Region interaction.", spans, + hasItem( + allOf(hasName(startsWith("Region.")), hasParentSpanId(masterBecomeActiveMasterSpan)))); + assertThat("the HMaster.becomeActiveMaster span should show RegionScanner interaction.", spans, + hasItem(allOf(hasName(startsWith("RegionScanner.")), + hasParentSpanId(masterBecomeActiveMasterSpan)))); + assertThat("the HMaster.becomeActiveMaster span should show hbase:meta interaction.", spans, + hasItem(allOf(hasName(containsString("hbase:meta")), + hasParentSpanId(masterBecomeActiveMasterSpan)))); + assertThat("the HMaster.becomeActiveMaster span should show WAL interaction.", spans, + hasItem(allOf(hasName(startsWith("WAL.")), hasParentSpanId(masterBecomeActiveMasterSpan)))); + } + + @Test + public void testZKWatcherHMaster() { + final Matcher mZKWatcherMatcher = + allOf(hasName(startsWith("ZKWatcher-master")), hasParentSpanId(NO_PARENT_ID)); + assertThat("there should be a span from the ZKWatcher running in the HMaster.", spans, + hasItem(mZKWatcherMatcher)); + final SpanData mZKWatcherSpan = + spans.stream().filter(mZKWatcherMatcher::matches).findAny().orElseThrow(AssertionError::new); + assertThat("the ZKWatcher running in the HMaster span should invoke processEvent.", spans, + hasItem(allOf(hasName(containsString("processEvent")), hasParentSpanId(mZKWatcherSpan)))); + } + + @Test + public void testHMasterShutdown() { + final Matcher masterShutdownMatcher = + allOf(hasName("HMaster.shutdown"), hasParentSpanId(NO_PARENT_ID)); + assertThat("there should be a span from the HMaster.shutdown.", spans, + hasItem(masterShutdownMatcher)); + final SpanData masterShutdownSpan = spans.stream().filter(masterShutdownMatcher::matches) + .findAny().orElseThrow(AssertionError::new); + assertThat("the HMaster.shutdown span should show zookeeper interaction.", spans, hasItem( + allOf(hasName(startsWith("RecoverableZookeeper.")), hasParentSpanId(masterShutdownSpan)))); + assertThat("the HMaster.shutdown span should show AsyncConnection interaction.", spans, + hasItem(allOf(hasName(startsWith("AsyncConnection.")), hasParentSpanId(masterShutdownSpan)))); + } + + @Test + public void testHMasterExitingMainLoop() { + final Matcher masterExitingMainLoopMatcher = + allOf(hasName("HMaster exiting main loop"), hasParentSpanId(NO_PARENT_ID)); + assertThat("there should be a span from the HMaster exiting main loop.", spans, + hasItem(masterExitingMainLoopMatcher)); + final SpanData masterExitingMainLoopSpan = spans.stream() + .filter(masterExitingMainLoopMatcher::matches).findAny().orElseThrow(AssertionError::new); + assertThat("the HMaster exiting main loop span should show zookeeper interaction.", spans, + hasItem(allOf(hasName(startsWith("RecoverableZookeeper.")), + hasParentSpanId(masterExitingMainLoopSpan)))); + assertThat("the HMaster exiting main loop span should show WAL interaction.", spans, + hasItem(allOf(hasName(startsWith("WAL.")), hasParentSpanId(masterExitingMainLoopSpan)))); + assertThat("the HMaster exiting main loop span should show AsyncConnection interaction.", spans, + hasItem(allOf(hasName(startsWith("AsyncConnection.")), + hasParentSpanId(masterExitingMainLoopSpan)))); + } + + @Test + public void testTryRegionServerReport() { + final Matcher tryRegionServerReportMatcher = + allOf(hasName("HRegionServer.tryRegionServerReport"), hasParentSpanId(NO_PARENT_ID)); + assertThat("there should be a span for the region server sending a report.", spans, + hasItem(tryRegionServerReportMatcher)); + final SpanData tryRegionServerReportSpan = spans.stream() + .filter(tryRegionServerReportMatcher::matches).findAny().orElseThrow(AssertionError::new); + assertThat( + "the region server report span should have an invocation of the RegionServerReport RPC.", + spans, hasItem(allOf(hasName(endsWith("RegionServerStatusService/RegionServerReport")), + hasParentSpanId(tryRegionServerReportSpan)))); + } + + @Test + public void testHRegionServerStartup() { + final Matcher regionServerStartupMatcher = + allOf(hasName("HRegionServer.startup"), hasParentSpanId(NO_PARENT_ID)); + assertThat("there should be a span from the HRegionServer startup procedure.", spans, + hasItem(regionServerStartupMatcher)); + final SpanData regionServerStartupSpan = spans.stream() + .filter(regionServerStartupMatcher::matches).findAny().orElseThrow(AssertionError::new); + assertThat("the HRegionServer startup procedure span should show zookeeper interaction.", spans, + hasItem(allOf(hasName(startsWith("RecoverableZookeeper.")), + hasParentSpanId(regionServerStartupSpan)))); + } + + @Test + public void testHRegionServerConstructor() { + final Matcher rsConstructorMatcher = + allOf(hasName("HRegionServer.cxtor"), hasParentSpanId(NO_PARENT_ID)); + assertThat("there should be a span from the HRegionServer constructor.", spans, + hasItem(rsConstructorMatcher)); + final SpanData rsConstructorSpan = spans.stream().filter(rsConstructorMatcher::matches) + .findAny().orElseThrow(AssertionError::new); + assertThat("the HRegionServer constructor span should show zookeeper interaction.", spans, + hasItem( + allOf(hasName(startsWith("RecoverableZookeeper.")), hasParentSpanId(rsConstructorSpan)))); + assertThat("the HRegionServer constructor span should invoke the MasterAddressTracker.", spans, + hasItem( + allOf(hasName(startsWith("MasterAddressTracker.")), hasParentSpanId(rsConstructorSpan)))); + } + + @Test + public void testHRegionServerPreRegistrationInitialization() { + final Matcher rsPreRegistrationInitializationMatcher = + allOf(hasName("HRegionServer.preRegistrationInitialization"), hasParentSpanId(NO_PARENT_ID)); + assertThat("there should be a span from the HRegionServer preRegistrationInitialization.", + spans, hasItem(rsPreRegistrationInitializationMatcher)); + final SpanData rsPreRegistrationInitializationSpan = + spans.stream().filter(rsPreRegistrationInitializationMatcher::matches).findAny() + .orElseThrow(AssertionError::new); + assertThat( + "the HRegionServer preRegistrationInitialization span should show zookeeper interaction.", + spans, hasItem(allOf(hasName(startsWith("RecoverableZookeeper.")), + hasParentSpanId(rsPreRegistrationInitializationSpan)))); + } + + @Test + public void testHRegionServerRegisterWithMaster() { + final Matcher rsRegisterWithMasterMatcher = + allOf(hasName("HRegionServer.registerWithMaster"), hasParentSpanId(NO_PARENT_ID)); + assertThat("there should be a span from the HRegionServer registerWithMaster.", spans, + hasItem(rsRegisterWithMasterMatcher)); + final SpanData rsRegisterWithMasterSpan = spans.stream() + .filter(rsRegisterWithMasterMatcher::matches).findAny().orElseThrow(AssertionError::new); + assertThat("the HRegionServer registerWithMaster span should show zookeeper interaction.", + spans, hasItem(allOf(hasName(startsWith("RecoverableZookeeper.")), + hasParentSpanId(rsRegisterWithMasterSpan)))); + assertThat( + "the HRegionServer registerWithMaster span should have an invocation of the" + + " RegionServerStartup RPC.", + spans, hasItem(allOf(hasName(endsWith("RegionServerStatusService/RegionServerStartup")), + hasParentSpanId(rsRegisterWithMasterSpan)))); + } + + @Test + public void testZKWatcherRegionServer() { + final Matcher rsZKWatcherMatcher = + allOf(hasName(startsWith("ZKWatcher-regionserver")), hasParentSpanId(NO_PARENT_ID)); + assertThat("there should be a span from the ZKWatcher running in the HRegionServer.", spans, + hasItem(rsZKWatcherMatcher)); + final SpanData rsZKWatcherSpan = + spans.stream().filter(rsZKWatcherMatcher::matches).findAny().orElseThrow(AssertionError::new); + assertThat("the ZKWatcher running in the HRegionServer span should invoke processEvent.", spans, + hasItem(allOf(hasName(containsString("processEvent")), hasParentSpanId(rsZKWatcherSpan)))); + } + + @Test + public void testHRegionServerExitingMainLoop() { + final Matcher rsExitingMainLoopMatcher = + allOf(hasName("HRegionServer exiting main loop"), hasParentSpanId(NO_PARENT_ID)); + assertThat("there should be a span from the HRegionServer exiting main loop.", spans, + hasItem(rsExitingMainLoopMatcher)); + final SpanData rsExitingMainLoopSpan = spans.stream().filter(rsExitingMainLoopMatcher::matches) + .findAny().orElseThrow(AssertionError::new); + assertThat("the HRegionServer exiting main loop span should show zookeeper interaction.", spans, + hasItem(allOf(hasName(startsWith("RecoverableZookeeper.")), + hasParentSpanId(rsExitingMainLoopSpan)))); + assertThat("the HRegionServer exiting main loop span should show AsyncConnection interaction.", + spans, hasItem( + allOf(hasName(startsWith("AsyncConnection.")), hasParentSpanId(rsExitingMainLoopSpan)))); + assertThat("the HRegionServer exiting main loop span should invoke CloseMetaHandler.", spans, + hasItem(allOf(hasName("CloseMetaHandler"), hasParentSpanId(rsExitingMainLoopSpan)))); + } +} diff --git a/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/MasterAddressTracker.java b/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/MasterAddressTracker.java index 649178f7a01f..840ee2d215f6 100644 --- a/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/MasterAddressTracker.java +++ b/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/MasterAddressTracker.java @@ -27,6 +27,7 @@ import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.exceptions.DeserializationException; +import org.apache.hadoop.hbase.trace.TraceUtil; import org.apache.yetus.audience.InterfaceAudience; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.data.Stat; @@ -71,11 +72,13 @@ public MasterAddressTracker(ZKWatcher watcher, Abortable abortable) { } private void loadBackupMasters() { - try { - backupMasters = Collections.unmodifiableList(getBackupMastersAndRenewWatch(watcher)); - } catch (InterruptedIOException e) { - abortable.abort("Unexpected exception handling nodeChildrenChanged event", e); - } + TraceUtil.trace(() -> { + try { + backupMasters = Collections.unmodifiableList(getBackupMastersAndRenewWatch(watcher)); + } catch (InterruptedIOException e) { + abortable.abort("Unexpected exception handling nodeChildrenChanged event", e); + } + }, "MasterAddressTracker.loadBackupMasters"); } @Override diff --git a/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java b/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java index a80d9b1fe0d3..dc301bf2b807 100644 --- a/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java +++ b/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java @@ -200,8 +200,8 @@ public synchronized void reconnectAfterExpiration() * throw NoNodeException if the path does not exist. */ public void delete(String path, int version) throws InterruptedException, KeeperException { - Span span = TraceUtil.getGlobalTracer().spanBuilder("RecoverableZookeeper.delete").startSpan(); - try (Scope scope = span.makeCurrent()) { + final Span span = TraceUtil.createSpan("RecoverableZookeeper.delete"); + try (Scope ignored = span.makeCurrent()) { RetryCounter retryCounter = retryCounterFactory.create(); boolean isRetry = false; // False for first attempt, true for all retries. while (true) { @@ -247,8 +247,8 @@ public Stat exists(String path, Watcher watcher) throws KeeperException, Interru private Stat exists(String path, Watcher watcher, Boolean watch) throws InterruptedException, KeeperException { - Span span = TraceUtil.getGlobalTracer().spanBuilder("RecoverableZookeeper.exists").startSpan(); - try (Scope scope = span.makeCurrent()) { + final Span span = TraceUtil.createSpan("RecoverableZookeeper.exists"); + try (Scope ignored = span.makeCurrent()) { RetryCounter retryCounter = retryCounterFactory.create(); while (true) { try { @@ -306,9 +306,8 @@ public List getChildren(String path, Watcher watcher) private List getChildren(String path, Watcher watcher, Boolean watch) throws InterruptedException, KeeperException { - Span span = - TraceUtil.getGlobalTracer().spanBuilder("RecoverableZookeeper.getChildren").startSpan(); - try (Scope scope = span.makeCurrent()) { + final Span span = TraceUtil.createSpan("RecoverableZookeeper.getChildren"); + try (Scope ignored = span.makeCurrent()) { RetryCounter retryCounter = retryCounterFactory.create(); while (true) { try { @@ -357,8 +356,8 @@ public byte[] getData(String path, Watcher watcher, Stat stat) private byte[] getData(String path, Watcher watcher, Boolean watch, Stat stat) throws InterruptedException, KeeperException { - Span span = TraceUtil.getGlobalTracer().spanBuilder("RecoverableZookeeper.getData").startSpan(); - try (Scope scope = span.makeCurrent()) { + final Span span = TraceUtil.createSpan("RecoverableZookeeper.getData"); + try (Scope ignored = span.makeCurrent()) { RetryCounter retryCounter = retryCounterFactory.create(); while (true) { try { @@ -404,8 +403,8 @@ public byte[] getData(String path, boolean watch, Stat stat) */ public Stat setData(String path, byte[] data, int version) throws KeeperException, InterruptedException { - Span span = TraceUtil.getGlobalTracer().spanBuilder("RecoverableZookeeper.setData").startSpan(); - try (Scope scope = span.makeCurrent()) { + final Span span = TraceUtil.createSpan("RecoverableZookeeper.setData"); + try (Scope ignored = span.makeCurrent()) { RetryCounter retryCounter = retryCounterFactory.create(); byte[] newData = ZKMetadata.appendMetaData(id, data); boolean isRetry = false; @@ -452,8 +451,8 @@ public Stat setData(String path, byte[] data, int version) * @return list of ACLs */ public List getAcl(String path, Stat stat) throws KeeperException, InterruptedException { - Span span = TraceUtil.getGlobalTracer().spanBuilder("RecoverableZookeeper.getAcl").startSpan(); - try (Scope scope = span.makeCurrent()) { + final Span span = TraceUtil.createSpan("RecoverableZookeeper.getAcl"); + try (Scope ignored = span.makeCurrent()) { RetryCounter retryCounter = retryCounterFactory.create(); while (true) { try { @@ -483,8 +482,8 @@ public List getAcl(String path, Stat stat) throws KeeperException, Interrup */ public Stat setAcl(String path, List acls, int version) throws KeeperException, InterruptedException { - Span span = TraceUtil.getGlobalTracer().spanBuilder("RecoverableZookeeper.setAcl").startSpan(); - try (Scope scope = span.makeCurrent()) { + final Span span = TraceUtil.createSpan("RecoverableZookeeper.setAcl"); + try (Scope ignored = span.makeCurrent()) { RetryCounter retryCounter = retryCounterFactory.create(); while (true) { try { @@ -520,8 +519,8 @@ public Stat setAcl(String path, List acls, int version) */ public String create(String path, byte[] data, List acl, CreateMode createMode) throws KeeperException, InterruptedException { - Span span = TraceUtil.getGlobalTracer().spanBuilder("RecoverableZookeeper.create").startSpan(); - try (Scope scope = span.makeCurrent()) { + final Span span = TraceUtil.createSpan("RecoverableZookeeper.create"); + try (Scope ignored = span.makeCurrent()) { byte[] newData = ZKMetadata.appendMetaData(id, data); switch (createMode) { case EPHEMERAL: @@ -646,8 +645,8 @@ private Iterable prepareZKMulti(Iterable ops) throws UnsupportedOperatio * Run multiple operations in a transactional manner. Retry before throwing exception */ public List multi(Iterable ops) throws KeeperException, InterruptedException { - Span span = TraceUtil.getGlobalTracer().spanBuilder("RecoverableZookeeper.multi").startSpan(); - try (Scope scope = span.makeCurrent()) { + final Span span = TraceUtil.createSpan("RecoverableZookeeper.multi"); + try (Scope ignored = span.makeCurrent()) { RetryCounter retryCounter = retryCounterFactory.create(); Iterable multiOps = prepareZKMulti(ops); while (true) { diff --git a/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKWatcher.java b/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKWatcher.java index b80dd699cca4..5e6b56a98b65 100644 --- a/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKWatcher.java +++ b/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKWatcher.java @@ -34,6 +34,7 @@ import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.ZooKeeperConnectionException; import org.apache.hadoop.hbase.security.Superusers; +import org.apache.hadoop.hbase.trace.TraceUtil; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.security.UserGroupInformation; @@ -552,45 +553,47 @@ public ZNodePaths getZNodePaths() { } private void processEvent(WatchedEvent event) { - switch (event.getType()) { - // If event type is NONE, this is a connection status change - case None: { - connectionEvent(event); - break; - } + TraceUtil.trace(() -> { + switch (event.getType()) { + // If event type is NONE, this is a connection status change + case None: { + connectionEvent(event); + break; + } - // Otherwise pass along to the listeners - case NodeCreated: { - for (ZKListener listener : listeners) { - listener.nodeCreated(event.getPath()); + // Otherwise pass along to the listeners + case NodeCreated: { + for (ZKListener listener : listeners) { + listener.nodeCreated(event.getPath()); + } + break; } - break; - } - case NodeDeleted: { - for (ZKListener listener : listeners) { - listener.nodeDeleted(event.getPath()); + case NodeDeleted: { + for (ZKListener listener : listeners) { + listener.nodeDeleted(event.getPath()); + } + break; } - break; - } - case NodeDataChanged: { - for (ZKListener listener : listeners) { - listener.nodeDataChanged(event.getPath()); + case NodeDataChanged: { + for (ZKListener listener : listeners) { + listener.nodeDataChanged(event.getPath()); + } + break; } - break; - } - case NodeChildrenChanged: { - for (ZKListener listener : listeners) { - listener.nodeChildrenChanged(event.getPath()); + case NodeChildrenChanged: { + for (ZKListener listener : listeners) { + listener.nodeChildrenChanged(event.getPath()); + } + break; } - break; + default: + LOG.error("Invalid event of type {} received for path {}. Ignoring.", event.getState(), + event.getPath()); } - default: - LOG.error("Invalid event of type {} received for path {}. Ignoring.", event.getState(), - event.getPath()); - } + }, "ZKWatcher.processEvent: " + event.getType() + " " + event.getPath()); } /** @@ -602,7 +605,8 @@ private void processEvent(WatchedEvent event) { public void process(WatchedEvent event) { LOG.debug(prefix("Received ZooKeeper Event, " + "type=" + event.getType() + ", " + "state=" + event.getState() + ", " + "path=" + event.getPath())); - zkEventProcessor.submit(() -> processEvent(event)); + final String spanName = ZKWatcher.class.getSimpleName() + "-" + identifier; + zkEventProcessor.submit(() -> TraceUtil.trace(() -> processEvent(event), spanName)); } // Connection management From b933fca0996e9bf69295162e803dac8e4bdf4114 Mon Sep 17 00:00:00 2001 From: Nick Dimiduk Date: Fri, 3 Jun 2022 16:44:24 +0200 Subject: [PATCH 2/2] PR feedback, also... - decouple spawned thread spans from parent thread spans, just in case - ensure all manually-managed spans have a status set - rename/move test to better match its scope --- .../trace/hamcrest/SpanDataMatchers.java | 22 +++--- .../org/apache/hadoop/hbase/ChoreService.java | 6 +- .../apache/hadoop/hbase/trace/TraceUtil.java | 31 ++++++++ .../apache/hadoop/hbase/HBaseServerBase.java | 2 + .../hadoop/hbase/MetaRegionLocationCache.java | 72 ++++++++++--------- .../apache/hadoop/hbase/master/HMaster.java | 5 +- .../hbase/master/HMasterCommandLine.java | 2 + .../hbase/master/RegionServerTracker.java | 45 +++++++----- .../hbase/regionserver/HRegionServer.java | 5 ++ .../HRegionServerCommandLine.java | 2 + ...g.java => TestServerInternalsTracing.java} | 60 +++++++++------- .../hbase/zookeeper/RecoverableZooKeeper.java | 34 ++++++++- .../hadoop/hbase/zookeeper/ZKWatcher.java | 2 +- 13 files changed, 190 insertions(+), 98 deletions(-) rename hbase-server/src/test/java/org/apache/hadoop/hbase/{regionserver/TestRegionServerInternalsTracing.java => TestServerInternalsTracing.java} (87%) diff --git a/hbase-client/src/test/java/org/apache/hadoop/hbase/client/trace/hamcrest/SpanDataMatchers.java b/hbase-client/src/test/java/org/apache/hadoop/hbase/client/trace/hamcrest/SpanDataMatchers.java index 6d0468c32ed5..d021f4d3aaf0 100644 --- a/hbase-client/src/test/java/org/apache/hadoop/hbase/client/trace/hamcrest/SpanDataMatchers.java +++ b/hbase-client/src/test/java/org/apache/hadoop/hbase/client/trace/hamcrest/SpanDataMatchers.java @@ -26,7 +26,6 @@ import io.opentelemetry.api.trace.StatusCode; import io.opentelemetry.sdk.trace.data.EventData; import io.opentelemetry.sdk.trace.data.SpanData; -import io.opentelemetry.sdk.trace.data.StatusData; import io.opentelemetry.semconv.trace.attributes.SemanticAttributes; import java.time.Duration; import java.util.Objects; @@ -144,23 +143,20 @@ protected String featureValueOf(SpanData item) { }; } - public static Matcher hasStatusWithCode(StatusCode statusCode) { - final Matcher matcher = is(equalTo(statusCode)); - return new TypeSafeMatcher() { - @Override - protected boolean matchesSafely(SpanData item) { - final StatusData statusData = item.getStatus(); - return statusData != null && statusData.getStatusCode() != null - && matcher.matches(statusData.getStatusCode()); - } - + public static Matcher hasStatusWithCode(Matcher matcher) { + return new FeatureMatcher(matcher, "SpanData with StatusCode that", + "statusWithCode") { @Override - public void describeTo(Description description) { - description.appendText("SpanData with StatusCode that ").appendDescriptionOf(matcher); + protected StatusCode featureValueOf(SpanData actual) { + return actual.getStatus().getStatusCode(); } }; } + public static Matcher hasStatusWithCode(StatusCode statusCode) { + return hasStatusWithCode(is(equalTo(statusCode))); + } + public static Matcher hasTraceId(String traceId) { return hasTraceId(is(equalTo(traceId))); } diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/ChoreService.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/ChoreService.java index 939d75fd7295..1bba8d49120f 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/ChoreService.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/ChoreService.java @@ -26,6 +26,7 @@ import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.ThreadFactory; import java.util.concurrent.atomic.AtomicInteger; +import org.apache.hadoop.hbase.trace.TraceUtil; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -163,8 +164,9 @@ public boolean scheduleChore(ScheduledChore chore) { chore.getChoreService().cancelChore(chore); } chore.setChoreService(this); - ScheduledFuture future = scheduler.scheduleAtFixedRate(chore, chore.getInitialDelay(), - chore.getPeriod(), chore.getTimeUnit()); + ScheduledFuture future = + scheduler.scheduleAtFixedRate(TraceUtil.tracedRunnable(chore, chore.getName()), + chore.getInitialDelay(), chore.getPeriod(), chore.getTimeUnit()); scheduledChores.put(chore, future); return true; } catch (Exception e) { diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/trace/TraceUtil.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/trace/TraceUtil.java index 7dc24a54ab66..5b1fb86a351a 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/trace/TraceUtil.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/trace/TraceUtil.java @@ -135,6 +135,31 @@ private static void endSpan(CompletableFuture future, Span span) { }); } + /** + * Wrap the provided {@code runnable} in a {@link Runnable} that is traced. + */ + public static Runnable tracedRunnable(final Runnable runnable, final String spanName) { + return tracedRunnable(runnable, () -> createSpan(spanName)); + } + + /** + * Wrap the provided {@code runnable} in a {@link Runnable} that is traced. + */ + public static Runnable tracedRunnable(final Runnable runnable, + final Supplier spanSupplier) { + // N.B. This method name follows the convention of this class, i.e., tracedFuture, rather than + // the convention of the OpenTelemetry classes, i.e., Context#wrap. + return () -> { + final Span span = spanSupplier.get(); + try (final Scope ignored = span.makeCurrent()) { + runnable.run(); + span.setStatus(StatusCode.OK); + } finally { + span.end(); + } + }; + } + /** * A {@link Runnable} that may also throw. * @param the type of {@link Throwable} that can be produced. @@ -144,11 +169,17 @@ public interface ThrowingRunnable { void run() throws T; } + /** + * Trace the execution of {@code runnable}. + */ public static void trace(final ThrowingRunnable runnable, final String spanName) throws T { trace(runnable, () -> createSpan(spanName)); } + /** + * Trace the execution of {@code runnable}. + */ public static void trace(final ThrowingRunnable runnable, final Supplier spanSupplier) throws T { Span span = spanSupplier.get(); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/HBaseServerBase.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/HBaseServerBase.java index 625139964ab1..f39e26091463 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/HBaseServerBase.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/HBaseServerBase.java @@ -22,6 +22,7 @@ import com.google.errorprone.annotations.RestrictedApi; import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.StatusCode; import io.opentelemetry.context.Scope; import java.io.IOException; import java.lang.management.MemoryType; @@ -289,6 +290,7 @@ public HBaseServerBase(Configuration conf, String name) throws IOException { clusterStatusTracker = null; } putUpWebUI(); + span.setStatus(StatusCode.OK); } catch (Throwable t) { TraceUtil.setError(span, t); throw t; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/MetaRegionLocationCache.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/MetaRegionLocationCache.java index b144342236af..c364607e6ccb 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/MetaRegionLocationCache.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/MetaRegionLocationCache.java @@ -17,7 +17,6 @@ */ package org.apache.hadoop.hbase; -import io.opentelemetry.context.Context; import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -25,6 +24,7 @@ import java.util.concurrent.ThreadFactory; import org.apache.hadoop.hbase.exceptions.DeserializationException; import org.apache.hadoop.hbase.master.RegionState; +import org.apache.hadoop.hbase.trace.TraceUtil; import org.apache.hadoop.hbase.types.CopyOnWriteArrayMap; import org.apache.hadoop.hbase.util.RetryCounter; import org.apache.hadoop.hbase.util.RetryCounterFactory; @@ -92,8 +92,8 @@ public MetaRegionLocationCache(ZKWatcher zkWatcher) { ThreadFactory threadFactory = new ThreadFactoryBuilder().setDaemon(true).build(); RetryCounterFactory retryFactory = new RetryCounterFactory(Integer.MAX_VALUE, SLEEP_INTERVAL_MS_BETWEEN_RETRIES, SLEEP_INTERVAL_MS_MAX); - threadFactory.newThread(Context.current() - .wrap(() -> loadMetaLocationsFromZk(retryFactory.create(), ZNodeOpType.INIT))).start(); + threadFactory.newThread(() -> loadMetaLocationsFromZk(retryFactory.create(), ZNodeOpType.INIT)) + .start(); } /** @@ -102,41 +102,43 @@ public MetaRegionLocationCache(ZKWatcher zkWatcher) { * @param retryCounter controls the number of retries and sleep between retries. */ private void loadMetaLocationsFromZk(RetryCounter retryCounter, ZNodeOpType opType) { - List znodes = null; - while (retryCounter.shouldRetry()) { - try { - znodes = watcher.getMetaReplicaNodesAndWatchChildren(); - break; - } catch (KeeperException ke) { - LOG.debug("Error populating initial meta locations", ke); - if (!retryCounter.shouldRetry()) { - // Retries exhausted and watchers not set. This is not a desirable state since the cache - // could remain stale forever. Propagate the exception. - watcher.abort("Error populating meta locations", ke); - return; - } + TraceUtil.trace(() -> { + List znodes = null; + while (retryCounter.shouldRetry()) { try { - retryCounter.sleepUntilNextRetry(); - } catch (InterruptedException ie) { - LOG.error("Interrupted while loading meta locations from ZK", ie); - Thread.currentThread().interrupt(); - return; + znodes = watcher.getMetaReplicaNodesAndWatchChildren(); + break; + } catch (KeeperException ke) { + LOG.debug("Error populating initial meta locations", ke); + if (!retryCounter.shouldRetry()) { + // Retries exhausted and watchers not set. This is not a desirable state since the cache + // could remain stale forever. Propagate the exception. + watcher.abort("Error populating meta locations", ke); + return; + } + try { + retryCounter.sleepUntilNextRetry(); + } catch (InterruptedException ie) { + LOG.error("Interrupted while loading meta locations from ZK", ie); + Thread.currentThread().interrupt(); + return; + } } } - } - if (znodes == null || znodes.isEmpty()) { - // No meta znodes exist at this point but we registered a watcher on the base znode to listen - // for updates. They will be handled via nodeChildrenChanged(). - return; - } - if (znodes.size() == cachedMetaLocations.size()) { - // No new meta znodes got added. - return; - } - for (String znode : znodes) { - String path = ZNodePaths.joinZNode(watcher.getZNodePaths().baseZNode, znode); - updateMetaLocation(path, opType); - } + if (znodes == null || znodes.isEmpty()) { + // No meta znodes exist at this point but we registered a watcher on the base znode to + // listen for updates. They will be handled via nodeChildrenChanged(). + return; + } + if (znodes.size() == cachedMetaLocations.size()) { + // No new meta znodes got added. + return; + } + for (String znode : znodes) { + String path = ZNodePaths.joinZNode(watcher.getZNodePaths().baseZNode, znode); + updateMetaLocation(path, opType); + } + }, "MetaRegionLocationCache.loadMetaLocationsFromZk"); } /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 239e30e7c171..c232e21136ac 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -25,6 +25,7 @@ import com.google.errorprone.annotations.RestrictedApi; import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.StatusCode; import io.opentelemetry.context.Scope; import java.io.IOException; import java.io.InterruptedIOException; @@ -535,6 +536,7 @@ public HMaster(final Configuration conf) throws IOException { cachedClusterId = new CachedClusterId(this, conf); this.regionServerTracker = new RegionServerTracker(zooKeeper, this); this.rpcServices.start(zooKeeper); + span.setStatus(StatusCode.OK); } catch (Throwable t) { // Make sure we log the exception. HMaster is often started via reflection and the // cause of failed startup is lost. @@ -572,7 +574,7 @@ public void run() { try { installShutdownHook(); registerConfigurationObservers(); - Threads.setDaemonThreadRunning(new Thread(() -> TraceUtil.trace(() -> { + Threads.setDaemonThreadRunning(new Thread(TraceUtil.tracedRunnable(() -> { try { int infoPort = putUpJettyServer(); startActiveMasterManager(infoPort); @@ -598,6 +600,7 @@ public void run() { this.rpcServices.stop(); } closeZooKeeper(); + span.setStatus(StatusCode.OK); } finally { span.end(); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMasterCommandLine.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMasterCommandLine.java index 822e4e27cee6..0941c62bac85 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMasterCommandLine.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMasterCommandLine.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hbase.master; import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.StatusCode; import io.opentelemetry.context.Scope; import java.io.File; import java.io.IOException; @@ -253,6 +254,7 @@ private int startMaster() { master.join(); if (master.isAborted()) throw new RuntimeException("HMaster Aborted"); } + span.setStatus(StatusCode.OK); } catch (Throwable t) { TraceUtil.setError(span, t); LOG.error("Master exiting", t); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java index 418fc4187adb..71ca500a045e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java @@ -17,7 +17,9 @@ */ package org.apache.hadoop.hbase.master; -import io.opentelemetry.context.Context; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.StatusCode; +import io.opentelemetry.context.Scope; import java.io.IOException; import java.io.InterruptedIOException; import java.util.Collections; @@ -30,6 +32,7 @@ import org.apache.hadoop.hbase.ServerMetricsBuilder; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.VersionInfoUtil; +import org.apache.hadoop.hbase.trace.TraceUtil; import org.apache.hadoop.hbase.zookeeper.ZKListener; import org.apache.hadoop.hbase.zookeeper.ZKUtil; import org.apache.hadoop.hbase.zookeeper.ZKWatcher; @@ -71,8 +74,8 @@ public class RegionServerTracker extends ZKListener { public RegionServerTracker(ZKWatcher watcher, MasterServices server) { super(watcher); this.server = server; - this.executor = Context.current().wrap(Executors.newSingleThreadExecutor( - new ThreadFactoryBuilder().setDaemon(true).setNameFormat("RegionServerTracker-%d").build())); + this.executor = Executors.newSingleThreadExecutor( + new ThreadFactoryBuilder().setDaemon(true).setNameFormat("RegionServerTracker-%d").build()); watcher.registerListener(this); refresh(); } @@ -185,22 +188,28 @@ private void processAsActiveMaster(Set newServers) { private synchronized void refresh() { List names; - try { - names = ZKUtil.listChildrenAndWatchForNewChildren(watcher, watcher.getZNodePaths().rsZNode); - } catch (KeeperException e) { - // here we need to abort as we failed to set watcher on the rs node which means that we can - // not track the node deleted event any more. - server.abort("Unexpected zk exception getting RS nodes", e); - return; - } - Set newServers = CollectionUtils.isEmpty(names) - ? Collections.emptySet() - : names.stream().map(ServerName::parseServerName) - .collect(Collectors.collectingAndThen(Collectors.toSet(), Collections::unmodifiableSet)); - if (active) { - processAsActiveMaster(newServers); + final Span span = TraceUtil.createSpan("RegionServerTracker.refresh"); + try (final Scope ignored = span.makeCurrent()) { + try { + names = ZKUtil.listChildrenAndWatchForNewChildren(watcher, watcher.getZNodePaths().rsZNode); + } catch (KeeperException e) { + // here we need to abort as we failed to set watcher on the rs node which means that we can + // not track the node deleted event any more. + server.abort("Unexpected zk exception getting RS nodes", e); + return; + } + Set newServers = CollectionUtils.isEmpty(names) + ? Collections.emptySet() + : names.stream().map(ServerName::parseServerName) + .collect(Collectors.collectingAndThen(Collectors.toSet(), Collections::unmodifiableSet)); + if (active) { + processAsActiveMaster(newServers); + } + this.regionServers = newServers; + span.setStatus(StatusCode.OK); + } finally { + span.end(); } - this.regionServers = newServers; } @Override diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 70300c237eb2..0ee4c5d7464c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -24,6 +24,7 @@ import static org.apache.hadoop.hbase.util.DNS.UNSAFE_RS_HOSTNAME_KEY; import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.StatusCode; import io.opentelemetry.context.Scope; import java.io.IOException; import java.io.PrintWriter; @@ -517,6 +518,7 @@ public HRegionServer(final Configuration conf) throws IOException { masterAddressTracker = null; } this.rpcServices.start(zooKeeper); + span.setStatus(StatusCode.OK); } catch (Throwable t) { // Make sure we log the exception. HRegionServer is often started via reflection and the // cause of failed startup is lost. @@ -660,6 +662,7 @@ private void preRegistrationInitialization() { regionReplicationBufferManager = new RegionReplicationBufferManager(this); // Setup RPC client for master communication this.rpcClient = asyncClusterConnection.getRpcClient(); + span.setStatus(StatusCode.OK); } catch (Throwable t) { // Call stop if error or process will stick around for ever since server // puts up non-daemon threads. @@ -975,6 +978,7 @@ public void run() { closeZooKeeper(); LOG.info("Exiting; stopping=" + this.serverName + "; zookeeper connection closed."); + span.setStatus(StatusCode.OK); } finally { span.end(); } @@ -1024,6 +1028,7 @@ protected void tryRegionServerReport(long reportStartTime, long reportEndTime) request.setServer(ProtobufUtil.toServerName(this.serverName)); request.setLoad(sl); rss.regionServerReport(null, request.build()); + span.setStatus(StatusCode.OK); } catch (ServiceException se) { IOException ioe = ProtobufUtil.getRemoteException(se); if (ioe instanceof YouAreDeadException) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServerCommandLine.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServerCommandLine.java index f17d79db5ea3..db74380d2ed4 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServerCommandLine.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServerCommandLine.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hbase.regionserver; import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.StatusCode; import io.opentelemetry.context.Scope; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HConstants; @@ -66,6 +67,7 @@ private int start() throws Exception { throw new RuntimeException("HRegionServer Aborted"); } } + span.setStatus(StatusCode.OK); } catch (Throwable t) { TraceUtil.setError(span, t); LOG.error("Region server exiting", t); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerInternalsTracing.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestServerInternalsTracing.java similarity index 87% rename from hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerInternalsTracing.java rename to hbase-server/src/test/java/org/apache/hadoop/hbase/TestServerInternalsTracing.java index 7b7c4b6feadf..69e6e4ac83df 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerInternalsTracing.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestServerInternalsTracing.java @@ -15,10 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hbase.regionserver; +package org.apache.hadoop.hbase; import static org.apache.hadoop.hbase.client.trace.hamcrest.SpanDataMatchers.hasName; import static org.apache.hadoop.hbase.client.trace.hamcrest.SpanDataMatchers.hasParentSpanId; +import static org.apache.hadoop.hbase.client.trace.hamcrest.SpanDataMatchers.hasStatusWithCode; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.allOf; import static org.hamcrest.Matchers.containsString; @@ -26,16 +27,14 @@ import static org.hamcrest.Matchers.hasItem; import static org.hamcrest.Matchers.hasItems; import static org.hamcrest.Matchers.hasSize; +import static org.hamcrest.Matchers.isOneOf; import static org.hamcrest.Matchers.startsWith; +import io.opentelemetry.api.trace.StatusCode; import io.opentelemetry.sdk.trace.data.SpanData; import java.util.List; import java.util.function.Supplier; import java.util.stream.Collectors; -import org.apache.hadoop.hbase.HBaseClassTestRule; -import org.apache.hadoop.hbase.HBaseTestingUtil; -import org.apache.hadoop.hbase.MiniClusterRule; -import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.trace.StringTraceRenderer; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.testclassification.RegionServerTests; @@ -55,12 +54,12 @@ * Test that sundry operations internal to the region server are traced as expected. */ @Category({ MediumTests.class, RegionServerTests.class }) -public class TestRegionServerInternalsTracing { - private static final Logger LOG = LoggerFactory.getLogger(TestRegionServerInternalsTracing.class); +public class TestServerInternalsTracing { + private static final Logger LOG = LoggerFactory.getLogger(TestServerInternalsTracing.class); @ClassRule public static final HBaseClassTestRule CLASS_RULE = - HBaseClassTestRule.forClass(TestRegionServerInternalsTracing.class); + HBaseClassTestRule.forClass(TestServerInternalsTracing.class); private static final String NO_PARENT_ID = "0000000000000000"; private static List spans; @@ -110,8 +109,8 @@ public void evaluate() throws Throwable { @Test public void testHBaseServerBaseConstructor() { - final Matcher hbaseServerBaseMatcher = - allOf(hasName("HBaseServerBase.cxtor"), hasParentSpanId(NO_PARENT_ID)); + final Matcher hbaseServerBaseMatcher = allOf(hasName("HBaseServerBase.cxtor"), + hasParentSpanId(NO_PARENT_ID), hasStatusWithCode(isOneOf(StatusCode.OK, StatusCode.ERROR))); assertThat("there should be a span from the HBaseServerBase constructor.", spans, hasItem(hbaseServerBaseMatcher)); final List hbaseServerBaseSpans = @@ -128,8 +127,8 @@ public void testHBaseServerBaseConstructor() { @Test public void testHMasterConstructor() { - final Matcher masterConstructorMatcher = - allOf(hasName("HMaster.cxtor"), hasParentSpanId(NO_PARENT_ID)); + final Matcher masterConstructorMatcher = allOf(hasName("HMaster.cxtor"), + hasParentSpanId(NO_PARENT_ID), hasStatusWithCode(isOneOf(StatusCode.OK, StatusCode.ERROR))); assertThat("there should be a span from the HMaster constructor.", spans, hasItem(masterConstructorMatcher)); final SpanData masterConstructorSpan = spans.stream().filter(masterConstructorMatcher::matches) @@ -141,7 +140,8 @@ public void testHMasterConstructor() { @Test public void testHMasterBecomeActiveMaster() { final Matcher masterBecomeActiveMasterMatcher = - allOf(hasName("HMaster.becomeActiveMaster"), hasParentSpanId(NO_PARENT_ID)); + allOf(hasName("HMaster.becomeActiveMaster"), hasParentSpanId(NO_PARENT_ID), + hasStatusWithCode(isOneOf(StatusCode.OK, StatusCode.ERROR))); assertThat("there should be a span from the HMaster.becomeActiveMaster.", spans, hasItem(masterBecomeActiveMasterMatcher)); final SpanData masterBecomeActiveMasterSpan = spans.stream() @@ -164,8 +164,8 @@ public void testHMasterBecomeActiveMaster() { @Test public void testZKWatcherHMaster() { - final Matcher mZKWatcherMatcher = - allOf(hasName(startsWith("ZKWatcher-master")), hasParentSpanId(NO_PARENT_ID)); + final Matcher mZKWatcherMatcher = allOf(hasName(startsWith("ZKWatcher-master")), + hasParentSpanId(NO_PARENT_ID), hasStatusWithCode(isOneOf(StatusCode.OK, StatusCode.ERROR))); assertThat("there should be a span from the ZKWatcher running in the HMaster.", spans, hasItem(mZKWatcherMatcher)); final SpanData mZKWatcherSpan = @@ -176,8 +176,8 @@ public void testZKWatcherHMaster() { @Test public void testHMasterShutdown() { - final Matcher masterShutdownMatcher = - allOf(hasName("HMaster.shutdown"), hasParentSpanId(NO_PARENT_ID)); + final Matcher masterShutdownMatcher = allOf(hasName("HMaster.shutdown"), + hasParentSpanId(NO_PARENT_ID), hasStatusWithCode(isOneOf(StatusCode.OK, StatusCode.ERROR))); assertThat("there should be a span from the HMaster.shutdown.", spans, hasItem(masterShutdownMatcher)); final SpanData masterShutdownSpan = spans.stream().filter(masterShutdownMatcher::matches) @@ -191,7 +191,8 @@ public void testHMasterShutdown() { @Test public void testHMasterExitingMainLoop() { final Matcher masterExitingMainLoopMatcher = - allOf(hasName("HMaster exiting main loop"), hasParentSpanId(NO_PARENT_ID)); + allOf(hasName("HMaster exiting main loop"), hasParentSpanId(NO_PARENT_ID), + hasStatusWithCode(isOneOf(StatusCode.OK, StatusCode.ERROR))); assertThat("there should be a span from the HMaster exiting main loop.", spans, hasItem(masterExitingMainLoopMatcher)); final SpanData masterExitingMainLoopSpan = spans.stream() @@ -209,7 +210,8 @@ public void testHMasterExitingMainLoop() { @Test public void testTryRegionServerReport() { final Matcher tryRegionServerReportMatcher = - allOf(hasName("HRegionServer.tryRegionServerReport"), hasParentSpanId(NO_PARENT_ID)); + allOf(hasName("HRegionServer.tryRegionServerReport"), hasParentSpanId(NO_PARENT_ID), + hasStatusWithCode(isOneOf(StatusCode.OK, StatusCode.ERROR))); assertThat("there should be a span for the region server sending a report.", spans, hasItem(tryRegionServerReportMatcher)); final SpanData tryRegionServerReportSpan = spans.stream() @@ -222,8 +224,8 @@ spans, hasItem(allOf(hasName(endsWith("RegionServerStatusService/RegionServerRep @Test public void testHRegionServerStartup() { - final Matcher regionServerStartupMatcher = - allOf(hasName("HRegionServer.startup"), hasParentSpanId(NO_PARENT_ID)); + final Matcher regionServerStartupMatcher = allOf(hasName("HRegionServer.startup"), + hasParentSpanId(NO_PARENT_ID), hasStatusWithCode(isOneOf(StatusCode.OK, StatusCode.ERROR))); assertThat("there should be a span from the HRegionServer startup procedure.", spans, hasItem(regionServerStartupMatcher)); final SpanData regionServerStartupSpan = spans.stream() @@ -235,8 +237,8 @@ public void testHRegionServerStartup() { @Test public void testHRegionServerConstructor() { - final Matcher rsConstructorMatcher = - allOf(hasName("HRegionServer.cxtor"), hasParentSpanId(NO_PARENT_ID)); + final Matcher rsConstructorMatcher = allOf(hasName("HRegionServer.cxtor"), + hasParentSpanId(NO_PARENT_ID), hasStatusWithCode(isOneOf(StatusCode.OK, StatusCode.ERROR))); assertThat("there should be a span from the HRegionServer constructor.", spans, hasItem(rsConstructorMatcher)); final SpanData rsConstructorSpan = spans.stream().filter(rsConstructorMatcher::matches) @@ -252,7 +254,8 @@ public void testHRegionServerConstructor() { @Test public void testHRegionServerPreRegistrationInitialization() { final Matcher rsPreRegistrationInitializationMatcher = - allOf(hasName("HRegionServer.preRegistrationInitialization"), hasParentSpanId(NO_PARENT_ID)); + allOf(hasName("HRegionServer.preRegistrationInitialization"), hasParentSpanId(NO_PARENT_ID), + hasStatusWithCode(isOneOf(StatusCode.OK, StatusCode.ERROR))); assertThat("there should be a span from the HRegionServer preRegistrationInitialization.", spans, hasItem(rsPreRegistrationInitializationMatcher)); final SpanData rsPreRegistrationInitializationSpan = @@ -267,7 +270,8 @@ spans, hasItem(allOf(hasName(startsWith("RecoverableZookeeper.")), @Test public void testHRegionServerRegisterWithMaster() { final Matcher rsRegisterWithMasterMatcher = - allOf(hasName("HRegionServer.registerWithMaster"), hasParentSpanId(NO_PARENT_ID)); + allOf(hasName("HRegionServer.registerWithMaster"), hasParentSpanId(NO_PARENT_ID), + hasStatusWithCode(isOneOf(StatusCode.OK, StatusCode.ERROR))); assertThat("there should be a span from the HRegionServer registerWithMaster.", spans, hasItem(rsRegisterWithMasterMatcher)); final SpanData rsRegisterWithMasterSpan = spans.stream() @@ -285,7 +289,8 @@ spans, hasItem(allOf(hasName(endsWith("RegionServerStatusService/RegionServerSta @Test public void testZKWatcherRegionServer() { final Matcher rsZKWatcherMatcher = - allOf(hasName(startsWith("ZKWatcher-regionserver")), hasParentSpanId(NO_PARENT_ID)); + allOf(hasName(startsWith("ZKWatcher-regionserver")), hasParentSpanId(NO_PARENT_ID), + hasStatusWithCode(isOneOf(StatusCode.OK, StatusCode.ERROR))); assertThat("there should be a span from the ZKWatcher running in the HRegionServer.", spans, hasItem(rsZKWatcherMatcher)); final SpanData rsZKWatcherSpan = @@ -297,7 +302,8 @@ public void testZKWatcherRegionServer() { @Test public void testHRegionServerExitingMainLoop() { final Matcher rsExitingMainLoopMatcher = - allOf(hasName("HRegionServer exiting main loop"), hasParentSpanId(NO_PARENT_ID)); + allOf(hasName("HRegionServer exiting main loop"), hasParentSpanId(NO_PARENT_ID), + hasStatusWithCode(isOneOf(StatusCode.OK, StatusCode.ERROR))); assertThat("there should be a span from the HRegionServer exiting main loop.", spans, hasItem(rsExitingMainLoopMatcher)); final SpanData rsExitingMainLoopSpan = spans.stream().filter(rsExitingMainLoopMatcher::matches) diff --git a/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java b/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java index dc301bf2b807..5ac4ffa329b3 100644 --- a/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java +++ b/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hbase.zookeeper; import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.StatusCode; import io.opentelemetry.context.Scope; import java.io.IOException; import java.lang.management.ManagementFactory; @@ -207,6 +208,7 @@ public void delete(String path, int version) throws InterruptedException, Keeper while (true) { try { checkZk().delete(path, version); + span.setStatus(StatusCode.OK); return; } catch (KeeperException e) { switch (e.code()) { @@ -217,15 +219,18 @@ public void delete(String path, int version) throws InterruptedException, Keeper return; } LOG.debug("Node {} already deleted, retry={}", path, isRetry); + TraceUtil.setError(span, e); throw e; case CONNECTIONLOSS: case OPERATIONTIMEOUT: case REQUESTTIMEOUT: + TraceUtil.setError(span, e); retryOrThrow(retryCounter, e, "delete"); break; default: + TraceUtil.setError(span, e); throw e; } } @@ -258,16 +263,19 @@ private Stat exists(String path, Watcher watcher, Boolean watch) } else { nodeStat = checkZk().exists(path, watch); } + span.setStatus(StatusCode.OK); return nodeStat; } catch (KeeperException e) { switch (e.code()) { case CONNECTIONLOSS: case OPERATIONTIMEOUT: case REQUESTTIMEOUT: + TraceUtil.setError(span, e); retryOrThrow(retryCounter, e, "exists"); break; default: + TraceUtil.setError(span, e); throw e; } } @@ -317,16 +325,19 @@ private List getChildren(String path, Watcher watcher, Boolean watch) } else { children = checkZk().getChildren(path, watch); } + span.setStatus(StatusCode.OK); return children; } catch (KeeperException e) { switch (e.code()) { case CONNECTIONLOSS: case OPERATIONTIMEOUT: case REQUESTTIMEOUT: + TraceUtil.setError(span, e); retryOrThrow(retryCounter, e, "getChildren"); break; default: + TraceUtil.setError(span, e); throw e; } } @@ -367,16 +378,19 @@ private byte[] getData(String path, Watcher watcher, Boolean watch, Stat stat) } else { revData = checkZk().getData(path, watch, stat); } + span.setStatus(StatusCode.OK); return ZKMetadata.removeMetaData(revData); } catch (KeeperException e) { switch (e.code()) { case CONNECTIONLOSS: case OPERATIONTIMEOUT: case REQUESTTIMEOUT: + TraceUtil.setError(span, e); retryOrThrow(retryCounter, e, "getData"); break; default: + TraceUtil.setError(span, e); throw e; } } @@ -410,12 +424,14 @@ public Stat setData(String path, byte[] data, int version) boolean isRetry = false; while (true) { try { + span.setStatus(StatusCode.OK); return checkZk().setData(path, newData, version); } catch (KeeperException e) { switch (e.code()) { case CONNECTIONLOSS: case OPERATIONTIMEOUT: case REQUESTTIMEOUT: + TraceUtil.setError(span, e); retryOrThrow(retryCounter, e, "setData"); break; case BADVERSION: @@ -430,11 +446,13 @@ public Stat setData(String path, byte[] data, int version) } } catch (KeeperException keeperException) { // the ZK is not reliable at this moment. just throwing exception + TraceUtil.setError(span, e); throw keeperException; } } // throw other exceptions and verified bad version exceptions default: + TraceUtil.setError(span, e); throw e; } } @@ -456,16 +474,19 @@ public List getAcl(String path, Stat stat) throws KeeperException, Interrup RetryCounter retryCounter = retryCounterFactory.create(); while (true) { try { + span.setStatus(StatusCode.OK); return checkZk().getACL(path, stat); } catch (KeeperException e) { switch (e.code()) { case CONNECTIONLOSS: case OPERATIONTIMEOUT: case REQUESTTIMEOUT: + TraceUtil.setError(span, e); retryOrThrow(retryCounter, e, "getAcl"); break; default: + TraceUtil.setError(span, e); throw e; } } @@ -487,15 +508,18 @@ public Stat setAcl(String path, List acls, int version) RetryCounter retryCounter = retryCounterFactory.create(); while (true) { try { + span.setStatus(StatusCode.OK); return checkZk().setACL(path, acls, version); } catch (KeeperException e) { switch (e.code()) { case CONNECTIONLOSS: case OPERATIONTIMEOUT: + TraceUtil.setError(span, e); retryOrThrow(retryCounter, e, "setAcl"); break; default: + TraceUtil.setError(span, e); throw e; } } @@ -525,14 +549,19 @@ public String create(String path, byte[] data, List acl, CreateMode createM switch (createMode) { case EPHEMERAL: case PERSISTENT: + span.setStatus(StatusCode.OK); return createNonSequential(path, newData, acl, createMode); case EPHEMERAL_SEQUENTIAL: case PERSISTENT_SEQUENTIAL: + span.setStatus(StatusCode.OK); return createSequential(path, newData, acl, createMode); default: - throw new IllegalArgumentException("Unrecognized CreateMode: " + createMode); + final IllegalArgumentException e = + new IllegalArgumentException("Unrecognized CreateMode: " + createMode); + TraceUtil.setError(span, e); + throw e; } } finally { span.end(); @@ -651,16 +680,19 @@ public List multi(Iterable ops) throws KeeperException, Interrupte Iterable multiOps = prepareZKMulti(ops); while (true) { try { + span.setStatus(StatusCode.OK); return checkZk().multi(multiOps); } catch (KeeperException e) { switch (e.code()) { case CONNECTIONLOSS: case OPERATIONTIMEOUT: case REQUESTTIMEOUT: + TraceUtil.setError(span, e); retryOrThrow(retryCounter, e, "multi"); break; default: + TraceUtil.setError(span, e); throw e; } } diff --git a/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKWatcher.java b/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKWatcher.java index 5e6b56a98b65..feaa62fd77bb 100644 --- a/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKWatcher.java +++ b/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKWatcher.java @@ -606,7 +606,7 @@ public void process(WatchedEvent event) { LOG.debug(prefix("Received ZooKeeper Event, " + "type=" + event.getType() + ", " + "state=" + event.getState() + ", " + "path=" + event.getPath())); final String spanName = ZKWatcher.class.getSimpleName() + "-" + identifier; - zkEventProcessor.submit(() -> TraceUtil.trace(() -> processEvent(event), spanName)); + zkEventProcessor.submit(TraceUtil.tracedRunnable(() -> processEvent(event), spanName)); } // Connection management