diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionImplementation.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionImplementation.java index 93df9523dd23..e117af4893c4 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionImplementation.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionImplementation.java @@ -1003,6 +1003,7 @@ private RegionLocations locateRegionInMeta(TableName tableName, byte[] row, bool // Query the meta region long pauseBase = connectionConfig.getPauseMillis(); takeUserRegionLock(); + final long lockStartTime = EnvironmentEdgeManager.currentTime(); try { // We don't need to check if useCache is enabled or not. Even if useCache is false // we already cleared the cache for this row before acquiring userRegion lock so if this @@ -1113,6 +1114,10 @@ rpcControllerFactory, getMetaLookupPool(), connectionConfig.getMetaReadRpcTimeou } } finally { userRegionLock.unlock(); + // update duration of the lock being held + if (metrics != null) { + metrics.updateUserRegionLockHeld(EnvironmentEdgeManager.currentTime() - lockStartTime); + } } try { Thread.sleep(ConnectionUtils.getPauseTime(pauseBase, tries)); @@ -1126,9 +1131,19 @@ rpcControllerFactory, getMetaLookupPool(), connectionConfig.getMetaReadRpcTimeou void takeUserRegionLock() throws IOException { try { long waitTime = connectionConfig.getMetaOperationTimeout(); + if (metrics != null) { + metrics.updateUserRegionLockQueue(userRegionLock.getQueueLength()); + } + final long waitStartTime = EnvironmentEdgeManager.currentTime(); if (!userRegionLock.tryLock(waitTime, TimeUnit.MILLISECONDS)) { + if (metrics != null) { + metrics.incrUserRegionLockTimeout(); + } throw new LockTimeoutException("Failed to get user region lock in" + waitTime + " ms. " + " for accessing meta region server."); + } else if (metrics != null) { + // successfully grabbed the lock, start timer of holding the lock + metrics.updateUserRegionLockWaiting(EnvironmentEdgeManager.currentTime() - waitStartTime); } } catch (InterruptedException ie) { LOG.error("Interrupted while waiting for a lock", ie); diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetricsConnection.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetricsConnection.java index 0f9b5a5ee868..4e89f3f49288 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetricsConnection.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetricsConnection.java @@ -359,6 +359,10 @@ public Counter newMetric(Class clazz, String name, String scope) { private final Counter nsLookups; private final Counter nsLookupsFailed; private final Timer overloadedBackoffTimer; + private final Counter userRegionLockTimeoutCount; + private final Timer userRegionLockWaitingTimer; + private final Timer userRegionLockHeldTimer; + private final Histogram userRegionLockQueueHist; // dynamic metrics @@ -443,6 +447,15 @@ protected Ratio getRatio() { this.nsLookups = registry.counter(name(this.getClass(), NS_LOOKUPS, scope)); this.nsLookupsFailed = registry.counter(name(this.getClass(), NS_LOOKUPS_FAILED, scope)); + this.userRegionLockTimeoutCount = + registry.counter(name(this.getClass(), "userRegionLockTimeoutCount", scope)); + this.userRegionLockWaitingTimer = + registry.timer(name(this.getClass(), "userRegionLockWaitingDuration", scope)); + this.userRegionLockHeldTimer = + registry.timer(name(this.getClass(), "userRegionLockHeldDuration", scope)); + this.userRegionLockQueueHist = + registry.histogram(name(MetricsConnection.class, "userRegionLockQueueLength", scope)); + this.overloadedBackoffTimer = registry.timer(name(this.getClass(), "overloadedBackoffDurationMs", scope)); @@ -598,6 +611,41 @@ public void incrementServerOverloadedBackoffTime(long time, TimeUnit timeUnit) { overloadedBackoffTimer.update(time, timeUnit); } + /** incr */ + public void incrUserRegionLockTimeout() { + userRegionLockTimeoutCount.inc(); + } + + /** get */ + public Counter getUserRegionLockTimeout() { + return userRegionLockTimeoutCount; + } + + public Timer getUserRegionLockWaitingTimer() { + return userRegionLockWaitingTimer; + } + + public Timer getUserRegionLockHeldTimer() { + return userRegionLockHeldTimer; + } + + public Histogram getUserRegionLockQueue() { + return userRegionLockQueueHist; + } + + /** update */ + public void updateUserRegionLockWaiting(long duration) { + userRegionLockWaitingTimer.update(duration, TimeUnit.MILLISECONDS); + } + + public void updateUserRegionLockHeld(long duration) { + userRegionLockHeldTimer.update(duration, TimeUnit.MILLISECONDS); + } + + public void updateUserRegionLockQueue(int count) { + userRegionLockQueueHist.update(count); + } + /** Return the connection count of the metrics within a scope */ public long getConnectionCount() { return connectionCount.getCount(); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaCache.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaCache.java index 87b704cbe9a3..00425b74c1c7 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaCache.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaCache.java @@ -435,6 +435,7 @@ public void testUserRegionLockThrowsException() throws IOException, InterruptedE conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 0); conf.setLong(HConstants.HBASE_CLIENT_META_OPERATION_TIMEOUT, 2000); conf.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 2000); + conf.setBoolean(MetricsConnection.CLIENT_SIDE_METRICS_ENABLED_KEY, true); try (ConnectionImplementation conn = (ConnectionImplementation) ConnectionFactory.createConnection(conf)) { @@ -459,6 +460,28 @@ public void testUserRegionLockThrowsException() throws IOException, InterruptedE assertTrue(client1.getException() instanceof LockTimeoutException ^ client2.getException() instanceof LockTimeoutException); + + // obtain the client metrics + MetricsConnection metrics = conn.getConnectionMetrics(); + long queueCount = metrics.getUserRegionLockQueue().getCount(); + assertEquals("Queue of userRegionLock should be updated twice. queueCount: " + queueCount, + queueCount, 2); + + long timeoutCount = metrics.getUserRegionLockTimeout().getCount(); + assertEquals("Timeout of userRegionLock should happen once. timeoutCount: " + timeoutCount, + timeoutCount, 1); + + long waitingTimerCount = metrics.getUserRegionLockWaitingTimer().getCount(); + assertEquals("userRegionLock should be grabbed successfully once. waitingTimerCount: " + + waitingTimerCount, waitingTimerCount, 1); + + long heldTimerCount = metrics.getUserRegionLockHeldTimer().getCount(); + assertEquals( + "userRegionLock should be held successfully once. heldTimerCount: " + heldTimerCount, + heldTimerCount, 1); + double heldTime = metrics.getUserRegionLockHeldTimer().getSnapshot().getMax(); + assertTrue("Max held time should be greater than 2 seconds. heldTime: " + heldTime, + heldTime >= 2E9); } }