Skip to content

Commit ff97540

Browse files
authored
HBASE-27684: add client metrics related to user region lock. (#5081)
Signed-off-by: Andrew Purtell <[email protected]> Signed-off-by: David Manning <[email protected]> Signed-off-by: Rushabh Shah <[email protected]> Signed-off-by: Tanuj Khurana <[email protected]>
1 parent e7a958a commit ff97540

File tree

3 files changed

+86
-0
lines changed

3 files changed

+86
-0
lines changed

hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionImplementation.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1003,6 +1003,7 @@ private RegionLocations locateRegionInMeta(TableName tableName, byte[] row, bool
10031003
// Query the meta region
10041004
long pauseBase = connectionConfig.getPauseMillis();
10051005
takeUserRegionLock();
1006+
final long lockStartTime = EnvironmentEdgeManager.currentTime();
10061007
try {
10071008
// We don't need to check if useCache is enabled or not. Even if useCache is false
10081009
// we already cleared the cache for this row before acquiring userRegion lock so if this
@@ -1113,6 +1114,10 @@ rpcControllerFactory, getMetaLookupPool(), connectionConfig.getMetaReadRpcTimeou
11131114
}
11141115
} finally {
11151116
userRegionLock.unlock();
1117+
// update duration of the lock being held
1118+
if (metrics != null) {
1119+
metrics.updateUserRegionLockHeld(EnvironmentEdgeManager.currentTime() - lockStartTime);
1120+
}
11161121
}
11171122
try {
11181123
Thread.sleep(ConnectionUtils.getPauseTime(pauseBase, tries));
@@ -1126,9 +1131,19 @@ rpcControllerFactory, getMetaLookupPool(), connectionConfig.getMetaReadRpcTimeou
11261131
void takeUserRegionLock() throws IOException {
11271132
try {
11281133
long waitTime = connectionConfig.getMetaOperationTimeout();
1134+
if (metrics != null) {
1135+
metrics.updateUserRegionLockQueue(userRegionLock.getQueueLength());
1136+
}
1137+
final long waitStartTime = EnvironmentEdgeManager.currentTime();
11291138
if (!userRegionLock.tryLock(waitTime, TimeUnit.MILLISECONDS)) {
1139+
if (metrics != null) {
1140+
metrics.incrUserRegionLockTimeout();
1141+
}
11301142
throw new LockTimeoutException("Failed to get user region lock in" + waitTime + " ms. "
11311143
+ " for accessing meta region server.");
1144+
} else if (metrics != null) {
1145+
// successfully grabbed the lock, start timer of holding the lock
1146+
metrics.updateUserRegionLockWaiting(EnvironmentEdgeManager.currentTime() - waitStartTime);
11321147
}
11331148
} catch (InterruptedException ie) {
11341149
LOG.error("Interrupted while waiting for a lock", ie);

hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetricsConnection.java

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,10 @@ public Counter newMetric(Class<?> clazz, String name, String scope) {
359359
private final Counter nsLookups;
360360
private final Counter nsLookupsFailed;
361361
private final Timer overloadedBackoffTimer;
362+
private final Counter userRegionLockTimeoutCount;
363+
private final Timer userRegionLockWaitingTimer;
364+
private final Timer userRegionLockHeldTimer;
365+
private final Histogram userRegionLockQueueHist;
362366

363367
// dynamic metrics
364368

@@ -443,6 +447,15 @@ protected Ratio getRatio() {
443447
this.nsLookups = registry.counter(name(this.getClass(), NS_LOOKUPS, scope));
444448
this.nsLookupsFailed = registry.counter(name(this.getClass(), NS_LOOKUPS_FAILED, scope));
445449

450+
this.userRegionLockTimeoutCount =
451+
registry.counter(name(this.getClass(), "userRegionLockTimeoutCount", scope));
452+
this.userRegionLockWaitingTimer =
453+
registry.timer(name(this.getClass(), "userRegionLockWaitingDuration", scope));
454+
this.userRegionLockHeldTimer =
455+
registry.timer(name(this.getClass(), "userRegionLockHeldDuration", scope));
456+
this.userRegionLockQueueHist =
457+
registry.histogram(name(MetricsConnection.class, "userRegionLockQueueLength", scope));
458+
446459
this.overloadedBackoffTimer =
447460
registry.timer(name(this.getClass(), "overloadedBackoffDurationMs", scope));
448461

@@ -602,6 +615,41 @@ public void incrementServerOverloadedBackoffTime(long time, TimeUnit timeUnit) {
602615
overloadedBackoffTimer.update(time, timeUnit);
603616
}
604617

618+
/** incr */
619+
public void incrUserRegionLockTimeout() {
620+
userRegionLockTimeoutCount.inc();
621+
}
622+
623+
/** get */
624+
public Counter getUserRegionLockTimeout() {
625+
return userRegionLockTimeoutCount;
626+
}
627+
628+
public Timer getUserRegionLockWaitingTimer() {
629+
return userRegionLockWaitingTimer;
630+
}
631+
632+
public Timer getUserRegionLockHeldTimer() {
633+
return userRegionLockHeldTimer;
634+
}
635+
636+
public Histogram getUserRegionLockQueue() {
637+
return userRegionLockQueueHist;
638+
}
639+
640+
/** update */
641+
public void updateUserRegionLockWaiting(long duration) {
642+
userRegionLockWaitingTimer.update(duration, TimeUnit.MILLISECONDS);
643+
}
644+
645+
public void updateUserRegionLockHeld(long duration) {
646+
userRegionLockHeldTimer.update(duration, TimeUnit.MILLISECONDS);
647+
}
648+
649+
public void updateUserRegionLockQueue(int count) {
650+
userRegionLockQueueHist.update(count);
651+
}
652+
605653
/** Return the connection count of the metrics within a scope */
606654
public long getConnectionCount() {
607655
return connectionCount.getCount();

hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaCache.java

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,7 @@ public void testUserRegionLockThrowsException() throws IOException, InterruptedE
563563
conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 0);
564564
conf.setLong(HConstants.HBASE_CLIENT_META_OPERATION_TIMEOUT, 2000);
565565
conf.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 2000);
566+
conf.setBoolean(MetricsConnection.CLIENT_SIDE_METRICS_ENABLED_KEY, true);
566567

567568
try (ConnectionImplementation conn =
568569
(ConnectionImplementation) ConnectionFactory.createConnection(conf)) {
@@ -587,6 +588,28 @@ public void testUserRegionLockThrowsException() throws IOException, InterruptedE
587588

588589
assertTrue(client1.getException() instanceof LockTimeoutException
589590
^ client2.getException() instanceof LockTimeoutException);
591+
592+
// obtain the client metrics
593+
MetricsConnection metrics = conn.getConnectionMetrics();
594+
long queueCount = metrics.getUserRegionLockQueue().getCount();
595+
assertEquals("Queue of userRegionLock should be updated twice. queueCount: " + queueCount,
596+
queueCount, 2);
597+
598+
long timeoutCount = metrics.getUserRegionLockTimeout().getCount();
599+
assertEquals("Timeout of userRegionLock should happen once. timeoutCount: " + timeoutCount,
600+
timeoutCount, 1);
601+
602+
long waitingTimerCount = metrics.getUserRegionLockWaitingTimer().getCount();
603+
assertEquals("userRegionLock should be grabbed successfully once. waitingTimerCount: "
604+
+ waitingTimerCount, waitingTimerCount, 1);
605+
606+
long heldTimerCount = metrics.getUserRegionLockHeldTimer().getCount();
607+
assertEquals(
608+
"userRegionLock should be held successfully once. heldTimerCount: " + heldTimerCount,
609+
heldTimerCount, 1);
610+
double heldTime = metrics.getUserRegionLockHeldTimer().getSnapshot().getMax();
611+
assertTrue("Max held time should be greater than 2 seconds. heldTime: " + heldTime,
612+
heldTime >= 2E9);
590613
}
591614
}
592615

0 commit comments

Comments
 (0)