Skip to content

Commit b53d19a

Browse files
avijayanhwxbshashikant
authored andcommitted
HDDS-1783 : Latency metric for applyTransaction in ContainerStateMachine (#1363).
1 parent 5ff76cb commit b53d19a

File tree

4 files changed

+43
-4
lines changed

4 files changed

+43
-4
lines changed

hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ public class CSMMetrics {
6060
private @Metric MutableCounterLong numStartTransactionVerifyFailures;
6161
private @Metric MutableCounterLong numContainerNotOpenVerifyFailures;
6262

63+
private @Metric MutableRate applyTransaction;
64+
private @Metric MutableRate writeStateMachineData;
65+
6366
public CSMMetrics() {
6467
int numCmdTypes = ContainerProtos.Type.values().length;
6568
this.opsLatency = new MutableRate[numCmdTypes];
@@ -186,6 +189,10 @@ public long getNumBytesCommittedCount() {
186189
return numBytesCommittedCount.value();
187190
}
188191

192+
public MutableRate getApplyTransactionLatency() {
193+
return applyTransaction;
194+
}
195+
189196
public void incPipelineLatency(ContainerProtos.Type type, long latencyNanos) {
190197
opsLatency[type.ordinal()].add(latencyNanos);
191198
transactionLatency.add(latencyNanos);
@@ -199,6 +206,13 @@ public void incNumContainerNotOpenVerifyFailures() {
199206
numContainerNotOpenVerifyFailures.incr();
200207
}
201208

209+
public void recordApplyTransactionCompletion(long latencyNanos) {
210+
applyTransaction.add(latencyNanos);
211+
}
212+
213+
public void recordWriteStateMachineCompletion(long latencyNanos) {
214+
writeStateMachineData.add(latencyNanos);
215+
}
202216

203217
public void unRegister() {
204218
MetricsSystem ms = DefaultMetricsSystem.instance();

hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,8 @@ private ExecutorService getCommandExecutor(
411411
}
412412

413413
private CompletableFuture<Message> handleWriteChunk(
414-
ContainerCommandRequestProto requestProto, long entryIndex, long term) {
414+
ContainerCommandRequestProto requestProto, long entryIndex, long term,
415+
long startTime) {
415416
final WriteChunkRequestProto write = requestProto.getWriteChunk();
416417
RaftServer server = ratisServer.getServer();
417418
Preconditions.checkState(server instanceof RaftServerProxy);
@@ -461,6 +462,8 @@ private CompletableFuture<Message> handleWriteChunk(
461462
write.getBlockID() + " logIndex " + entryIndex + " chunkName " +
462463
write.getChunkData().getChunkName());
463464
raftFuture.complete(r::toByteString);
465+
metrics.recordWriteStateMachineCompletion(
466+
Time.monotonicNowNanos() - startTime);
464467
}
465468

466469
writeChunkFutureMap.remove(entryIndex);
@@ -477,6 +480,7 @@ private CompletableFuture<Message> handleWriteChunk(
477480
public CompletableFuture<Message> writeStateMachineData(LogEntryProto entry) {
478481
try {
479482
metrics.incNumWriteStateMachineOps();
483+
long writeStateMachineStartTime = Time.monotonicNowNanos();
480484
ContainerCommandRequestProto requestProto =
481485
getContainerCommandRequestProto(
482486
entry.getStateMachineLogEntry().getLogData());
@@ -493,7 +497,7 @@ public CompletableFuture<Message> writeStateMachineData(LogEntryProto entry) {
493497
switch (cmdType) {
494498
case WriteChunk:
495499
return handleWriteChunk(requestProto, entry.getIndex(),
496-
entry.getTerm());
500+
entry.getTerm(), writeStateMachineStartTime);
497501
default:
498502
throw new IllegalStateException("Cmd Type:" + cmdType
499503
+ " should not have state machine data");
@@ -673,6 +677,7 @@ public CompletableFuture<Message> applyTransaction(TransactionContext trx) {
673677
.setTerm(trx.getLogEntry().getTerm())
674678
.setLogIndex(index);
675679

680+
long applyTxnStartTime = Time.monotonicNowNanos();
676681
try {
677682
applyTransactionSemaphore.acquire();
678683
metrics.incNumApplyTransactionsOps();
@@ -740,7 +745,11 @@ public CompletableFuture<Message> applyTransaction(TransactionContext trx) {
740745
}
741746
}
742747
return applyTransactionFuture;
743-
}).whenComplete((r, t) -> applyTransactionSemaphore.release());
748+
}).whenComplete((r, t) -> {
749+
applyTransactionSemaphore.release();
750+
metrics.recordApplyTransactionCompletion(
751+
Time.monotonicNowNanos() - applyTxnStartTime);
752+
});
744753
return applyTransactionFuture;
745754
} catch (IOException | InterruptedException e) {
746755
metrics.incNumApplyTransactionsFails();

hadoop-ozone/dist/src/main/compose/ozonesecure-mr/docker-config

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ HDFS-SITE.XML_dfs.datanode.address=0.0.0.0:1019
4747
HDFS-SITE.XML_dfs.datanode.http.address=0.0.0.0:1012
4848
CORE-SITE.XML_dfs.data.transfer.protection=authentication
4949
CORE-SITE.XML_hadoop.security.authentication=kerberos
50-
COER-SITE.XML_hadoop.security.auth_to_local=RULE:[2:$1@$0](.*@EXAMPLE.COM)s/@.*///L
50+
CORE-SITE.XML_hadoop.security.auth_to_local=RULE:[2:$1@$0](.*@EXAMPLE.COM)s/@.*///L
5151
CORE-SITE.XML_hadoop.security.key.provider.path=kms://http@kms:9600/kms
5252

5353
#temporary disable authorization as org.apache.hadoop.yarn.server.api.ResourceTrackerPB is not properly annotated to support it

hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/TestCSMMetrics.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package org.apache.hadoop.ozone.container.common.transport.server.ratis;
2020

2121
import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
22+
import static org.apache.hadoop.test.MetricsAsserts.getDoubleGauge;
2223
import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
2324

2425
import java.io.File;
@@ -49,6 +50,8 @@
4950
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
5051

5152
import static org.apache.ratis.rpc.SupportedRpcType.GRPC;
53+
import static org.junit.Assert.assertTrue;
54+
5255
import org.apache.ratis.protocol.RaftGroupId;
5356
import org.apache.ratis.util.function.CheckedBiConsumer;
5457

@@ -118,6 +121,12 @@ static void runContainerStateMachineMetrics(
118121
assertCounter("NumStartTransactionVerifyFailures", 0L, metric);
119122
assertCounter("NumContainerNotOpenVerifyFailures", 0L, metric);
120123
assertCounter("WriteChunkNumOps", 0L, metric);
124+
double applyTransactionLatency = getDoubleGauge(
125+
"ApplyTransactionAvgTime", metric);
126+
assertTrue(applyTransactionLatency == 0.0);
127+
double writeStateMachineLatency = getDoubleGauge(
128+
"WriteStateMachineDataAvgTime", metric);
129+
assertTrue(writeStateMachineLatency == 0.0);
121130

122131
// Write Chunk
123132
BlockID blockID = ContainerTestHelper.getTestBlockID(ContainerTestHelper.
@@ -152,6 +161,13 @@ static void runContainerStateMachineMetrics(
152161
RaftGroupId.valueOf(pipeline.getId().getId()).toString());
153162
assertCounter("NumQueryStateMachineOps", 1L, metric);
154163
assertCounter("NumApplyTransactionOps", 1L, metric);
164+
applyTransactionLatency = getDoubleGauge(
165+
"ApplyTransactionAvgTime", metric);
166+
assertTrue(applyTransactionLatency > 0.0);
167+
writeStateMachineLatency = getDoubleGauge(
168+
"WriteStateMachineDataAvgTime", metric);
169+
assertTrue(writeStateMachineLatency > 0.0);
170+
155171
} finally {
156172
if (client != null) {
157173
client.close();

0 commit comments

Comments
 (0)