Skip to content

Commit e4b4fec

Browse files
bshashikanthanishakoneru
authored andcommitted
HDDS-1224. Restructure code to validate the response from server in the Read path (#806)
1 parent ea3b0a1 commit e4b4fec

File tree

8 files changed

+168
-158
lines changed

8 files changed

+168
-158
lines changed

hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/XceiverClientGrpc.java

Lines changed: 46 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,14 @@
2323
import org.apache.hadoop.conf.Configuration;
2424
import org.apache.hadoop.hdds.HddsUtils;
2525
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
26-
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos;
2726
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandRequestProto;
2827
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandResponseProto;
2928
import org.apache.hadoop.hdds.protocol.datanode.proto.XceiverClientProtocolServiceGrpc;
3029
import org.apache.hadoop.hdds.protocol.datanode.proto.XceiverClientProtocolServiceGrpc.XceiverClientProtocolServiceStub;
3130
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
3231
import org.apache.hadoop.hdds.scm.client.HddsClientUtils;
3332
import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
33+
import org.apache.hadoop.hdds.scm.storage.CheckedBiFunction;
3434
import org.apache.hadoop.hdds.security.exception.SCMSecurityException;
3535
import org.apache.hadoop.hdds.security.x509.SecurityConfig;
3636
import org.apache.hadoop.hdds.tracing.GrpcClientInterceptor;
@@ -62,7 +62,6 @@
6262
import java.util.concurrent.Semaphore;
6363
import java.util.concurrent.TimeUnit;
6464
import java.util.concurrent.TimeoutException;
65-
import java.util.stream.Collectors;
6665

6766
/**
6867
* A Client for the storageContainer protocol.
@@ -83,15 +82,15 @@ public class XceiverClientGrpc extends XceiverClientSpi {
8382
* data nodes.
8483
*
8584
* @param pipeline - Pipeline that defines the machines.
86-
* @param config -- Ozone Config
85+
* @param config -- Ozone Config
8786
*/
8887
public XceiverClientGrpc(Pipeline pipeline, Configuration config) {
8988
super();
9089
Preconditions.checkNotNull(pipeline);
9190
Preconditions.checkNotNull(config);
9291
this.pipeline = pipeline;
9392
this.config = config;
94-
this.secConfig = new SecurityConfig(config);
93+
this.secConfig = new SecurityConfig(config);
9594
this.semaphore =
9695
new Semaphore(HddsClientUtils.getMaxOutstandingRequests(config));
9796
this.metrics = XceiverClientManager.getXceiverClientMetrics();
@@ -101,7 +100,7 @@ public XceiverClientGrpc(Pipeline pipeline, Configuration config) {
101100

102101
/**
103102
* To be used when grpc token is not enabled.
104-
* */
103+
*/
105104
@Override
106105
public void connect() throws Exception {
107106
// leader by default is the 1st datanode in the datanode list of pipleline
@@ -112,7 +111,7 @@ public void connect() throws Exception {
112111

113112
/**
114113
* Passed encoded token to GRPC header when security is enabled.
115-
* */
114+
*/
116115
@Override
117116
public void connect(String encodedToken) throws Exception {
118117
// leader by default is the 1st datanode in the datanode list of pipleline
@@ -132,11 +131,10 @@ private void connectToDatanode(DatanodeDetails dn, String encodedToken)
132131
}
133132

134133
// Add credential context to the client call
135-
String userName = UserGroupInformation.getCurrentUser()
136-
.getShortUserName();
134+
String userName = UserGroupInformation.getCurrentUser().getShortUserName();
137135
LOG.debug("Connecting to server Port : " + dn.getIpAddress());
138-
NettyChannelBuilder channelBuilder = NettyChannelBuilder.forAddress(dn
139-
.getIpAddress(), port).usePlaintext()
136+
NettyChannelBuilder channelBuilder =
137+
NettyChannelBuilder.forAddress(dn.getIpAddress(), port).usePlaintext()
140138
.maxInboundMessageSize(OzoneConsts.OZONE_SCM_CHUNK_MAX_SIZE)
141139
.intercept(new ClientCredentialInterceptor(userName, encodedToken),
142140
new GrpcClientInterceptor());
@@ -149,8 +147,8 @@ private void connectToDatanode(DatanodeDetails dn, String encodedToken)
149147
if (trustCertCollectionFile != null) {
150148
sslContextBuilder.trustManager(trustCertCollectionFile);
151149
}
152-
if (secConfig.isGrpcMutualTlsRequired() && clientCertChainFile != null &&
153-
privateKeyFile != null) {
150+
if (secConfig.isGrpcMutualTlsRequired() && clientCertChainFile != null
151+
&& privateKeyFile != null) {
154152
sslContextBuilder.keyManager(clientCertChainFile, privateKeyFile);
155153
}
156154

@@ -216,77 +214,83 @@ public ContainerCommandResponseProto sendCommand(
216214
}
217215

218216
@Override
219-
public XceiverClientReply sendCommand(
220-
ContainerCommandRequestProto request, List<DatanodeDetails> excludeDns)
217+
public ContainerCommandResponseProto sendCommand(
218+
ContainerCommandRequestProto request, List<CheckedBiFunction> validators)
221219
throws IOException {
222-
Preconditions.checkState(HddsUtils.isReadOnly(request));
223-
return sendCommandWithTraceIDAndRetry(request, excludeDns);
220+
try {
221+
XceiverClientReply reply;
222+
reply = sendCommandWithTraceIDAndRetry(request, validators);
223+
ContainerCommandResponseProto responseProto = reply.getResponse().get();
224+
return responseProto;
225+
} catch (ExecutionException | InterruptedException e) {
226+
throw new IOException("Failed to execute command " + request, e);
227+
}
224228
}
225229

226230
private XceiverClientReply sendCommandWithTraceIDAndRetry(
227-
ContainerCommandRequestProto request, List<DatanodeDetails> excludeDns)
231+
ContainerCommandRequestProto request, List<CheckedBiFunction> validators)
228232
throws IOException {
229233
try (Scope scope = GlobalTracer.get()
230234
.buildSpan("XceiverClientGrpc." + request.getCmdType().name())
231235
.startActive(true)) {
232236
ContainerCommandRequestProto finalPayload =
233237
ContainerCommandRequestProto.newBuilder(request)
234-
.setTraceID(TracingUtil.exportCurrentSpan())
235-
.build();
236-
return sendCommandWithRetry(finalPayload, excludeDns);
238+
.setTraceID(TracingUtil.exportCurrentSpan()).build();
239+
return sendCommandWithRetry(finalPayload, validators);
237240
}
238241
}
239242

240243
private XceiverClientReply sendCommandWithRetry(
241-
ContainerCommandRequestProto request, List<DatanodeDetails> excludeDns)
244+
ContainerCommandRequestProto request, List<CheckedBiFunction> validators)
242245
throws IOException {
243246
ContainerCommandResponseProto responseProto = null;
247+
IOException ioException = null;
244248

245249
// In case of an exception or an error, we will try to read from the
246250
// datanodes in the pipeline in a round robin fashion.
247251

248252
// TODO: cache the correct leader info in here, so that any subsequent calls
249253
// should first go to leader
250-
List<DatanodeDetails> dns = pipeline.getNodes();
251-
List<DatanodeDetails> healthyDns =
252-
excludeDns != null ? dns.stream().filter(dnId -> {
253-
for (DatanodeDetails excludeId : excludeDns) {
254-
if (dnId.equals(excludeId)) {
255-
return false;
256-
}
257-
}
258-
return true;
259-
}).collect(Collectors.toList()) : dns;
260254
XceiverClientReply reply = new XceiverClientReply(null);
261-
for (DatanodeDetails dn : healthyDns) {
255+
for (DatanodeDetails dn : pipeline.getNodes()) {
262256
try {
263257
LOG.debug("Executing command " + request + " on datanode " + dn);
264258
// In case the command gets retried on a 2nd datanode,
265259
// sendCommandAsyncCall will create a new channel and async stub
266260
// in case these don't exist for the specific datanode.
267261
reply.addDatanode(dn);
268262
responseProto = sendCommandAsync(request, dn).getResponse().get();
269-
if (responseProto.getResult() == ContainerProtos.Result.SUCCESS) {
270-
break;
263+
if (validators != null && !validators.isEmpty()) {
264+
for (CheckedBiFunction validator : validators) {
265+
validator.apply(request, responseProto);
266+
}
271267
}
272-
} catch (ExecutionException | InterruptedException e) {
268+
break;
269+
} catch (ExecutionException | InterruptedException | IOException e) {
273270
LOG.debug("Failed to execute command " + request + " on datanode " + dn
274271
.getUuidString(), e);
275-
if (Status.fromThrowable(e.getCause()).getCode()
276-
== Status.UNAUTHENTICATED.getCode()) {
277-
throw new SCMSecurityException("Failed to authenticate with "
278-
+ "GRPC XceiverServer with Ozone block token.");
272+
if (!(e instanceof IOException)) {
273+
if (Status.fromThrowable(e.getCause()).getCode()
274+
== Status.UNAUTHENTICATED.getCode()) {
275+
throw new SCMSecurityException("Failed to authenticate with "
276+
+ "GRPC XceiverServer with Ozone block token.");
277+
}
278+
ioException = new IOException(e);
279+
} else {
280+
ioException = (IOException) e;
279281
}
282+
responseProto = null;
280283
}
281284
}
282285

283286
if (responseProto != null) {
284287
reply.setResponse(CompletableFuture.completedFuture(responseProto));
285288
return reply;
286289
} else {
287-
throw new IOException(
288-
"Failed to execute command " + request + " on the pipeline "
289-
+ pipeline.getId());
290+
Preconditions.checkNotNull(ioException);
291+
LOG.error("Failed to execute command " + request + " on the pipeline "
292+
+ pipeline.getId());
293+
throw ioException;
290294
}
291295
}
292296

hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/BlockInputStream.java

Lines changed: 34 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -21,29 +21,29 @@
2121
import com.google.common.annotations.VisibleForTesting;
2222
import com.google.common.base.Preconditions;
2323
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
24-
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos;
25-
import org.apache.hadoop.hdds.scm.XceiverClientReply;
2624
import org.apache.hadoop.hdds.scm.container.common.helpers
2725
.StorageContainerException;
2826
import org.apache.hadoop.ozone.common.Checksum;
2927
import org.apache.hadoop.ozone.common.ChecksumData;
28+
import org.apache.hadoop.ozone.common.OzoneChecksumException;
3029
import org.apache.ratis.thirdparty.com.google.protobuf.ByteString;
3130
import org.apache.hadoop.fs.Seekable;
3231
import org.apache.hadoop.hdds.scm.XceiverClientManager;
3332
import org.apache.hadoop.hdds.scm.XceiverClientSpi;
3433
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ChunkInfo;
3534
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos
3635
.ReadChunkResponseProto;
36+
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.
37+
ContainerCommandResponseProto;
38+
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.
39+
ContainerCommandRequestProto;
3740
import org.apache.hadoop.hdds.client.BlockID;
38-
3941
import java.io.EOFException;
4042
import java.io.IOException;
4143
import java.io.InputStream;
4244
import java.nio.ByteBuffer;
43-
import java.util.ArrayList;
4445
import java.util.Arrays;
4546
import java.util.List;
46-
import java.util.concurrent.ExecutionException;
4747

4848
/**
4949
* An {@link InputStream} used by the REST service in combination with the
@@ -74,7 +74,7 @@ public class BlockInputStream extends InputStream implements Seekable {
7474
private List<ByteBuffer> buffers;
7575
private int bufferIndex;
7676
private long bufferPosition;
77-
private final boolean verifyChecksum;
77+
private boolean verifyChecksum;
7878

7979
/**
8080
* Creates a new BlockInputStream.
@@ -323,41 +323,8 @@ private boolean chunksRemaining() {
323323
private synchronized void readChunkFromContainer() throws IOException {
324324
// Read the chunk at chunkIndex
325325
final ChunkInfo chunkInfo = chunks.get(chunkIndex);
326-
List<DatanodeDetails> excludeDns = null;
327326
ByteString byteString;
328-
List<DatanodeDetails> dnList = getDatanodeList();
329-
while (true) {
330-
List<DatanodeDetails> dnListFromReadChunkCall = new ArrayList<>();
331-
byteString = readChunk(chunkInfo, excludeDns, dnListFromReadChunkCall);
332-
try {
333-
if (byteString.size() != chunkInfo.getLen()) {
334-
// Bytes read from chunk should be equal to chunk size.
335-
throw new IOException(String
336-
.format("Inconsistent read for chunk=%s len=%d bytesRead=%d",
337-
chunkInfo.getChunkName(), chunkInfo.getLen(),
338-
byteString.size()));
339-
}
340-
ChecksumData checksumData =
341-
ChecksumData.getFromProtoBuf(chunkInfo.getChecksumData());
342-
if (verifyChecksum) {
343-
Checksum.verifyChecksum(byteString, checksumData);
344-
}
345-
break;
346-
} catch (IOException ioe) {
347-
// we will end up in this situation only if the checksum mismatch
348-
// happens or the length of the chunk mismatches.
349-
// In this case, read should be retried on a different replica.
350-
// TODO: Inform SCM of a possible corrupt container replica here
351-
if (excludeDns == null) {
352-
excludeDns = new ArrayList<>();
353-
}
354-
excludeDns.addAll(dnListFromReadChunkCall);
355-
if (excludeDns.size() == dnList.size()) {
356-
throw ioe;
357-
}
358-
}
359-
}
360-
327+
byteString = readChunk(chunkInfo);
361328
buffers = byteString.asReadOnlyByteBufferList();
362329
bufferIndex = 0;
363330
chunkIndexOfCurrentBuffer = chunkIndex;
@@ -372,28 +339,20 @@ private synchronized void readChunkFromContainer() throws IOException {
372339
* Send RPC call to get the chunk from the container.
373340
*/
374341
@VisibleForTesting
375-
protected ByteString readChunk(final ChunkInfo chunkInfo,
376-
List<DatanodeDetails> excludeDns, List<DatanodeDetails> dnListFromReply)
342+
protected ByteString readChunk(final ChunkInfo chunkInfo)
377343
throws IOException {
378-
XceiverClientReply reply;
379-
ReadChunkResponseProto readChunkResponse = null;
344+
ReadChunkResponseProto readChunkResponse;
380345
try {
381-
reply = ContainerProtocolCalls
382-
.readChunk(xceiverClient, chunkInfo, blockID, traceID, excludeDns);
383-
ContainerProtos.ContainerCommandResponseProto response;
384-
response = reply.getResponse().get();
385-
ContainerProtocolCalls.validateContainerResponse(response);
386-
readChunkResponse = response.getReadChunk();
387-
dnListFromReply.addAll(reply.getDatanodes());
346+
List<CheckedBiFunction> validators =
347+
ContainerProtocolCalls.getValidatorList();
348+
validators.add(validator);
349+
readChunkResponse = ContainerProtocolCalls
350+
.readChunk(xceiverClient, chunkInfo, blockID, traceID, validators);
388351
} catch (IOException e) {
389352
if (e instanceof StorageContainerException) {
390353
throw e;
391354
}
392355
throw new IOException("Unexpected OzoneException: " + e.toString(), e);
393-
} catch (ExecutionException | InterruptedException e) {
394-
throw new IOException(
395-
"Failed to execute ReadChunk command for chunk " + chunkInfo
396-
.getChunkName(), e);
397356
}
398357
return readChunkResponse.getData();
399358
}
@@ -403,6 +362,26 @@ protected List<DatanodeDetails> getDatanodeList() {
403362
return xceiverClient.getPipeline().getNodes();
404363
}
405364

365+
private CheckedBiFunction<ContainerCommandRequestProto,
366+
ContainerCommandResponseProto, IOException> validator =
367+
(request, response) -> {
368+
ReadChunkResponseProto readChunkResponse = response.getReadChunk();
369+
final ChunkInfo chunkInfo = readChunkResponse.getChunkData();
370+
ByteString byteString = readChunkResponse.getData();
371+
if (byteString.size() != chunkInfo.getLen()) {
372+
// Bytes read from chunk should be equal to chunk size.
373+
throw new OzoneChecksumException(String
374+
.format("Inconsistent read for chunk=%s len=%d bytesRead=%d",
375+
chunkInfo.getChunkName(), chunkInfo.getLen(),
376+
byteString.size()));
377+
}
378+
ChecksumData checksumData =
379+
ChecksumData.getFromProtoBuf(chunkInfo.getChecksumData());
380+
if (verifyChecksum) {
381+
Checksum.verifyChecksum(byteString, checksumData);
382+
}
383+
};
384+
406385
@Override
407386
public synchronized void seek(long pos) throws IOException {
408387
if (pos < 0 || (chunks.size() == 0 && pos > 0)

hadoop-hdds/client/src/test/java/org/apache/hadoop/hdds/scm/storage/TestBlockInputStream.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,7 @@ private static class DummyBlockInputStream extends BlockInputStream {
114114
}
115115

116116
@Override
117-
protected ByteString readChunk(final ChunkInfo chunkInfo,
118-
List<DatanodeDetails> excludeDns, List<DatanodeDetails> dnListFromReply)
117+
protected ByteString readChunk(final ChunkInfo chunkInfo)
119118
throws IOException {
120119
return getByteString(chunkInfo.getChunkName(), (int) chunkInfo.getLen());
121120
}

0 commit comments

Comments
 (0)