diff --git a/LICENSE-binary b/LICENSE-binary
index b61b7f3166733..5c433304b7666 100644
--- a/LICENSE-binary
+++ b/LICENSE-binary
@@ -352,7 +352,7 @@ io.reactivex:rxnetty:0.4.20
io.swagger:swagger-annotations:1.5.4
javax.inject:javax.inject:1
net.java.dev.jna:jna:5.2.0
-net.minidev:accessors-smart:1.2
+net.minidev:accessors-smart:1.21
org.apache.avro:avro:1.11.4
org.apache.commons:commons-compress:1.26.1
org.apache.commons:commons-configuration2:2.10.1
@@ -419,7 +419,7 @@ org.xerial.snappy:snappy-java:1.1.10.4
org.yaml:snakeyaml:2.0
org.wildfly.openssl:wildfly-openssl:2.2.5.Final
ro.isdc.wro4j:wro4j-maven-plugin:1.8.0
-software.amazon.awssdk:bundle:2.29.52
+software.amazon.awssdk:bundle:2.33.8
software.amazon.s3.analyticsaccelerator:analyticsaccelerator-s3:1.3.0
net.jodah:failsafe:2.4.4
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractUnbufferTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractUnbufferTest.java
index 59eb57af8135e..0392c22fbee41 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractUnbufferTest.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractUnbufferTest.java
@@ -18,6 +18,7 @@
package org.apache.hadoop.fs.contract;
+import org.assertj.core.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@@ -30,6 +31,7 @@
import static org.apache.hadoop.fs.contract.ContractTestUtils.createFile;
import static org.apache.hadoop.fs.contract.ContractTestUtils.dataset;
+import static org.apache.hadoop.fs.contract.ContractTestUtils.readNBytes;
/**
* Contract tests for {@link org.apache.hadoop.fs.CanUnbuffer#unbuffer}.
@@ -145,10 +147,12 @@ protected void validateFileContents(FSDataInputStream stream, int length,
int startIndex)
throws IOException {
byte[] streamData = new byte[length];
- assertEquals(length, stream.read(streamData),
- "failed to read expected number of bytes from "
- + "stream. This may be transient");
+ final int read = readNBytes(stream, streamData, 0, length);
+ Assertions.assertThat(read)
+ .describedAs("failed to read expected number of bytes from stream. %s", stream)
+ .isEqualTo(length);
byte[] validateFileBytes;
+
if (startIndex == 0 && length == fileBytes.length) {
validateFileBytes = fileBytes;
} else {
diff --git a/hadoop-project/pom.xml b/hadoop-project/pom.xml
index 9967f3d79c9cb..7dbbd0b140b50 100644
--- a/hadoop-project/pom.xml
+++ b/hadoop-project/pom.xml
@@ -209,7 +209,7 @@
1.0-beta-19001.12.720
- 2.29.52
+ 2.35.43.1.11.3.01.0.1
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/AWSClientIOException.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/AWSClientIOException.java
index b61667d1c502b..af187e3580db1 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/AWSClientIOException.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/AWSClientIOException.java
@@ -54,4 +54,8 @@ public String getMessage() {
public boolean retryable() {
return getCause().retryable();
}
+
+ public String getOperation() {
+ return operation;
+ }
}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/AWSNoResponseException.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/AWSNoResponseException.java
index b8562714b1aae..49ebd3a42fdf4 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/AWSNoResponseException.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/AWSNoResponseException.java
@@ -24,6 +24,12 @@
* Status code 443, no response from server. This is considered idempotent.
*/
public class AWSNoResponseException extends AWSServiceIOException {
+
+ /**
+ * Constructor.
+ * @param operation operation in progress.
+ * @param cause inner cause
+ */
public AWSNoResponseException(String operation,
AwsServiceException cause) {
super(operation, cause);
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java
index 361806545403b..75db90092d151 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java
@@ -21,6 +21,7 @@
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.fs.Options;
+import org.apache.hadoop.fs.s3a.impl.ChecksumSupport;
import org.apache.hadoop.fs.s3a.impl.streams.StreamIntegration;
import org.apache.hadoop.security.ssl.DelegatingSSLSocketFactory;
@@ -1836,15 +1837,53 @@ private Constants() {
*/
public static final boolean CHECKSUM_VALIDATION_DEFAULT = false;
+ /**
+ * Should checksums always be generated?
+ * Not all third-party stores like this being enabled for every request.
+ * Value: {@value}.
+ */
+ public static final String CHECKSUM_GENERATION =
+ "fs.s3a.checksum.generation";
+
+ /**
+ * Default value of {@link #CHECKSUM_GENERATION}.
+ * Value: {@value}.
+ */
+ public static final boolean DEFAULT_CHECKSUM_GENERATION = false;
+
/**
* Indicates the algorithm used to create the checksum for the object
* to be uploaded to S3. Unset by default. It supports the following values:
- * 'CRC32', 'CRC32C', 'SHA1', and 'SHA256'
+ * 'CRC32', 'CRC32C', 'SHA1', 'SHA256', 'CRC64_NVME 'NONE', ''.
+ * When checksum calculation is enabled this MUST be set to a valid algorithm.
* value:{@value}
*/
public static final String CHECKSUM_ALGORITHM =
"fs.s3a.create.checksum.algorithm";
+ /**
+ * Default checksum algorithm: {@code "NONE"}.
+ */
+ public static final String DEFAULT_CHECKSUM_ALGORITHM =
+ ChecksumSupport.NONE;
+
+ /**
+ * Send a {@code Content-MD5 header} with every request.
+ * This is required when performing some operations with third party stores
+ * For example: bulk delete).
+ * It is supported by AWS S3, though has unexpected behavior with AWS S3 Express storage.
+ * See https://github.com/aws/aws-sdk-java-v2/issues/6459 for details.
+ */
+ public static final String REQUEST_MD5_HEADER =
+ "fs.s3a.request.md5.header";
+
+ /**
+ * Default value of {@link #REQUEST_MD5_HEADER}.
+ * Value: {@value}.
+ */
+ public static final boolean DEFAULT_REQUEST_MD5_HEADER = true;
+
+
/**
* Are extensions classes, such as {@code fs.s3a.aws.credentials.provider},
* going to be loaded from the same classloader that loaded
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java
index 7b5aa5ff934ce..41e904ec9de1b 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java
@@ -30,6 +30,8 @@
import org.slf4j.LoggerFactory;
import software.amazon.awssdk.awscore.util.AwsHostNameUtils;
+import software.amazon.awssdk.core.checksums.RequestChecksumCalculation;
+import software.amazon.awssdk.core.checksums.ResponseChecksumValidation;
import software.amazon.awssdk.core.client.config.ClientOverrideConfiguration;
import software.amazon.awssdk.core.client.config.SdkAdvancedClientOption;
import software.amazon.awssdk.core.interceptor.ExecutionInterceptor;
@@ -41,6 +43,7 @@
import software.amazon.awssdk.metrics.LoggingMetricPublisher;
import software.amazon.awssdk.regions.Region;
import software.amazon.awssdk.s3accessgrants.plugin.S3AccessGrantsPlugin;
+import software.amazon.awssdk.services.s3.LegacyMd5Plugin;
import software.amazon.awssdk.services.s3.S3AsyncClient;
import software.amazon.awssdk.services.s3.S3AsyncClientBuilder;
import software.amazon.awssdk.services.s3.S3BaseClientBuilder;
@@ -202,11 +205,34 @@ private , ClientT> Build
configureEndpointAndRegion(builder, parameters, conf);
+ // add a plugin to add a Content-MD5 header.
+ // this is required when performing some operations with third party stores
+ // (for example: bulk delete), and is somewhat harmless when working with AWS S3.
+ if (parameters.isMd5HeaderEnabled()) {
+ LOG.debug("MD5 header enabled");
+ builder.addPlugin(LegacyMd5Plugin.create());
+ }
+
+ //when to calculate request checksums.
+ final RequestChecksumCalculation checksumCalculation =
+ parameters.isChecksumCalculationEnabled()
+ ? RequestChecksumCalculation.WHEN_SUPPORTED
+ : RequestChecksumCalculation.WHEN_REQUIRED;
+ LOG.debug("Using checksum calculation policy: {}", checksumCalculation);
+ builder.requestChecksumCalculation(checksumCalculation);
+
+ // response checksum validation. Slow, even with CRC32 checksums.
+ final ResponseChecksumValidation checksumValidation;
+ checksumValidation = parameters.isChecksumValidationEnabled()
+ ? ResponseChecksumValidation.WHEN_SUPPORTED
+ : ResponseChecksumValidation.WHEN_REQUIRED;
+ LOG.debug("Using checksum validation policy: {}", checksumValidation);
+ builder.responseChecksumValidation(checksumValidation);
+
maybeApplyS3AccessGrantsConfigurations(builder, conf);
S3Configuration serviceConfiguration = S3Configuration.builder()
.pathStyleAccessEnabled(parameters.isPathStyleAccess())
- .checksumValidationEnabled(parameters.isChecksumValidationEnabled())
.build();
final ClientOverrideConfiguration.Builder override =
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
index 079b4022f2225..863a63f0c14a8 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
@@ -1173,10 +1173,15 @@ private ClientManager createClientManager(URI fsURI, boolean dtEnabled) throws I
.withTransferManagerExecutor(unboundedThreadPool)
.withRegion(configuredRegion)
.withFipsEnabled(fipsEnabled)
+ .withS3ExpressStore(s3ExpressStore)
.withExpressCreateSession(
conf.getBoolean(S3EXPRESS_CREATE_SESSION, S3EXPRESS_CREATE_SESSION_DEFAULT))
.withChecksumValidationEnabled(
conf.getBoolean(CHECKSUM_VALIDATION, CHECKSUM_VALIDATION_DEFAULT))
+ .withChecksumCalculationEnabled(
+ conf.getBoolean(CHECKSUM_GENERATION, DEFAULT_CHECKSUM_GENERATION))
+ .withMd5HeaderEnabled(conf.getBoolean(REQUEST_MD5_HEADER,
+ DEFAULT_REQUEST_MD5_HEADER))
.withClientSideEncryptionEnabled(isCSEEnabled)
.withClientSideEncryptionMaterials(cseMaterials)
.withAnalyticsAcceleratorEnabled(isAnalyticsAcceleratorEnabled)
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java
index 63ad42dab7adb..af4708120216d 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java
@@ -18,6 +18,7 @@
package org.apache.hadoop.fs.s3a;
+import software.amazon.awssdk.awscore.exception.AwsErrorDetails;
import software.amazon.awssdk.awscore.exception.AwsServiceException;
import software.amazon.awssdk.core.exception.AbortedException;
import software.amazon.awssdk.core.exception.ApiCallAttemptTimeoutException;
@@ -240,8 +241,13 @@ public static IOException translateException(@Nullable String operation,
? (S3Exception) ase
: null;
int status = ase.statusCode();
- if (ase.awsErrorDetails() != null) {
- message = message + ":" + ase.awsErrorDetails().errorCode();
+ // error details, may be null
+ final AwsErrorDetails errorDetails = ase.awsErrorDetails();
+ // error code, will be null if errorDetails is null
+ String errorCode = "";
+ if (errorDetails != null) {
+ errorCode = errorDetails.errorCode();
+ message = message + ":" + errorCode;
}
// big switch on the HTTP status code.
@@ -308,6 +314,8 @@ public static IOException translateException(@Nullable String operation,
// precondition failure: the object is there, but the precondition
// (e.g. etag) didn't match. Assume remote file change during
// rename or status passed in to openfile had an etag which didn't match.
+ // See the SC_200 handler for the treatment of the S3 Express failure
+ // variant.
case SC_412_PRECONDITION_FAILED:
ioe = new RemoteFileChangedException(path, message, "", ase);
break;
@@ -352,6 +360,16 @@ public static IOException translateException(@Nullable String operation,
return ((MultiObjectDeleteException) exception)
.translateException(message);
}
+ if (PRECONDITION_FAILED.equals(errorCode)) {
+ // S3 Express stores report conflict in conditional writes
+ // as a 200 + an error code of "PreconditionFailed".
+ // This is mapped to RemoteFileChangedException for consistency
+ // with SC_412_PRECONDITION_FAILED handling.
+ return new RemoteFileChangedException(path,
+ operation,
+ exception.getMessage(),
+ exception);
+ }
// other 200: FALL THROUGH
default:
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ClientFactory.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ClientFactory.java
index 559cd49c34582..58d3813075695 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ClientFactory.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ClientFactory.java
@@ -187,6 +187,11 @@ final class S3ClientCreationParameters {
*/
private String region;
+ /**
+ * Is this an S3 Express store?
+ */
+ private boolean s3ExpressStore;
+
/**
* Enable S3Express create session.
*/
@@ -207,6 +212,17 @@ final class S3ClientCreationParameters {
*/
private boolean isAnalyticsAcceleratorEnabled;
+ /**
+ * Is the MD5 Header Enabled?
+ */
+ private boolean md5HeaderEnabled;
+
+ /**
+ * Is Checksum calculation Enabled?
+ */
+ private boolean checksumCalculationEnabled;
+
+
/**
* List of execution interceptors to include in the chain
* of interceptors in the SDK.
@@ -255,10 +271,18 @@ public S3ClientCreationParameters withRequesterPays(
return this;
}
+ /**
+ * Is this a requester pays bucket?
+ * @return true if the bucket is requester pays.
+ */
public boolean isRequesterPays() {
return requesterPays;
}
+ /**
+ * Get the credentials.
+ * @return the credential provider.
+ */
public AwsCredentialsProvider getCredentialSet() {
return credentialSet;
}
@@ -275,6 +299,10 @@ public S3ClientCreationParameters withCredentialSet(
return this;
}
+ /**
+ * Get UA suffix.
+ * @return suffix.
+ */
public String getUserAgentSuffix() {
return userAgentSuffix;
}
@@ -536,6 +564,20 @@ public String getKmsRegion() {
return kmsRegion;
}
+ public boolean isS3ExpressStore() {
+ return s3ExpressStore;
+ }
+
+ /**
+ * Set builder value.
+ * @param value new value
+ * @return the builder
+ */
+ public S3ClientCreationParameters withS3ExpressStore(final boolean value) {
+ s3ExpressStore = value;
+ return this;
+ }
+
/**
* Should s3express createSession be called?
* @return true if the client should enable createSession.
@@ -564,10 +606,46 @@ public S3ClientCreationParameters withChecksumValidationEnabled(final boolean va
return this;
}
+ /**
+ * Is checksum validation on every request enabled?
+ * @return true if validation is on every request.
+ */
public boolean isChecksumValidationEnabled() {
return checksumValidationEnabled;
}
+ /**
+ * Should MD5 headers be added?
+ * @return true to always add an MD5 header.
+ */
+ public boolean isMd5HeaderEnabled() {
+ return md5HeaderEnabled;
+ }
+
+ /**
+ * Set builder value.
+ * @param value new value
+ * @return the builder
+ */
+ public S3ClientCreationParameters withMd5HeaderEnabled(final boolean value) {
+ md5HeaderEnabled = value;
+ return this;
+ }
+
+ public boolean isChecksumCalculationEnabled() {
+ return checksumCalculationEnabled;
+ }
+
+ /**
+ * Set builder value.
+ * @param value new value
+ * @return the builder
+ */
+ public S3ClientCreationParameters withChecksumCalculationEnabled(final boolean value) {
+ checksumCalculationEnabled = value;
+ return this;
+ }
+
@Override
public String toString() {
return "S3ClientCreationParameters{" +
@@ -580,8 +658,10 @@ public String toString() {
", multiPartThreshold=" + multiPartThreshold +
", multipartCopy=" + multipartCopy +
", region='" + region + '\'' +
+ ", s3ExpressStore=" + s3ExpressStore +
", expressCreateSession=" + expressCreateSession +
", checksumValidationEnabled=" + checksumValidationEnabled +
+ ", md5HeaderEnabled=" + md5HeaderEnabled +
'}';
}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java
index 364f780863a01..c8a3864d59e2c 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java
@@ -313,7 +313,8 @@ private CompleteMultipartUploadResponse finalizeMultipartUpload(
}
try (AuditSpan span = activateAuditSpan()) {
CompleteMultipartUploadResponse uploadResult;
- uploadResult = invoker.retry("Completing multipart upload", destKey,
+ uploadResult = invoker.retry("Completing multipart upload id " + uploadId,
+ destKey,
true,
retrying,
() -> {
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/CustomSdkSigner.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/CustomSdkSigner.java
index b378602165074..e374a1ad9fe5b 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/CustomSdkSigner.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/CustomSdkSigner.java
@@ -104,6 +104,7 @@ public SdkHttpFullRequest sign(SdkHttpFullRequest request,
/**
* Parse the bucket name from the host.
+ * This does not work for path-style access; the hostname of the endpoint is returned.
* @param host hostname
* @return the parsed bucket name; if "kms" is KMS signing.
*/
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/RolePolicies.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/RolePolicies.java
index bd76d83ee096f..9cd68a5ba8ea8 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/RolePolicies.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/RolePolicies.java
@@ -18,7 +18,6 @@
package org.apache.hadoop.fs.s3a.auth;
-import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
@@ -199,10 +198,31 @@ private RolePolicies() {
public static final String S3_RESTORE_OBJECT = "s3:RestoreObject";
/**
- * S3Express session permission; required unless sessions are disabled.
+ * Everything: {@value}.
+ */
+ public static final String EVERYTHING_ARN = "*";
+
+
+ /**
+ * All S3Express buckets: {@value}.
+ * S3Express adds another "domain" for permissions: S3 express ARNs and S3 Express operations,
+ * of which createSession is one key operation.
+ * See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-express-security.html
+ * Note: this wildcard patten came from AWS Q; if it is wrong blame GenerativeAI.
+ */
+ public static final String S3EXPRESS_ALL_BUCKETS = "arn:aws:s3express:*:*:bucket/*--*--x-s3";
+
+ /**
+ * S3Express session permission; required unless sessions are disabled: {@value}.
+ * See https://docs.aws.amazon.com/AmazonS3/latest/API/API_CreateSession.html
*/
public static final String S3EXPRESS_CREATE_SESSION_POLICY = "s3express:CreateSession";
+ /**
+ * S3 Express All operations: {@value}.
+ */
+ public static final String S3EXPRESS_ALL_OPERATIONS = "s3express:*";
+
/**
* Actions needed to read a file in S3 through S3A, excluding
* SSE-KMS.
@@ -224,7 +244,7 @@ private RolePolicies() {
*/
private static final String[] S3_ROOT_READ_OPERATIONS =
new String[]{
- S3_ALL_GET
+ S3_ALL_GET,
};
public static final List S3_ROOT_READ_OPERATIONS_LIST =
@@ -239,7 +259,7 @@ private RolePolicies() {
public static final String[] S3_BUCKET_READ_OPERATIONS =
new String[]{
S3_ALL_GET,
- S3_BUCKET_ALL_LIST
+ S3_BUCKET_ALL_LIST,
};
/**
@@ -281,7 +301,7 @@ private RolePolicies() {
S3_PUT_OBJECT,
S3_PUT_OBJECT_ACL,
S3_DELETE_OBJECT,
- S3_ABORT_MULTIPART_UPLOAD
+ S3_ABORT_MULTIPART_UPLOAD,
}));
/**
@@ -292,6 +312,13 @@ private RolePolicies() {
S3_ALL_BUCKETS,
S3_ALL_OPERATIONS);
+ /**
+ * S3 Express operations required for operation.
+ */
+ public static final Statement STATEMENT_S3EXPRESS = statement(true,
+ S3EXPRESS_ALL_BUCKETS,
+ S3EXPRESS_ALL_OPERATIONS);
+
/**
* The s3:GetBucketLocation permission is for all buckets, not for
* any named bucket, which complicates permissions.
@@ -310,8 +337,9 @@ private RolePolicies() {
public static List allowS3Operations(String bucket,
boolean write) {
// add the bucket operations for the specific bucket ARN
- ArrayList statements =
+ List statements =
Lists.newArrayList(
+ STATEMENT_S3EXPRESS,
statement(true,
bucketToArn(bucket),
S3_GET_BUCKET_LOCATION, S3_BUCKET_ALL_LIST));
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/MagicCommitIntegration.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/MagicCommitIntegration.java
index ba1dd400f6d7b..9ada0d565a342 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/MagicCommitIntegration.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/MagicCommitIntegration.java
@@ -64,7 +64,7 @@ public MagicCommitIntegration(S3AFileSystem owner,
boolean magicCommitEnabled) {
super(owner.createStoreContext());
this.owner = owner;
- this.magicCommitEnabled = magicCommitEnabled;
+ this.magicCommitEnabled = magicCommitEnabled && owner.isMultipartUploadEnabled();
}
/**
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChecksumSupport.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChecksumSupport.java
index b14f5f7bd2370..567eb571012a8 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChecksumSupport.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChecksumSupport.java
@@ -18,12 +18,12 @@
package org.apache.hadoop.fs.s3a.impl;
+import java.util.Locale;
import java.util.Set;
import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableSet;
import software.amazon.awssdk.services.s3.model.ChecksumAlgorithm;
-import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.ConfigurationHelper;
@@ -34,6 +34,22 @@
*/
public final class ChecksumSupport {
+ /**
+ * Special checksum algorithm to declare that no checksum
+ * is required: {@value}.
+ */
+ public static final String NONE = "NONE";
+
+ /**
+ * CRC32C, mapped to CRC32_C algorithm class.
+ */
+ public static final String CRC32C = "CRC32C";
+
+ /**
+ * CRC64NVME, mapped to CRC64_NVME algorithm class.
+ */
+ public static final String CRC64NVME = "CRC64NVME";
+
private ChecksumSupport() {
}
@@ -43,6 +59,7 @@ private ChecksumSupport() {
private static final Set SUPPORTED_CHECKSUM_ALGORITHMS = ImmutableSet.of(
ChecksumAlgorithm.CRC32,
ChecksumAlgorithm.CRC32_C,
+ ChecksumAlgorithm.CRC64_NVME,
ChecksumAlgorithm.SHA1,
ChecksumAlgorithm.SHA256);
@@ -58,14 +75,21 @@ public static ChecksumAlgorithm getChecksumAlgorithm(Configuration conf) {
CHECKSUM_ALGORITHM,
ChecksumAlgorithm.class,
configValue -> {
- if (StringUtils.isBlank(configValue)) {
+ // default values and handling algorithms names without underscores.
+ String val = configValue == null
+ ? NONE
+ : configValue.toUpperCase(Locale.ROOT);
+ switch (val) {
+ case "":
+ case NONE:
return null;
- }
- if (ChecksumAlgorithm.CRC32_C.toString().equalsIgnoreCase(configValue)) {
- // In case the configuration value is CRC32C, without underscore.
+ case CRC32C:
return ChecksumAlgorithm.CRC32_C;
+ case CRC64NVME:
+ return ChecksumAlgorithm.CRC64_NVME;
+ default:
+ throw new IllegalArgumentException("Checksum algorithm is not supported: " + configValue);
}
- throw new IllegalArgumentException("Checksum algorithm is not supported: " + configValue);
});
}
}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/InternalConstants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/InternalConstants.java
index 8cf435f7ca603..d8448655769eb 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/InternalConstants.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/InternalConstants.java
@@ -313,4 +313,9 @@ private InternalConstants() {
public static final String UPLOAD_PROGRESS_LOG_NAME =
"org.apache.hadoop.fs.s3a.S3AFileSystem.Progress";
+ /**
+ * AWS Error code for conditional put failure on s3 express buckets: {@value}.
+ */
+ public static final String PRECONDITION_FAILED = "PreconditionFailed";
+
}
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
index 1f65caeb5e219..a4a08f093ef1f 100644
--- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
@@ -931,7 +931,9 @@ Here are some the S3A properties for use in production.
Indicates the algorithm used to create the checksum for the object
to be uploaded to S3. Unset by default. It supports the following values:
- 'CRC32', 'CRC32C', 'SHA1', and 'SHA256'
+ 'CRC32', 'CRC32C', 'SHA1', 'SHA256', "CRC64_NVME", "none"
+ The CRC64_NVME option requires aws-crt on the classpath, and is still
+ tangibly slower than CRC32C, which has its own instruction on x86 and ARM.
@@ -1433,6 +1435,9 @@ The "fast" output stream
1. Uploads blocks in parallel in background threads.
1. Begins uploading blocks as soon as the buffered data exceeds this partition
size.
+1. Uses any checksum set in `fs.s3a.create.checksum.algorithm` to calculate an upload
+ checksum on data written; this is included in the file/part upload and verified
+ on the store. This can be a source of third-party store compatibility issues.
1. When buffering data to disk, uses the directory/directories listed in
`fs.s3a.buffer.dir`. The size of data which can be buffered is limited
to the available disk space.
@@ -1707,16 +1712,7 @@ rate.
The best practise for using this option is to disable multipart purges in
normal use of S3A, enabling only in manual/scheduled housekeeping operations.
-### S3A "fadvise" input policy support
-
-The S3A Filesystem client supports the notion of input policies, similar
-to that of the Posix `fadvise()` API call. This tunes the behavior of the S3A
-client to optimise HTTP GET requests for the different use cases.
-
-See [Improving data input performance through fadvise](./performance.html#fadvise)
-for the details.
-
-##Metrics
+## Metrics
S3A metrics can be monitored through Hadoop's metrics2 framework. S3A creates
its own metrics system called s3a-file-system, and each instance of the client
@@ -1754,7 +1750,127 @@ also get recorded, for example the following:
Note that low-level metrics from the AWS SDK itself are not currently included
in these metrics.
-## Other Topics
+
+## Checksums
+
+The S3 Client can use checksums in its requests to an S3 store in a number of ways:
+
+1. To provide a checksum of the request headers.
+2. To provide a `Content-MD5` hash of the request headers
+3. To provide a checksum of data being PUT/POSTed to the store.
+4. To validate data downloaded from the store.
+
+The various options available can impact performance and compatibility.
+To understand the risks and issues here know that:
+* Request checksum generation (item 1) and validation (4) can be done "when required" or "always".
+ The "always" option is stricter, but can result in third-party compatibility issues
+* Some third-party stores require the `Content-MD5` header and will fail without it (item 2)
+* Data upload checksums (item 3) can be computationally expensive and incompatible with third-party stores
+* The most efficient data upload checksum is CRC32C; there are explicit opcodes for this in x86 and ARM CPUs, with the appropriate implementation circuitry.
+* Data download validation checksums are also computationally expensive.
+
+| Option | Purpose | Values | Default |
+|------------------------------------|------------------------------------------------|---------|----------|
+| `fs.s3a.request.md5.header` | Enable MD5 header | boolean | `true` |
+| `fs.s3a.checksum.generation` | Generate checksums on all requests | boolean | `false` |
+| `fs.s3a.checksum.validation` | Validate checksums on download | boolean | `false` |
+| `fs.s3a.create.checksum.algorithm` | Checksum Algorithm when creating/copying files | `NONE`, `CRC32`, `CRC32C`, `CRC32_C`, `CRC64NVME` , `CRC64_NVME`, `SHA256`, `SHA1` | `""` |
+
+
+Turning on checksum generation and validation may seem like obvious actions, but consider
+this: you are communicating with an S3 store over an HTTPS channels, which includes
+cryptographically strong HMAC checksums of every block transmitted.
+These are far more robust than the CRC* algorithms, and the computational cost is already
+being paid for: so why add more?
+
+With TLS ensuring the network traffic isn't altered from the moment it is encrypted to when
+it is decrypted, all extra checksum generation/validation does is ensure that there's no
+accidental corruption between the data being generated and uploaded, or between being downloaded and read.
+
+This could potentially deal with memory/buffering/bus issues on the servers.
+However this is what ECC RAM is for.
+If you do suspect requests being corrupted during writing or reading, the options may
+be worth considering.
+As it is, they are off by default to avoid compatibility problems.
+
+Note: if you have a real example of where these checksum options have identified memory corruption,
+please let us know.
+
+### Content-MD5 Header on requests: `fs.s3a.request.md5.header`
+
+Send a `Content-MD5 header` with every request?
+
+This header is required when interacting with some third-party stores.
+It is supported by AWS S3, though has has some unexpected behavior with AWS S3 Express storage
+[issue 6459](https://github.com/aws/aws-sdk-java-v2/issues/6459).
+As that appears to have been fixed in the 2.35.4 SDK release, this option is enabled by default.
+
+### Request checksum generation: `fs.s3a.checksum.generation`
+
+Should checksums be generated for all requests made to the store?
+
+* Incompatible with some third-party stores
+* If `true` then multipart upload (i.e. large file upload) may fail if `fs.s3a.create.checksum.algorithm`
+ is not set to a valid algorithm (i.e. something other than `NONE`)
+
+Set `fs.s3a.checksum.generation` to `false` by default to avoid these problems.
+
+### Checksum validation `fs.s3a.checksum.validation`
+
+Should the checksums of downloaded data be validated?
+
+This hurts performance and should be only used if considered important.
+
+### Creation checksum `fs.s3a.create.checksum.algorithm`
+
+
+This is the algorithm to use when checksumming data during file creation and copy.
+
+Options: `NONE`, `CRC32`, `CRC32C`, `CRC32_C`, `CRC64NVME` , `CRC64_NVME`, `SHA256`, `SHA1`
+
+The option `NONE` is new to Hadoop 3.4.3; previously an empty string was required for the same behavior.
+
+The `CRC64NVME`/`CRC64_NVME` option is also new to Hadoop 3.4.3 and requires the `aws-crt` module to be on the classpath, otherwise an error is printed:
+
+```
+java.lang.RuntimeException: Could not load software.amazon.awssdk.crt.checksums.CRC64NVME.
+Add dependency on 'software.amazon.awssdk.crt:aws-crt' module to enable CRC64NVME feature.
+```
+
+Checksum/algorithm incompatibilities may surface as a failure in "Completing multipart upload"`.
+
+First as a failure reported as a "missing part".
+```
+org.apache.hadoop.fs.s3a.AWSBadRequestException: Completing multipart upload id l8itQB.
+5u7TcWqznqbGfTjHv06mxb4IlBNcZiDWrBAS0t1EMJGkr9J1QD2UAwDM5rLUZqypJfWCoPJtySxA3QK9QqKTBdKr3LXYjYJ_r9lRcGdzBRbnIJeI8tBr8yqtS on
+test/testCommitEmptyFile/empty-commit.txt:
+software.amazon.awssdk.services.s3.model.S3Exception: One or more of the specified parts could not be found.
+The part may not have been uploaded, or the specified entity tag may not match the part's entity tag.
+(Service: S3, Status Code: 400, Request ID: AQ0J4B66H626Y3FH,
+Extended Request ID: g1zo25aQCZfqFh3vOzrzOBp9RjJEWmKImRcfWhvaeFHQ2hZo1xTm3GVMD03zN+d+cFB6oNeelNc=)
+(SDK Attempt Count: 1):InvalidPart: One or more of the specified parts could not be found.
+The part may not have been uploaded, or the specified entity tag may not match the part's entity tag.
+(Service: S3, Status Code: 400, Request ID: AQ0J4B66H626Y3FH, Extended Request ID:
+g1zo25aQCZfqFh3vOzrzOBp9RjJEWmKImRcfWhvaeFHQ2hZo1xTm3GVMD03zN+d+cFB6oNeelNc=) (SDK Attempt Count: 1)
+```
+
+Alternatively, as the failure of multipart uploads when a checksum algorithm is set and the part ordering is not in sequence.
+
+```
+org.apache.hadoop.fs.s3a.AWSStatus500Exception:
+ Completing multipart upload id A8rf256dBVbDtIVLr40KaMGKw9DY.rhgNP5zmn1ap97YjPaIO2Ac3XXL_T.2HCtIrGUpx5bdOTgvVeZzVHuoWI4pKv_MeMMVqBHJGP7u_q4PR8AxWvSq0Lsv724HT1fQ
+ on test/testMultipartUploadReverseOrderNonContiguousPartNumbers:
+software.amazon.awssdk.services.s3.model.S3Exception: We encountered an internal error.
+Please try again.
+(Service: S3, Status Code: 500, Request ID: WTBY2FX76Q5F5YWB,
+Extended Request ID: eWQWk8V8rmVmKImWVCI2rHyFS3XQSPgIkjfAyzzZCgVgyeRqox8mO8qO4ODMB6IUY0+rYqqsnOX2zXiQcRzFlb9p3nSkEEc+T0CYurLaH28=)
+(SDK Attempt Count: 3)
+```
+
+This is only possible through the FileSystem multipart API; normal data writes including
+those through the magic committer will not encounter it,
+
+## Other Topics
### Copying Data with distcp
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/reading.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/reading.md
index fa4572bb165b1..99490bb4b3368 100644
--- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/reading.md
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/reading.md
@@ -163,6 +163,16 @@ configured to use the vector IO API, it is likely to be significantly
faster to use the classic stream and its parallel reads.
+## S3A "fadvise" input policy support: `fs.s3a.experimental.input.fadvise`
+
+The S3A Filesystem client supports the notion of input policies, similar
+to that of the Posix `fadvise()` API call. This tunes the behavior of the S3A
+client to optimise HTTP GET requests for the different use cases.
+
+See [Improving data input performance through fadvise](./performance.html#fadvise)
+for the details.
+
+
## Developer Topics
### Stream IOStatistics
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md
index 4176b20e8f54a..049f27c2e5bdf 100644
--- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md
@@ -585,6 +585,19 @@ on third party stores.
test.fs.s3a.performance.enabledfalse
+
+
+
+ fs.s3a.ext.multipart.commit.consumes.upload.id
+ true
+
+
+
```
See [Third Party Stores](third_party_stores.html) for more on this topic.
@@ -736,9 +749,23 @@ For stores with stricter semantics, these test cases must be disabled.
```
+### Changing expectations on multipart upload retries: `ITestS3AContractMultipartUploader` and `ITestUploadRecovery`
+
+If the store reports errors when trying to list/abort completed multipart uploads,
+expect failures in `ITestUploadRecovery` and `ITestS3AContractMultipartUploader`.
+The tests can be reconfigured to expect failure by setting the option
+`fs.s3a.ext.multipart.commit.consumes.upload.id` to true.
+
+Note how this can be set as a per-bucket option.
+
+```xml
+
+ fs.s3a.ext.multipart.commit.consumes.upload.id
+ true
+
+```
### Tests which may fail (and which you can ignore)
-* `ITestS3AContractMultipartUploader` tests `testMultipartUploadAbort` and `testSingleUpload` raising `FileNotFoundException`
* `ITestS3AMiscOperations.testEmptyFileChecksums`: if the FS encrypts data always.
## Debugging Test failures
@@ -837,10 +864,15 @@ Key features of `AbstractS3ATestBase`
* `getFileSystem()` returns the S3A Filesystem bonded to the contract test Filesystem
defined in `fs.s3a.contract.test`
* will automatically skip all tests if that URL is unset.
-* Extends `AbstractFSContractTestBase` and `Assert` for all their methods.
+* Extends `AbstractFSContractTestBase`
+* Uses AssertJ for all assertions, _not_ those of JUnit5.
Having shared base classes may help reduce future maintenance too. Please
-use them/
+use them.
+
+We adopted AssertJ assertions long before the move to JUnit5.
+While there are still many tests with legacy JUnit 1.x assertions, all new test cases
+should use AssertJ assertions and MUST NOT use JUnit5.
### Secure
@@ -873,7 +905,7 @@ against other regions, or with third party S3 implementations. Thus the
URL can be overridden for testing elsewhere.
-### Works With Other S3 Stored
+### Works With Other S3 Stores
Don't assume AWS S3 US-East only, do allow for working with external S3 implementations.
Those may be behind the latest S3 API features, not support encryption, session
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md
index f6fea9338a424..a7151bbcb7f20 100644
--- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md
@@ -42,19 +42,9 @@ The features which may be unavailable include:
* Bucket lifecycle rules to clean up pending uploads.
* Support for multipart uploads.
* Conditional file creation. (`fs.s3a.create.conditional.enabled = false`)
+* Variations in checksum calculation on uploads.
+* Requirement for Content-MD5 headers.
-### Disabling Change Detection
-
-The (default) etag-based change detection logic expects stores to provide an Etag header in HEAD/GET requests,
-and to support it as a precondition in subsequent GET and COPY calls.
-If a store does not do this, disable the checks.
-
-```xml
-
- fs.s3a.change.detection.mode
- none
-
-```
## Connecting to a third party object store over HTTPS
The core setting for a third party store is to change the endpoint in `fs.s3a.endpoint`.
@@ -65,7 +55,7 @@ path style access must also be enabled in `fs.s3a.path.style.access`.
The v4 signing algorithm requires a region to be set in `fs.s3a.endpoint.region`.
A non-empty value is generally sufficient, though some deployments may require
-a specific value.
+a specific value.
*Important:* do not use `auto` or `sdk` as these may be used
in the future for specific region binding algorithms.
@@ -87,7 +77,7 @@ then these must be set, either in XML or (preferred) in a JCEKS file.
fs.s3a.endpoint.region
- anything
+ anything except: sdk, auto, ec2
@@ -104,7 +94,14 @@ then these must be set, either in XML or (preferred) in a JCEKS file.
If per-bucket settings are used here, then third-party stores and credentials may be used alongside an AWS store.
+### region naming
+
+AWS SDK requires the name of a region is supplied for signing, and that region match the endpoint used.
+
+Third-party stores don't normally care about the name of a region, *only that a region is supplied*.
+You should set `fs.s3a.endpoint.region` to anything except the following reserved names: `sdk`, `ec2` and `auto`.
+We have plans for those.
## Other issues
@@ -120,7 +117,7 @@ This can be addressed in two ways
#### S3Guard uploads command
-This can be executed on a schedule, or manually
+This can be executed on a schedule, or manually:
```
hadoop s3guard uploads -abort -force s3a://bucket/
@@ -174,10 +171,79 @@ false to disable use of these features.
```
+## Controlling Upload Checksums and MD5 Headers
+
+It may be necessary to change checksums of uploads by
+1. Restoring the attachment of a `Content-MD5 header` in requests
+2. Restricting checksum generation to only when required.
+
+```xml
+
+ fs.s3a.request.md5.header
+ true
+ re-enable calculation and inclusion of an MD5 HEADER on data upload operations
+
+
+
+ fs.s3a.checksum.generation
+ false
+ Calculate and attach a message checksum on every operation.
+
+
+
+ fs.s3a.checksum.validation
+ false
+ Validate data checksums on download
+
+
+```
+
+These options are set for best compatibility and performance by default; they may need tuning for specific stores.
+
+See [checksums](index.html#checksums) for more details.
+
+### Disabling Change Detection
+
+The (default) etag-based change detection logic expects stores to provide an Etag header in HEAD/GET requests,
+and to support it as a precondition in subsequent GET and COPY calls.
+If a store does not do this, disable the checks.
+
+```xml
+
+ fs.s3a.change.detection.mode
+ none
+
+```
+
+## Handling Null Etags
+
+Some object stores do not support etags, that is: they return `null` or an empty string as the etag of an object on both HEAD and GET requests.
+
+This breaks version management in the classic input stream *and* metadata caching in the analytics stream.
+
+To work with such a store:
+* Set `fs.s3a.input.stream.type` to `classic`
+* Set `fs.s3a.change.detection.mode` to `none`
+
+```xml
+
+ fs.s3a.input.stream.type
+ classic
+
+
+
+ fs.s3a.change.detection.mode
+ none
+
+```
+
+Note: the [cloudstore](https://github.com/steveloughran/cloudstore) `etag` command will retrieve and print an object's etag,
+and can be used to help debug this situation.
+The etag value of a newly created object SHOULD be a non-empty string.
# Troubleshooting
-The most common problem when talking to third-party stores are
+The most common problem when talking to third-party stores are:
1. The S3A client is still configured to talk to the AWS S3 endpoint. This leads to authentication failures and/or reports that the bucket is unknown.
2. Path access has not been enabled, the client is generating a host name for the target bucket and it does not exist.
@@ -185,11 +251,12 @@ The most common problem when talking to third-party stores are
4. JVM HTTPS settings include the certificates needed to negotiate a TLS connection with the store.
-## How to improve troubleshooting
+## How to Troubleshoot problems
-### log more network info
+### Log More Network Info
+
+There are some very low level logs which can be printed.
-There are some very low level logs.
```properties
# Log all HTTP requests made; includes S3 interaction. This may
# include sensitive information such as account IDs in HTTP headers.
@@ -203,7 +270,7 @@ log4j.logger.io.netty.handler.logging=DEBUG
log4j.logger.io.netty.handler.codec.http2.Http2FrameLogger=DEBUG
```
-### Cut back on retries, shorten timeouts
+### Reduce on Retries; Shorten Timeouts
By default, there's a lot of retries going on in the AWS connector (which even retries on DNS failures)
and in the S3A code which invokes it.
@@ -263,7 +330,7 @@ the AWS SDK itself still makes a limited attempt to retry.
There's an external utility, [cloudstore](https://github.com/steveloughran/cloudstore) whose [storediag](https://github.com/steveloughran/cloudstore#command-storediag) exists to debug the connection settings to hadoop cloud storage.
```bash
-hadoop jar cloudstore-1.0.jar storediag s3a://nonexistent-bucket-example/
+hadoop jar cloudstore-1.1.jar storediag s3a://nonexistent-bucket-example/
```
The main reason it's not an ASF release is that it allows for a rapid release cycle, sometimes hours; if anyone doesn't trust
@@ -414,7 +481,47 @@ Fix: path style access
```
-# Connecting to Google Cloud Storage through the S3A connector
+# Settings for Specific Stores
+
+## Dell ECS through the S3A Connector
+
+As of October 2025 and the 2.33.8 AWS SDK, the settings needed to interact with Dell ECS at [ECS Test Drive](https://portal.ecstestdrive.com/) were
+
+```xml
+
+ fs.s3a.region
+ region
+ arbitrary name
+
+
+
+
+ fs.s3a.endpoint.region
+ region
+
+
+
+ fs.s3a.path.style.access
+ true
+
+
+
+ fs.s3a.create.conditional.enabled
+ false
+
+
+
+ fs.s3a.request.md5.header
+ false
+
+
+
+ fs.s3a.checksum.generation
+ false
+
+```
+
+## Google Cloud Storage through the S3A connector
It *is* possible to connect to google cloud storage through the S3A connector.
However, Google provide their own [Cloud Storage connector](https://cloud.google.com/dataproc/docs/concepts/connectors/cloud-storage).
@@ -443,63 +550,68 @@ this makes renaming and deleting significantly slower.
- fs.s3a.bucket.gcs-container.access.key
+ fs.s3a.access.keyGOOG1EZ....
- fs.s3a.bucket.gcs-container.secret.key
+ fs.s3a.secret.keySECRETS
- fs.s3a.bucket.gcs-container.endpoint
+ fs.s3a.endpointhttps://storage.googleapis.com
+
+
- fs.s3a.bucket.gcs-container.bucket.probe
- 0
+ fs.s3a.endpoint.region
+ gcs
- fs.s3a.bucket.gcs-container.list.version
- 1
+ fs.s3a.path.style.access
+ true
- fs.s3a.bucket.gcs-container.multiobjectdelete.enable
+ fs.s3a.checksum.generationfalse
+ Calculate and attach a message checksum on every operation. (default: true)
- fs.s3a.bucket.gcs-container.path.style.access
- true
+ fs.s3a.bucket.probe
+ 0
-
- fs.s3a.bucket.gcs-container.endpoint.region
- gcs
+ fs.s3a.list.version
+ 1
-
- fs.s3a.multipart.uploads.enabled
+ fs.s3a.multiobjectdelete.enablefalse
-
+
+
+ fs.s3a.committer.magic.enabled
+ false
+
+
fs.s3a.optimized.copy.from.local.enabledfalse
-
+
fs.s3a.create.conditional.enabledfalse
-
```
@@ -531,3 +643,4 @@ It is also a way to regression test foundational S3A third-party store compatibi
_Note_ If anyone is set up to test this regularly, please let the hadoop developer team know if regressions do surface,
as it is not a common test configuration.
+We do use it to help test compatibility during SDK updates.
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md
index 151ee5bd8a465..9b70f12dc8a6b 100644
--- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md
@@ -243,7 +243,7 @@ A credential provider listed in `fs.s3a.aws.credentials.provider` does not imple
the interface `software.amazon.awssdk.auth.credentials.AwsCredentialsProvider`.
```
-InstantiationIOException: `s3a://stevel-gcs/': Class org.apache.hadoop.fs.s3a.S3ARetryPolicy does not implement
+InstantiationIOException: `s3a://gcs/': Class org.apache.hadoop.fs.s3a.S3ARetryPolicy does not implement
software.amazon.awssdk.auth.credentials.AwsCredentialsProvider (configuration key fs.s3a.aws.credentials.provider)
at org.apache.hadoop.fs.s3a.impl.InstantiationIOException.isNotInstanceOf(InstantiationIOException.java:128)
at org.apache.hadoop.fs.s3a.S3AUtils.getInstanceFromReflection(S3AUtils.java:604)
@@ -354,7 +354,7 @@ org.apache.hadoop.fs.s3a.AWSBadRequestException: upload part #1 upload ID 112233
This is an obscure failure which was encountered as part of
[HADOOP-19221](https://issues.apache.org/jira/browse/HADOOP-19221) : an upload of part of a file could not
-be succesfully retried after a failure was reported on the first attempt.
+be successfully retried after a failure was reported on the first attempt.
1. It was only encountered during uploading files via the Staging Committers
2. And is a regression in the V2 SDK.
@@ -364,7 +364,7 @@ be succesfully retried after a failure was reported on the first attempt.
* If it is encountered on a release without the fix, please upgrade.
It may be that the problem arises in the AWS SDK's "TransferManager", which is used for a
-higher performance upload of data from the local fileystem. If this is the case. disable this feature:
+higher performance upload of data from the local filesystem. If this is the case. disable this feature:
```
fs.s3a.optimized.copy.from.local.enabled
@@ -409,6 +409,48 @@ affect the performance.
```
+### Status Code 400 "XAmzContentSHA256Mismatch: The Content-SHA256 you specified did not match what we receive"
+
+Seen when working with a third-party store
+
+```
+org.apache.hadoop.fs.s3a.AWSBadRequestException: PUT 0-byte object on test:
+software.amazon.awssdk.services.s3.model.S3Exception:
+The Content-SHA256 you specified did not match what we received
+(Service: S3, Status Code: 400, Request ID: 0c07c87d:196d43d824a:d7bca:eeb, Extended Request ID: 2af53adb49ffb141a32b534ad7ffbdf33a247f6b95b422011e0b109649d1fab7) (SDK Attempt Count: 1):
+XAmzContentSHA256Mismatch: The Content-SHA256 you specified did not match what we received
+```
+
+This happens when a file create checksum has been enabled but the store does not support it/support it consistently with AWS S3.
+
+```xml
+
+ fs.s3a.create.checksum.algorithm
+ none
+
+```
+
+### Status Code 400 "x-amz-sdk-checksum-algorithm specified, but no corresponding x-amz-checksum-* or x-amz-trailer headers were found"
+
+```
+org.apache.hadoop.fs.s3a.AWSBadRequestException: PUT 0-byte object on test
+software.amazon.awssdk.services.s3.model.InvalidRequestException
+x-amz-sdk-checksum-algorithm specified, but no corresponding x-amz-checksum-* or x-amz-trailer headers were found.
+ (Service: S3, Status Code: 400, Request ID: 012929bd17000198c8bc82d20509eecd6df79b1a, Extended Request ID: P9bq0Iv) (SDK Attempt Count: 1):
+```
+
+The checksum algorithm to be used is not one supported by the store.
+In particular, the value `unknown_to_sdk_version` appears to cause it.
+
+```xml
+
+ fs.s3a.create.checksum.algorithm
+ unknown_to_sdk_version
+
+```
+
+
+
## Access Denied
HTTP error codes 401 and 403 are mapped to `AccessDeniedException` in the S3A connector.
@@ -436,6 +478,9 @@ java.nio.file.AccessDeniedException: bucket: doesBucketExist on bucket:
```
+If working with a third-party bucket, verify the `fs.s3a.endpoint` setting
+points to the third-party store.
+
### `AccessDeniedException` All access to this object has been disabled
Caller has no permission to access the bucket at all.
@@ -560,6 +605,94 @@ Glacier.
If you want to access the file with S3A after writes, do not set `fs.s3a.create.storage.class` to `glacier` or `deep_archive`.
+### `AccessDeniedException` with `SignatureDoesNotMatch` on a third party bucket.
+
+This can surface when trying to interact, especially write data, to a third-party bucket
+
+```
+ Writing Object on example-file: software.amazon.awssdk.services.s3.model.S3Exception: Invalid argument. (Service: S3, Status Code: 403, Request ID: null) (SDK Attempt Count: 1):SignatureDoesNotMatch
+```
+
+The store does not recognize checksum calculation on every operation.
+Fix: disable it by setting `fs.s3a.checksum.generation` to `false`.
+
+```
+
+ fs.s3a.checksum.generation
+ false
+ Calculate and attach a message checksum on every operation. (default: false)
+
+```
+
+Full stack
+
+```
+> bin/hadoop fs -touchz s3a://gcs/example-file
+2025-10-21 16:23:27,642 [main] WARN s3a.S3ABlockOutputStream (S3ABlockOutputStream.java:progressChanged(1335)) - Transfer failure of block FileBlock{index=1, destFile=/tmp/hadoop-stevel/s3a/s3ablock-0001-1358390699869033998.tmp, state=Upload, dataSize=0, limit=-1}
+2025-10-21 16:23:27,645 [main] DEBUG shell.Command (Command.java:displayError(481)) - touchz failure
+java.nio.file.AccessDeniedException: example-file: Writing Object on example-file: software.amazon.awssdk.services.s3.model.S3Exception: Invalid argument. (Service: S3, Status Code: 403, Request ID: null) (SDK Attempt Count: 1):SignatureDoesNotMatch
+ at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:271)
+ at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:124)
+ at org.apache.hadoop.fs.s3a.Invoker.lambda$retry$4(Invoker.java:376)
+ at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:468)
+ at org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:372)
+ at org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:347)
+ at org.apache.hadoop.fs.s3a.WriteOperationHelper.retry(WriteOperationHelper.java:210)
+ at org.apache.hadoop.fs.s3a.WriteOperationHelper.putObject(WriteOperationHelper.java:534)
+ at org.apache.hadoop.fs.s3a.S3ABlockOutputStream.putObject(S3ABlockOutputStream.java:726)
+ at org.apache.hadoop.fs.s3a.S3ABlockOutputStream.close(S3ABlockOutputStream.java:518)
+ at org.apache.hadoop.fs.FSDataOutputStream$PositionCache.close(FSDataOutputStream.java:77)
+ at org.apache.hadoop.fs.FSDataOutputStream.close(FSDataOutputStream.java:106)
+ at org.apache.hadoop.fs.shell.TouchCommands$Touchz.touchz(TouchCommands.java:89)
+ at org.apache.hadoop.fs.shell.TouchCommands$Touchz.processNonexistentPath(TouchCommands.java:85)
+ at org.apache.hadoop.fs.shell.Command.processArgument(Command.java:303)
+ at org.apache.hadoop.fs.shell.Command.processArguments(Command.java:285)
+ at org.apache.hadoop.fs.shell.FsCommand.processRawArguments(FsCommand.java:121)
+ at org.apache.hadoop.fs.shell.Command.run(Command.java:192)
+ at org.apache.hadoop.fs.FsShell.run(FsShell.java:327)
+ at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:82)
+ at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:97)
+ at org.apache.hadoop.fs.FsShell.main(FsShell.java:390)
+Caused by: software.amazon.awssdk.services.s3.model.S3Exception: Invalid argument. (Service: S3, Status Code: 403, Request ID: null) (SDK Attempt Count: 1)
+ at software.amazon.awssdk.services.s3.model.S3Exception$BuilderImpl.build(S3Exception.java:113)
+ at software.amazon.awssdk.services.s3.model.S3Exception$BuilderImpl.build(S3Exception.java:61)
+ at software.amazon.awssdk.core.internal.http.pipeline.stages.utils.RetryableStageHelper.retryPolicyDisallowedRetryException(RetryableStageHelper.java:168)
+ at software.amazon.awssdk.core.internal.http.pipeline.stages.RetryableStage.execute(RetryableStage.java:73)
+ at software.amazon.awssdk.core.internal.http.pipeline.stages.RetryableStage.execute(RetryableStage.java:36)
+ at software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)
+ at software.amazon.awssdk.core.internal.http.StreamManagingStage.execute(StreamManagingStage.java:53)
+ at software.amazon.awssdk.core.internal.http.StreamManagingStage.execute(StreamManagingStage.java:35)
+ at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallTimeoutTrackingStage.executeWithTimer(ApiCallTimeoutTrackingStage.java:82)
+ at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallTimeoutTrackingStage.execute(ApiCallTimeoutTrackingStage.java:62)
+ at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallTimeoutTrackingStage.execute(ApiCallTimeoutTrackingStage.java:43)
+ at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallMetricCollectionStage.execute(ApiCallMetricCollectionStage.java:50)
+ at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallMetricCollectionStage.execute(ApiCallMetricCollectionStage.java:32)
+ at software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)
+ at software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206)
+ at software.amazon.awssdk.core.internal.http.pipeline.stages.ExecutionFailureExceptionReportingStage.execute(ExecutionFailureExceptionReportingStage.java:37)
+ at software.amazon.awssdk.core.internal.http.pipeline.stages.ExecutionFailureExceptionReportingStage.execute(ExecutionFailureExceptionReportingStage.java:26)
+ at software.amazon.awssdk.core.internal.http.AmazonSyncHttpClient$RequestExecutionBuilderImpl.execute(AmazonSyncHttpClient.java:210)
+ at software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.invoke(BaseSyncClientHandler.java:103)
+ at software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.doExecute(BaseSyncClientHandler.java:173)
+ at software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.lambda$execute$1(BaseSyncClientHandler.java:80)
+ at software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.measureApiCallSuccess(BaseSyncClientHandler.java:182)
+ at software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.execute(BaseSyncClientHandler.java:74)
+ at software.amazon.awssdk.core.client.handler.SdkSyncClientHandler.execute(SdkSyncClientHandler.java:45)
+ at software.amazon.awssdk.awscore.client.handler.AwsSyncClientHandler.execute(AwsSyncClientHandler.java:53)
+ at software.amazon.awssdk.services.s3.DefaultS3Client.putObject(DefaultS3Client.java:11883)
+ at software.amazon.awssdk.services.s3.DelegatingS3Client.lambda$putObject$89(DelegatingS3Client.java:9716)
+ at software.amazon.awssdk.services.s3.internal.crossregion.S3CrossRegionSyncClient.invokeOperation(S3CrossRegionSyncClient.java:67)
+ at software.amazon.awssdk.services.s3.DelegatingS3Client.putObject(DelegatingS3Client.java:9716)
+ at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$putObjectDirect$14(S3AFileSystem.java:3332)
+ at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDurationOfSupplier(IOStatisticsBinding.java:650)
+ at org.apache.hadoop.fs.s3a.S3AFileSystem.putObjectDirect(S3AFileSystem.java:3330)
+ at org.apache.hadoop.fs.s3a.WriteOperationHelper.lambda$putObject$7(WriteOperationHelper.java:535)
+ at org.apache.hadoop.fs.store.audit.AuditingFunctions.lambda$withinAuditSpan$0(AuditingFunctions.java:62)
+ at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:122)
+ ... 20 more
+ touchz: example-file: Writing Object on example-file: software.amazon.awssdk.services.s3.model.S3Exception: Invalid argument. (Service: S3, Status Code: 403, Request ID: null) (SDK Attempt Count: 1):SignatureDoesNotMatch
+```
+
### "Unable to find a region via the region provider chain." when using session credentials.
Region must be provided when requesting session credentials, or an exception will be thrown with the
@@ -1241,6 +1374,24 @@ When working with S3 Express store buckets (unlike standard S3 buckets), follow
2. This setting ensures that all pending MPUs are aborted before the directory object is deleted, which is a requirement specific to S3 Express store buckets.
+## Status Code: 200 + "PreconditionFailed: At least one of the pre-conditions you specified did not hold"
+
+```
+software.amazon.awssdk.services.s3.model.S3Exception: At least one of the pre-conditions you specified did not hold
+(Service: S3, Status Code: 200, Request ID: 01a396cff3000198cc0439e40509a95e33467bdc, Extended Request ID: TZrsG8pBzlmXoV) (SDK Attempt Count: 1):
+PreconditionFailed: At least one of the pre-conditions you specified did not hold
+```
+
+An attempt to write to S3Express bucket using conditional overwrite failed because another process was writing at the same time.
+
+Conditional overwrite during file creation is used when conditional creation has been enabled (`fs.s3a.create.conditional.enabled`).
+This is true by default.
+
+* A file is created using the `createFile()` API with the option `fs.option.create.conditional.overwrite` set to true.
+* File create performance has been enabled with (`fs.s3a.performance.flags` including `create` or being `*`)
+
+
+
### Application hangs after reading a number of files
@@ -1333,6 +1484,39 @@ connections more frequently.
Something has been trying to write data to "/".
+### "Unable to create OutputStream with the given multipart upload and buffer configuration."
+
+This error is raised when an attemt it made to write to a store with
+`fs.s3a.multipart.uploads.enabled` set to `false` and `fs.s3a.fast.upload.buffer` set to array.
+
+This is pre-emptively disabled before a write of so much data takes place that the process runs out of heap space.
+
+If the store doesn't support multipart uploads, _use disk for buffering_.
+Nothing else is safe to use as it leads to a state where small jobs work, but those which generate large amounts of data fail.
+
+```xml
+
+ fs.s3a.fast.upload.buffer
+ disk
+
+```
+
+```
+org.apache.hadoop.fs.PathIOException: `s3a://gcs/a2a8c3e4-5788-40c0-ad66-fe3fe63f4507': Unable to create OutputStream with the given multipart upload and buffer configuration.
+ at org.apache.hadoop.fs.s3a.S3AUtils.validateOutputStreamConfiguration(S3AUtils.java:985)
+ at org.apache.hadoop.fs.s3a.S3AFileSystem.innerCreateFile(S3AFileSystem.java:2201)
+ at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$create$5(S3AFileSystem.java:2068)
+ at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.invokeTrackingDuration(IOStatisticsBinding.java:546)
+ at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.lambda$trackDurationOfOperation$5(IOStatisticsBinding.java:527)
+ at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDuration(IOStatisticsBinding.java:448)
+ at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2881)
+ at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2900)
+ at org.apache.hadoop.fs.s3a.S3AFileSystem.create(S3AFileSystem.java:2067)
+ at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1233)
+ at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1210)
+ at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1091)
+```
+
## Best Practises
### Enabling low-level logging
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMkdirWithCreatePerf.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMkdirWithCreatePerf.java
index 4570320029d59..42d175a577674 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMkdirWithCreatePerf.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMkdirWithCreatePerf.java
@@ -31,6 +31,7 @@
import static org.apache.hadoop.fs.contract.ContractTestUtils.createFile;
import static org.apache.hadoop.fs.contract.ContractTestUtils.dataset;
import static org.apache.hadoop.fs.s3a.S3ATestConstants.KEY_PERFORMANCE_TESTS_ENABLED;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.disableFilesystemCaching;
import static org.apache.hadoop.fs.s3a.S3ATestUtils.setPerformanceFlags;
import static org.apache.hadoop.fs.s3a.S3ATestUtils.skipIfNotEnabled;
@@ -42,9 +43,9 @@ public class ITestS3AContractMkdirWithCreatePerf extends AbstractContractMkdirTe
@Override
protected Configuration createConfiguration() {
- return setPerformanceFlags(
- super.createConfiguration(),
- "create,mkdir");
+ final Configuration conf = super.createConfiguration();
+ disableFilesystemCaching(conf);
+ return setPerformanceFlags(conf, "create,mkdir");
}
@Override
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java
index 0afdf20595bcc..bb4cdf3a024d8 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java
@@ -18,10 +18,13 @@
package org.apache.hadoop.fs.contract.s3a;
+import java.io.FileNotFoundException;
+
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.contract.AbstractContractMultipartUploaderTest;
import org.apache.hadoop.fs.contract.AbstractFSContract;
import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.fs.s3a.impl.ChecksumSupport;
import org.apache.hadoop.test.tags.IntegrationTest;
import org.apache.hadoop.test.tags.ScaleTest;
@@ -29,15 +32,23 @@
import org.junit.jupiter.api.Test;
import static org.apache.hadoop.fs.contract.ContractTestUtils.skip;
+import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_ALGORITHM;
+import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_GENERATION;
+import static org.apache.hadoop.fs.s3a.S3ATestConstants.DEFAULT_MULTIPART_COMMIT_CONSUMES_UPLOAD_ID;
import static org.apache.hadoop.fs.s3a.S3ATestConstants.DEFAULT_SCALE_TESTS_ENABLED;
import static org.apache.hadoop.fs.s3a.S3ATestConstants.KEY_HUGE_PARTITION_SIZE;
import static org.apache.hadoop.fs.s3a.S3ATestConstants.KEY_SCALE_TESTS_ENABLED;
+import static org.apache.hadoop.fs.s3a.S3ATestConstants.MULTIPART_COMMIT_CONSUMES_UPLOAD_ID;
import static org.apache.hadoop.fs.s3a.S3ATestConstants.SCALE_TEST_TIMEOUT_MILLIS;
import static org.apache.hadoop.fs.s3a.S3ATestUtils.assume;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeMultipartUploads;
import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeNotS3ExpressFileSystem;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.disableFilesystemCaching;
import static org.apache.hadoop.fs.s3a.S3ATestUtils.getTestPropertyBool;
import static org.apache.hadoop.fs.s3a.S3ATestUtils.getTestPropertyBytes;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides;
import static org.apache.hadoop.fs.s3a.S3ATestUtils.skipIfAnalyticsAcceleratorEnabled;
+import static org.apache.hadoop.fs.s3a.impl.ChecksumSupport.getChecksumAlgorithm;
import static org.apache.hadoop.fs.s3a.scale.AbstractSTestS3AHugeFiles.DEFAULT_HUGE_PARTITION_SIZE;
/**
@@ -54,6 +65,8 @@ public class ITestS3AContractMultipartUploader extends
private int partitionSize;
+ private boolean mpuCommitConsumesUploadId;
+
/**
* S3 requires a minimum part size of 5MB (except the last part).
* @return 5MB+ value
@@ -95,7 +108,18 @@ protected boolean supportsConcurrentUploadsToSamePath() {
@Override
protected boolean finalizeConsumesUploadIdImmediately() {
- return false;
+ return mpuCommitConsumesUploadId;
+ }
+
+ @Override
+ protected Configuration createConfiguration() {
+ final Configuration conf = super.createConfiguration();
+ // use whatever the default checksum generation option is.
+ removeBaseAndBucketOverrides(conf, CHECKSUM_GENERATION, CHECKSUM_ALGORITHM);
+ conf.setBoolean(CHECKSUM_GENERATION, false);
+ conf.set(CHECKSUM_ALGORITHM, ChecksumSupport.NONE);
+ disableFilesystemCaching(conf);
+ return conf;
}
@BeforeEach
@@ -110,9 +134,16 @@ public void setup() throws Exception {
assume("Scale test disabled: to enable set property " +
KEY_SCALE_TESTS_ENABLED,
enabled);
+ final Configuration fsConf = getFileSystem().getConf();
+ assumeMultipartUploads(fsConf);
partitionSize = (int) getTestPropertyBytes(conf,
KEY_HUGE_PARTITION_SIZE,
DEFAULT_HUGE_PARTITION_SIZE);
+ mpuCommitConsumesUploadId = fsConf.getBoolean(
+ MULTIPART_COMMIT_CONSUMES_UPLOAD_ID,
+ DEFAULT_MULTIPART_COMMIT_CONSUMES_UPLOAD_ID);
+ LOG.info("{} = {}", MULTIPART_COMMIT_CONSUMES_UPLOAD_ID, mpuCommitConsumesUploadId);
+ LOG.info("{} = {}", CHECKSUM_ALGORITHM, getChecksumAlgorithm(fsConf));
}
/**
@@ -134,6 +165,7 @@ public void testMultipartUploadReverseOrder() throws Exception {
@Override
public void testMultipartUploadReverseOrderNonContiguousPartNumbers() throws Exception {
assumeNotS3ExpressFileSystem(getFileSystem());
+ final Configuration fsConf = getFileSystem().getConf();
super.testMultipartUploadReverseOrderNonContiguousPartNumbers();
}
@@ -149,4 +181,18 @@ public void testConcurrentUploads() throws Throwable {
"Analytics Accelerator currently does not support reading of over written files");
super.testConcurrentUploads();
}
+
+
+ @Test
+ @Override
+ public void testMultipartUploadAbort() throws Exception {
+ try {
+ super.testMultipartUploadAbort();
+ } catch (FileNotFoundException e) {
+ LOG.info("Multipart upload not found in abort()."
+ + " This is common on third-party stores: {}",
+ e.toString());
+ LOG.debug("Exception: ", e);
+ }
+ }
}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAnalyticsAcceleratorStreamReading.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAnalyticsAcceleratorStreamReading.java
index 8f8f90f9b1e65..be0d2cc5b20c2 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAnalyticsAcceleratorStreamReading.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAnalyticsAcceleratorStreamReading.java
@@ -164,26 +164,33 @@ public void testMultiRowGroupParquet() throws Throwable {
FileStatus fileStatus = getFileSystem().getFileStatus(dest);
- byte[] buffer = new byte[3000];
+ final int size = 3000;
+ byte[] buffer = new byte[size];
+ int readLimit = Math.min(size, (int) fileStatus.getLen());
IOStatistics ioStats;
+ final IOStatistics fsIostats = getFileSystem().getIOStatistics();
+ final long initialAuditCount = fsIostats.counters()
+ .getOrDefault(AUDIT_REQUEST_EXECUTION, 0L);
+
try (FSDataInputStream inputStream = getFileSystem().open(dest)) {
ioStats = inputStream.getIOStatistics();
- inputStream.readFully(buffer, 0, (int) fileStatus.getLen());
+ inputStream.readFully(buffer, 0, readLimit);
}
verifyStatisticCounterValue(ioStats, STREAM_READ_ANALYTICS_OPENED, 1);
try (FSDataInputStream inputStream = getFileSystem().openFile(dest)
+ .withFileStatus(fileStatus)
.must(FS_OPTION_OPENFILE_READ_POLICY, FS_OPTION_OPENFILE_READ_POLICY_PARQUET)
.build().get()) {
ioStats = inputStream.getIOStatistics();
- inputStream.readFully(buffer, 0, (int) fileStatus.getLen());
+ inputStream.readFully(buffer, 0, readLimit);
}
verifyStatisticCounterValue(ioStats, STREAM_READ_ANALYTICS_OPENED, 1);
- verifyStatisticCounterValue(getFileSystem().getIOStatistics(), AUDIT_REQUEST_EXECUTION, 4);
+ verifyStatisticCounterValue(fsIostats, AUDIT_REQUEST_EXECUTION, initialAuditCount + 2);
}
@Test
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ABlockOutputArray.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ABlockOutputArray.java
index 18f665ecef2ce..0e43ee3f2ad61 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ABlockOutputArray.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ABlockOutputArray.java
@@ -28,6 +28,7 @@
import org.apache.hadoop.io.IOUtils;
import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
@@ -36,6 +37,7 @@
import static org.apache.hadoop.fs.StreamCapabilities.ABORTABLE_STREAM;
import static org.apache.hadoop.fs.s3a.Constants.*;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.skipIfNotEnabled;
import static org.apache.hadoop.fs.s3a.test.ExtraAssertions.assertCompleteAbort;
import static org.apache.hadoop.fs.s3a.test.ExtraAssertions.assertNoopAbort;
@@ -66,6 +68,16 @@ protected Configuration createConfiguration() {
return conf;
}
+ @Override
+ @BeforeEach
+ public void setup() throws Exception {
+ super.setup();
+
+ skipIfNotEnabled(getFileSystem().getConf(),
+ MULTIPART_UPLOADS_ENABLED,
+ "Store has disabled multipart uploads; skipping tests");
+ }
+
protected String getBlockOutputBufferName() {
return FAST_UPLOAD_BUFFER_ARRAY;
}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ABucketExistence.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ABucketExistence.java
index 8b9a202f620e0..a7010ef68e39d 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ABucketExistence.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ABucketExistence.java
@@ -44,7 +44,11 @@
import static org.apache.hadoop.fs.s3a.Constants.FS_S3A;
import static org.apache.hadoop.fs.s3a.Constants.PATH_STYLE_ACCESS;
import static org.apache.hadoop.fs.s3a.Constants.S3A_BUCKET_PROBE;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.assume;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.getTestBucketName;
import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides;
+import static org.apache.hadoop.fs.s3a.S3AUtils.propagateBucketOptions;
+import static org.apache.hadoop.fs.s3a.impl.NetworkBinding.isAwsEndpoint;
import static org.apache.hadoop.test.LambdaTestUtils.intercept;
/**
@@ -59,6 +63,15 @@ public class ITestS3ABucketExistence extends AbstractS3ATestBase {
private final URI uri = URI.create(FS_S3A + "://" + randomBucket + "/");
+ @Override
+ protected Configuration createConfiguration() {
+ final Configuration conf = super.createConfiguration();
+ String endpoint = propagateBucketOptions(conf, getTestBucketName(conf)).get(ENDPOINT, "");
+ assume("Skipping existence probes",
+ isAwsEndpoint(endpoint));
+ return conf;
+ }
+
@SuppressWarnings("deprecation")
@Test
public void testNoBucketProbing() throws Exception {
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java
index 75266461565dc..665703cc26f5d 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java
@@ -19,9 +19,13 @@
package org.apache.hadoop.fs.s3a;
import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collection;
import org.assertj.core.api.Assertions;
import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedClass;
+import org.junit.jupiter.params.provider.MethodSource;
import software.amazon.awssdk.services.s3.model.ChecksumAlgorithm;
import software.amazon.awssdk.services.s3.model.ChecksumMode;
import software.amazon.awssdk.services.s3.model.HeadObjectRequest;
@@ -31,8 +35,14 @@
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.s3a.impl.ChecksumSupport;
-import static org.apache.hadoop.fs.contract.ContractTestUtils.rm;
+import static org.apache.hadoop.fs.contract.ContractTestUtils.skip;
import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_ALGORITHM;
+import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_GENERATION;
+import static org.apache.hadoop.fs.s3a.Constants.DEFAULT_CHECKSUM_GENERATION;
+import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_VALIDATION;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.assume;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.getTestBucketName;
+import static org.apache.hadoop.fs.s3a.S3AUtils.propagateBucketOptions;
import static org.apache.hadoop.fs.s3a.audit.S3AAuditConstants.REJECT_OUT_OF_SPAN_OPERATIONS;
/**
@@ -40,31 +50,58 @@
* If CHECKSUM_ALGORITHM config is not set in auth-keys.xml,
* SHA256 algorithm will be picked.
*/
+@ParameterizedClass(name="checksum={0}")
+@MethodSource("params")
public class ITestS3AChecksum extends AbstractS3ATestBase {
- private static final ChecksumAlgorithm DEFAULT_CHECKSUM_ALGORITHM = ChecksumAlgorithm.SHA256;
+ public static final String UNKNOWN = "UNKNOWN_TO_SDK_VERSION";
private ChecksumAlgorithm checksumAlgorithm;
+ /**
+ * Parameterization.
+ */
+ public static Collection