From e0ab6025a342d7ab466eea76c85fb785ff131192 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Mon, 18 Aug 2025 19:30:37 +0100 Subject: [PATCH 01/24] HADOOP-19654. Upgrade AWS SDK to 2.32.23 This is just the library declaration change. --- LICENSE-binary | 2 +- hadoop-project/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/LICENSE-binary b/LICENSE-binary index b61b7f3166733..10dff251988b8 100644 --- a/LICENSE-binary +++ b/LICENSE-binary @@ -419,7 +419,7 @@ org.xerial.snappy:snappy-java:1.1.10.4 org.yaml:snakeyaml:2.0 org.wildfly.openssl:wildfly-openssl:2.2.5.Final ro.isdc.wro4j:wro4j-maven-plugin:1.8.0 -software.amazon.awssdk:bundle:2.29.52 +software.amazon.awssdk:bundle:2.32.23 software.amazon.s3.analyticsaccelerator:analyticsaccelerator-s3:1.3.0 net.jodah:failsafe:2.4.4 diff --git a/hadoop-project/pom.xml b/hadoop-project/pom.xml index 9967f3d79c9cb..9f7f48f5ac2b1 100644 --- a/hadoop-project/pom.xml +++ b/hadoop-project/pom.xml @@ -209,7 +209,7 @@ 1.0-beta-1 900 1.12.720 - 2.29.52 + 2.32.23 3.1.1 1.3.0 1.0.1 From c754f0ff8747ee424192e3a35078a65c35465433 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Wed, 20 Aug 2025 14:25:22 +0100 Subject: [PATCH 02/24] HADOOP-19654. Restore support for third party stores. * create legacy MD5 headers * downgrade request checksum calculation to "when required" * set the (existing) checksum validation flag the same way so the builder library is happy * add "none" as a valid checksum algorithm * document that and the existing "unknown_to_sdk_version" option * with troubleshooting * ITestS3AChecksum modified to handle these checksum changes. --- .../hadoop/fs/s3a/DefaultS3ClientFactory.java | 18 ++++- .../hadoop/fs/s3a/impl/ChecksumSupport.java | 8 ++- .../site/markdown/tools/hadoop-aws/index.md | 2 +- .../tools/hadoop-aws/troubleshooting_s3a.md | 42 ++++++++++++ .../hadoop/fs/s3a/ITestS3AChecksum.java | 68 +++++++++++++++---- .../hadoop/fs/s3a/ITestS3AEndpointRegion.java | 1 + .../hadoop/fs/s3a/auth/ITestCustomSigner.java | 7 +- 7 files changed, 129 insertions(+), 17 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java index 7b5aa5ff934ce..e47ebccd66dc1 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java @@ -30,6 +30,8 @@ import org.slf4j.LoggerFactory; import software.amazon.awssdk.awscore.util.AwsHostNameUtils; +import software.amazon.awssdk.core.checksums.RequestChecksumCalculation; +import software.amazon.awssdk.core.checksums.ResponseChecksumValidation; import software.amazon.awssdk.core.client.config.ClientOverrideConfiguration; import software.amazon.awssdk.core.client.config.SdkAdvancedClientOption; import software.amazon.awssdk.core.interceptor.ExecutionInterceptor; @@ -41,6 +43,7 @@ import software.amazon.awssdk.metrics.LoggingMetricPublisher; import software.amazon.awssdk.regions.Region; import software.amazon.awssdk.s3accessgrants.plugin.S3AccessGrantsPlugin; +import software.amazon.awssdk.services.s3.LegacyMd5Plugin; import software.amazon.awssdk.services.s3.S3AsyncClient; import software.amazon.awssdk.services.s3.S3AsyncClientBuilder; import software.amazon.awssdk.services.s3.S3BaseClientBuilder; @@ -202,11 +205,24 @@ private , ClientT> Build configureEndpointAndRegion(builder, parameters, conf); + // add a plugin to add a Content-MD5 header. + // this is required when performing some operations with third party stores + // (for example: bulk delete), and is harmless when working with AWS S3. + builder.addPlugin(LegacyMd5Plugin.create()); + + // do not do request checksums as this causes third-party store problems. + builder.requestChecksumCalculation(RequestChecksumCalculation.WHEN_REQUIRED); + + // response checksum validation. Slow, even with CRC32 checksums. + if (parameters.isChecksumValidationEnabled()) { + builder.responseChecksumValidation(ResponseChecksumValidation.WHEN_SUPPORTED); + } + + maybeApplyS3AccessGrantsConfigurations(builder, conf); S3Configuration serviceConfiguration = S3Configuration.builder() .pathStyleAccessEnabled(parameters.isPathStyleAccess()) - .checksumValidationEnabled(parameters.isChecksumValidationEnabled()) .build(); final ClientOverrideConfiguration.Builder override = diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChecksumSupport.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChecksumSupport.java index b14f5f7bd2370..714d65a98bb41 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChecksumSupport.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChecksumSupport.java @@ -34,6 +34,12 @@ */ public final class ChecksumSupport { + /** + * Special checksum algorithm to declare that no checksum + * is required: {@value}. + */ + public static final String NONE = "none"; + private ChecksumSupport() { } @@ -58,7 +64,7 @@ public static ChecksumAlgorithm getChecksumAlgorithm(Configuration conf) { CHECKSUM_ALGORITHM, ChecksumAlgorithm.class, configValue -> { - if (StringUtils.isBlank(configValue)) { + if (StringUtils.isBlank(configValue) || NONE.equalsIgnoreCase(configValue)) { return null; } if (ChecksumAlgorithm.CRC32_C.toString().equalsIgnoreCase(configValue)) { diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md index 1f65caeb5e219..7770c5576a513 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md @@ -931,7 +931,7 @@ Here are some the S3A properties for use in production. Indicates the algorithm used to create the checksum for the object to be uploaded to S3. Unset by default. It supports the following values: - 'CRC32', 'CRC32C', 'SHA1', and 'SHA256' + 'CRC32', 'CRC32C', 'SHA1', 'SHA256', "none" diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md index 151ee5bd8a465..5acb374e4cc09 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md @@ -409,6 +409,48 @@ affect the performance. ``` +### Status Code 400 "XAmzContentSHA256Mismatch: The Content-SHA256 you specified did not match what we receive" + +Seen when working with a third-party store + +``` +org.apache.hadoop.fs.s3a.AWSBadRequestException: PUT 0-byte object on test: +software.amazon.awssdk.services.s3.model.S3Exception: +The Content-SHA256 you specified did not match what we received +(Service: S3, Status Code: 400, Request ID: 0c07c87d:196d43d824a:d7bca:eeb, Extended Request ID: 2af53adb49ffb141a32b534ad7ffbdf33a247f6b95b422011e0b109649d1fab7) (SDK Attempt Count: 1): +XAmzContentSHA256Mismatch: The Content-SHA256 you specified did not match what we received +``` + +This happens when a file create checksum has been enabled but the store does not support it/support it consistently with AWS S3. + +```xml + + fs.s3a.create.checksum.algorithm + none + +``` + +### Status Code 400 "x-amz-sdk-checksum-algorithm specified, but no corresponding x-amz-checksum-* or x-amz-trailer headers were found" + +``` +org.apache.hadoop.fs.s3a.AWSBadRequestException: PUT 0-byte object on test +software.amazon.awssdk.services.s3.model.InvalidRequestException +x-amz-sdk-checksum-algorithm specified, but no corresponding x-amz-checksum-* or x-amz-trailer headers were found. + (Service: S3, Status Code: 400, Request ID: 012929bd17000198c8bc82d20509eecd6df79b1a, Extended Request ID: P9bq0Iv) (SDK Attempt Count: 1): +``` + +The checksum algorithm to be used is not one supported by the store. +In particular, the value `unknown_to_sdk_version` appears to cause it. + +```xml + + fs.s3a.create.checksum.algorithm + unknown_to_sdk_version + +``` + + + ## Access Denied HTTP error codes 401 and 403 are mapped to `AccessDeniedException` in the S3A connector. diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java index 75266461565dc..3525d89efc703 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java @@ -19,9 +19,13 @@ package org.apache.hadoop.fs.s3a; import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; import org.assertj.core.api.Assertions; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedClass; +import org.junit.jupiter.params.provider.MethodSource; import software.amazon.awssdk.services.s3.model.ChecksumAlgorithm; import software.amazon.awssdk.services.s3.model.ChecksumMode; import software.amazon.awssdk.services.s3.model.HeadObjectRequest; @@ -31,8 +35,9 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.s3a.impl.ChecksumSupport; -import static org.apache.hadoop.fs.contract.ContractTestUtils.rm; +import static org.apache.hadoop.fs.contract.ContractTestUtils.skip; import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_ALGORITHM; +import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_VALIDATION; import static org.apache.hadoop.fs.s3a.audit.S3AAuditConstants.REJECT_OUT_OF_SPAN_OPERATIONS; /** @@ -40,31 +45,55 @@ * If CHECKSUM_ALGORITHM config is not set in auth-keys.xml, * SHA256 algorithm will be picked. */ +@ParameterizedClass(name="checksum={0}") +@MethodSource("params") public class ITestS3AChecksum extends AbstractS3ATestBase { - private static final ChecksumAlgorithm DEFAULT_CHECKSUM_ALGORITHM = ChecksumAlgorithm.SHA256; + public static final String UNKNOWN = "UNKNOWN_TO_SDK_VERSION"; private ChecksumAlgorithm checksumAlgorithm; + /** + * Parameterization. + */ + public static Collection params() { + return Arrays.asList(new Object[][]{ + {"SHA256"}, + {"CRC32C"}, + {"SHA1"}, + {UNKNOWN}, + }); + } + private static final int[] SIZES = { - 1, 2, 3, 4, 5, 254, 255, 256, 257, 2 ^ 12 - 1 + 5, 255, 256, 257, 2 ^ 12 - 1 }; + private final String algorithmName; + + public ITestS3AChecksum(final String algorithmName) { + this.algorithmName = algorithmName; + } + @Override protected Configuration createConfiguration() { final Configuration conf = super.createConfiguration(); + // get the base checksum algorithm, if set it will be left alone. + final String al = conf.getTrimmed(CHECKSUM_ALGORITHM, ""); + if (!UNKNOWN.equals(algorithmName) && + (ChecksumSupport.NONE.equalsIgnoreCase(al) || UNKNOWN.equalsIgnoreCase(al))) { + skip("Skipping checksum algorithm tests"); + } S3ATestUtils.removeBaseAndBucketOverrides(conf, CHECKSUM_ALGORITHM, + CHECKSUM_VALIDATION, REJECT_OUT_OF_SPAN_OPERATIONS); S3ATestUtils.disableFilesystemCaching(conf); - checksumAlgorithm = ChecksumSupport.getChecksumAlgorithm(conf); - if (checksumAlgorithm == null) { - checksumAlgorithm = DEFAULT_CHECKSUM_ALGORITHM; - LOG.info("No checksum algorithm found in configuration, will use default {}", - checksumAlgorithm); - conf.set(CHECKSUM_ALGORITHM, checksumAlgorithm.toString()); - } + LOG.info("Using checksum algorithm {}", algorithmName); + conf.set(CHECKSUM_ALGORITHM, algorithmName); + conf.setBoolean(CHECKSUM_VALIDATION, true); conf.setBoolean(REJECT_OUT_OF_SPAN_OPERATIONS, false); + checksumAlgorithm = ChecksumSupport.getChecksumAlgorithm(conf); return conf; } @@ -77,14 +106,15 @@ public void testChecksum() throws IOException { private void validateChecksumForFilesize(int len) throws IOException { describe("Create a file of size " + len); - String src = String.format("%s-%04x", methodName.getMethodName(), len); - Path path = writeThenReadFile(src, len); + final Path path = methodPath(); + writeThenReadFile(path, len); assertChecksum(path); - rm(getFileSystem(), path, false, false); } private void assertChecksum(Path path) throws IOException { final String key = getFileSystem().pathToKey(path); + // issue a head request and include asking for the checksum details. + // such a query may require extra IAM permissions. HeadObjectRequest.Builder requestBuilder = getFileSystem().getRequestFactory() .newHeadObjectRequestBuilder(key) .checksumMode(ChecksumMode.ENABLED); @@ -101,6 +131,9 @@ private void assertChecksum(Path path) throws IOException { Assertions.assertThat(headObject.checksumCRC32C()) .describedAs("headObject.checksumCRC32C()") .isNotNull(); + Assertions.assertThat(headObject.checksumSHA256()) + .describedAs("headObject.checksumSHA256()") + .isNull(); break; case SHA1: Assertions.assertThat(headObject.checksumSHA1()) @@ -112,6 +145,15 @@ private void assertChecksum(Path path) throws IOException { .describedAs("headObject.checksumSHA256()") .isNotNull(); break; + case UNKNOWN_TO_SDK_VERSION: + // expect values to be null + Assertions.assertThat(headObject.checksumSHA256()) + .describedAs("headObject.checksumSHA256()") + .isNull(); + Assertions.assertThat(headObject.checksumCRC32()) + .describedAs("headObject.checksumCRC32()") + .isNull(); + break; default: fail("Checksum algorithm not supported: " + checksumAlgorithm); } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEndpointRegion.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEndpointRegion.java index ae0daf879b03c..e237a199b9750 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEndpointRegion.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEndpointRegion.java @@ -484,6 +484,7 @@ public void testCentralEndpointAndNullRegionWithCRUD() throws Throwable { describe("Access the test bucket using central endpoint and" + " null region, perform file system CRUD operations"); final Configuration conf = getConfiguration(); + assumeStoreAwsHosted(getFileSystem()); final Configuration newConf = new Configuration(conf); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestCustomSigner.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestCustomSigner.java index 1a4d354d5edc8..c90f032ae2764 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestCustomSigner.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestCustomSigner.java @@ -53,9 +53,11 @@ import org.apache.hadoop.fs.s3a.S3AFileSystem; import org.apache.hadoop.fs.s3a.auth.ITestCustomSigner.CustomSignerInitializer.StoreValue; import org.apache.hadoop.fs.s3a.auth.delegation.DelegationTokenProvider; +import org.apache.hadoop.fs.s3a.impl.ChecksumSupport; import org.apache.hadoop.security.UserGroupInformation; import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_ALGORITHM; +import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_VALIDATION; import static org.apache.hadoop.fs.s3a.Constants.CUSTOM_SIGNERS; import static org.apache.hadoop.fs.s3a.Constants.ENABLE_MULTI_DELETE; import static org.apache.hadoop.fs.s3a.Constants.SIGNING_ALGORITHM_S3; @@ -218,6 +220,8 @@ private Configuration createTestConfig(String identifier) { Configuration conf = createConfiguration(); removeBaseAndBucketOverrides(conf, + CHECKSUM_ALGORITHM, + CHECKSUM_VALIDATION, CUSTOM_SIGNERS, SIGNING_ALGORITHM_S3, ENABLE_MULTI_DELETE); @@ -233,7 +237,8 @@ private Configuration createTestConfig(String identifier) { // Having the checksum algorithm in this test causes // x-amz-sdk-checksum-algorithm specified, but no corresponding // x-amz-checksum-* or x-amz-trailer headers were found - conf.unset(CHECKSUM_ALGORITHM); + conf.set(CHECKSUM_ALGORITHM, ChecksumSupport.NONE); + conf.setBoolean(CHECKSUM_VALIDATION, false); // make absolutely sure there is no caching. disableFilesystemCaching(conf); From 89ecb7087e7a1c066edce0359de5fd6a8e859d16 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Thu, 21 Aug 2025 11:22:06 +0100 Subject: [PATCH 03/24] HADOOP-19592. S3A: S3 Express bucket failure of conditional overwrite of multiparts is 200 + error * Recognise and wrap with RemoteFileChangedException * Fix up assertions in test. --- .../hadoop/fs/s3a/AWSClientIOException.java | 4 + .../hadoop/fs/s3a/WriteOperationHelper.java | 14 ++++ .../hadoop/fs/s3a/impl/InternalConstants.java | 5 ++ .../tools/hadoop-aws/troubleshooting_s3a.md | 18 +++++ .../ITestS3APutIfMatchAndIfNoneMatch.java | 75 ++++++++++++------- 5 files changed, 91 insertions(+), 25 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/AWSClientIOException.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/AWSClientIOException.java index b61667d1c502b..af187e3580db1 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/AWSClientIOException.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/AWSClientIOException.java @@ -54,4 +54,8 @@ public String getMessage() { public boolean retryable() { return getCause().retryable(); } + + public String getOperation() { + return operation; + } } diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java index 364f780863a01..3f7456b499910 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java @@ -34,6 +34,7 @@ import software.amazon.awssdk.services.s3.model.MultipartUpload; import software.amazon.awssdk.services.s3.model.PutObjectRequest; import software.amazon.awssdk.services.s3.model.PutObjectResponse; +import software.amazon.awssdk.services.s3.model.S3Exception; import software.amazon.awssdk.services.s3.model.UploadPartRequest; import software.amazon.awssdk.services.s3.model.UploadPartResponse; import org.slf4j.Logger; @@ -54,6 +55,8 @@ import org.apache.hadoop.fs.store.audit.AuditSpanSource; import org.apache.hadoop.util.functional.CallableRaisingIOE; +import static org.apache.hadoop.fs.s3a.impl.InternalConstants.PRECONDITION_FAILED; +import static org.apache.hadoop.fs.s3a.impl.InternalConstants.SC_200_OK; import static org.apache.hadoop.util.Preconditions.checkNotNull; import static org.apache.hadoop.fs.s3a.Invoker.*; import static org.apache.hadoop.fs.store.audit.AuditingFunctions.withinAuditSpan; @@ -322,6 +325,17 @@ private CompleteMultipartUploadResponse finalizeMultipartUpload( return writeOperationHelperCallbacks.completeMultipartUpload(requestBuilder.build()); }); return uploadResult; + } catch (AWSS3IOException e) { + // S3 express buckets report the create conflict differently + // recognize and convert. + if (e.statusCode() == SC_200_OK + && PRECONDITION_FAILED.equals(e.awsErrorDetails().errorCode())) { + throw new RemoteFileChangedException(destKey, + e.getOperation(), + e.getMessage(), + e.getCause()); + } + throw e; } } diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/InternalConstants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/InternalConstants.java index 8cf435f7ca603..d8448655769eb 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/InternalConstants.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/InternalConstants.java @@ -313,4 +313,9 @@ private InternalConstants() { public static final String UPLOAD_PROGRESS_LOG_NAME = "org.apache.hadoop.fs.s3a.S3AFileSystem.Progress"; + /** + * AWS Error code for conditional put failure on s3 express buckets: {@value}. + */ + public static final String PRECONDITION_FAILED = "PreconditionFailed"; + } diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md index 5acb374e4cc09..e4a0ebff40396 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md @@ -1283,6 +1283,24 @@ When working with S3 Express store buckets (unlike standard S3 buckets), follow 2. This setting ensures that all pending MPUs are aborted before the directory object is deleted, which is a requirement specific to S3 Express store buckets. +## Status Code: 200 + "PreconditionFailed: At least one of the pre-conditions you specified did not hold" + +``` +software.amazon.awssdk.services.s3.model.S3Exception: At least one of the pre-conditions you specified did not hold +(Service: S3, Status Code: 200, Request ID: 01a396cff3000198cc0439e40509a95e33467bdc, Extended Request ID: TZrsG8pBzlmXoV) (SDK Attempt Count: 1): +PreconditionFailed: At least one of the pre-conditions you specified did not hold +``` + +An attempt to write to S3Express bucket using conditional overwrite failed because another process was writing at the same time. + +Conditional overwrite during file creation is used when conditional creation has been enabled (`fs.s3a.create.conditional.enabled`). +This is true by default. + +* A file is created using the `createFile()` API with the option `fs.option.create.conditional.overwrite` set to true. +* File create performance has been enabled with (`fs.s3a.performance.flags` including `create` or being `*`) + + + ### Application hangs after reading a number of files diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestS3APutIfMatchAndIfNoneMatch.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestS3APutIfMatchAndIfNoneMatch.java index 99224945cbaa5..68ae6feead952 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestS3APutIfMatchAndIfNoneMatch.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestS3APutIfMatchAndIfNoneMatch.java @@ -26,6 +26,7 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import org.opentest4j.AssertionFailedError; import software.amazon.awssdk.services.s3.model.S3Exception; import org.apache.commons.io.IOUtils; @@ -37,16 +38,19 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.s3a.AWSS3IOException; import org.apache.hadoop.fs.s3a.AbstractS3ATestBase; import org.apache.hadoop.fs.s3a.RemoteFileChangedException; import org.apache.hadoop.fs.s3a.S3AFileStatus; import org.apache.hadoop.fs.s3a.S3ATestUtils; import org.apache.hadoop.fs.s3a.Statistic; import org.apache.hadoop.fs.s3a.statistics.BlockOutputStreamStatistics; +import org.apache.hadoop.test.LambdaTestUtils; import static org.apache.hadoop.fs.Options.CreateFileOptionKeys.FS_OPTION_CREATE_CONDITIONAL_OVERWRITE; import static org.apache.hadoop.fs.Options.CreateFileOptionKeys.FS_OPTION_CREATE_CONDITIONAL_OVERWRITE_ETAG; import static org.apache.hadoop.fs.contract.ContractTestUtils.dataset; +import static org.apache.hadoop.fs.s3a.impl.InternalConstants.SC_200_OK; import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.verifyStatisticCounterValue; import static org.apache.hadoop.fs.s3a.Constants.FAST_UPLOAD_BUFFER_ARRAY; import static org.apache.hadoop.fs.s3a.Constants.FS_S3A_CREATE_MULTIPART; @@ -76,6 +80,8 @@ public class ITestS3APutIfMatchAndIfNoneMatch extends AbstractS3ATestBase { private static final byte[] SMALL_FILE_BYTES = dataset(TEST_FILE_LEN, 0, 255); private static final byte[] MULTIPART_FILE_BYTES = dataset(UPDATED_MULTIPART_THRESHOLD * 5, 'a', 'z' - 'a'); + public static final String PRECONDITION_FAILED = "PreconditionFailed"; + private BlockOutputStreamStatistics statistics; @Override @@ -107,16 +113,24 @@ public void setup() throws Exception { /** * Asserts that an S3Exception has the expected HTTP status code. - * * @param code Expected HTTP status code. - * @param ex Exception to validate. + * @param ex Exception to validate. + * @return the inner exception + * @throws AssertionError if the status code doesn't match. */ - private static void assertS3ExceptionStatusCode(int code, Exception ex) { - S3Exception s3Exception = (S3Exception) ex.getCause(); - + private static S3Exception verifyS3ExceptionStatusCode(int code, Exception ex) { + final Throwable cause = ex.getCause(); + if (cause == null) { + throw new AssertionError("No inner exception of" + ex, ex); + } + if (!(cause instanceof S3Exception)) { + throw new AssertionError("Inner exception is not S3Exception under " + ex, ex); + } + S3Exception s3Exception = (S3Exception) cause; if (s3Exception.statusCode() != code) { throw new AssertionError("Expected status code " + code + " from " + ex, ex); } + return s3Exception; } /** @@ -296,12 +310,12 @@ public void testIfNoneMatchConflictOnOverwrite() throws Throwable { // attempted overwrite fails RemoteFileChangedException firstException = intercept(RemoteFileChangedException.class, () -> createFileWithFlags(fs, testFile, SMALL_FILE_BYTES, true, null)); - assertS3ExceptionStatusCode(SC_412_PRECONDITION_FAILED, firstException); + verifyS3ExceptionStatusCode(SC_412_PRECONDITION_FAILED, firstException); // second attempt also fails RemoteFileChangedException secondException = intercept(RemoteFileChangedException.class, () -> createFileWithFlags(fs, testFile, SMALL_FILE_BYTES, true, null)); - assertS3ExceptionStatusCode(SC_412_PRECONDITION_FAILED, secondException); + verifyS3ExceptionStatusCode(SC_412_PRECONDITION_FAILED, secondException); // Delete file and verify an overwrite works again fs.delete(testFile, false); @@ -320,14 +334,12 @@ public void testIfNoneMatchConflictOnMultipartUpload() throws Throwable { createFileWithFlags(fs, testFile, MULTIPART_FILE_BYTES, true, null, true); - RemoteFileChangedException firstException = intercept(RemoteFileChangedException.class, - () -> createFileWithFlags(fs, testFile, MULTIPART_FILE_BYTES, true, null, true)); - assertS3ExceptionStatusCode(SC_412_PRECONDITION_FAILED, firstException); + expectPreconditionFailure(() -> + createFileWithFlags(fs, testFile, MULTIPART_FILE_BYTES, true, null, true)); - RemoteFileChangedException secondException = intercept(RemoteFileChangedException.class, - () -> createFileWithFlags(fs, testFile, MULTIPART_FILE_BYTES, true, null, true)); - assertS3ExceptionStatusCode(SC_412_PRECONDITION_FAILED, secondException); - } + expectPreconditionFailure(() -> + createFileWithFlags(fs, testFile, MULTIPART_FILE_BYTES, true, null, true)); + } @Test public void testIfNoneMatchMultipartUploadWithRaceCondition() throws Throwable { @@ -348,8 +360,7 @@ public void testIfNoneMatchMultipartUploadWithRaceCondition() throws Throwable { createFileWithFlags(fs, testFile, SMALL_FILE_BYTES, true, null); // Closing the first stream should throw RemoteFileChangedException - RemoteFileChangedException exception = intercept(RemoteFileChangedException.class, stream::close); - assertS3ExceptionStatusCode(SC_412_PRECONDITION_FAILED, exception); + expectPreconditionFailure(stream::close); } @Test @@ -371,8 +382,23 @@ public void testIfNoneMatchTwoConcurrentMultipartUploads() throws Throwable { createFileWithFlags(fs, testFile, MULTIPART_FILE_BYTES, true, null, true); // Closing the first stream should throw RemoteFileChangedException - RemoteFileChangedException exception = intercept(RemoteFileChangedException.class, stream::close); - assertS3ExceptionStatusCode(SC_412_PRECONDITION_FAILED, exception); + // or or the S3 Express equivalent. + expectPreconditionFailure(stream::close); + } + + /** + * Expect an operation to fail with an S3 classic or S3 Express precondition failure. + * @param eval closure to eval + * @throws Exception any other failure. + */ + private static void expectPreconditionFailure(final LambdaTestUtils.VoidCallable eval) throws Exception { + RemoteFileChangedException exception = intercept(RemoteFileChangedException.class, eval); + S3Exception s3Exception = (S3Exception) exception.getCause(); + if (!(s3Exception.statusCode() == SC_412_PRECONDITION_FAILED + || (s3Exception.statusCode() == SC_200_OK) + && PRECONDITION_FAILED.equals(s3Exception.awsErrorDetails().errorCode()))) { + throw exception; + } } @Test @@ -390,7 +416,7 @@ public void testIfNoneMatchOverwriteWithEmptyFile() throws Throwable { // close the stream, should throw RemoteFileChangedException RemoteFileChangedException exception = intercept(RemoteFileChangedException.class, stream::close); - assertS3ExceptionStatusCode(SC_412_PRECONDITION_FAILED, exception); + verifyS3ExceptionStatusCode(SC_412_PRECONDITION_FAILED, exception); } @Test @@ -407,7 +433,7 @@ public void testIfNoneMatchOverwriteEmptyFileWithFile() throws Throwable { // overwrite with non-empty file, should throw RemoteFileChangedException RemoteFileChangedException exception = intercept(RemoteFileChangedException.class, () -> createFileWithFlags(fs, testFile, SMALL_FILE_BYTES, true, null)); - assertS3ExceptionStatusCode(SC_412_PRECONDITION_FAILED, exception); + verifyS3ExceptionStatusCode(SC_412_PRECONDITION_FAILED, exception); } @Test @@ -425,7 +451,7 @@ public void testIfNoneMatchOverwriteEmptyWithEmptyFile() throws Throwable { FSDataOutputStream stream2 = getStreamWithFlags(fs, testFile, true, null); assertHasCapabilityConditionalCreate(stream2); RemoteFileChangedException exception = intercept(RemoteFileChangedException.class, stream2::close); - assertS3ExceptionStatusCode(SC_412_PRECONDITION_FAILED, exception); + verifyS3ExceptionStatusCode(SC_412_PRECONDITION_FAILED, exception); } @Test @@ -480,7 +506,7 @@ public void testIfMatchOverwriteWithOutdatedEtag() throws Throwable { // overwrite file with outdated etag. Should throw RemoteFileChangedException RemoteFileChangedException exception = intercept(RemoteFileChangedException.class, () -> createFileWithFlags(fs, path, SMALL_FILE_BYTES, false, etag)); - assertS3ExceptionStatusCode(SC_412_PRECONDITION_FAILED, exception); + verifyS3ExceptionStatusCode(SC_412_PRECONDITION_FAILED, exception); } @Test @@ -504,7 +530,7 @@ public void testIfMatchOverwriteDeletedFileWithEtag() throws Throwable { // overwrite file with etag. Should throw FileNotFoundException FileNotFoundException exception = intercept(FileNotFoundException.class, () -> createFileWithFlags(fs, path, SMALL_FILE_BYTES, false, etag)); - assertS3ExceptionStatusCode(SC_404_NOT_FOUND, exception); + verifyS3ExceptionStatusCode(SC_404_NOT_FOUND, exception); } @Test @@ -553,8 +579,7 @@ public void testIfMatchTwoMultipartUploadsRaceConditionOneClosesFirst() throws T stream1.close(); // Close second stream, should fail due to etag mismatch - RemoteFileChangedException exception = intercept(RemoteFileChangedException.class, stream2::close); - assertS3ExceptionStatusCode(SC_412_PRECONDITION_FAILED, exception); + expectPreconditionFailure(stream2::close); } @Disabled("conditional_write statistics not yet fully implemented") From 686bdf13e835cc2307b4440043e2fd63ae84beee Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Thu, 21 Aug 2025 16:08:33 +0100 Subject: [PATCH 04/24] HADOOP-19003. S3A Assume role tests failing against S3Express stores Add relevant statements for s3 access and skip all tests which expect partial access to paths on a bucket. --- .../hadoop/fs/s3a/auth/RolePolicies.java | 39 ++++++++++++++++--- .../hadoop/fs/s3a/auth/ITestAssumeRole.java | 16 +++++++- .../ITestAssumedRoleCommitOperations.java | 4 +- .../s3a/auth/ITestRestrictedReadAccess.java | 6 ++- .../hadoop/fs/s3a/auth/RoleTestUtils.java | 4 +- .../s3a/impl/ITestPartialRenamesDeletes.java | 5 ++- 6 files changed, 62 insertions(+), 12 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/RolePolicies.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/RolePolicies.java index bd76d83ee096f..697f788ccb8ab 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/RolePolicies.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/RolePolicies.java @@ -199,10 +199,31 @@ private RolePolicies() { public static final String S3_RESTORE_OBJECT = "s3:RestoreObject"; /** - * S3Express session permission; required unless sessions are disabled. + * Everything: {@value}. + */ + public static final String EVERYTHING_ARN = "*"; + + + /** + * All S3Express buckets: {@value}. + * S3Express adds another "domain" for permissions: S3 express ARNs and S3 Express operations, + * of which createSession is one key operation. + * See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-express-security.html + * Note: this wildcard patten came from AWS Q; if it is wrong blame GenerativeAI. + */ + public static final String S3EXPRESS_ALL_BUCKETS = "arn:aws:s3express:*:*:bucket/*--*--x-s3"; + + /** + * S3Express session permission; required unless sessions are disabled: {@value}. + * See https://docs.aws.amazon.com/AmazonS3/latest/API/API_CreateSession.html */ public static final String S3EXPRESS_CREATE_SESSION_POLICY = "s3express:CreateSession"; + /** + * S3 Express All operations: {@value}. + */ + public static final String S3EXPRESS_ALL_OPERATIONS = "s3express:*"; + /** * Actions needed to read a file in S3 through S3A, excluding * SSE-KMS. @@ -224,7 +245,7 @@ private RolePolicies() { */ private static final String[] S3_ROOT_READ_OPERATIONS = new String[]{ - S3_ALL_GET + S3_ALL_GET, }; public static final List S3_ROOT_READ_OPERATIONS_LIST = @@ -239,7 +260,7 @@ private RolePolicies() { public static final String[] S3_BUCKET_READ_OPERATIONS = new String[]{ S3_ALL_GET, - S3_BUCKET_ALL_LIST + S3_BUCKET_ALL_LIST, }; /** @@ -281,7 +302,7 @@ private RolePolicies() { S3_PUT_OBJECT, S3_PUT_OBJECT_ACL, S3_DELETE_OBJECT, - S3_ABORT_MULTIPART_UPLOAD + S3_ABORT_MULTIPART_UPLOAD, })); /** @@ -292,6 +313,13 @@ private RolePolicies() { S3_ALL_BUCKETS, S3_ALL_OPERATIONS); + /** + * S3 Express operations required for operation. + */ + public static final Statement STATEMENT_S3EXPRESS = statement(true, + S3EXPRESS_ALL_BUCKETS, + S3EXPRESS_ALL_OPERATIONS); + /** * The s3:GetBucketLocation permission is for all buckets, not for * any named bucket, which complicates permissions. @@ -310,8 +338,9 @@ private RolePolicies() { public static List allowS3Operations(String bucket, boolean write) { // add the bucket operations for the specific bucket ARN - ArrayList statements = + List statements = Lists.newArrayList( + STATEMENT_S3EXPRESS, statement(true, bucketToArn(bucket), S3_GET_BUCKET_LOCATION, S3_BUCKET_ALL_LIST)); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java index d9c84fa6914c9..06b052f0aee07 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java @@ -86,6 +86,9 @@ /** * Tests use of assumed roles. * Only run if an assumed role is provided. + *

+ * S3Express buckets only support access restrictions at the bucket level, rather than at paths underneath. + * All partial permission tests are disabled. */ @SuppressWarnings("ThrowableNotThrown") public class ITestAssumeRole extends AbstractS3ATestBase { @@ -203,7 +206,7 @@ protected Configuration createValidRoleConf() throws JsonProcessingException { conf.set(ASSUMED_ROLE_SESSION_DURATION, "45m"); // disable create session so there's no need to // add a role policy for it. - disableCreateSession(conf); + //disableCreateSession(conf); bindRolePolicy(conf, RESTRICTED_POLICY); return conf; @@ -462,12 +465,15 @@ public void testAssumeRolePoliciesOverrideRolePerms() throws Throwable { public void testReadOnlyOperations() throws Throwable { describe("Restrict role to read only"); + skipIfS3ExpressBucket(getConfiguration()); Configuration conf = createAssumedRoleConfig(); bindRolePolicy(conf, policy( statement(false, S3_ALL_BUCKETS, S3_PATH_WRITE_OPERATIONS), - STATEMENT_ALL_S3, STATEMENT_ALLOW_KMS_RW)); + STATEMENT_ALL_S3, + STATEMENT_S3EXPRESS, + STATEMENT_ALLOW_KMS_RW)); Path path = methodPath(); roleFS = (S3AFileSystem) path.getFileSystem(conf); // list the root path, expect happy @@ -505,6 +511,7 @@ public void testRestrictedWriteSubdir() throws Throwable { describe("Attempt writing to paths where a role only has" + " write access to a subdir of the bucket"); + skipIfS3ExpressBucket(getConfiguration()); Path restrictedDir = methodPath(); Path child = new Path(restrictedDir, "child"); // the full FS @@ -567,6 +574,7 @@ public void testAssumedRoleRetryHandler() throws Throwable { @Test public void testRestrictedCommitActions() throws Throwable { describe("Attempt commit operations against a path with restricted rights"); + skipIfS3ExpressBucket(getConfiguration()); Configuration conf = createAssumedRoleConfig(); final int uploadPartSize = 5 * 1024 * 1024; @@ -704,12 +712,14 @@ public void writeCSVData(final File localSrc) throws IOException { @Test public void testPartialDelete() throws Throwable { describe("delete with part of the child tree read only; multidelete"); + skipIfS3ExpressBucket(getConfiguration()); executePartialDelete(createAssumedRoleConfig(), false); } @Test public void testPartialDeleteSingleDelete() throws Throwable { describe("delete with part of the child tree read only"); + skipIfS3ExpressBucket(getConfiguration()); executePartialDelete(createAssumedRoleConfig(), true); } @@ -722,6 +732,7 @@ public void testBulkDeleteOnReadOnlyAccess() throws Throwable { @Test public void testBulkDeleteWithReadWriteAccess() throws Throwable { describe("Bulk delete with read write access"); + skipIfS3ExpressBucket(getConfiguration()); executeBulkDeleteOnSomeReadOnlyFiles(createAssumedRoleConfig()); } @@ -811,6 +822,7 @@ private static void bindReadOnlyRolePolicy(Configuration assumedRoleConfig, throws JsonProcessingException { bindRolePolicyStatements(assumedRoleConfig, STATEMENT_ALLOW_KMS_RW, statement(true, S3_ALL_BUCKETS, S3_ALL_OPERATIONS), + STATEMENT_S3EXPRESS, new Statement(Effects.Deny) .addActions(S3_PATH_WRITE_OPERATIONS) .addResources(directory(readOnlyDir)) diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumedRoleCommitOperations.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumedRoleCommitOperations.java index 29201a8d5a0dc..721cb016f2138 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumedRoleCommitOperations.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumedRoleCommitOperations.java @@ -70,7 +70,9 @@ public void setup() throws Exception { restrictedDir = super.path("restricted"); Configuration conf = newAssumedRoleConfig(getConfiguration(), getAssumedRoleARN()); - bindRolePolicyStatements(conf, STATEMENT_ALLOW_KMS_RW, + bindRolePolicyStatements(conf, + STATEMENT_ALLOW_KMS_RW, + STATEMENT_S3EXPRESS, statement(true, S3_ALL_BUCKETS, S3_BUCKET_READ_OPERATIONS), new RoleModel.Statement(RoleModel.Effects.Allow) .addActions(S3_PATH_RW_OPERATIONS) diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestRestrictedReadAccess.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestRestrictedReadAccess.java index ae05a8dfc2076..8f00c11bb4589 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestRestrictedReadAccess.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestRestrictedReadAccess.java @@ -49,6 +49,7 @@ import static org.apache.hadoop.fs.s3a.Constants.ASSUMED_ROLE_ARN; import static org.apache.hadoop.fs.s3a.S3ATestUtils.assume; import static org.apache.hadoop.fs.s3a.S3ATestUtils.lsR; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.skipIfS3ExpressBucket; import static org.apache.hadoop.fs.s3a.auth.RoleModel.Effects; import static org.apache.hadoop.fs.s3a.auth.RoleModel.Statement; import static org.apache.hadoop.fs.s3a.auth.RoleModel.directory; @@ -88,7 +89,9 @@ * To simplify maintenance, the operations tested are broken up into * their own methods, with fields used to share the restricted role and * created paths. - * + *

+ * Test are skipped if no assumed role was provided, or if the test bucket + * is an S3Express bucket, whose permissions model is different. */ public class ITestRestrictedReadAccess extends AbstractS3ATestBase { @@ -161,6 +164,7 @@ public class ITestRestrictedReadAccess extends AbstractS3ATestBase { public void setup() throws Exception { super.setup(); assumeRoleTests(); + skipIfS3ExpressBucket(getConfiguration()); } @AfterEach diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/RoleTestUtils.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/RoleTestUtils.java index a1dd177387ba4..579c5b86779df 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/RoleTestUtils.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/RoleTestUtils.java @@ -164,8 +164,8 @@ public static Configuration newAssumedRoleConfig( conf.set(ASSUMED_ROLE_ARN, roleARN); conf.set(ASSUMED_ROLE_SESSION_NAME, "test"); conf.set(ASSUMED_ROLE_SESSION_DURATION, "15m"); - // force in bucket resolution during startup - conf.setInt(S3A_BUCKET_PROBE, 1); + // disable bucket resolution during startup as s3 express doesn't like it + conf.setInt(S3A_BUCKET_PROBE, 0); disableCreateSession(conf); disableFilesystemCaching(conf); return conf; diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestPartialRenamesDeletes.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestPartialRenamesDeletes.java index 8afb60d6139e0..31c99bc9b18fd 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestPartialRenamesDeletes.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestPartialRenamesDeletes.java @@ -74,7 +74,9 @@ * Test partial failures of delete and rename operations,. * * All these test have a unique path for each run, with a roleFS having - * full RW access to part of it, and R/O access to a restricted subdirectory + * full RW access to part of it, and R/O access to a restricted subdirectory. + *

+ * Tests are skipped on S3Express buckets or if no assumed role is provided. * *

    *
  1. @@ -221,6 +223,7 @@ public ITestPartialRenamesDeletes(final boolean multiDelete) { public void setup() throws Exception { super.setup(); assumeRoleTests(); + skipIfS3ExpressBucket(getConfiguration()); basePath = uniquePath(); readOnlyDir = new Path(basePath, "readonlyDir"); writableDir = new Path(basePath, "writableDir"); From 2839c231a61963393e07c350ab9157c665b85e26 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Thu, 21 Aug 2025 16:17:05 +0100 Subject: [PATCH 05/24] HADOOP-19654. create session now always seems to get called ...so adds cost to the assertion --- .../ITestAWSStatisticCollection.java | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/statistics/ITestAWSStatisticCollection.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/statistics/ITestAWSStatisticCollection.java index 7e6b5992744e7..5fbb674cd28c5 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/statistics/ITestAWSStatisticCollection.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/statistics/ITestAWSStatisticCollection.java @@ -22,10 +22,16 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.s3a.S3AFileSystem; +import org.apache.hadoop.fs.s3a.S3ATestUtils; import org.apache.hadoop.fs.s3a.performance.AbstractS3ACostTest; import static org.apache.hadoop.fs.s3a.Constants.FS_S3A_CREATE_PERFORMANCE; +import static org.apache.hadoop.fs.s3a.Constants.FS_S3A_PERFORMANCE_FLAGS; +import static org.apache.hadoop.fs.s3a.Constants.S3EXPRESS_CREATE_SESSION; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.setPerformanceFlags; import static org.apache.hadoop.fs.s3a.Statistic.STORE_IO_REQUEST; +import static org.apache.hadoop.fs.s3a.impl.S3ExpressStorage.STORE_CAPABILITY_S3_EXPRESS_STORAGE; /** * Verify that AWS SDK statistics are wired up. @@ -35,7 +41,10 @@ public class ITestAWSStatisticCollection extends AbstractS3ACostTest { @Override public Configuration createConfiguration() { final Configuration conf = super.createConfiguration(); - conf.setBoolean(FS_S3A_CREATE_PERFORMANCE, true); + S3ATestUtils.removeBaseAndBucketOverrides(conf, + S3EXPRESS_CREATE_SESSION); + setPerformanceFlags(conf, "create"); + conf.setBoolean(S3EXPRESS_CREATE_SESSION, false); return conf; } @@ -44,8 +53,11 @@ public void testSDKMetricsCostOfGetFileStatusOnFile() throws Throwable { describe("performing getFileStatus on a file"); Path simpleFile = file(methodPath()); // and repeat on the file looking at AWS wired up stats - verifyMetrics(() -> getFileSystem().getFileStatus(simpleFile), - with(STORE_IO_REQUEST, 1)); + final S3AFileSystem fs = getFileSystem(); + verifyMetrics(() -> fs.getFileStatus(simpleFile), + with(STORE_IO_REQUEST, + fs.hasPathCapability(new Path("/"), STORE_CAPABILITY_S3_EXPRESS_STORAGE) + ? 2 : 1)); } } From 10acdee8d90f069834106a5ef7b9e3af97bc32f8 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Mon, 15 Sep 2025 18:40:05 +0100 Subject: [PATCH 06/24] HADOOP-19654. S3A: AWS SDK to 2.33.8 --- LICENSE-binary | 2 +- hadoop-project/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/LICENSE-binary b/LICENSE-binary index 10dff251988b8..9aaa3ad7abda3 100644 --- a/LICENSE-binary +++ b/LICENSE-binary @@ -419,7 +419,7 @@ org.xerial.snappy:snappy-java:1.1.10.4 org.yaml:snakeyaml:2.0 org.wildfly.openssl:wildfly-openssl:2.2.5.Final ro.isdc.wro4j:wro4j-maven-plugin:1.8.0 -software.amazon.awssdk:bundle:2.32.23 +software.amazon.awssdk:bundle:2.33.8 software.amazon.s3.analyticsaccelerator:analyticsaccelerator-s3:1.3.0 net.jodah:failsafe:2.4.4 diff --git a/hadoop-project/pom.xml b/hadoop-project/pom.xml index 9f7f48f5ac2b1..f2924cf9ff80c 100644 --- a/hadoop-project/pom.xml +++ b/hadoop-project/pom.xml @@ -209,7 +209,7 @@ 1.0-beta-1 900 1.12.720 - 2.32.23 + 2.33.8 3.1.1 1.3.0 1.0.1 From 82fd148ac9853fb62a59a00abef17ec8944c6a11 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Wed, 17 Sep 2025 18:56:30 +0100 Subject: [PATCH 07/24] HADOOP-19654. SDK update: ITestAWSStatisticCollection fails against s3 express Revert test to original assert (no special treatment of s3 express), and log Initiating GET request" for ease of log scanning. Add to the log4.properties file the settings needed to turn on wire logging, commented out by default. --- .../statistics/ITestAWSStatisticCollection.java | 15 ++++++++------- .../src/test/resources/log4j.properties | 7 ++++++- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/statistics/ITestAWSStatisticCollection.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/statistics/ITestAWSStatisticCollection.java index 5fbb674cd28c5..2e2c8a2952a03 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/statistics/ITestAWSStatisticCollection.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/statistics/ITestAWSStatisticCollection.java @@ -19,6 +19,8 @@ package org.apache.hadoop.fs.s3a.statistics; import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; @@ -26,18 +28,18 @@ import org.apache.hadoop.fs.s3a.S3ATestUtils; import org.apache.hadoop.fs.s3a.performance.AbstractS3ACostTest; -import static org.apache.hadoop.fs.s3a.Constants.FS_S3A_CREATE_PERFORMANCE; -import static org.apache.hadoop.fs.s3a.Constants.FS_S3A_PERFORMANCE_FLAGS; import static org.apache.hadoop.fs.s3a.Constants.S3EXPRESS_CREATE_SESSION; import static org.apache.hadoop.fs.s3a.S3ATestUtils.setPerformanceFlags; import static org.apache.hadoop.fs.s3a.Statistic.STORE_IO_REQUEST; -import static org.apache.hadoop.fs.s3a.impl.S3ExpressStorage.STORE_CAPABILITY_S3_EXPRESS_STORAGE; /** * Verify that AWS SDK statistics are wired up. */ public class ITestAWSStatisticCollection extends AbstractS3ACostTest { + private static final Logger LOG = + LoggerFactory.getLogger(ITestAWSStatisticCollection.class); + @Override public Configuration createConfiguration() { final Configuration conf = super.createConfiguration(); @@ -50,14 +52,13 @@ public Configuration createConfiguration() { @Test public void testSDKMetricsCostOfGetFileStatusOnFile() throws Throwable { - describe("performing getFileStatus on a file"); + describe("Performing getFileStatus() on a file"); Path simpleFile = file(methodPath()); // and repeat on the file looking at AWS wired up stats final S3AFileSystem fs = getFileSystem(); + LOG.info("Initiating GET request for {}", simpleFile); verifyMetrics(() -> fs.getFileStatus(simpleFile), - with(STORE_IO_REQUEST, - fs.hasPathCapability(new Path("/"), STORE_CAPABILITY_S3_EXPRESS_STORAGE) - ? 2 : 1)); + with(STORE_IO_REQUEST, 1)); } } diff --git a/hadoop-tools/hadoop-aws/src/test/resources/log4j.properties b/hadoop-tools/hadoop-aws/src/test/resources/log4j.properties index f61668643a1b5..fe651742e02be 100644 --- a/hadoop-tools/hadoop-aws/src/test/resources/log4j.properties +++ b/hadoop-tools/hadoop-aws/src/test/resources/log4j.properties @@ -68,8 +68,13 @@ log4j.logger.org.apache.hadoop.fs.s3a.SDKV2Upgrade=WARN # include sensitive information such as account IDs in HTTP headers. # log4j.logger.software.amazon.awssdk.request=DEBUG +# Log TLS info +#log4j.logger.software.amazon.awssdk.thirdparty.org.apache.http.conn.ssl.SSLConnectionSocketFactory=DEBUG + + # Turn on low level HTTP protocol debugging -#log4j.logger.org.apache.http.wire=DEBUG +#log4j.logger.software.amazon.awssdk.thirdparty.org.apache.http.wire=DEBUG +#log4j.logger.software.amazon.awssdk.thirdparty.org.apache.http=DEBUG # async client #log4j.logger.io.netty.handler.logging=DEBUG From 78e4acfd750a160e7abc3eccf6165644cb1d8659 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Thu, 18 Sep 2025 14:39:20 +0100 Subject: [PATCH 08/24] HADOOP-19654. review comments --- .../java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java index 06b052f0aee07..e0029714d2bec 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java @@ -204,9 +204,6 @@ protected Configuration createValidRoleConf() throws JsonProcessingException { conf.set(ASSUMED_ROLE_ARN, roleARN); conf.set(ASSUMED_ROLE_SESSION_NAME, "valid"); conf.set(ASSUMED_ROLE_SESSION_DURATION, "45m"); - // disable create session so there's no need to - // add a role policy for it. - //disableCreateSession(conf); bindRolePolicy(conf, RESTRICTED_POLICY); return conf; From d376ddcc5173167e9d720f314433393a3cb980e2 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Tue, 14 Oct 2025 19:50:49 +0100 Subject: [PATCH 09/24] HADOOP-19654. S3A: options to disable checksum calculation & restore MD5 header fs.s3a.checksum.calculation.enabled (default: true) fs.s3a.md5.header.enabled (default: false) --- .../org/apache/hadoop/fs/s3a/Constants.java | 41 ++++++++++++- .../hadoop/fs/s3a/DefaultS3ClientFactory.java | 14 +++-- .../apache/hadoop/fs/s3a/S3AFileSystem.java | 4 ++ .../apache/hadoop/fs/s3a/S3ClientFactory.java | 60 +++++++++++++++++++ .../tools/hadoop-aws/third_party_stores.md | 23 +++++++ 5 files changed, 136 insertions(+), 6 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java index 361806545403b..4f2200b2e4541 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java @@ -21,6 +21,7 @@ import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.fs.Options; +import org.apache.hadoop.fs.s3a.impl.ChecksumSupport; import org.apache.hadoop.fs.s3a.impl.streams.StreamIntegration; import org.apache.hadoop.security.ssl.DelegatingSSLSocketFactory; @@ -1836,15 +1837,53 @@ private Constants() { */ public static final boolean CHECKSUM_VALIDATION_DEFAULT = false; + /** + * Should checksums always be calculated? + * Not all third-party stores like this being enabled for every request. + * Value: {@value}. + */ + public static final String CHECKSUM_CALCULATION_ENABLED = + "fs.s3a.checksum.calculation.enabled"; + + /** + * Default value of {@link #CHECKSUM_CALCULATION_ENABLED}. + * Value: {@value}. + */ + public static final boolean CHECKSUM_CALCULATION_ENABLED_DEFAULT = true; + /** * Indicates the algorithm used to create the checksum for the object * to be uploaded to S3. Unset by default. It supports the following values: - * 'CRC32', 'CRC32C', 'SHA1', and 'SHA256' + * 'CRC32', 'CRC32C', 'SHA1', 'SHA256', 'NONE', ''.. * value:{@value} */ public static final String CHECKSUM_ALGORITHM = "fs.s3a.create.checksum.algorithm"; + /** + * Default checksum algorithm: {@code "none"}. + */ + public static final String CHECKSUM_ALGORITHM_DEFAULT = + ChecksumSupport.NONE; + + /** + * Send a {@code Content-MD5 header} with every request. + * This is required when performing some operations with third party stores + * For example: bulk delete). + * It is supported by AWS S3, though has unexpected behavior with AWS S3 Express storage. + * See https://github.com/aws/aws-sdk-java-v2/issues/6459 for details. + * It will be automatically disabled there. + */ + public static final String MD5_HEADER_ENABLED = + "fs.s3a.md5.header.enabled"; + + /** + * Default value of {@link #MD5_HEADER_ENABLED}. + * Value: {@value}. + */ + public static final boolean MD5_HEADER_ENABLED_DEFAULT = true; + + /** * Are extensions classes, such as {@code fs.s3a.aws.credentials.provider}, * going to be loaded from the same classloader that loaded diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java index e47ebccd66dc1..41ae6603ccc85 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java @@ -207,18 +207,22 @@ private , ClientT> Build // add a plugin to add a Content-MD5 header. // this is required when performing some operations with third party stores - // (for example: bulk delete), and is harmless when working with AWS S3. - builder.addPlugin(LegacyMd5Plugin.create()); + // (for example: bulk delete), and is somewhat harmless when working with AWS S3. + if (parameters.isMd5HeaderEnabled()) { + builder.addPlugin(LegacyMd5Plugin.create()); + } - // do not do request checksums as this causes third-party store problems. - builder.requestChecksumCalculation(RequestChecksumCalculation.WHEN_REQUIRED); + //when to calculate request checksums. + builder.requestChecksumCalculation( + parameters.isChecksumCalculationEnabled() + ? RequestChecksumCalculation.WHEN_SUPPORTED + : RequestChecksumCalculation.WHEN_REQUIRED); // response checksum validation. Slow, even with CRC32 checksums. if (parameters.isChecksumValidationEnabled()) { builder.responseChecksumValidation(ResponseChecksumValidation.WHEN_SUPPORTED); } - maybeApplyS3AccessGrantsConfigurations(builder, conf); S3Configuration serviceConfiguration = S3Configuration.builder() diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java index 079b4022f2225..eb5d1c327bcd8 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java @@ -1173,10 +1173,14 @@ private ClientManager createClientManager(URI fsURI, boolean dtEnabled) throws I .withTransferManagerExecutor(unboundedThreadPool) .withRegion(configuredRegion) .withFipsEnabled(fipsEnabled) + .withS3ExpressStore(s3ExpressStore) .withExpressCreateSession( conf.getBoolean(S3EXPRESS_CREATE_SESSION, S3EXPRESS_CREATE_SESSION_DEFAULT)) .withChecksumValidationEnabled( conf.getBoolean(CHECKSUM_VALIDATION, CHECKSUM_VALIDATION_DEFAULT)) + .withChecksumCalculationEnabled( + conf.getBoolean(CHECKSUM_CALCULATION_ENABLED, CHECKSUM_CALCULATION_ENABLED_DEFAULT)) + .withMd5HeaderEnabled(conf.getBoolean(MD5_HEADER_ENABLED, MD5_HEADER_ENABLED_DEFAULT)) .withClientSideEncryptionEnabled(isCSEEnabled) .withClientSideEncryptionMaterials(cseMaterials) .withAnalyticsAcceleratorEnabled(isAnalyticsAcceleratorEnabled) diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ClientFactory.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ClientFactory.java index 559cd49c34582..ca0ed39a848df 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ClientFactory.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ClientFactory.java @@ -187,6 +187,11 @@ final class S3ClientCreationParameters { */ private String region; + /** + * Is this an S3 Express store? + */ + private boolean s3ExpressStore; + /** * Enable S3Express create session. */ @@ -207,6 +212,17 @@ final class S3ClientCreationParameters { */ private boolean isAnalyticsAcceleratorEnabled; + /** + * Is the MD5 Header Enabled? + */ + private boolean md5HeaderEnabled; + + /** + * Is Checksum calculation Enabled? + */ + private boolean checksumCalculationEnabled; + + /** * List of execution interceptors to include in the chain * of interceptors in the SDK. @@ -536,6 +552,20 @@ public String getKmsRegion() { return kmsRegion; } + public boolean isS3ExpressStore() { + return s3ExpressStore; + } + + /** + * Set builder value. + * @param value new value + * @return the builder + */ + public S3ClientCreationParameters withS3ExpressStore(final boolean value) { + s3ExpressStore = value; + return this; + } + /** * Should s3express createSession be called? * @return true if the client should enable createSession. @@ -568,6 +598,34 @@ public boolean isChecksumValidationEnabled() { return checksumValidationEnabled; } + public boolean isMd5HeaderEnabled() { + return md5HeaderEnabled; + } + + /** + * Set builder value. + * @param value new value + * @return the builder + */ + public S3ClientCreationParameters withMd5HeaderEnabled(final boolean value) { + md5HeaderEnabled = value; + return this; + } + + public boolean isChecksumCalculationEnabled() { + return checksumCalculationEnabled; + } + + /** + * Set builder value. + * @param value new value + * @return the builder + */ + public S3ClientCreationParameters withChecksumCalculationEnabled(final boolean value) { + checksumCalculationEnabled = value; + return this; + } + @Override public String toString() { return "S3ClientCreationParameters{" + @@ -580,8 +638,10 @@ public String toString() { ", multiPartThreshold=" + multiPartThreshold + ", multipartCopy=" + multipartCopy + ", region='" + region + '\'' + + ", s3ExpressStore=" + s3ExpressStore + ", expressCreateSession=" + expressCreateSession + ", checksumValidationEnabled=" + checksumValidationEnabled + + ", md5HeaderEnabled=" + md5HeaderEnabled + '}'; } diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md index f6fea9338a424..a054f9e2b7545 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md @@ -42,6 +42,8 @@ The features which may be unavailable include: * Bucket lifecycle rules to clean up pending uploads. * Support for multipart uploads. * Conditional file creation. (`fs.s3a.create.conditional.enabled = false`) +* Variations in checksum calculation on uploads. +* Requirement for Content-MD5 headers. ### Disabling Change Detection @@ -174,6 +176,27 @@ false to disable use of these features. ``` +## Upload checksums and MD5 Headers + +It may be necessary to change checksums of uploads by +1. Restoring the attachment of a `Content-MD5 header` in requests +2. Changing checksum calculation from "always" to "when required" + +```xml + + fs.s3a.md5.header.enabled + false + re-enable calculation and inclusion of an MD5 HEADER on data upload operations (default: false) + + + + fs.s3a.checksum.calculation.enabled + false + Calculate and attach a message checksum on every operation. (default: true) + + +``` + # Troubleshooting From bead08e74f5557a39ea5467116e31730d4d636be Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Wed, 15 Oct 2025 16:17:11 +0100 Subject: [PATCH 10/24] HADOOP-19654. third party stores --- LICENSE-binary | 2 +- .../hadoop/fs/s3a/DefaultS3ClientFactory.java | 7 +++++-- .../s3a/ITestS3AContractMultipartUploader.java | 14 ++++++++++++++ .../hadoop/fs/s3a/ITestS3ABucketExistence.java | 15 +++++++++++++++ .../apache/hadoop/fs/s3a/ITestS3AChecksum.java | 11 ++++++++++- .../hadoop/fs/s3a/auth/ITestCustomSigner.java | 4 +++- .../hadoop/fs/s3a/commit/ITestUploadRecovery.java | 7 ++++++- .../fs/s3a/performance/ITestCreateFileCost.java | 12 +++++++++--- 8 files changed, 63 insertions(+), 9 deletions(-) diff --git a/LICENSE-binary b/LICENSE-binary index 9aaa3ad7abda3..5c433304b7666 100644 --- a/LICENSE-binary +++ b/LICENSE-binary @@ -352,7 +352,7 @@ io.reactivex:rxnetty:0.4.20 io.swagger:swagger-annotations:1.5.4 javax.inject:javax.inject:1 net.java.dev.jna:jna:5.2.0 -net.minidev:accessors-smart:1.2 +net.minidev:accessors-smart:1.21 org.apache.avro:avro:1.11.4 org.apache.commons:commons-compress:1.26.1 org.apache.commons:commons-configuration2:2.10.1 diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java index 41ae6603ccc85..344459d63d7f0 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java @@ -209,14 +209,17 @@ private , ClientT> Build // this is required when performing some operations with third party stores // (for example: bulk delete), and is somewhat harmless when working with AWS S3. if (parameters.isMd5HeaderEnabled()) { + LOG.debug("MD5 header enabled"); builder.addPlugin(LegacyMd5Plugin.create()); } //when to calculate request checksums. - builder.requestChecksumCalculation( + final RequestChecksumCalculation checksumCalculation = parameters.isChecksumCalculationEnabled() ? RequestChecksumCalculation.WHEN_SUPPORTED - : RequestChecksumCalculation.WHEN_REQUIRED); + : RequestChecksumCalculation.WHEN_REQUIRED; + LOG.debug("Using checksum calculation policy: {}", checksumCalculation); + builder.requestChecksumCalculation(checksumCalculation); // response checksum validation. Slow, even with CRC32 checksums. if (parameters.isChecksumValidationEnabled()) { diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java index 0afdf20595bcc..b78b0bff7fa5e 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java @@ -18,6 +18,8 @@ package org.apache.hadoop.fs.contract.s3a; +import java.io.FileNotFoundException; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.contract.AbstractContractMultipartUploaderTest; import org.apache.hadoop.fs.contract.AbstractFSContract; @@ -149,4 +151,16 @@ public void testConcurrentUploads() throws Throwable { "Analytics Accelerator currently does not support reading of over written files"); super.testConcurrentUploads(); } + + + @Test + @Override + public void testMultipartUploadAbort() throws Exception { + try { + super.testMultipartUploadAbort(); + } catch (FileNotFoundException e) { + LOG.info("Multipart upload not found in abort(). This is common on third-party stores: {}", e.toString()); + LOG.debug("Exception: ", e); + } + } } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ABucketExistence.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ABucketExistence.java index 8b9a202f620e0..8d53748132a09 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ABucketExistence.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ABucketExistence.java @@ -39,12 +39,18 @@ import static org.apache.hadoop.fs.contract.ContractTestUtils.writeDataset; import static org.apache.hadoop.fs.s3a.Constants.AWS_REGION; import static org.apache.hadoop.fs.s3a.Constants.AWS_S3_ACCESSPOINT_REQUIRED; +import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_CALCULATION_ENABLED; +import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_CALCULATION_ENABLED_DEFAULT; import static org.apache.hadoop.fs.s3a.Constants.ENDPOINT; import static org.apache.hadoop.fs.s3a.Constants.FIPS_ENDPOINT; import static org.apache.hadoop.fs.s3a.Constants.FS_S3A; import static org.apache.hadoop.fs.s3a.Constants.PATH_STYLE_ACCESS; import static org.apache.hadoop.fs.s3a.Constants.S3A_BUCKET_PROBE; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.assume; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.getTestBucketName; import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides; +import static org.apache.hadoop.fs.s3a.S3AUtils.propagateBucketOptions; +import static org.apache.hadoop.fs.s3a.impl.NetworkBinding.isAwsEndpoint; import static org.apache.hadoop.test.LambdaTestUtils.intercept; /** @@ -59,6 +65,15 @@ public class ITestS3ABucketExistence extends AbstractS3ATestBase { private final URI uri = URI.create(FS_S3A + "://" + randomBucket + "/"); + @Override + protected Configuration createConfiguration() { + final Configuration conf = super.createConfiguration(); + String endpoint = propagateBucketOptions(conf, getTestBucketName(conf)).get(ENDPOINT, ""); + assume("Skipping existence probes", + isAwsEndpoint(endpoint)); + return conf; + } + @SuppressWarnings("deprecation") @Test public void testNoBucketProbing() throws Exception { diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java index 3525d89efc703..41c694d22318b 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java @@ -37,7 +37,12 @@ import static org.apache.hadoop.fs.contract.ContractTestUtils.skip; import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_ALGORITHM; +import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_CALCULATION_ENABLED; +import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_CALCULATION_ENABLED_DEFAULT; import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_VALIDATION; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.assume; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.getTestBucketName; +import static org.apache.hadoop.fs.s3a.S3AUtils.propagateBucketOptions; import static org.apache.hadoop.fs.s3a.audit.S3AAuditConstants.REJECT_OUT_OF_SPAN_OPERATIONS; /** @@ -59,6 +64,7 @@ public class ITestS3AChecksum extends AbstractS3ATestBase { public static Collection params() { return Arrays.asList(new Object[][]{ {"SHA256"}, + {"CRC32"}, {"CRC32C"}, {"SHA1"}, {UNKNOWN}, @@ -89,11 +95,14 @@ protected Configuration createConfiguration() { CHECKSUM_VALIDATION, REJECT_OUT_OF_SPAN_OPERATIONS); S3ATestUtils.disableFilesystemCaching(conf); - LOG.info("Using checksum algorithm {}", algorithmName); conf.set(CHECKSUM_ALGORITHM, algorithmName); conf.setBoolean(CHECKSUM_VALIDATION, true); conf.setBoolean(REJECT_OUT_OF_SPAN_OPERATIONS, false); checksumAlgorithm = ChecksumSupport.getChecksumAlgorithm(conf); + LOG.info("Using checksum algorithm {}/{}", algorithmName, checksumAlgorithm); + assume("Skipping checksum tests as " + CHECKSUM_CALCULATION_ENABLED + " is set", + propagateBucketOptions(conf, getTestBucketName(conf)) + .getBoolean(CHECKSUM_CALCULATION_ENABLED, CHECKSUM_CALCULATION_ENABLED_DEFAULT)); return conf; } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestCustomSigner.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestCustomSigner.java index c90f032ae2764..206c751945e0e 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestCustomSigner.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestCustomSigner.java @@ -359,7 +359,9 @@ public static void reset() { public static StoreValue getStoreValue(String bucketName, UserGroupInformation ugi) { StoreKey storeKey = new StoreKey(bucketName, ugi); - return knownStores.get(storeKey); + final StoreValue storeValue = knownStores.get(storeKey); + LOG.info("Getting store value for key {}: {}", storeKey, storeValue); + return storeValue; } private static class StoreKey { diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestUploadRecovery.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestUploadRecovery.java index c5144a466a935..a82fbfa1a719c 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestUploadRecovery.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestUploadRecovery.java @@ -19,6 +19,7 @@ package org.apache.hadoop.fs.s3a.commit; import java.io.File; +import java.io.FileNotFoundException; import java.util.Arrays; import java.util.Collection; import java.util.UUID; @@ -266,7 +267,11 @@ public void testCommitOperations() throws Throwable { try (CommitContext commitContext = actions.createCommitContextForTesting(dest, JOB_ID, 0)) { - commitContext.commitOrFail(commit); + try { + commitContext.commitOrFail(commit); + } catch (FileNotFoundException e) { + LOG.info("S3 Store doesn't support retries on completed commits: {}", e.toString()); + } } // make sure the saved data is as expected verifyFileContents(fs, dest, dataset); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestCreateFileCost.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestCreateFileCost.java index 9549a0c85f0ca..e202f0ce0f1d8 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestCreateFileCost.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestCreateFileCost.java @@ -41,6 +41,7 @@ import static java.util.Objects.requireNonNull; import static org.apache.hadoop.fs.contract.ContractTestUtils.toChar; +import static org.apache.hadoop.fs.s3a.Constants.FS_S3A_CONDITIONAL_CREATE_ENABLED; import static org.apache.hadoop.fs.s3a.Constants.FS_S3A_CREATE_HEADER; import static org.apache.hadoop.fs.s3a.Constants.FS_S3A_CREATE_PERFORMANCE; import static org.apache.hadoop.fs.s3a.Constants.XA_HEADER_PREFIX; @@ -201,9 +202,14 @@ public void testCreateBuilderSequence() throws Throwable { () -> buildFile(testFile, false, true, GET_FILE_STATUS_ON_FILE)); } else { - // will trigger conditional create and throw RemoteFileChangedException - intercept(RemoteFileChangedException.class, - () -> buildFile(testFile, false, true, NO_HEAD_OR_LIST)); + if (getConfiguration().getBoolean(FS_S3A_CONDITIONAL_CREATE_ENABLED, true)) { + // will trigger conditional create and throw RemoteFileChangedException + intercept(RemoteFileChangedException.class, + () -> buildFile(testFile, false, true, NO_HEAD_OR_LIST)); + } else { + // third party store w/out conditional overwrite support + buildFile(testFile, false, true, NO_HEAD_OR_LIST); + } } } From 8d75f058723618bebf778c5a39e0fb6a88f0b299 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Wed, 15 Oct 2025 19:07:45 +0100 Subject: [PATCH 11/24] HADOOP-19654. third party stores and signing The problem here is not the signing -it's that the test couldn't cope with path-style-access as it is used the hostname of the http request as the bucket name. Fixed by adding a static field to note when path access is in use and to map the name to "bucket" in such a situation. --- .../hadoop/fs/s3a/auth/CustomSdkSigner.java | 1 + .../hadoop/fs/s3a/auth/ITestCustomSigner.java | 32 +++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/CustomSdkSigner.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/CustomSdkSigner.java index b378602165074..e374a1ad9fe5b 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/CustomSdkSigner.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/CustomSdkSigner.java @@ -104,6 +104,7 @@ public SdkHttpFullRequest sign(SdkHttpFullRequest request, /** * Parse the bucket name from the host. + * This does not work for path-style access; the hostname of the endpoint is returned. * @param host hostname * @return the parsed bucket name; if "kms" is KMS signing. */ diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestCustomSigner.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestCustomSigner.java index 206c751945e0e..f2ba4434f2763 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestCustomSigner.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestCustomSigner.java @@ -25,6 +25,7 @@ import java.util.HashMap; import java.util.Map; import java.util.Objects; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import org.junit.jupiter.api.AfterEach; @@ -60,6 +61,7 @@ import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_VALIDATION; import static org.apache.hadoop.fs.s3a.Constants.CUSTOM_SIGNERS; import static org.apache.hadoop.fs.s3a.Constants.ENABLE_MULTI_DELETE; +import static org.apache.hadoop.fs.s3a.Constants.PATH_STYLE_ACCESS; import static org.apache.hadoop.fs.s3a.Constants.SIGNING_ALGORITHM_S3; import static org.apache.hadoop.fs.s3a.MultipartTestUtils.createMagicFile; import static org.apache.hadoop.fs.s3a.S3ATestUtils.disableFilesystemCaching; @@ -80,6 +82,13 @@ public class ITestCustomSigner extends AbstractS3ATestBase { private static final String TEST_ID_KEY = "TEST_ID_KEY"; private static final String TEST_REGION_KEY = "TEST_REGION_KEY"; + /** + * Is the store using path style access? + */ + private static final AtomicBoolean PATH_STYLE_ACCESS_IN_USE = new AtomicBoolean(false); + + public static final String BUCKET = "bucket"; + /** * Parameterization. */ @@ -121,6 +130,7 @@ public void setup() throws Exception { final S3AFileSystem fs = getFileSystem(); final Configuration conf = fs.getConf(); endpoint = conf.getTrimmed(Constants.ENDPOINT, Constants.CENTRAL_ENDPOINT); + PATH_STYLE_ACCESS_IN_USE.set(conf.getBoolean(PATH_STYLE_ACCESS, false)); LOG.info("Test endpoint is {}", endpoint); regionName = conf.getTrimmed(Constants.AWS_REGION, ""); if (regionName.isEmpty()) { @@ -168,6 +178,7 @@ private S3AFileSystem runStoreOperationsAndVerify(UserGroupInformation ugi, throws IOException, InterruptedException { Configuration conf = createTestConfig(identifier); return ugi.doAs((PrivilegedExceptionAction) () -> { + LOG.info("Performing store operations for {}", ugi.getShortUserName()); int instantiationCount = CustomSigner.getInstantiationCount(); int invocationCount = CustomSigner.getInvocationCount(); S3AFileSystem fs = (S3AFileSystem)finalPath.getFileSystem(conf); @@ -288,6 +299,9 @@ public SdkHttpFullRequest sign(SdkHttpFullRequest request, String host = request.host(); String bucketName = parseBucketFromHost(host); + if (PATH_STYLE_ACCESS_IN_USE.get()) { + bucketName = BUCKET; + } try { lastStoreValue = CustomSignerInitializer .getStoreValue(bucketName, UserGroupInformation.getCurrentUser()); @@ -330,11 +344,20 @@ public static String description() { public static final class CustomSignerInitializer implements AwsSignerInitializer { + /** + * Map of (bucket-name, ugi) -> store value. + *

    + * When working with buckets using path-style resolution, the store bucket name + * is just {@link #BUCKET}. + */ private static final Map knownStores = new HashMap<>(); @Override public void registerStore(String bucketName, Configuration storeConf, DelegationTokenProvider dtProvider, UserGroupInformation storeUgi) { + if (PATH_STYLE_ACCESS_IN_USE.get()) { + bucketName = BUCKET; + } StoreKey storeKey = new StoreKey(bucketName, storeUgi); StoreValue storeValue = new StoreValue(storeConf, dtProvider); LOG.info("Registering store {} with value {}", storeKey, storeValue); @@ -344,6 +367,9 @@ public void registerStore(String bucketName, Configuration storeConf, @Override public void unregisterStore(String bucketName, Configuration storeConf, DelegationTokenProvider dtProvider, UserGroupInformation storeUgi) { + if (PATH_STYLE_ACCESS_IN_USE.get()) { + bucketName = BUCKET; + } StoreKey storeKey = new StoreKey(bucketName, storeUgi); LOG.info("Unregistering store {}", storeKey); knownStores.remove(storeKey); @@ -364,6 +390,12 @@ public static StoreValue getStoreValue(String bucketName, return storeValue; } + /** + * The key for the signer map: bucket-name and UGI. + *

    + * In path-style-access the bucket name is mapped to {@link #BUCKET} so only + * one bucket per UGI instance is supported. + */ private static class StoreKey { private final String bucketName; private final UserGroupInformation ugi; From fff2adac22dece0523d2fb4424d743da7a2ccf7d Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Thu, 16 Oct 2025 15:11:02 +0100 Subject: [PATCH 12/24] HADOOP-19654. third party stores and MPU commit retries Third-party stores may remove an ongoing MPU after it is committed, so a retry will fail. On AWS S3 it is simply ignored. New option fs.s3a.mpu.commit.consumes.upload.id can be set to true to change assertions in tests which would otherwise fail. --- .../site/markdown/tools/hadoop-aws/testing.md | 40 +++++++++++++++++-- .../ITestS3AContractMultipartUploader.java | 10 ++++- .../hadoop/fs/s3a/S3ATestConstants.java | 18 +++++++++ .../fs/s3a/commit/ITestUploadRecovery.java | 12 ++++-- 4 files changed, 72 insertions(+), 8 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md index 4176b20e8f54a..049f27c2e5bdf 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md @@ -585,6 +585,19 @@ on third party stores. test.fs.s3a.performance.enabled false + + + + fs.s3a.ext.multipart.commit.consumes.upload.id + true + + + ``` See [Third Party Stores](third_party_stores.html) for more on this topic. @@ -736,9 +749,23 @@ For stores with stricter semantics, these test cases must be disabled. ``` +### Changing expectations on multipart upload retries: `ITestS3AContractMultipartUploader` and `ITestUploadRecovery` + +If the store reports errors when trying to list/abort completed multipart uploads, +expect failures in `ITestUploadRecovery` and `ITestS3AContractMultipartUploader`. +The tests can be reconfigured to expect failure by setting the option +`fs.s3a.ext.multipart.commit.consumes.upload.id` to true. + +Note how this can be set as a per-bucket option. + +```xml + + fs.s3a.ext.multipart.commit.consumes.upload.id + true + +``` ### Tests which may fail (and which you can ignore) -* `ITestS3AContractMultipartUploader` tests `testMultipartUploadAbort` and `testSingleUpload` raising `FileNotFoundException` * `ITestS3AMiscOperations.testEmptyFileChecksums`: if the FS encrypts data always. ## Debugging Test failures @@ -837,10 +864,15 @@ Key features of `AbstractS3ATestBase` * `getFileSystem()` returns the S3A Filesystem bonded to the contract test Filesystem defined in `fs.s3a.contract.test` * will automatically skip all tests if that URL is unset. -* Extends `AbstractFSContractTestBase` and `Assert` for all their methods. +* Extends `AbstractFSContractTestBase` +* Uses AssertJ for all assertions, _not_ those of JUnit5. Having shared base classes may help reduce future maintenance too. Please -use them/ +use them. + +We adopted AssertJ assertions long before the move to JUnit5. +While there are still many tests with legacy JUnit 1.x assertions, all new test cases +should use AssertJ assertions and MUST NOT use JUnit5. ### Secure @@ -873,7 +905,7 @@ against other regions, or with third party S3 implementations. Thus the URL can be overridden for testing elsewhere. -### Works With Other S3 Stored +### Works With Other S3 Stores Don't assume AWS S3 US-East only, do allow for working with external S3 implementations. Those may be behind the latest S3 API features, not support encryption, session diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java index b78b0bff7fa5e..a05d09bded5b6 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java @@ -31,9 +31,11 @@ import org.junit.jupiter.api.Test; import static org.apache.hadoop.fs.contract.ContractTestUtils.skip; +import static org.apache.hadoop.fs.s3a.S3ATestConstants.DEFAULT_MULTIPART_COMMIT_CONSUMES_UPLOAD_ID; import static org.apache.hadoop.fs.s3a.S3ATestConstants.DEFAULT_SCALE_TESTS_ENABLED; import static org.apache.hadoop.fs.s3a.S3ATestConstants.KEY_HUGE_PARTITION_SIZE; import static org.apache.hadoop.fs.s3a.S3ATestConstants.KEY_SCALE_TESTS_ENABLED; +import static org.apache.hadoop.fs.s3a.S3ATestConstants.MULTIPART_COMMIT_CONSUMES_UPLOAD_ID; import static org.apache.hadoop.fs.s3a.S3ATestConstants.SCALE_TEST_TIMEOUT_MILLIS; import static org.apache.hadoop.fs.s3a.S3ATestUtils.assume; import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeNotS3ExpressFileSystem; @@ -56,6 +58,8 @@ public class ITestS3AContractMultipartUploader extends private int partitionSize; + private boolean mpuCommitConsumesUploadId; + /** * S3 requires a minimum part size of 5MB (except the last part). * @return 5MB+ value @@ -97,7 +101,7 @@ protected boolean supportsConcurrentUploadsToSamePath() { @Override protected boolean finalizeConsumesUploadIdImmediately() { - return false; + return mpuCommitConsumesUploadId; } @BeforeEach @@ -115,6 +119,10 @@ public void setup() throws Exception { partitionSize = (int) getTestPropertyBytes(conf, KEY_HUGE_PARTITION_SIZE, DEFAULT_HUGE_PARTITION_SIZE); + mpuCommitConsumesUploadId = getFileSystem().getConf().getBoolean( + MULTIPART_COMMIT_CONSUMES_UPLOAD_ID, + DEFAULT_MULTIPART_COMMIT_CONSUMES_UPLOAD_ID); + LOG.debug("{} = {}", MULTIPART_COMMIT_CONSUMES_UPLOAD_ID, mpuCommitConsumesUploadId); } /** diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestConstants.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestConstants.java index 8743907545b73..6c92dacebb92f 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestConstants.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestConstants.java @@ -284,4 +284,22 @@ public interface S3ATestConstants { * Default policy on root tests: {@value}. */ boolean DEFAULT_ROOT_TESTS_ENABLED = true; + + /** + * Flag to set when testing third party stores: {@value}. + *

    + * Set to true when a completed MPU commit consumes the ID so it is no + * longer visible in list operations; and abort reports {@code NoSuchUploadException}. + *

    + * This will change assertions in relevant tests. + *

    + * Can be set as a per-bucket setting; test runner will pick this up. + */ + String MULTIPART_COMMIT_CONSUMES_UPLOAD_ID = + "fs.s3a.ext.multipart.commit.consumes.upload.id"; + + /** + * Default value of {@link #MULTIPART_COMMIT_CONSUMES_UPLOAD_ID}: {@value}. + */ + boolean DEFAULT_MULTIPART_COMMIT_CONSUMES_UPLOAD_ID = false; } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestUploadRecovery.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestUploadRecovery.java index a82fbfa1a719c..a00d6bb6bf630 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestUploadRecovery.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestUploadRecovery.java @@ -65,6 +65,7 @@ import static org.apache.hadoop.fs.s3a.commit.CommitConstants.BASE; import static org.apache.hadoop.fs.s3a.commit.CommitConstants.MAGIC_PATH_PREFIX; import static org.apache.hadoop.fs.s3a.test.SdkFaultInjector.setRequestFailureConditions; +import static org.apache.hadoop.test.LambdaTestUtils.intercept; /** * Test upload recovery by injecting failures into the response chain. @@ -265,12 +266,17 @@ public void testCommitOperations() throws Throwable { setRequestFailureConditions(2, SdkFaultInjector::isCompleteMultipartUploadRequest); + boolean mpuCommitConsumesUploadId = getFileSystem().getConf().getBoolean( + MULTIPART_COMMIT_CONSUMES_UPLOAD_ID, + DEFAULT_MULTIPART_COMMIT_CONSUMES_UPLOAD_ID); try (CommitContext commitContext = actions.createCommitContextForTesting(dest, JOB_ID, 0)) { - try { + + if (mpuCommitConsumesUploadId) { + intercept(FileNotFoundException.class, () -> + commitContext.commitOrFail(commit)); + } else { commitContext.commitOrFail(commit); - } catch (FileNotFoundException e) { - LOG.info("S3 Store doesn't support retries on completed commits: {}", e.toString()); } } // make sure the saved data is as expected From 9b8161791acf7b999e607d9c88089ecb281ff55c Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Thu, 16 Oct 2025 19:56:42 +0100 Subject: [PATCH 13/24] HADOOP-19654. SDK upgrade: S3 Express remove mention of CRC32 as checksum algorithm; don't assert that it is null if requested version is is unknown. --- ...tS3AAnalyticsAcceleratorStreamReading.java | 15 ++++++++---- .../hadoop/fs/s3a/ITestS3AChecksum.java | 6 ++--- .../hadoop/fs/s3a/ITestS3AEndpointRegion.java | 23 +++++++++++++------ 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAnalyticsAcceleratorStreamReading.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAnalyticsAcceleratorStreamReading.java index 8f8f90f9b1e65..be0d2cc5b20c2 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAnalyticsAcceleratorStreamReading.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAnalyticsAcceleratorStreamReading.java @@ -164,26 +164,33 @@ public void testMultiRowGroupParquet() throws Throwable { FileStatus fileStatus = getFileSystem().getFileStatus(dest); - byte[] buffer = new byte[3000]; + final int size = 3000; + byte[] buffer = new byte[size]; + int readLimit = Math.min(size, (int) fileStatus.getLen()); IOStatistics ioStats; + final IOStatistics fsIostats = getFileSystem().getIOStatistics(); + final long initialAuditCount = fsIostats.counters() + .getOrDefault(AUDIT_REQUEST_EXECUTION, 0L); + try (FSDataInputStream inputStream = getFileSystem().open(dest)) { ioStats = inputStream.getIOStatistics(); - inputStream.readFully(buffer, 0, (int) fileStatus.getLen()); + inputStream.readFully(buffer, 0, readLimit); } verifyStatisticCounterValue(ioStats, STREAM_READ_ANALYTICS_OPENED, 1); try (FSDataInputStream inputStream = getFileSystem().openFile(dest) + .withFileStatus(fileStatus) .must(FS_OPTION_OPENFILE_READ_POLICY, FS_OPTION_OPENFILE_READ_POLICY_PARQUET) .build().get()) { ioStats = inputStream.getIOStatistics(); - inputStream.readFully(buffer, 0, (int) fileStatus.getLen()); + inputStream.readFully(buffer, 0, readLimit); } verifyStatisticCounterValue(ioStats, STREAM_READ_ANALYTICS_OPENED, 1); - verifyStatisticCounterValue(getFileSystem().getIOStatistics(), AUDIT_REQUEST_EXECUTION, 4); + verifyStatisticCounterValue(fsIostats, AUDIT_REQUEST_EXECUTION, initialAuditCount + 2); } @Test diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java index 41c694d22318b..77d6e6a16e42b 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java @@ -64,7 +64,6 @@ public class ITestS3AChecksum extends AbstractS3ATestBase { public static Collection params() { return Arrays.asList(new Object[][]{ {"SHA256"}, - {"CRC32"}, {"CRC32C"}, {"SHA1"}, {UNKNOWN}, @@ -156,12 +155,11 @@ private void assertChecksum(Path path) throws IOException { break; case UNKNOWN_TO_SDK_VERSION: // expect values to be null + // this is brittle with different stores; crc32 assertions have been cut + // because S3 express always set them. Assertions.assertThat(headObject.checksumSHA256()) .describedAs("headObject.checksumSHA256()") .isNull(); - Assertions.assertThat(headObject.checksumCRC32()) - .describedAs("headObject.checksumCRC32()") - .isNull(); break; default: fail("Checksum algorithm not supported: " + checksumAlgorithm); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEndpointRegion.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEndpointRegion.java index e237a199b9750..721f2247c6a1a 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEndpointRegion.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEndpointRegion.java @@ -56,6 +56,7 @@ import static org.apache.hadoop.fs.s3a.Constants.S3_ENCRYPTION_ALGORITHM; import static org.apache.hadoop.fs.s3a.DefaultS3ClientFactory.ERROR_ENDPOINT_WITH_FIPS; import static org.apache.hadoop.fs.s3a.S3ATestUtils.assume; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeNotS3ExpressFileSystem; import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeStoreAwsHosted; import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides; import static org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils.DEFAULT_REQUESTER_PAYS_BUCKET_NAME; @@ -362,7 +363,7 @@ public void testCentralEndpointAndDifferentRegionThanBucket() throws Throwable { public void testWithOutCrossRegionAccess() throws Exception { describe("Verify cross region access fails when disabled"); // skip the test if the region is sa-east-1 - skipCrossRegionTest(); + assumeCrossRegionTestSupported(); final Configuration newConf = new Configuration(getConfiguration()); removeBaseAndBucketOverrides(newConf, ENDPOINT, @@ -383,7 +384,7 @@ public void testWithOutCrossRegionAccess() throws Exception { public void testWithCrossRegionAccess() throws Exception { describe("Verify cross region access succeed when enabled"); // skip the test if the region is sa-east-1 - skipCrossRegionTest(); + assumeCrossRegionTestSupported(); final Configuration newConf = new Configuration(getConfiguration()); removeBaseAndBucketOverrides(newConf, ENDPOINT, @@ -484,7 +485,7 @@ public void testCentralEndpointAndNullRegionWithCRUD() throws Throwable { describe("Access the test bucket using central endpoint and" + " null region, perform file system CRUD operations"); final Configuration conf = getConfiguration(); - assumeStoreAwsHosted(getFileSystem()); + assumeCrossRegionTestSupported(); final Configuration newConf = new Configuration(conf); @@ -507,7 +508,7 @@ public void testCentralEndpointAndNullRegionWithCRUD() throws Throwable { public void testCentralEndpointAndNullRegionFipsWithCRUD() throws Throwable { describe("Access the test bucket using central endpoint and" + " null region and fips enabled, perform file system CRUD operations"); - assumeStoreAwsHosted(getFileSystem()); + assumeCrossRegionTestSupported(); final String bucketLocation = getFileSystem().getBucketLocation(); assume("FIPS can be enabled to access buckets from US or Canada endpoints only", @@ -534,10 +535,18 @@ public void testCentralEndpointAndNullRegionFipsWithCRUD() throws Throwable { } /** - * Skip the test if the region is null or sa-east-1. + * Skip the test if the region is null, sa-east-1, or otherwise + * not compatible with the test */ - private void skipCrossRegionTest() throws IOException { - String region = getFileSystem().getS3AInternals().getBucketMetadata().bucketRegion(); + private void assumeCrossRegionTestSupported() throws IOException { + final S3AFileSystem fs = getFileSystem(); + + // not S3 as the store URLs may not resolve. + assumeNotS3ExpressFileSystem(fs); + // aws hosted. + assumeStoreAwsHosted(fs); + + String region = fs.getS3AInternals().getBucketMetadata().bucketRegion(); if (region == null || SA_EAST_1.equals(region)) { skip("Skipping test since region is null or it is set to sa-east-1"); } From 2e4769045d15a65a92782f9b38b80a2278a0dda8 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Tue, 21 Oct 2025 14:57:01 +0100 Subject: [PATCH 14/24] Cloudstore: etag command; new version New command "etag". Retrieves and prints etags. Updated release version HADOOP-19654. Update third party documentation, especially on null etags Input stream MUST be set to classic unless/until support there changes. --- .../tools/hadoop-aws/third_party_stores.md | 160 +++++++++++++----- 1 file changed, 120 insertions(+), 40 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md index a054f9e2b7545..f9ea445d4ef36 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md @@ -45,18 +45,6 @@ The features which may be unavailable include: * Variations in checksum calculation on uploads. * Requirement for Content-MD5 headers. -### Disabling Change Detection - -The (default) etag-based change detection logic expects stores to provide an Etag header in HEAD/GET requests, -and to support it as a precondition in subsequent GET and COPY calls. -If a store does not do this, disable the checks. - -```xml - - fs.s3a.change.detection.mode - none - -``` ## Connecting to a third party object store over HTTPS The core setting for a third party store is to change the endpoint in `fs.s3a.endpoint`. @@ -89,7 +77,7 @@ then these must be set, either in XML or (preferred) in a JCEKS file. fs.s3a.endpoint.region - anything + anything except: sdk, auto, ec2 @@ -106,7 +94,14 @@ then these must be set, either in XML or (preferred) in a JCEKS file. If per-bucket settings are used here, then third-party stores and credentials may be used alongside an AWS store. +### region naming +AWS SDK requires the name of a region is supplied for signing, and that region match the endpoint used. + +Third-party stores don't normally care about the name of a region, *only that a region is supplied*. + +You should set `fs.s3a.endpoint.region` to anything except the following reserved names: `sdk`, `ec2` and `auto`. +We have plans for those. ## Other issues @@ -122,7 +117,7 @@ This can be addressed in two ways #### S3Guard uploads command -This can be executed on a schedule, or manually +This can be executed on a schedule, or manually: ``` hadoop s3guard uploads -abort -force s3a://bucket/ @@ -176,7 +171,7 @@ false to disable use of these features. ``` -## Upload checksums and MD5 Headers +## Controlling Upload Checksums and MD5 Headers It may be necessary to change checksums of uploads by 1. Restoring the attachment of a `Content-MD5 header` in requests @@ -194,13 +189,51 @@ It may be necessary to change checksums of uploads by false Calculate and attach a message checksum on every operation. (default: true) +``` + + +### Disabling Change Detection +The (default) etag-based change detection logic expects stores to provide an Etag header in HEAD/GET requests, +and to support it as a precondition in subsequent GET and COPY calls. +If a store does not do this, disable the checks. + +```xml + + fs.s3a.change.detection.mode + none + +``` + +## Handling Null Etags + +Some object stores do not support etags, that is: they return `null` or an empty string as the etag of an object on both HEAD and GET requests. + +This breaks version management in the classic input stream *and* metadata caching in the analytics stream. + +To work with such a store: +* Set `fs.s3a.input.stream.type` to `classic` +* Set `fs.s3a.change.detection.mode` to `none` + +```xml + + fs.s3a.input.stream.type + classic + + + + fs.s3a.change.detection.mode + none + ``` +Note: the [cloudstore](https://github.com/steveloughran/cloudstore) `etag` command will retrieve and print an object's etag, +and can be used to help debug this situation. +The etag value of a newly created object SHOULD be a non-empty string. # Troubleshooting -The most common problem when talking to third-party stores are +The most common problem when talking to third-party stores are: 1. The S3A client is still configured to talk to the AWS S3 endpoint. This leads to authentication failures and/or reports that the bucket is unknown. 2. Path access has not been enabled, the client is generating a host name for the target bucket and it does not exist. @@ -208,11 +241,12 @@ The most common problem when talking to third-party stores are 4. JVM HTTPS settings include the certificates needed to negotiate a TLS connection with the store. -## How to improve troubleshooting +## How to Troubleshoot problems + +### Log More Network Info -### log more network info +There are some very low level logs which can be printed. -There are some very low level logs. ```properties # Log all HTTP requests made; includes S3 interaction. This may # include sensitive information such as account IDs in HTTP headers. @@ -226,7 +260,7 @@ log4j.logger.io.netty.handler.logging=DEBUG log4j.logger.io.netty.handler.codec.http2.Http2FrameLogger=DEBUG ``` -### Cut back on retries, shorten timeouts +### Reduce on Retries; Shorten Timeouts By default, there's a lot of retries going on in the AWS connector (which even retries on DNS failures) and in the S3A code which invokes it. @@ -286,7 +320,7 @@ the AWS SDK itself still makes a limited attempt to retry. There's an external utility, [cloudstore](https://github.com/steveloughran/cloudstore) whose [storediag](https://github.com/steveloughran/cloudstore#command-storediag) exists to debug the connection settings to hadoop cloud storage. ```bash -hadoop jar cloudstore-1.0.jar storediag s3a://nonexistent-bucket-example/ +hadoop jar cloudstore-1.1.jar storediag s3a://nonexistent-bucket-example/ ``` The main reason it's not an ASF release is that it allows for a rapid release cycle, sometimes hours; if anyone doesn't trust @@ -437,7 +471,51 @@ Fix: path style access ``` -# Connecting to Google Cloud Storage through the S3A connector +# Settings for Specific Stores + +## Dell ECS through the S3A Connector + +As of October 2025 and the 2.33.8 AWS SDK, the settings needed to interact with Dell ECS at [ECS Test Drive](https://portal.ecstestdrive.com/) were + +```xml + + fs.s3a.region + region + arbitrary name + + + + + fs.s3a.endpoint.region + region + + + + fs.s3a.path.style.access + true + + + + fs.s3a.create.conditional.enabled + false + + + + fs.s3a.md5.header.enabled + false + re-enable calculation and inclusion of an MD5 HEADER on data upload operations (default: false) + + + + fs.s3a.checksum.calculation.enabled + false + Calculate and attach a message checksum on every operation. (default: true) + + + +``` + +# Google Cloud Storage through the S3A connector It *is* possible to connect to google cloud storage through the S3A connector. However, Google provide their own [Cloud Storage connector](https://cloud.google.com/dataproc/docs/concepts/connectors/cloud-storage). @@ -466,49 +544,49 @@ this makes renaming and deleting significantly slower. - fs.s3a.bucket.gcs-container.access.key + fs.s3a.access.key GOOG1EZ.... - fs.s3a.bucket.gcs-container.secret.key + fs.s3a.secret.key SECRETS - fs.s3a.bucket.gcs-container.endpoint + fs.s3a.endpoint https://storage.googleapis.com + + - fs.s3a.bucket.gcs-container.bucket.probe - 0 + fs.s3a.endpoint.region + gcs - + - fs.s3a.bucket.gcs-container.list.version - 1 + fs.s3a.path.style.access + true - + - fs.s3a.bucket.gcs-container.multiobjectdelete.enable - false + fs.s3a.bucket.probe + 0 - fs.s3a.bucket.gcs-container.path.style.access - true + fs.s3a.list.version + 1 - - fs.s3a.bucket.gcs-container.endpoint.region - gcs + fs.s3a.multiobjectdelete.enable + false - - fs.s3a.multipart.uploads.enabled + fs.s3a.committer.magic.enabled false @@ -522,6 +600,7 @@ this makes renaming and deleting significantly slower. fs.s3a.create.conditional.enabled false + ``` @@ -554,3 +633,4 @@ It is also a way to regression test foundational S3A third-party store compatibi _Note_ If anyone is set up to test this regularly, please let the hadoop developer team know if regressions do surface, as it is not a common test configuration. +We do use it to help test compatibility during SDK updates. From 1cd8a2820c4aadeca61f3a7449c7d98fd34bb9d8 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Tue, 21 Oct 2025 19:17:28 +0100 Subject: [PATCH 15/24] HADOOP-19654. Tests to skip as appropriate when MPU is disabled. This includes one production-side change: FS automatically declares that magic committers are disabled if MPU is disabled. --- .../fs/s3a/commit/MagicCommitIntegration.java | 2 +- .../tools/hadoop-aws/third_party_stores.md | 8 +- .../tools/hadoop-aws/troubleshooting_s3a.md | 126 +++++++++++++++++- .../ITestS3AContractMultipartUploader.java | 2 + .../fs/s3a/ITestS3ABlockOutputArray.java | 12 ++ .../hadoop/fs/s3a/ITestS3AMultipartUtils.java | 11 ++ .../apache/hadoop/fs/s3a/S3ATestUtils.java | 26 ++++ .../hadoop/fs/s3a/auth/ITestCustomSigner.java | 21 ++- .../hadoop/fs/s3a/auth/ITestHttpSigner.java | 12 +- .../s3a/commit/AbstractITCommitProtocol.java | 2 +- .../s3a/commit/ITestCommitOperationCost.java | 2 + .../fs/s3a/commit/ITestCommitOperations.java | 1 + .../s3a/commit/ITestS3ACommitterFactory.java | 2 + .../fs/s3a/commit/ITestUploadRecovery.java | 6 +- .../integration/ITestS3ACommitterMRJob.java | 2 + .../magic/ITestS3AHugeMagicCommits.java | 9 ++ .../commit/terasort/ITestTerasortOnS3A.java | 2 + .../fs/s3a/impl/ITestTreewalkProblems.java | 2 + ...ITestUploadPurgeOnDirectoryOperations.java | 2 + .../s3a/performance/ITestS3ADeleteCost.java | 6 +- .../fs/s3a/s3guard/ITestS3GuardTool.java | 5 + .../s3a/scale/AbstractSTestS3AHugeFiles.java | 11 ++ ...ITestS3ABlockOutputStreamInterruption.java | 6 + .../scale/ITestS3AHugeFilesArrayBlocks.java | 9 ++ .../ITestS3AHugeFilesByteBufferBlocks.java | 9 ++ .../ITestS3AMultipartUploadSizeLimits.java | 9 ++ .../fs/s3a/yarn/ITestS3AMiniYarnCluster.java | 2 + 27 files changed, 286 insertions(+), 21 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/MagicCommitIntegration.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/MagicCommitIntegration.java index ba1dd400f6d7b..9ada0d565a342 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/MagicCommitIntegration.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/MagicCommitIntegration.java @@ -64,7 +64,7 @@ public MagicCommitIntegration(S3AFileSystem owner, boolean magicCommitEnabled) { super(owner.createStoreContext()); this.owner = owner; - this.magicCommitEnabled = magicCommitEnabled; + this.magicCommitEnabled = magicCommitEnabled && owner.isMultipartUploadEnabled(); } /** diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md index f9ea445d4ef36..9288b307e2a43 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md @@ -515,7 +515,7 @@ As of October 2025 and the 2.33.8 AWS SDK, the settings needed to interact with ``` -# Google Cloud Storage through the S3A connector +## Google Cloud Storage through the S3A connector It *is* possible to connect to google cloud storage through the S3A connector. However, Google provide their own [Cloud Storage connector](https://cloud.google.com/dataproc/docs/concepts/connectors/cloud-storage). @@ -570,6 +570,12 @@ this makes renaming and deleting significantly slower. true + + fs.s3a.checksum.calculation.enabled + false + Calculate and attach a message checksum on every operation. (default: true) + + fs.s3a.bucket.probe 0 diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md index e4a0ebff40396..323854bfd75b4 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md @@ -243,7 +243,7 @@ A credential provider listed in `fs.s3a.aws.credentials.provider` does not imple the interface `software.amazon.awssdk.auth.credentials.AwsCredentialsProvider`. ``` -InstantiationIOException: `s3a://stevel-gcs/': Class org.apache.hadoop.fs.s3a.S3ARetryPolicy does not implement +InstantiationIOException: `s3a://gcs/': Class org.apache.hadoop.fs.s3a.S3ARetryPolicy does not implement software.amazon.awssdk.auth.credentials.AwsCredentialsProvider (configuration key fs.s3a.aws.credentials.provider) at org.apache.hadoop.fs.s3a.impl.InstantiationIOException.isNotInstanceOf(InstantiationIOException.java:128) at org.apache.hadoop.fs.s3a.S3AUtils.getInstanceFromReflection(S3AUtils.java:604) @@ -478,6 +478,9 @@ java.nio.file.AccessDeniedException: bucket: doesBucketExist on bucket: ``` +If working with a third-party bucket, verify the `fs.s3a.endpoint` setting +points to the third-party store. + ### `AccessDeniedException` All access to this object has been disabled Caller has no permission to access the bucket at all. @@ -602,6 +605,94 @@ Glacier. If you want to access the file with S3A after writes, do not set `fs.s3a.create.storage.class` to `glacier` or `deep_archive`. +### `AccessDeniedException` with `SignatureDoesNotMatch` on a third party bucket. + +This can surface when trying to interact, especially write data, to a third-party bucket + +``` + Writing Object on example-file: software.amazon.awssdk.services.s3.model.S3Exception: Invalid argument. (Service: S3, Status Code: 403, Request ID: null) (SDK Attempt Count: 1):SignatureDoesNotMatch +``` + +The store does not recognize checksum calculation on every operation. +Fix: disable it by setting `fs.s3a.checksum.calculation.enabled` to `false`. + +``` + + fs.s3a.checksum.calculation.enabled + false + Calculate and attach a message checksum on every operation. (default: true) + +``` + +Full stack + +``` +> bin/hadoop fs -touchz s3a://gcs/example-file +2025-10-21 16:23:27,642 [main] WARN s3a.S3ABlockOutputStream (S3ABlockOutputStream.java:progressChanged(1335)) - Transfer failure of block FileBlock{index=1, destFile=/tmp/hadoop-stevel/s3a/s3ablock-0001-1358390699869033998.tmp, state=Upload, dataSize=0, limit=-1} +2025-10-21 16:23:27,645 [main] DEBUG shell.Command (Command.java:displayError(481)) - touchz failure +java.nio.file.AccessDeniedException: example-file: Writing Object on example-file: software.amazon.awssdk.services.s3.model.S3Exception: Invalid argument. (Service: S3, Status Code: 403, Request ID: null) (SDK Attempt Count: 1):SignatureDoesNotMatch + at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:271) + at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:124) + at org.apache.hadoop.fs.s3a.Invoker.lambda$retry$4(Invoker.java:376) + at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:468) + at org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:372) + at org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:347) + at org.apache.hadoop.fs.s3a.WriteOperationHelper.retry(WriteOperationHelper.java:210) + at org.apache.hadoop.fs.s3a.WriteOperationHelper.putObject(WriteOperationHelper.java:534) + at org.apache.hadoop.fs.s3a.S3ABlockOutputStream.putObject(S3ABlockOutputStream.java:726) + at org.apache.hadoop.fs.s3a.S3ABlockOutputStream.close(S3ABlockOutputStream.java:518) + at org.apache.hadoop.fs.FSDataOutputStream$PositionCache.close(FSDataOutputStream.java:77) + at org.apache.hadoop.fs.FSDataOutputStream.close(FSDataOutputStream.java:106) + at org.apache.hadoop.fs.shell.TouchCommands$Touchz.touchz(TouchCommands.java:89) + at org.apache.hadoop.fs.shell.TouchCommands$Touchz.processNonexistentPath(TouchCommands.java:85) + at org.apache.hadoop.fs.shell.Command.processArgument(Command.java:303) + at org.apache.hadoop.fs.shell.Command.processArguments(Command.java:285) + at org.apache.hadoop.fs.shell.FsCommand.processRawArguments(FsCommand.java:121) + at org.apache.hadoop.fs.shell.Command.run(Command.java:192) + at org.apache.hadoop.fs.FsShell.run(FsShell.java:327) + at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:82) + at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:97) + at org.apache.hadoop.fs.FsShell.main(FsShell.java:390) +Caused by: software.amazon.awssdk.services.s3.model.S3Exception: Invalid argument. (Service: S3, Status Code: 403, Request ID: null) (SDK Attempt Count: 1) + at software.amazon.awssdk.services.s3.model.S3Exception$BuilderImpl.build(S3Exception.java:113) + at software.amazon.awssdk.services.s3.model.S3Exception$BuilderImpl.build(S3Exception.java:61) + at software.amazon.awssdk.core.internal.http.pipeline.stages.utils.RetryableStageHelper.retryPolicyDisallowedRetryException(RetryableStageHelper.java:168) + at software.amazon.awssdk.core.internal.http.pipeline.stages.RetryableStage.execute(RetryableStage.java:73) + at software.amazon.awssdk.core.internal.http.pipeline.stages.RetryableStage.execute(RetryableStage.java:36) + at software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206) + at software.amazon.awssdk.core.internal.http.StreamManagingStage.execute(StreamManagingStage.java:53) + at software.amazon.awssdk.core.internal.http.StreamManagingStage.execute(StreamManagingStage.java:35) + at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallTimeoutTrackingStage.executeWithTimer(ApiCallTimeoutTrackingStage.java:82) + at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallTimeoutTrackingStage.execute(ApiCallTimeoutTrackingStage.java:62) + at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallTimeoutTrackingStage.execute(ApiCallTimeoutTrackingStage.java:43) + at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallMetricCollectionStage.execute(ApiCallMetricCollectionStage.java:50) + at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallMetricCollectionStage.execute(ApiCallMetricCollectionStage.java:32) + at software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206) + at software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206) + at software.amazon.awssdk.core.internal.http.pipeline.stages.ExecutionFailureExceptionReportingStage.execute(ExecutionFailureExceptionReportingStage.java:37) + at software.amazon.awssdk.core.internal.http.pipeline.stages.ExecutionFailureExceptionReportingStage.execute(ExecutionFailureExceptionReportingStage.java:26) + at software.amazon.awssdk.core.internal.http.AmazonSyncHttpClient$RequestExecutionBuilderImpl.execute(AmazonSyncHttpClient.java:210) + at software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.invoke(BaseSyncClientHandler.java:103) + at software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.doExecute(BaseSyncClientHandler.java:173) + at software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.lambda$execute$1(BaseSyncClientHandler.java:80) + at software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.measureApiCallSuccess(BaseSyncClientHandler.java:182) + at software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.execute(BaseSyncClientHandler.java:74) + at software.amazon.awssdk.core.client.handler.SdkSyncClientHandler.execute(SdkSyncClientHandler.java:45) + at software.amazon.awssdk.awscore.client.handler.AwsSyncClientHandler.execute(AwsSyncClientHandler.java:53) + at software.amazon.awssdk.services.s3.DefaultS3Client.putObject(DefaultS3Client.java:11883) + at software.amazon.awssdk.services.s3.DelegatingS3Client.lambda$putObject$89(DelegatingS3Client.java:9716) + at software.amazon.awssdk.services.s3.internal.crossregion.S3CrossRegionSyncClient.invokeOperation(S3CrossRegionSyncClient.java:67) + at software.amazon.awssdk.services.s3.DelegatingS3Client.putObject(DelegatingS3Client.java:9716) + at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$putObjectDirect$14(S3AFileSystem.java:3332) + at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDurationOfSupplier(IOStatisticsBinding.java:650) + at org.apache.hadoop.fs.s3a.S3AFileSystem.putObjectDirect(S3AFileSystem.java:3330) + at org.apache.hadoop.fs.s3a.WriteOperationHelper.lambda$putObject$7(WriteOperationHelper.java:535) + at org.apache.hadoop.fs.store.audit.AuditingFunctions.lambda$withinAuditSpan$0(AuditingFunctions.java:62) + at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:122) + ... 20 more + touchz: example-file: Writing Object on example-file: software.amazon.awssdk.services.s3.model.S3Exception: Invalid argument. (Service: S3, Status Code: 403, Request ID: null) (SDK Attempt Count: 1):SignatureDoesNotMatch +``` + ### "Unable to find a region via the region provider chain." when using session credentials. Region must be provided when requesting session credentials, or an exception will be thrown with the @@ -1393,6 +1484,39 @@ connections more frequently. Something has been trying to write data to "/". +### "Unable to create OutputStream with the given multipart upload and buffer configuration." + +This error is raised when an attemt it made to write to a store with +`fs.s3a.multipart.uploads.enabled` set to `false` and `fs.s3a.fast.upload.buffer` set to array. + +This is pre-emptively disabled before a write of so much data takes place that the process runs out of heap space. + +If the store doesn't support multipart uploads, _use disk for buffering_. +Nothing else is safe to use as it leads to a state where small jobs work, but those which generate large amounts of data fail. + +```xml + + fs.s3a.fast.upload.buffer + disk + +``` + +``` +org.apache.hadoop.fs.PathIOException: `s3a://gcs/a2a8c3e4-5788-40c0-ad66-fe3fe63f4507': Unable to create OutputStream with the given multipart upload and buffer configuration. + at org.apache.hadoop.fs.s3a.S3AUtils.validateOutputStreamConfiguration(S3AUtils.java:985) + at org.apache.hadoop.fs.s3a.S3AFileSystem.innerCreateFile(S3AFileSystem.java:2201) + at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$create$5(S3AFileSystem.java:2068) + at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.invokeTrackingDuration(IOStatisticsBinding.java:546) + at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.lambda$trackDurationOfOperation$5(IOStatisticsBinding.java:527) + at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDuration(IOStatisticsBinding.java:448) + at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2881) + at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2900) + at org.apache.hadoop.fs.s3a.S3AFileSystem.create(S3AFileSystem.java:2067) + at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1233) + at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1210) + at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1091) +``` + ## Best Practises ### Enabling low-level logging diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java index a05d09bded5b6..61cd45e508879 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java @@ -38,6 +38,7 @@ import static org.apache.hadoop.fs.s3a.S3ATestConstants.MULTIPART_COMMIT_CONSUMES_UPLOAD_ID; import static org.apache.hadoop.fs.s3a.S3ATestConstants.SCALE_TEST_TIMEOUT_MILLIS; import static org.apache.hadoop.fs.s3a.S3ATestUtils.assume; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeMultipartUploads; import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeNotS3ExpressFileSystem; import static org.apache.hadoop.fs.s3a.S3ATestUtils.getTestPropertyBool; import static org.apache.hadoop.fs.s3a.S3ATestUtils.getTestPropertyBytes; @@ -116,6 +117,7 @@ public void setup() throws Exception { assume("Scale test disabled: to enable set property " + KEY_SCALE_TESTS_ENABLED, enabled); + assumeMultipartUploads(getFileSystem().getConf()); partitionSize = (int) getTestPropertyBytes(conf, KEY_HUGE_PARTITION_SIZE, DEFAULT_HUGE_PARTITION_SIZE); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ABlockOutputArray.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ABlockOutputArray.java index 18f665ecef2ce..0e43ee3f2ad61 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ABlockOutputArray.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ABlockOutputArray.java @@ -28,6 +28,7 @@ import org.apache.hadoop.io.IOUtils; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import java.io.IOException; @@ -36,6 +37,7 @@ import static org.apache.hadoop.fs.StreamCapabilities.ABORTABLE_STREAM; import static org.apache.hadoop.fs.s3a.Constants.*; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.skipIfNotEnabled; import static org.apache.hadoop.fs.s3a.test.ExtraAssertions.assertCompleteAbort; import static org.apache.hadoop.fs.s3a.test.ExtraAssertions.assertNoopAbort; @@ -66,6 +68,16 @@ protected Configuration createConfiguration() { return conf; } + @Override + @BeforeEach + public void setup() throws Exception { + super.setup(); + + skipIfNotEnabled(getFileSystem().getConf(), + MULTIPART_UPLOADS_ENABLED, + "Store has disabled multipart uploads; skipping tests"); + } + protected String getBlockOutputBufferName() { return FAST_UPLOAD_BUFFER_ARRAY; } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AMultipartUtils.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AMultipartUtils.java index 1fcc41a3bde28..890c95aba97de 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AMultipartUtils.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AMultipartUtils.java @@ -19,6 +19,7 @@ package org.apache.hadoop.fs.s3a; import org.assertj.core.api.Assertions; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import software.amazon.awssdk.services.s3.model.MultipartUpload; @@ -31,6 +32,9 @@ import java.util.HashSet; import java.util.Set; +import static org.apache.hadoop.fs.s3a.Constants.MULTIPART_UPLOADS_ENABLED; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeMultipartUploads; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.skipIfNotEnabled; import static org.apache.hadoop.util.functional.RemoteIterators.foreach; /** @@ -54,6 +58,13 @@ protected Configuration createConfiguration() { return conf; } + @Override + @BeforeEach + public void setup() throws Exception { + super.setup(); + assumeMultipartUploads(getFileSystem().getConf()); + } + /** * Main test case for upload part listing and iterator paging. * @throws Exception on failure. diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java index 07af6e6b1aa41..1eb302a0ca137 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java @@ -533,6 +533,32 @@ public static void skipIfNotEnabled(final Configuration configuration, } } + /** + * Skip a test suite/case if a configuration option is true. + * @param configuration configuration to probe + * @param key key to resolve + * @param defVal default value. + * @param message assertion text + */ + public static void skipIfEnabled(final Configuration configuration, + final String key, + final boolean defVal, + final String message) { + if (!configuration.getBoolean(key, defVal)) { + skip(message); + } + } + + /** + * Require multipart uploads; skip tests if not enabled in the configuration. + * @param conf filesystem configuration. + */ + public static void assumeMultipartUploads(Configuration conf) { + skipIfNotEnabled(conf, + MULTIPART_UPLOADS_ENABLED, + "Store has disabled multipart uploads; skipping tests"); + } + /** * Skip a test if storage class tests are disabled, * or the bucket is an S3Express bucket. diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestCustomSigner.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestCustomSigner.java index f2ba4434f2763..3833e2c9eb9c8 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestCustomSigner.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestCustomSigner.java @@ -66,6 +66,7 @@ import static org.apache.hadoop.fs.s3a.MultipartTestUtils.createMagicFile; import static org.apache.hadoop.fs.s3a.S3ATestUtils.disableFilesystemCaching; import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.skipIfNotEnabled; /** * Tests for custom Signers and SignerInitializers. @@ -129,6 +130,9 @@ public void setup() throws Exception { super.setup(); final S3AFileSystem fs = getFileSystem(); final Configuration conf = fs.getConf(); + if (bulkDelete) { + skipIfNotEnabled(conf, ENABLE_MULTI_DELETE, "no bulk delete"); + } endpoint = conf.getTrimmed(Constants.ENDPOINT, Constants.CENTRAL_ENDPOINT); PATH_STYLE_ACCESS_IN_USE.set(conf.getBoolean(PATH_STYLE_ACCESS, false)); LOG.info("Test endpoint is {}", endpoint); @@ -212,11 +216,13 @@ private S3AFileSystem runStoreOperationsAndVerify(UserGroupInformation ugi, ContractTestUtils.touch(fs, new Path(subdir, "file1")); // create a magic file. - createMagicFile(fs, subdir); - ContentSummary summary = fs.getContentSummary(finalPath); - fs.getS3AInternals().abortMultipartUploads(subdir); - fs.rename(subdir, new Path(finalPath, "renamed")); - fs.delete(finalPath, true); + if (fs.isMagicCommitEnabled()) { + createMagicFile(fs, subdir); + ContentSummary summary = fs.getContentSummary(finalPath); + fs.getS3AInternals().abortMultipartUploads(subdir); + fs.rename(subdir, new Path(finalPath, "renamed")); + fs.delete(finalPath, true); + } return fs; }); } @@ -230,12 +236,13 @@ private S3AFileSystem runStoreOperationsAndVerify(UserGroupInformation ugi, private Configuration createTestConfig(String identifier) { Configuration conf = createConfiguration(); + // bulk delete is not disabled; if it has been set to false by the bucket + // then one of the test runs will be skipped. removeBaseAndBucketOverrides(conf, CHECKSUM_ALGORITHM, CHECKSUM_VALIDATION, CUSTOM_SIGNERS, - SIGNING_ALGORITHM_S3, - ENABLE_MULTI_DELETE); + SIGNING_ALGORITHM_S3); conf.set(CUSTOM_SIGNERS, "CustomS3Signer:" + CustomSigner.class.getName() + ":" + CustomSignerInitializer.class.getName()); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestHttpSigner.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestHttpSigner.java index 1a60c012ba7c8..a4ffae9077cb3 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestHttpSigner.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestHttpSigner.java @@ -144,11 +144,13 @@ private S3AFileSystem runStoreOperationsAndVerify(UserGroupInformation ugi, ContractTestUtils.touch(fs, new Path(subdir, "file1")); // create a magic file. - createMagicFile(fs, subdir); - ContentSummary summary = fs.getContentSummary(finalPath); - fs.getS3AInternals().abortMultipartUploads(subdir); - fs.rename(subdir, new Path(finalPath, "renamed")); - fs.delete(finalPath, true); + if (fs.isMagicCommitEnabled()) { + createMagicFile(fs, subdir); + ContentSummary summary = fs.getContentSummary(finalPath); + fs.getS3AInternals().abortMultipartUploads(subdir); + fs.rename(subdir, new Path(finalPath, "renamed")); + fs.delete(finalPath, true); + } return fs; }); } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/AbstractITCommitProtocol.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/AbstractITCommitProtocol.java index 5fc5cf9eb99c2..b270b5de70342 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/AbstractITCommitProtocol.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/AbstractITCommitProtocol.java @@ -173,7 +173,7 @@ public void setup() throws Exception { taskAttempt0 = TaskAttemptID.forName(attempt0); attempt1 = "attempt_" + jobId + "_m_000001_0"; taskAttempt1 = TaskAttemptID.forName(attempt1); - + assumeMultipartUploads(getFileSystem().getConf()); outDir = path(getMethodName()); abortMultipartUploadsUnderPath(outDir); cleanupDestDir(); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestCommitOperationCost.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestCommitOperationCost.java index 9ad2c0625a094..fcc24102cc42a 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestCommitOperationCost.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestCommitOperationCost.java @@ -42,6 +42,7 @@ import org.apache.hadoop.fs.s3a.performance.AbstractS3ACostTest; import org.apache.hadoop.fs.statistics.IOStatisticsLogging; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeMultipartUploads; import static org.apache.hadoop.fs.s3a.S3ATestUtils.skipIfAnalyticsAcceleratorEnabled; import static org.apache.hadoop.fs.s3a.Statistic.ACTION_HTTP_GET_REQUEST; import static org.apache.hadoop.fs.s3a.Statistic.COMMITTER_MAGIC_FILES_CREATED; @@ -84,6 +85,7 @@ public class ITestCommitOperationCost extends AbstractS3ACostTest { @Override public void setup() throws Exception { super.setup(); + assumeMultipartUploads(getFileSystem().getConf()); testHelper = new CommitterTestHelper(getFileSystem()); } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestCommitOperations.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestCommitOperations.java index ddd306ddc25ad..005235cd7516d 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestCommitOperations.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestCommitOperations.java @@ -94,6 +94,7 @@ protected Configuration createConfiguration() { public void setup() throws Exception { FileSystem.closeAll(); super.setup(); + assumeMultipartUploads(getFileSystem().getConf()); verifyIsMagicCommitFS(getFileSystem()); progress = new ProgressCounter(); progress.assertCount("progress", 0); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestS3ACommitterFactory.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestS3ACommitterFactory.java index 2956bfc3baa7e..fb18ccb904444 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestS3ACommitterFactory.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestS3ACommitterFactory.java @@ -47,6 +47,7 @@ import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; import org.apache.hadoop.security.UserGroupInformation; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeMultipartUploads; import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides; import static org.apache.hadoop.fs.s3a.commit.CommitConstants.*; import static org.apache.hadoop.fs.s3a.commit.InternalCommitterConstants.COMMITTER_NAME_STAGING; @@ -186,6 +187,7 @@ public void setup() throws Exception { // destroy all filesystems from previous runs. FileSystem.closeAllForUGI(UserGroupInformation.getCurrentUser()); super.setup(); + assumeMultipartUploads(getFileSystem().getConf()); jobId = randomJobId(); attempt0 = "attempt_" + jobId + "_m_000000_0"; taskAttempt0 = TaskAttemptID.forName(attempt0); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestUploadRecovery.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestUploadRecovery.java index a00d6bb6bf630..8e03deffc8faf 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestUploadRecovery.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/ITestUploadRecovery.java @@ -60,6 +60,7 @@ import static org.apache.hadoop.fs.s3a.Constants.FS_S3A_CREATE_PERFORMANCE; import static org.apache.hadoop.fs.s3a.Constants.MAX_ERROR_RETRIES; import static org.apache.hadoop.fs.s3a.Constants.RETRY_HTTP_5XX_ERRORS; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeMultipartUploads; import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides; import static org.apache.hadoop.fs.s3a.audit.S3AAuditConstants.AUDIT_EXECUTION_INTERCEPTORS; import static org.apache.hadoop.fs.s3a.commit.CommitConstants.BASE; @@ -161,6 +162,10 @@ public Configuration createConfiguration() { public void setup() throws Exception { SdkFaultInjector.resetFaultInjector(); super.setup(); + if (!FAST_UPLOAD_BUFFER_DISK.equals(buffer)) { + assumeMultipartUploads(getFileSystem().getConf()); + } + } @AfterEach @@ -169,7 +174,6 @@ public void teardown() throws Exception { // safety check in case the evaluation is failing any // request needed in cleanup. SdkFaultInjector.resetFaultInjector(); - super.teardown(); } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/integration/ITestS3ACommitterMRJob.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/integration/ITestS3ACommitterMRJob.java index 8ef109d10a929..e091ee344df51 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/integration/ITestS3ACommitterMRJob.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/integration/ITestS3ACommitterMRJob.java @@ -73,6 +73,7 @@ import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.util.DurationInfo; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeMultipartUploads; import static org.apache.hadoop.fs.s3a.S3ATestUtils.disableFilesystemCaching; import static org.apache.hadoop.fs.s3a.S3ATestUtils.lsR; import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides; @@ -184,6 +185,7 @@ public ITestS3ACommitterMRJob( @BeforeEach public void setup() throws Exception { super.setup(); + assumeMultipartUploads(getFileSystem().getConf()); // configure the test binding for this specific test case. committerTestBinding.setup(getClusterBinding(), getFileSystem()); } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/magic/ITestS3AHugeMagicCommits.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/magic/ITestS3AHugeMagicCommits.java index edcc7bfbf8310..2b8ca4bed196d 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/magic/ITestS3AHugeMagicCommits.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/magic/ITestS3AHugeMagicCommits.java @@ -119,6 +119,15 @@ public void setup() throws Exception { pendingDataFile = new Path(jobDir, filename + PENDING_SUFFIX); } + /** + * Skip this test suite when MPUS are not avaialable. + * @return false + */ + @Override + protected boolean requireMultipartUploads() { + return true; + } + /** * Returns the path to the commit metadata file, not that of the huge file. * @return a file in the job dir diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/terasort/ITestTerasortOnS3A.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/terasort/ITestTerasortOnS3A.java index 536a158dbb4b1..ae6bd591315fe 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/terasort/ITestTerasortOnS3A.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/terasort/ITestTerasortOnS3A.java @@ -57,6 +57,7 @@ import org.apache.hadoop.util.ToolRunner; import static java.util.Optional.empty; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeMultipartUploads; import static org.apache.hadoop.fs.s3a.S3ATestUtils.lsR; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assumptions.assumeTrue; @@ -148,6 +149,7 @@ protected String committerName() { public void setup() throws Exception { super.setup(); requireScaleTestsEnabled(); + assumeMultipartUploads(getFileSystem().getConf()); prepareToTerasort(); } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestTreewalkProblems.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestTreewalkProblems.java index 965512f3a0d3c..a2b016cf53326 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestTreewalkProblems.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestTreewalkProblems.java @@ -58,6 +58,7 @@ import static org.apache.hadoop.fs.s3a.MultipartTestUtils.clearAnyUploads; import static org.apache.hadoop.fs.s3a.MultipartTestUtils.createMagicFile; import static org.apache.hadoop.fs.s3a.MultipartTestUtils.magicPath; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeMultipartUploads; import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides; import static org.apache.hadoop.fs.s3a.S3ATestUtils.toPathList; import static org.apache.hadoop.fs.s3a.S3AUtils.HIDDEN_FILE_FILTER; @@ -106,6 +107,7 @@ public void setup() throws Exception { final S3AFileSystem fs = getFileSystem(); final Path path = methodPath(); assertHasPathCapabilities(fs, path, DIRECTORY_OPERATIONS_PURGE_UPLOADS); + assumeMultipartUploads(fs.getConf()); listingInconsistent = fs.hasPathCapability(path, DIRECTORY_LISTING_INCONSISTENT); clearAnyUploads(fs, path); } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestUploadPurgeOnDirectoryOperations.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestUploadPurgeOnDirectoryOperations.java index 301f348981462..5c12dfedee626 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestUploadPurgeOnDirectoryOperations.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestUploadPurgeOnDirectoryOperations.java @@ -36,6 +36,7 @@ import static org.apache.hadoop.fs.s3a.Constants.DIRECTORY_OPERATIONS_PURGE_UPLOADS; import static org.apache.hadoop.fs.s3a.MultipartTestUtils.clearAnyUploads; import static org.apache.hadoop.fs.s3a.MultipartTestUtils.createMagicFile; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeMultipartUploads; import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides; import static org.apache.hadoop.fs.s3a.Statistic.MULTIPART_UPLOAD_LIST; import static org.apache.hadoop.fs.s3a.Statistic.OBJECT_MULTIPART_UPLOAD_ABORTED; @@ -65,6 +66,7 @@ public Configuration createConfiguration() { public void setup() throws Exception { super.setup(); final S3AFileSystem fs = getFileSystem(); + assumeMultipartUploads(fs.getConf()); assertHasPathCapabilities(fs, new Path("/"), DIRECTORY_OPERATIONS_PURGE_UPLOADS); clearAnyUploads(fs, methodPath()); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestS3ADeleteCost.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestS3ADeleteCost.java index 27276655541cb..63f4ec7f1bd00 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestS3ADeleteCost.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestS3ADeleteCost.java @@ -97,14 +97,12 @@ public void testDeleteSingleFileInDir() throws Throwable { FILESTATUS_FILE_PROBE_L + FILESTATUS_DIR_PROBE_L), with(DIRECTORIES_DELETED, 0), with(FILES_DELETED, 1), - // a single DELETE call is made to delete the object - probe(bulkDelete, OBJECT_DELETE_REQUEST, DELETE_OBJECT_REQUEST), - probe(!bulkDelete, OBJECT_DELETE_REQUEST, - DELETE_OBJECT_REQUEST + DELETE_MARKER_REQUEST), + with(OBJECT_DELETE_REQUEST, DELETE_OBJECT_REQUEST), // create no parent dirs or delete parents with(DIRECTORIES_CREATED, 0), + // even when bulk delete is enabled, there is no use of this. with(OBJECT_BULK_DELETE_REQUEST, 0) ); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardTool.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardTool.java index 995a20ec6b6a9..dad1c87afa377 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardTool.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardTool.java @@ -44,6 +44,7 @@ import static org.apache.hadoop.fs.s3a.MultipartTestUtils.clearAnyUploads; import static org.apache.hadoop.fs.s3a.MultipartTestUtils.countUploadsAt; import static org.apache.hadoop.fs.s3a.MultipartTestUtils.createPartUpload; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeMultipartUploads; import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides; import static org.apache.hadoop.fs.s3a.s3guard.S3GuardTool.BucketInfo; import static org.apache.hadoop.fs.s3a.s3guard.S3GuardTool.E_BAD_STATE; @@ -129,6 +130,7 @@ public void testStoreInfoFips() throws Throwable { @Test public void testUploads() throws Throwable { + assumeMultipartUploads(getFileSystem().getConf()); S3AFileSystem fs = getFileSystem(); Path path = methodPath(); Path file = new Path(path, UPLOAD_NAME); @@ -173,14 +175,17 @@ public void testUploads() throws Throwable { } } + @Test public void testUploadListByAge() throws Throwable { S3AFileSystem fs = getFileSystem(); Path path = methodPath(); Path file = new Path(path, UPLOAD_NAME); + assumeMultipartUploads(getFileSystem().getConf()); describe("Cleaning up any leftover uploads from previous runs."); + // 1. Make sure key doesn't already exist clearAnyUploads(fs, path); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/AbstractSTestS3AHugeFiles.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/AbstractSTestS3AHugeFiles.java index 9c864d10dd37c..591aebfc7d60c 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/AbstractSTestS3AHugeFiles.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/AbstractSTestS3AHugeFiles.java @@ -108,6 +108,9 @@ public void setup() throws Exception { uploadBlockSize = uploadBlockSize(); filesize = getTestPropertyBytes(getConf(), KEY_HUGE_FILESIZE, DEFAULT_HUGE_FILESIZE); + if (requireMultipartUploads()) { + assumeMultipartUploads(getFileSystem().getConf()); + } } /** @@ -129,6 +132,14 @@ public String getTestSuiteName() { return getBlockOutputBufferName(); } + /** + * Override point: does this test suite require MPUs? + * @return true if the test suite must be skipped if MPUS are off. + */ + protected boolean requireMultipartUploads() { + return false; + } + /** * Note that this can get called before test setup. * @return the configuration to use. diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3ABlockOutputStreamInterruption.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3ABlockOutputStreamInterruption.java index c04261efc1390..1bb99ee139827 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3ABlockOutputStreamInterruption.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3ABlockOutputStreamInterruption.java @@ -56,6 +56,7 @@ import static org.apache.hadoop.fs.s3a.Constants.MAX_ERROR_RETRIES; import static org.apache.hadoop.fs.s3a.Constants.MULTIPART_SIZE; import static org.apache.hadoop.fs.s3a.Constants.RETRY_HTTP_5XX_ERRORS; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeMultipartUploads; import static org.apache.hadoop.fs.s3a.S3ATestUtils.getTestPropertyInt; import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides; import static org.apache.hadoop.fs.s3a.Statistic.OBJECT_MULTIPART_UPLOAD_ABORTED; @@ -84,6 +85,9 @@ *

    * Marked as a scale test even though it tries to aggressively abort streams being written * and should, if working, complete fast. + *

    + * Assumes multipart uploads are enabled; single part upload interruptions aren't the complicated + * ones. */ @ParameterizedClass(name = "{0}-{1}") @MethodSource("params") @@ -171,6 +175,7 @@ protected Configuration createScaleConfiguration() { public void setup() throws Exception { SdkFaultInjector.resetFaultInjector(); super.setup(); + assumeMultipartUploads(getFileSystem().getConf()); } @AfterEach @@ -287,6 +292,7 @@ public void testAbortDuringUpload() throws Throwable { @Test public void testPartUploadFailure() throws Throwable { describe("Trigger a failure during a multipart upload"); + assumeMultipartUploads(getFileSystem().getConf()); int len = 6 * _1MB; final byte[] dataset = dataset(len, 'a', 'z' - 'a'); final String text = "Simulated failure"; diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AHugeFilesArrayBlocks.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AHugeFilesArrayBlocks.java index 33dfdc6db6aff..4b8322b002a2d 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AHugeFilesArrayBlocks.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AHugeFilesArrayBlocks.java @@ -30,4 +30,13 @@ public class ITestS3AHugeFilesArrayBlocks extends AbstractSTestS3AHugeFiles { protected String getBlockOutputBufferName() { return Constants.FAST_UPLOAD_BUFFER_ARRAY; } + + /** + * Skip this test suite when MPUS are not avaialable. + * @return false + */ + @Override + protected boolean requireMultipartUploads() { + return true; + } } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AHugeFilesByteBufferBlocks.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AHugeFilesByteBufferBlocks.java index 3b7b4caae9d8b..ff660bf35beef 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AHugeFilesByteBufferBlocks.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AHugeFilesByteBufferBlocks.java @@ -42,6 +42,15 @@ protected String getBlockOutputBufferName() { return FAST_UPLOAD_BYTEBUFFER; } + /** + * Skip this test suite when MPUS are not avaialable. + * @return false + */ + @Override + protected boolean requireMultipartUploads() { + return true; + } + /** * Rename the parent directory, rather than the file itself. * @param src source file diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AMultipartUploadSizeLimits.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AMultipartUploadSizeLimits.java index cee917de0a486..a00126e4598b7 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AMultipartUploadSizeLimits.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AMultipartUploadSizeLimits.java @@ -21,6 +21,7 @@ import java.io.File; import org.assertj.core.api.Assertions; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.apache.commons.io.FileUtils; @@ -44,6 +45,7 @@ import static org.apache.hadoop.fs.contract.ContractTestUtils.verifyFileContents; import static org.apache.hadoop.fs.contract.ContractTestUtils.writeTextFile; import static org.apache.hadoop.fs.s3a.Constants.MULTIPART_SIZE; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeMultipartUploads; import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides; import static org.apache.hadoop.fs.s3a.Statistic.INVOCATION_ABORT; import static org.apache.hadoop.fs.s3a.Statistic.OBJECT_MULTIPART_UPLOAD_ABORTED; @@ -76,6 +78,13 @@ protected Configuration createScaleConfiguration() { return configuration; } + @Override + @BeforeEach + public void setup() throws Exception { + super.setup(); + assumeMultipartUploads(getFileSystem().getConf()); + } + /** * Uploads under the limit are valid. */ diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/yarn/ITestS3AMiniYarnCluster.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/yarn/ITestS3AMiniYarnCluster.java index c74faa24f9933..c816746b0799f 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/yarn/ITestS3AMiniYarnCluster.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/yarn/ITestS3AMiniYarnCluster.java @@ -46,6 +46,7 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeMultipartUploads; import static org.apache.hadoop.fs.s3a.commit.CommitConstants.FS_S3A_COMMITTER_NAME; import static org.apache.hadoop.fs.s3a.commit.CommitConstants.FS_S3A_COMMITTER_STAGING_UNIQUE_FILENAMES; import static org.apache.hadoop.fs.s3a.commit.CommitConstants._SUCCESS; @@ -74,6 +75,7 @@ public void setup() throws Exception { super.setup(); S3AFileSystem fs = getFileSystem(); Configuration conf = getConfiguration(); + assumeMultipartUploads(conf); rootPath = path("MiniClusterWordCount"); Path workingDir = path("working"); fs.setWorkingDirectory(workingDir); From 96c3f38ea5e033636e1acdb8fe2ed4b398bedb08 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Tue, 21 Oct 2025 19:40:51 +0100 Subject: [PATCH 16/24] HADOOP-19654. Tests to skip as appropriate when MPU is disabled. Final(?) wrap-up of tests which fail on a store without multipart upload or bulk delete. --- .../fs/s3a/impl/ITestConnectionTimeouts.java | 29 ++++++++++--------- .../fs/s3a/yarn/ITestS3AMiniYarnCluster.java | 2 +- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestConnectionTimeouts.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestConnectionTimeouts.java index 09911b339d2b9..3b64541e97c80 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestConnectionTimeouts.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestConnectionTimeouts.java @@ -259,20 +259,21 @@ public void testObjectUploadTimeouts() throws Throwable { // and try a multipart upload to verify that its requests also outlast // the short requests - SdkFaultInjector.setRequestFailureConditions(999, - SdkFaultInjector::isPartUpload); - Path magicFile = new Path(dir, MAGIC_PATH_PREFIX + "0001/__base/file2"); - totalSleepTime.set(0); - OperationDuration dur2 = new DurationInfo(LOG, "Creating File"); - ContractTestUtils.createFile(brittleFS, magicFile, true, DATASET); - dur2.finished(); - Assertions.assertThat(totalSleepTime.get()) - .describedAs("total sleep time of magic write") - .isGreaterThan(0); - Assertions.assertThat(dur2.asDuration()) - .describedAs("Duration of magic write") - .isGreaterThan(shortTimeout); - brittleFS.delete(dir, true); + if (fs.isMagicCommitEnabled()) { + SdkFaultInjector.setRequestFailureConditions(999, + SdkFaultInjector::isPartUpload); + Path magicFile = new Path(dir, MAGIC_PATH_PREFIX + "0001/__base/file2"); + totalSleepTime.set(0); + OperationDuration dur2 = new DurationInfo(LOG, "Creating File"); + ContractTestUtils.createFile(brittleFS, magicFile, true, DATASET); + dur2.finished(); + Assertions.assertThat(totalSleepTime.get()) + .describedAs("total sleep time of magic write") + .isGreaterThan(0); + Assertions.assertThat(dur2.asDuration()) + .describedAs("Duration of magic write") + .isGreaterThan(shortTimeout); + } } } } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/yarn/ITestS3AMiniYarnCluster.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/yarn/ITestS3AMiniYarnCluster.java index c816746b0799f..9c8773b1d995e 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/yarn/ITestS3AMiniYarnCluster.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/yarn/ITestS3AMiniYarnCluster.java @@ -75,7 +75,7 @@ public void setup() throws Exception { super.setup(); S3AFileSystem fs = getFileSystem(); Configuration conf = getConfiguration(); - assumeMultipartUploads(conf); + assumeMultipartUploads(fs.getConf()); rootPath = path("MiniClusterWordCount"); Path workingDir = path("working"); fs.setWorkingDirectory(workingDir); From 1527ba88936c3f19ff857792495c787b72421792 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Wed, 22 Oct 2025 21:09:25 +0100 Subject: [PATCH 17/24] HADOOP-19654. Yetus Address style, trailing space and javadoc. Not addressed: the new spotbugs errors. These are not from this patch. --- .../hadoop/fs/s3a/AWSNoResponseException.java | 6 +++ .../apache/hadoop/fs/s3a/S3ClientFactory.java | 20 ++++++++++ .../hadoop/fs/s3a/WriteOperationHelper.java | 1 - .../hadoop/fs/s3a/auth/RolePolicies.java | 1 - .../tools/hadoop-aws/third_party_stores.md | 18 ++++----- .../tools/hadoop-aws/troubleshooting_s3a.md | 38 +++++++++---------- .../ITestS3AContractMultipartUploader.java | 4 +- .../fs/s3a/ITestS3ABucketExistence.java | 2 - .../hadoop/fs/s3a/ITestS3AEndpointRegion.java | 2 +- .../hadoop/fs/s3a/ITestS3AMultipartUtils.java | 2 - .../hadoop/fs/s3a/auth/ITestAssumeRole.java | 3 +- .../ITestS3APutIfMatchAndIfNoneMatch.java | 7 ++-- .../s3a/performance/ITestCreateFileCost.java | 2 +- 13 files changed, 62 insertions(+), 44 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/AWSNoResponseException.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/AWSNoResponseException.java index b8562714b1aae..49ebd3a42fdf4 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/AWSNoResponseException.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/AWSNoResponseException.java @@ -24,6 +24,12 @@ * Status code 443, no response from server. This is considered idempotent. */ public class AWSNoResponseException extends AWSServiceIOException { + + /** + * Constructor. + * @param operation operation in progress. + * @param cause inner cause + */ public AWSNoResponseException(String operation, AwsServiceException cause) { super(operation, cause); diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ClientFactory.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ClientFactory.java index ca0ed39a848df..58d3813075695 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ClientFactory.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ClientFactory.java @@ -271,10 +271,18 @@ public S3ClientCreationParameters withRequesterPays( return this; } + /** + * Is this a requester pays bucket? + * @return true if the bucket is requester pays. + */ public boolean isRequesterPays() { return requesterPays; } + /** + * Get the credentials. + * @return the credential provider. + */ public AwsCredentialsProvider getCredentialSet() { return credentialSet; } @@ -291,6 +299,10 @@ public S3ClientCreationParameters withCredentialSet( return this; } + /** + * Get UA suffix. + * @return suffix. + */ public String getUserAgentSuffix() { return userAgentSuffix; } @@ -594,10 +606,18 @@ public S3ClientCreationParameters withChecksumValidationEnabled(final boolean va return this; } + /** + * Is checksum validation on every request enabled? + * @return true if validation is on every request. + */ public boolean isChecksumValidationEnabled() { return checksumValidationEnabled; } + /** + * Should MD5 headers be added? + * @return true to always add an MD5 header. + */ public boolean isMd5HeaderEnabled() { return md5HeaderEnabled; } diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java index 3f7456b499910..f0a9583cf63fb 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java @@ -34,7 +34,6 @@ import software.amazon.awssdk.services.s3.model.MultipartUpload; import software.amazon.awssdk.services.s3.model.PutObjectRequest; import software.amazon.awssdk.services.s3.model.PutObjectResponse; -import software.amazon.awssdk.services.s3.model.S3Exception; import software.amazon.awssdk.services.s3.model.UploadPartRequest; import software.amazon.awssdk.services.s3.model.UploadPartResponse; import org.slf4j.Logger; diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/RolePolicies.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/RolePolicies.java index 697f788ccb8ab..9cd68a5ba8ea8 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/RolePolicies.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/RolePolicies.java @@ -18,7 +18,6 @@ package org.apache.hadoop.fs.s3a.auth; -import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md index 9288b307e2a43..17cd7b0627808 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md @@ -55,7 +55,7 @@ path style access must also be enabled in `fs.s3a.path.style.access`. The v4 signing algorithm requires a region to be set in `fs.s3a.endpoint.region`. A non-empty value is generally sufficient, though some deployments may require -a specific value. +a specific value. *Important:* do not use `auto` or `sdk` as these may be used in the future for specific region binding algorithms. @@ -511,8 +511,6 @@ As of October 2025 and the 2.33.8 AWS SDK, the settings needed to interact with false Calculate and attach a message checksum on every operation. (default: true) - - ``` ## Google Cloud Storage through the S3A connector @@ -557,25 +555,25 @@ this makes renaming and deleting significantly slower. fs.s3a.endpoint https://storage.googleapis.com - + fs.s3a.endpoint.region gcs - + fs.s3a.path.style.access true - + fs.s3a.checksum.calculation.enabled false Calculate and attach a message checksum on every operation. (default: true) - + fs.s3a.bucket.probe 0 @@ -595,19 +593,17 @@ this makes renaming and deleting significantly slower. fs.s3a.committer.magic.enabled false - + fs.s3a.optimized.copy.from.local.enabled false - + fs.s3a.create.conditional.enabled false - - ``` diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md index 323854bfd75b4..f9ddb68cd31e0 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md @@ -645,49 +645,49 @@ java.nio.file.AccessDeniedException: example-file: Writing Object on example-fil at org.apache.hadoop.fs.FSDataOutputStream.close(FSDataOutputStream.java:106) at org.apache.hadoop.fs.shell.TouchCommands$Touchz.touchz(TouchCommands.java:89) at org.apache.hadoop.fs.shell.TouchCommands$Touchz.processNonexistentPath(TouchCommands.java:85) - at org.apache.hadoop.fs.shell.Command.processArgument(Command.java:303) - at org.apache.hadoop.fs.shell.Command.processArguments(Command.java:285) - at org.apache.hadoop.fs.shell.FsCommand.processRawArguments(FsCommand.java:121) + at org.apache.hadoop.fs.shell.Command.processArgument(Command.java:303) + at org.apache.hadoop.fs.shell.Command.processArguments(Command.java:285) + at org.apache.hadoop.fs.shell.FsCommand.processRawArguments(FsCommand.java:121) at org.apache.hadoop.fs.shell.Command.run(Command.java:192) at org.apache.hadoop.fs.FsShell.run(FsShell.java:327) at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:82) at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:97) at org.apache.hadoop.fs.FsShell.main(FsShell.java:390) -Caused by: software.amazon.awssdk.services.s3.model.S3Exception: Invalid argument. (Service: S3, Status Code: 403, Request ID: null) (SDK Attempt Count: 1) - at software.amazon.awssdk.services.s3.model.S3Exception$BuilderImpl.build(S3Exception.java:113) - at software.amazon.awssdk.services.s3.model.S3Exception$BuilderImpl.build(S3Exception.java:61) +Caused by: software.amazon.awssdk.services.s3.model.S3Exception: Invalid argument. (Service: S3, Status Code: 403, Request ID: null) (SDK Attempt Count: 1) + at software.amazon.awssdk.services.s3.model.S3Exception$BuilderImpl.build(S3Exception.java:113) + at software.amazon.awssdk.services.s3.model.S3Exception$BuilderImpl.build(S3Exception.java:61) at software.amazon.awssdk.core.internal.http.pipeline.stages.utils.RetryableStageHelper.retryPolicyDisallowedRetryException(RetryableStageHelper.java:168) at software.amazon.awssdk.core.internal.http.pipeline.stages.RetryableStage.execute(RetryableStage.java:73) at software.amazon.awssdk.core.internal.http.pipeline.stages.RetryableStage.execute(RetryableStage.java:36) - at software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206) + at software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206) at software.amazon.awssdk.core.internal.http.StreamManagingStage.execute(StreamManagingStage.java:53) at software.amazon.awssdk.core.internal.http.StreamManagingStage.execute(StreamManagingStage.java:35) - at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallTimeoutTrackingStage.executeWithTimer(ApiCallTimeoutTrackingStage.java:82) + at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallTimeoutTrackingStage.executeWithTimer(ApiCallTimeoutTrackingStage.java:82) at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallTimeoutTrackingStage.execute(ApiCallTimeoutTrackingStage.java:62) at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallTimeoutTrackingStage.execute(ApiCallTimeoutTrackingStage.java:43) - at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallMetricCollectionStage.execute(ApiCallMetricCollectionStage.java:50) - at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallMetricCollectionStage.execute(ApiCallMetricCollectionStage.java:32) - at software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206) - at software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206) + at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallMetricCollectionStage.execute(ApiCallMetricCollectionStage.java:50) + at software.amazon.awssdk.core.internal.http.pipeline.stages.ApiCallMetricCollectionStage.execute(ApiCallMetricCollectionStage.java:32) + at software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206) + at software.amazon.awssdk.core.internal.http.pipeline.RequestPipelineBuilder$ComposingRequestPipelineStage.execute(RequestPipelineBuilder.java:206) at software.amazon.awssdk.core.internal.http.pipeline.stages.ExecutionFailureExceptionReportingStage.execute(ExecutionFailureExceptionReportingStage.java:37) at software.amazon.awssdk.core.internal.http.pipeline.stages.ExecutionFailureExceptionReportingStage.execute(ExecutionFailureExceptionReportingStage.java:26) - at software.amazon.awssdk.core.internal.http.AmazonSyncHttpClient$RequestExecutionBuilderImpl.execute(AmazonSyncHttpClient.java:210) + at software.amazon.awssdk.core.internal.http.AmazonSyncHttpClient$RequestExecutionBuilderImpl.execute(AmazonSyncHttpClient.java:210) at software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.invoke(BaseSyncClientHandler.java:103) - at software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.doExecute(BaseSyncClientHandler.java:173) + at software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.doExecute(BaseSyncClientHandler.java:173) at software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.lambda$execute$1(BaseSyncClientHandler.java:80) at software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.measureApiCallSuccess(BaseSyncClientHandler.java:182) at software.amazon.awssdk.core.internal.handler.BaseSyncClientHandler.execute(BaseSyncClientHandler.java:74) - at software.amazon.awssdk.core.client.handler.SdkSyncClientHandler.execute(SdkSyncClientHandler.java:45) + at software.amazon.awssdk.core.client.handler.SdkSyncClientHandler.execute(SdkSyncClientHandler.java:45) at software.amazon.awssdk.awscore.client.handler.AwsSyncClientHandler.execute(AwsSyncClientHandler.java:53) at software.amazon.awssdk.services.s3.DefaultS3Client.putObject(DefaultS3Client.java:11883) at software.amazon.awssdk.services.s3.DelegatingS3Client.lambda$putObject$89(DelegatingS3Client.java:9716) - at software.amazon.awssdk.services.s3.internal.crossregion.S3CrossRegionSyncClient.invokeOperation(S3CrossRegionSyncClient.java:67) + at software.amazon.awssdk.services.s3.internal.crossregion.S3CrossRegionSyncClient.invokeOperation(S3CrossRegionSyncClient.java:67) at software.amazon.awssdk.services.s3.DelegatingS3Client.putObject(DelegatingS3Client.java:9716) at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$putObjectDirect$14(S3AFileSystem.java:3332) at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDurationOfSupplier(IOStatisticsBinding.java:650) at org.apache.hadoop.fs.s3a.S3AFileSystem.putObjectDirect(S3AFileSystem.java:3330) - at org.apache.hadoop.fs.s3a.WriteOperationHelper.lambda$putObject$7(WriteOperationHelper.java:535) - at org.apache.hadoop.fs.store.audit.AuditingFunctions.lambda$withinAuditSpan$0(AuditingFunctions.java:62) + at org.apache.hadoop.fs.s3a.WriteOperationHelper.lambda$putObject$7(WriteOperationHelper.java:535) + at org.apache.hadoop.fs.store.audit.AuditingFunctions.lambda$withinAuditSpan$0(AuditingFunctions.java:62) at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:122) ... 20 more touchz: example-file: Writing Object on example-file: software.amazon.awssdk.services.s3.model.S3Exception: Invalid argument. (Service: S3, Status Code: 403, Request ID: null) (SDK Attempt Count: 1):SignatureDoesNotMatch @@ -1377,7 +1377,7 @@ When working with S3 Express store buckets (unlike standard S3 buckets), follow ## Status Code: 200 + "PreconditionFailed: At least one of the pre-conditions you specified did not hold" ``` -software.amazon.awssdk.services.s3.model.S3Exception: At least one of the pre-conditions you specified did not hold +software.amazon.awssdk.services.s3.model.S3Exception: At least one of the pre-conditions you specified did not hold (Service: S3, Status Code: 200, Request ID: 01a396cff3000198cc0439e40509a95e33467bdc, Extended Request ID: TZrsG8pBzlmXoV) (SDK Attempt Count: 1): PreconditionFailed: At least one of the pre-conditions you specified did not hold ``` diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java index 61cd45e508879..6014494b31958 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java @@ -169,7 +169,9 @@ public void testMultipartUploadAbort() throws Exception { try { super.testMultipartUploadAbort(); } catch (FileNotFoundException e) { - LOG.info("Multipart upload not found in abort(). This is common on third-party stores: {}", e.toString()); + LOG.info("Multipart upload not found in abort()." + + " This is common on third-party stores: {}", + e.toString()); LOG.debug("Exception: ", e); } } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ABucketExistence.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ABucketExistence.java index 8d53748132a09..a7010ef68e39d 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ABucketExistence.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ABucketExistence.java @@ -39,8 +39,6 @@ import static org.apache.hadoop.fs.contract.ContractTestUtils.writeDataset; import static org.apache.hadoop.fs.s3a.Constants.AWS_REGION; import static org.apache.hadoop.fs.s3a.Constants.AWS_S3_ACCESSPOINT_REQUIRED; -import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_CALCULATION_ENABLED; -import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_CALCULATION_ENABLED_DEFAULT; import static org.apache.hadoop.fs.s3a.Constants.ENDPOINT; import static org.apache.hadoop.fs.s3a.Constants.FIPS_ENDPOINT; import static org.apache.hadoop.fs.s3a.Constants.FS_S3A; diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEndpointRegion.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEndpointRegion.java index 721f2247c6a1a..c55344ab18fc1 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEndpointRegion.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEndpointRegion.java @@ -536,7 +536,7 @@ public void testCentralEndpointAndNullRegionFipsWithCRUD() throws Throwable { /** * Skip the test if the region is null, sa-east-1, or otherwise - * not compatible with the test + * not compatible with the test. */ private void assumeCrossRegionTestSupported() throws IOException { final S3AFileSystem fs = getFileSystem(); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AMultipartUtils.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AMultipartUtils.java index 890c95aba97de..83a3def03768c 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AMultipartUtils.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AMultipartUtils.java @@ -32,9 +32,7 @@ import java.util.HashSet; import java.util.Set; -import static org.apache.hadoop.fs.s3a.Constants.MULTIPART_UPLOADS_ENABLED; import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeMultipartUploads; -import static org.apache.hadoop.fs.s3a.S3ATestUtils.skipIfNotEnabled; import static org.apache.hadoop.util.functional.RemoteIterators.foreach; /** diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java index e0029714d2bec..58eb3b6ec26da 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java @@ -87,7 +87,8 @@ * Tests use of assumed roles. * Only run if an assumed role is provided. *

    - * S3Express buckets only support access restrictions at the bucket level, rather than at paths underneath. + * S3Express buckets only support access restrictions at the bucket level, + * rather than at paths underneath. * All partial permission tests are disabled. */ @SuppressWarnings("ThrowableNotThrown") diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestS3APutIfMatchAndIfNoneMatch.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestS3APutIfMatchAndIfNoneMatch.java index 68ae6feead952..32413ebe609fe 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestS3APutIfMatchAndIfNoneMatch.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestS3APutIfMatchAndIfNoneMatch.java @@ -26,7 +26,6 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; -import org.opentest4j.AssertionFailedError; import software.amazon.awssdk.services.s3.model.S3Exception; import org.apache.commons.io.IOUtils; @@ -38,7 +37,6 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.s3a.AWSS3IOException; import org.apache.hadoop.fs.s3a.AbstractS3ATestBase; import org.apache.hadoop.fs.s3a.RemoteFileChangedException; import org.apache.hadoop.fs.s3a.S3AFileStatus; @@ -339,7 +337,7 @@ public void testIfNoneMatchConflictOnMultipartUpload() throws Throwable { expectPreconditionFailure(() -> createFileWithFlags(fs, testFile, MULTIPART_FILE_BYTES, true, null, true)); - } + } @Test public void testIfNoneMatchMultipartUploadWithRaceCondition() throws Throwable { @@ -391,7 +389,8 @@ public void testIfNoneMatchTwoConcurrentMultipartUploads() throws Throwable { * @param eval closure to eval * @throws Exception any other failure. */ - private static void expectPreconditionFailure(final LambdaTestUtils.VoidCallable eval) throws Exception { + private static void expectPreconditionFailure(final LambdaTestUtils.VoidCallable eval) + throws Exception { RemoteFileChangedException exception = intercept(RemoteFileChangedException.class, eval); S3Exception s3Exception = (S3Exception) exception.getCause(); if (!(s3Exception.statusCode() == SC_412_PRECONDITION_FAILED diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestCreateFileCost.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestCreateFileCost.java index e202f0ce0f1d8..81a7a4fa25ce0 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestCreateFileCost.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestCreateFileCost.java @@ -206,7 +206,7 @@ public void testCreateBuilderSequence() throws Throwable { // will trigger conditional create and throw RemoteFileChangedException intercept(RemoteFileChangedException.class, () -> buildFile(testFile, false, true, NO_HEAD_OR_LIST)); - } else { + } else { // third party store w/out conditional overwrite support buildFile(testFile, false, true, NO_HEAD_OR_LIST); } From fa906dcf97ff8829f50184906bd7433bc2a0a73a Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Thu, 23 Oct 2025 16:52:37 +0100 Subject: [PATCH 18/24] HADOOP-19654. get the right config for behaviour --- .../hadoop/fs/s3a/impl/ITestS3APutIfMatchAndIfNoneMatch.java | 2 +- .../apache/hadoop/fs/s3a/performance/ITestCreateFileCost.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestS3APutIfMatchAndIfNoneMatch.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestS3APutIfMatchAndIfNoneMatch.java index 32413ebe609fe..3e028578139b5 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestS3APutIfMatchAndIfNoneMatch.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/ITestS3APutIfMatchAndIfNoneMatch.java @@ -105,7 +105,7 @@ public Configuration createConfiguration() { @BeforeEach public void setup() throws Exception { super.setup(); - Configuration conf = getConfiguration(); + Configuration conf = getFileSystem().getConf(); assumeConditionalCreateEnabled(conf); } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestCreateFileCost.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestCreateFileCost.java index 81a7a4fa25ce0..29fa6d7035ba0 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestCreateFileCost.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestCreateFileCost.java @@ -202,7 +202,7 @@ public void testCreateBuilderSequence() throws Throwable { () -> buildFile(testFile, false, true, GET_FILE_STATUS_ON_FILE)); } else { - if (getConfiguration().getBoolean(FS_S3A_CONDITIONAL_CREATE_ENABLED, true)) { + if (getFileSystem().getConf().getBoolean(FS_S3A_CONDITIONAL_CREATE_ENABLED, true)) { // will trigger conditional create and throw RemoteFileChangedException intercept(RemoteFileChangedException.class, () -> buildFile(testFile, false, true, NO_HEAD_OR_LIST)); From f735b39be813a92aff0562f567c1ca0ec7ad23e3 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Thu, 23 Oct 2025 17:41:25 +0100 Subject: [PATCH 19/24] HADOOP-19654. remove intermittent and needless assertion failure --- .../apache/hadoop/fs/s3a/ITestS3APrefetchingInputStream.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3APrefetchingInputStream.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3APrefetchingInputStream.java index 5e66d6911cb83..05eb8c9497f91 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3APrefetchingInputStream.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3APrefetchingInputStream.java @@ -126,8 +126,6 @@ public void testReadLargeFileFully() throws Throwable { } // Verify that once stream is closed, all memory is freed verifyStatisticGaugeValue(ioStats, STREAM_READ_ACTIVE_MEMORY_IN_USE, 0); - assertThatStatisticMaximum(ioStats, - ACTION_EXECUTOR_ACQUIRED + SUFFIX_MAX).isGreaterThan(0); } @Test From c2b22b695490f4c2bc4e75d2ba822166e6d2fbfc Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Fri, 24 Oct 2025 17:21:50 +0100 Subject: [PATCH 20/24] HADOOP-19654. Upgrade SDK version to 2.35.4 This addresses the md5 header/s3 express issue we identified. Address MPU upload failures by tuning checksum options: fs.s3a.request.md5.header: true fs.s3a.checksum.generation: false fs.sea.crate.checksum.algorithm: CRC32C This seems to work everywhere. Documented. --- hadoop-project/pom.xml | 2 +- .../org/apache/hadoop/fs/s3a/Constants.java | 26 ++--- .../apache/hadoop/fs/s3a/S3AFileSystem.java | 5 +- .../org/apache/hadoop/fs/s3a/S3AUtils.java | 22 +++- .../hadoop/fs/s3a/WriteOperationHelper.java | 16 +-- .../hadoop/fs/s3a/impl/ChecksumSupport.java | 24 +++-- .../site/markdown/tools/hadoop-aws/index.md | 102 +++++++++++++++--- .../site/markdown/tools/hadoop-aws/reading.md | 10 ++ .../tools/hadoop-aws/third_party_stores.md | 28 +++-- .../hadoop/fs/s3a/ITestS3AChecksum.java | 8 +- .../fs/s3a/TestS3AExceptionTranslation.java | 13 +++ .../fs/s3a/impl/TestChecksumSupport.java | 29 ++++- 12 files changed, 217 insertions(+), 68 deletions(-) diff --git a/hadoop-project/pom.xml b/hadoop-project/pom.xml index f2924cf9ff80c..7dbbd0b140b50 100644 --- a/hadoop-project/pom.xml +++ b/hadoop-project/pom.xml @@ -209,7 +209,7 @@ 1.0-beta-1 900 1.12.720 - 2.33.8 + 2.35.4 3.1.1 1.3.0 1.0.1 diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java index 4f2200b2e4541..82da4242c8f56 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java @@ -1838,33 +1838,34 @@ private Constants() { public static final boolean CHECKSUM_VALIDATION_DEFAULT = false; /** - * Should checksums always be calculated? + * Should checksums always be generated? * Not all third-party stores like this being enabled for every request. * Value: {@value}. */ - public static final String CHECKSUM_CALCULATION_ENABLED = - "fs.s3a.checksum.calculation.enabled"; + public static final String CHECKSUM_GENERATION = + "fs.s3a.checksum.generation"; /** - * Default value of {@link #CHECKSUM_CALCULATION_ENABLED}. + * Default value of {@link #CHECKSUM_GENERATION}. * Value: {@value}. */ - public static final boolean CHECKSUM_CALCULATION_ENABLED_DEFAULT = true; + public static final boolean CHECKSUM_GENERATION_DEFAULT = false; /** * Indicates the algorithm used to create the checksum for the object * to be uploaded to S3. Unset by default. It supports the following values: - * 'CRC32', 'CRC32C', 'SHA1', 'SHA256', 'NONE', ''.. + * 'CRC32', 'CRC32C', 'SHA1', 'SHA256', 'CRC64_NVME 'NONE', ''. + * When checksum calculation is enabled this MUST be set to a valid algorithm. * value:{@value} */ public static final String CHECKSUM_ALGORITHM = "fs.s3a.create.checksum.algorithm"; /** - * Default checksum algorithm: {@code "none"}. + * Default checksum algorithm: {@code "CRC32C"}. */ public static final String CHECKSUM_ALGORITHM_DEFAULT = - ChecksumSupport.NONE; + ChecksumSupport.CRC32C; /** * Send a {@code Content-MD5 header} with every request. @@ -1872,16 +1873,15 @@ private Constants() { * For example: bulk delete). * It is supported by AWS S3, though has unexpected behavior with AWS S3 Express storage. * See https://github.com/aws/aws-sdk-java-v2/issues/6459 for details. - * It will be automatically disabled there. */ - public static final String MD5_HEADER_ENABLED = - "fs.s3a.md5.header.enabled"; + public static final String REQUEST_MD5_HEADER = + "fs.s3a.request.md5.header"; /** - * Default value of {@link #MD5_HEADER_ENABLED}. + * Default value of {@link #REQUEST_MD5_HEADER}. * Value: {@value}. */ - public static final boolean MD5_HEADER_ENABLED_DEFAULT = true; + public static final boolean DEFAULT_REQUEST_MD5_HEADER = true; /** diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java index eb5d1c327bcd8..a57396379a177 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java @@ -1179,8 +1179,9 @@ private ClientManager createClientManager(URI fsURI, boolean dtEnabled) throws I .withChecksumValidationEnabled( conf.getBoolean(CHECKSUM_VALIDATION, CHECKSUM_VALIDATION_DEFAULT)) .withChecksumCalculationEnabled( - conf.getBoolean(CHECKSUM_CALCULATION_ENABLED, CHECKSUM_CALCULATION_ENABLED_DEFAULT)) - .withMd5HeaderEnabled(conf.getBoolean(MD5_HEADER_ENABLED, MD5_HEADER_ENABLED_DEFAULT)) + conf.getBoolean(CHECKSUM_GENERATION, CHECKSUM_GENERATION_DEFAULT)) + .withMd5HeaderEnabled(conf.getBoolean(REQUEST_MD5_HEADER, + DEFAULT_REQUEST_MD5_HEADER)) .withClientSideEncryptionEnabled(isCSEEnabled) .withClientSideEncryptionMaterials(cseMaterials) .withAnalyticsAcceleratorEnabled(isAnalyticsAcceleratorEnabled) diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java index 63ad42dab7adb..af4708120216d 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java @@ -18,6 +18,7 @@ package org.apache.hadoop.fs.s3a; +import software.amazon.awssdk.awscore.exception.AwsErrorDetails; import software.amazon.awssdk.awscore.exception.AwsServiceException; import software.amazon.awssdk.core.exception.AbortedException; import software.amazon.awssdk.core.exception.ApiCallAttemptTimeoutException; @@ -240,8 +241,13 @@ public static IOException translateException(@Nullable String operation, ? (S3Exception) ase : null; int status = ase.statusCode(); - if (ase.awsErrorDetails() != null) { - message = message + ":" + ase.awsErrorDetails().errorCode(); + // error details, may be null + final AwsErrorDetails errorDetails = ase.awsErrorDetails(); + // error code, will be null if errorDetails is null + String errorCode = ""; + if (errorDetails != null) { + errorCode = errorDetails.errorCode(); + message = message + ":" + errorCode; } // big switch on the HTTP status code. @@ -308,6 +314,8 @@ public static IOException translateException(@Nullable String operation, // precondition failure: the object is there, but the precondition // (e.g. etag) didn't match. Assume remote file change during // rename or status passed in to openfile had an etag which didn't match. + // See the SC_200 handler for the treatment of the S3 Express failure + // variant. case SC_412_PRECONDITION_FAILED: ioe = new RemoteFileChangedException(path, message, "", ase); break; @@ -352,6 +360,16 @@ public static IOException translateException(@Nullable String operation, return ((MultiObjectDeleteException) exception) .translateException(message); } + if (PRECONDITION_FAILED.equals(errorCode)) { + // S3 Express stores report conflict in conditional writes + // as a 200 + an error code of "PreconditionFailed". + // This is mapped to RemoteFileChangedException for consistency + // with SC_412_PRECONDITION_FAILED handling. + return new RemoteFileChangedException(path, + operation, + exception.getMessage(), + exception); + } // other 200: FALL THROUGH default: diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java index f0a9583cf63fb..c8a3864d59e2c 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java @@ -54,8 +54,6 @@ import org.apache.hadoop.fs.store.audit.AuditSpanSource; import org.apache.hadoop.util.functional.CallableRaisingIOE; -import static org.apache.hadoop.fs.s3a.impl.InternalConstants.PRECONDITION_FAILED; -import static org.apache.hadoop.fs.s3a.impl.InternalConstants.SC_200_OK; import static org.apache.hadoop.util.Preconditions.checkNotNull; import static org.apache.hadoop.fs.s3a.Invoker.*; import static org.apache.hadoop.fs.store.audit.AuditingFunctions.withinAuditSpan; @@ -315,7 +313,8 @@ private CompleteMultipartUploadResponse finalizeMultipartUpload( } try (AuditSpan span = activateAuditSpan()) { CompleteMultipartUploadResponse uploadResult; - uploadResult = invoker.retry("Completing multipart upload", destKey, + uploadResult = invoker.retry("Completing multipart upload id " + uploadId, + destKey, true, retrying, () -> { @@ -324,17 +323,6 @@ private CompleteMultipartUploadResponse finalizeMultipartUpload( return writeOperationHelperCallbacks.completeMultipartUpload(requestBuilder.build()); }); return uploadResult; - } catch (AWSS3IOException e) { - // S3 express buckets report the create conflict differently - // recognize and convert. - if (e.statusCode() == SC_200_OK - && PRECONDITION_FAILED.equals(e.awsErrorDetails().errorCode())) { - throw new RemoteFileChangedException(destKey, - e.getOperation(), - e.getMessage(), - e.getCause()); - } - throw e; } } diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChecksumSupport.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChecksumSupport.java index 714d65a98bb41..3f3dffaa92685 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChecksumSupport.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChecksumSupport.java @@ -18,12 +18,12 @@ package org.apache.hadoop.fs.s3a.impl; +import java.util.Locale; import java.util.Set; import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableSet; import software.amazon.awssdk.services.s3.model.ChecksumAlgorithm; -import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.util.ConfigurationHelper; @@ -38,7 +38,11 @@ public final class ChecksumSupport { * Special checksum algorithm to declare that no checksum * is required: {@value}. */ - public static final String NONE = "none"; + public static final String NONE = "NONE"; + + public static final String CRC32C = "CRC32C"; + + public static final String CRC64NVME = "CRC64NVME"; private ChecksumSupport() { } @@ -49,6 +53,7 @@ private ChecksumSupport() { private static final Set SUPPORTED_CHECKSUM_ALGORITHMS = ImmutableSet.of( ChecksumAlgorithm.CRC32, ChecksumAlgorithm.CRC32_C, + ChecksumAlgorithm.CRC64_NVME, ChecksumAlgorithm.SHA1, ChecksumAlgorithm.SHA256); @@ -64,14 +69,19 @@ public static ChecksumAlgorithm getChecksumAlgorithm(Configuration conf) { CHECKSUM_ALGORITHM, ChecksumAlgorithm.class, configValue -> { - if (StringUtils.isBlank(configValue) || NONE.equalsIgnoreCase(configValue)) { + // default values and handling algorithms names without underscores. + String val = configValue == null ? NONE : configValue.toUpperCase(Locale.ROOT); + switch (val) { + case "": + case NONE: return null; - } - if (ChecksumAlgorithm.CRC32_C.toString().equalsIgnoreCase(configValue)) { - // In case the configuration value is CRC32C, without underscore. + case CRC32C: return ChecksumAlgorithm.CRC32_C; + case CRC64NVME: + return ChecksumAlgorithm.CRC64_NVME; + default: + throw new IllegalArgumentException("Checksum algorithm is not supported: " + configValue); } - throw new IllegalArgumentException("Checksum algorithm is not supported: " + configValue); }); } } diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md index 7770c5576a513..90c6ddbdb086b 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md @@ -931,7 +931,9 @@ Here are some the S3A properties for use in production. Indicates the algorithm used to create the checksum for the object to be uploaded to S3. Unset by default. It supports the following values: - 'CRC32', 'CRC32C', 'SHA1', 'SHA256', "none" + 'CRC32', 'CRC32C', 'SHA1', 'SHA256', "CRC64_NVME", "none" + The CRC64_NVME option requires aws-crt on the classpath, and is still + tangibly slower than CRC32C, which has its own instruction on x86 and ARM. @@ -1433,6 +1435,9 @@ The "fast" output stream 1. Uploads blocks in parallel in background threads. 1. Begins uploading blocks as soon as the buffered data exceeds this partition size. +1. Uses any checksum set in `fs.s3a.create.checksum.algorithm` to calculate an upload + checksum on data written; this is included in the file/part upload and verified + on the store. This can be a source of third-party store compatibility issues. 1. When buffering data to disk, uses the directory/directories listed in `fs.s3a.buffer.dir`. The size of data which can be buffered is limited to the available disk space. @@ -1707,16 +1712,7 @@ rate. The best practise for using this option is to disable multipart purges in normal use of S3A, enabling only in manual/scheduled housekeeping operations. -### S3A "fadvise" input policy support - -The S3A Filesystem client supports the notion of input policies, similar -to that of the Posix `fadvise()` API call. This tunes the behavior of the S3A -client to optimise HTTP GET requests for the different use cases. - -See [Improving data input performance through fadvise](./performance.html#fadvise) -for the details. - -##Metrics +## Metrics S3A metrics can be monitored through Hadoop's metrics2 framework. S3A creates its own metrics system called s3a-file-system, and each instance of the client @@ -1754,7 +1750,89 @@ also get recorded, for example the following: Note that low-level metrics from the AWS SDK itself are not currently included in these metrics. -## Other Topics + +## Checksums + +The S3 Client can use checksums in its requests to an S3 store in a number of ways + +1. To provide a checksum of the request headers. +2. To provide a `Content-MD5` hash of the request headers +3. To provide a checksum of data being PUT/POSTed to the store. +4. To validate data downloaded from the store. + +The various options available can impact performance and compatibility. +To understand the risks and issues here know that +* Request checksum generation (item 1) and validation (4) can be done "when required" or "always". + The "always" option is stricter, but can result in third-party compatibility issues +* Some third-party stores require the `Content-MD5` header and will fail without it (item 2) +* Data upload checksums (item 3) can be computationally expensive and incompatible with third-party stores +* The most efficient data upload checksum is CRC32C; there are explicit opcodes for this in x86 and ARM CPUs, with the appropriate implementation circuitry. +* Data download validation checksums are also computationally expensive. + +| Option | Purpose | Values | Default | +|------------------------------------|------------------------------------------------|---------|----------| +| `fs.s3a.request.md5.header` | Enable MD5 header | boolean | `true` | +| `fs.s3a.checksum.generation` | Generate checksums on all requests | boolean | `false` | +| `fs.s3a.checksum.validation` | Validate checksums on download | boolean | `false` | +| `fs.s3a.create.checksum.algorithm` | Checksum Algorithm when creating/copying files | `NONE`, `CRC32`, `CRC32C`, `CRC32_C`, `CRC64NVME` , `CRC64_NVME` | `CRC32C` | + +### Content-MD5 Header on requests: `fs.s3a.request.md5.header` + +Send a `Content-MD5 header` with every request? + +This header is required when interacting with some third-party stores. +It is supported by AWS S3, though has shown some unexpected behavior with AWS S3 Express storage +[issue 6459](https://github.com/aws/aws-sdk-java-v2/issues/6459). +As that appears to have been fixed in the SDK, this option is enabled by default. + +### Request checksum generation: `fs.s3a.checksum.generation` + +Should checksums be generated for all requests made to the store? + +* Incompatible with some third-party stores +* If `true` then multipart upload (i.e. large file upload) may fail if `fs.s3a.create.checksum.algorithm` is set to `NONE`. + +Set to `false` by default to avoid problems. + +### Checksum validation `fs.s3a.checksum.validation` + +Should the checksums of downloaded data be validated? + +This hurts performance and should be only used if considered important. + +### Creation checksum `fs.s3a.create.checksum.algorithm` + + +This is the algorithm to use when checksumming data during file creation and copy. + +Options: `NONE`, `CRC32`, `CRC32C`, `CRC32_C`, `CRC64NVME` , `CRC64_NVME` + +The option `NONE` is new to Hadoop 3.4.3; previously an empty string was required for the same behavior. + +The `CRC64NVME`/`CRC64_NVME` option is also new to Hadoop 3.4.3 and requires the `aws-crt` module to be on the classpath, otherwise an error is printed: + +``` +java.lang.RuntimeException: Could not load software.amazon.awssdk.crt.checksums.CRC64NVME. +Add dependency on 'software.amazon.awssdk.crt:aws-crt' module to enable CRC64NVME feature. +``` + +Checksum/algorithm incompatibilities may surface as a failure in "Completing multipart upload"`. + +``` +org.apache.hadoop.fs.s3a.AWSBadRequestException: Completing multipart upload id l8itQB. +5u7TcWqznqbGfTjHv06mxb4IlBNcZiDWrBAS0t1EMJGkr9J1QD2UAwDM5rLUZqypJfWCoPJtySxA3QK9QqKTBdKr3LXYjYJ_r9lRcGdzBRbnIJeI8tBr8yqtS on +test/testCommitEmptyFile/empty-commit.txt: +software.amazon.awssdk.services.s3.model.S3Exception: One or more of the specified parts could not be found. +The part may not have been uploaded, or the specified entity tag may not match the part's entity tag. +(Service: S3, Status Code: 400, Request ID: AQ0J4B66H626Y3FH, +Extended Request ID: g1zo25aQCZfqFh3vOzrzOBp9RjJEWmKImRcfWhvaeFHQ2hZo1xTm3GVMD03zN+d+cFB6oNeelNc=) +(SDK Attempt Count: 1):InvalidPart: One or more of the specified parts could not be found. +The part may not have been uploaded, or the specified entity tag may not match the part's entity tag. +(Service: S3, Status Code: 400, Request ID: AQ0J4B66H626Y3FH, Extended Request ID: +g1zo25aQCZfqFh3vOzrzOBp9RjJEWmKImRcfWhvaeFHQ2hZo1xTm3GVMD03zN+d+cFB6oNeelNc=) (SDK Attempt Count: 1) +``` + +## Other Topics ### Copying Data with distcp diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/reading.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/reading.md index fa4572bb165b1..99490bb4b3368 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/reading.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/reading.md @@ -163,6 +163,16 @@ configured to use the vector IO API, it is likely to be significantly faster to use the classic stream and its parallel reads. +## S3A "fadvise" input policy support: `fs.s3a.experimental.input.fadvise` + +The S3A Filesystem client supports the notion of input policies, similar +to that of the Posix `fadvise()` API call. This tunes the behavior of the S3A +client to optimise HTTP GET requests for the different use cases. + +See [Improving data input performance through fadvise](./performance.html#fadvise) +for the details. + + ## Developer Topics ### Stream IOStatistics diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md index 17cd7b0627808..a7151bbcb7f20 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/third_party_stores.md @@ -175,22 +175,32 @@ false to disable use of these features. It may be necessary to change checksums of uploads by 1. Restoring the attachment of a `Content-MD5 header` in requests -2. Changing checksum calculation from "always" to "when required" +2. Restricting checksum generation to only when required. ```xml - fs.s3a.md5.header.enabled + fs.s3a.request.md5.header + true + re-enable calculation and inclusion of an MD5 HEADER on data upload operations + + + + fs.s3a.checksum.generation false - re-enable calculation and inclusion of an MD5 HEADER on data upload operations (default: false) + Calculate and attach a message checksum on every operation. - fs.s3a.checksum.calculation.enabled + fs.s3a.checksum.validation false - Calculate and attach a message checksum on every operation. (default: true) + Validate data checksums on download + ``` +These options are set for best compatibility and performance by default; they may need tuning for specific stores. + +See [checksums](index.html#checksums) for more details. ### Disabling Change Detection @@ -501,15 +511,13 @@ As of October 2025 and the 2.33.8 AWS SDK, the settings needed to interact with - fs.s3a.md5.header.enabled + fs.s3a.request.md5.header false - re-enable calculation and inclusion of an MD5 HEADER on data upload operations (default: false) - fs.s3a.checksum.calculation.enabled + fs.s3a.checksum.generation false - Calculate and attach a message checksum on every operation. (default: true) ``` @@ -569,7 +577,7 @@ this makes renaming and deleting significantly slower. - fs.s3a.checksum.calculation.enabled + fs.s3a.checksum.generation false Calculate and attach a message checksum on every operation. (default: true) diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java index 77d6e6a16e42b..4b4c4efc0f78f 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java @@ -37,8 +37,8 @@ import static org.apache.hadoop.fs.contract.ContractTestUtils.skip; import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_ALGORITHM; -import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_CALCULATION_ENABLED; -import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_CALCULATION_ENABLED_DEFAULT; +import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_GENERATION; +import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_GENERATION_DEFAULT; import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_VALIDATION; import static org.apache.hadoop.fs.s3a.S3ATestUtils.assume; import static org.apache.hadoop.fs.s3a.S3ATestUtils.getTestBucketName; @@ -99,9 +99,9 @@ protected Configuration createConfiguration() { conf.setBoolean(REJECT_OUT_OF_SPAN_OPERATIONS, false); checksumAlgorithm = ChecksumSupport.getChecksumAlgorithm(conf); LOG.info("Using checksum algorithm {}/{}", algorithmName, checksumAlgorithm); - assume("Skipping checksum tests as " + CHECKSUM_CALCULATION_ENABLED + " is set", + assume("Skipping checksum tests as " + CHECKSUM_GENERATION + " is set", propagateBucketOptions(conf, getTestBucketName(conf)) - .getBoolean(CHECKSUM_CALCULATION_ENABLED, CHECKSUM_CALCULATION_ENABLED_DEFAULT)); + .getBoolean(CHECKSUM_GENERATION, CHECKSUM_GENERATION_DEFAULT)); return conf; } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AExceptionTranslation.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AExceptionTranslation.java index 2e7ed1c8b76a5..9dbf92b792bca 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AExceptionTranslation.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AExceptionTranslation.java @@ -453,6 +453,19 @@ public void testOpenSSLErrorRetry() throws Throwable { sdkClientException(WFOPENSSL_0035_STREAM_IS_CLOSED, null)))); } + @Test + public void testS3ExpressPreconditionFailure() throws Throwable { + AwsServiceException ase = AwsServiceException.builder() + .message("unwind") + .statusCode(SC_200_OK) + .awsErrorDetails(AwsErrorDetails.builder() + .errorCode(PRECONDITION_FAILED) + .build()) + .build(); + verifyExceptionClass(RemoteFileChangedException.class, + translateException("commit", "/path", ase)); + } + /** * Create a shaded NoHttpResponseException. * @return an exception. diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/TestChecksumSupport.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/TestChecksumSupport.java index 43187e6e8d46c..296e0aa22a250 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/TestChecksumSupport.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/impl/TestChecksumSupport.java @@ -31,15 +31,37 @@ public class TestChecksumSupport { @ParameterizedTest - @EnumSource(value = ChecksumAlgorithm.class, names = {"CRC32", "CRC32_C", "SHA1", "SHA256"}) + @EnumSource(value = ChecksumAlgorithm.class, names = {"CRC32", "CRC32_C", "SHA1", "SHA256", "CRC64_NVME"}) public void testGetSupportedChecksumAlgorithm(ChecksumAlgorithm checksumAlgorithm) { - final Configuration conf = new Configuration(); - conf.set(CHECKSUM_ALGORITHM, checksumAlgorithm.toString()); + assertChecksumAlgorithm(checksumAlgorithm, checksumAlgorithm.toString()); + } + + /** + * Assert that a checksum algorithm string resolves to a value. + * @param checksumAlgorithm expected value + * @param algorithm algorithm name + */ + private static void assertChecksumAlgorithm(final ChecksumAlgorithm checksumAlgorithm, + final String algorithm) { + final Configuration conf = new Configuration(false); + conf.set(CHECKSUM_ALGORITHM, algorithm); Assertions.assertThat(ChecksumSupport.getChecksumAlgorithm(conf)) .describedAs("Checksum algorithm must match value set in the configuration") .isEqualTo(checksumAlgorithm); } + @Test + public void testCRC32C() throws Throwable { + assertChecksumAlgorithm(ChecksumAlgorithm.CRC32_C, "CRC32C" ); + assertChecksumAlgorithm(ChecksumAlgorithm.CRC32_C, "CRC32_C" ); + } + + @Test + public void testCRC64NVME() throws Throwable { + assertChecksumAlgorithm(ChecksumAlgorithm.CRC64_NVME, "CRC64_NVME" ); + assertChecksumAlgorithm(ChecksumAlgorithm.CRC64_NVME, "CRC64NVME" ); + } + @Test public void testGetChecksumAlgorithmWhenNull() { final Configuration conf = new Configuration(); @@ -57,4 +79,5 @@ public void testGetNotSupportedChecksumAlgorithm() { .describedAs("Invalid checksum algorithm should throw an exception") .isInstanceOf(IllegalArgumentException.class); } + } From 447e7bb246e6cca180a566d567654432a0413399 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Fri, 24 Oct 2025 18:18:14 +0100 Subject: [PATCH 21/24] HADOOP-19654. Transient test failure due to cached fs not having performance flags --- .../contract/s3a/ITestS3AContractMkdirWithCreatePerf.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMkdirWithCreatePerf.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMkdirWithCreatePerf.java index 4570320029d59..42d175a577674 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMkdirWithCreatePerf.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMkdirWithCreatePerf.java @@ -31,6 +31,7 @@ import static org.apache.hadoop.fs.contract.ContractTestUtils.createFile; import static org.apache.hadoop.fs.contract.ContractTestUtils.dataset; import static org.apache.hadoop.fs.s3a.S3ATestConstants.KEY_PERFORMANCE_TESTS_ENABLED; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.disableFilesystemCaching; import static org.apache.hadoop.fs.s3a.S3ATestUtils.setPerformanceFlags; import static org.apache.hadoop.fs.s3a.S3ATestUtils.skipIfNotEnabled; @@ -42,9 +43,9 @@ public class ITestS3AContractMkdirWithCreatePerf extends AbstractContractMkdirTe @Override protected Configuration createConfiguration() { - return setPerformanceFlags( - super.createConfiguration(), - "create,mkdir"); + final Configuration conf = super.createConfiguration(); + disableFilesystemCaching(conf); + return setPerformanceFlags(conf, "create,mkdir"); } @Override From 8418c9f80d409a2465aaa9099d8cc8a47574dc56 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Fri, 24 Oct 2025 19:18:49 +0100 Subject: [PATCH 22/24] HADOOP-19654. SDK checksum stabilisation * Now need to explicitly turn off checksum validation on downloads (slow) * Default fs.s3a.create.checksum.algorithm is "" again: nothing. Docs updated to try and explain this. --- .../org/apache/hadoop/fs/s3a/Constants.java | 8 +++--- .../hadoop/fs/s3a/DefaultS3ClientFactory.java | 9 ++++--- .../apache/hadoop/fs/s3a/S3AFileSystem.java | 2 +- .../site/markdown/tools/hadoop-aws/index.md | 23 +++++++++++++--- .../ITestS3AContractMultipartUploader.java | 26 ++++++++++++++++--- .../hadoop/fs/s3a/ITestS3AChecksum.java | 4 +-- 6 files changed, 56 insertions(+), 16 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java index 82da4242c8f56..75db90092d151 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java @@ -1849,7 +1849,7 @@ private Constants() { * Default value of {@link #CHECKSUM_GENERATION}. * Value: {@value}. */ - public static final boolean CHECKSUM_GENERATION_DEFAULT = false; + public static final boolean DEFAULT_CHECKSUM_GENERATION = false; /** * Indicates the algorithm used to create the checksum for the object @@ -1862,10 +1862,10 @@ private Constants() { "fs.s3a.create.checksum.algorithm"; /** - * Default checksum algorithm: {@code "CRC32C"}. + * Default checksum algorithm: {@code "NONE"}. */ - public static final String CHECKSUM_ALGORITHM_DEFAULT = - ChecksumSupport.CRC32C; + public static final String DEFAULT_CHECKSUM_ALGORITHM = + ChecksumSupport.NONE; /** * Send a {@code Content-MD5 header} with every request. diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java index 344459d63d7f0..41e904ec9de1b 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java @@ -222,9 +222,12 @@ private , ClientT> Build builder.requestChecksumCalculation(checksumCalculation); // response checksum validation. Slow, even with CRC32 checksums. - if (parameters.isChecksumValidationEnabled()) { - builder.responseChecksumValidation(ResponseChecksumValidation.WHEN_SUPPORTED); - } + final ResponseChecksumValidation checksumValidation; + checksumValidation = parameters.isChecksumValidationEnabled() + ? ResponseChecksumValidation.WHEN_SUPPORTED + : ResponseChecksumValidation.WHEN_REQUIRED; + LOG.debug("Using checksum validation policy: {}", checksumValidation); + builder.responseChecksumValidation(checksumValidation); maybeApplyS3AccessGrantsConfigurations(builder, conf); diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java index a57396379a177..863a63f0c14a8 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java @@ -1179,7 +1179,7 @@ private ClientManager createClientManager(URI fsURI, boolean dtEnabled) throws I .withChecksumValidationEnabled( conf.getBoolean(CHECKSUM_VALIDATION, CHECKSUM_VALIDATION_DEFAULT)) .withChecksumCalculationEnabled( - conf.getBoolean(CHECKSUM_GENERATION, CHECKSUM_GENERATION_DEFAULT)) + conf.getBoolean(CHECKSUM_GENERATION, DEFAULT_CHECKSUM_GENERATION)) .withMd5HeaderEnabled(conf.getBoolean(REQUEST_MD5_HEADER, DEFAULT_REQUEST_MD5_HEADER)) .withClientSideEncryptionEnabled(isCSEEnabled) diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md index 90c6ddbdb086b..a8af1bc5a4664 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md @@ -1774,7 +1774,7 @@ To understand the risks and issues here know that | `fs.s3a.request.md5.header` | Enable MD5 header | boolean | `true` | | `fs.s3a.checksum.generation` | Generate checksums on all requests | boolean | `false` | | `fs.s3a.checksum.validation` | Validate checksums on download | boolean | `false` | -| `fs.s3a.create.checksum.algorithm` | Checksum Algorithm when creating/copying files | `NONE`, `CRC32`, `CRC32C`, `CRC32_C`, `CRC64NVME` , `CRC64_NVME` | `CRC32C` | +| `fs.s3a.create.checksum.algorithm` | Checksum Algorithm when creating/copying files | `NONE`, `CRC32`, `CRC32C`, `CRC32_C`, `CRC64NVME` , `CRC64_NVME`, `SHA256`, `SHA1` | `""` | ### Content-MD5 Header on requests: `fs.s3a.request.md5.header` @@ -1790,7 +1790,7 @@ As that appears to have been fixed in the SDK, this option is enabled by default Should checksums be generated for all requests made to the store? * Incompatible with some third-party stores -* If `true` then multipart upload (i.e. large file upload) may fail if `fs.s3a.create.checksum.algorithm` is set to `NONE`. +* If `true` then multipart upload (i.e. large file upload) may fail if `fs.s3a.create.checksum.algorithm` is not set to an algorithm other than `NONE`. Set to `false` by default to avoid problems. @@ -1805,7 +1805,7 @@ This hurts performance and should be only used if considered important. This is the algorithm to use when checksumming data during file creation and copy. -Options: `NONE`, `CRC32`, `CRC32C`, `CRC32_C`, `CRC64NVME` , `CRC64_NVME` +Options: `NONE`, `CRC32`, `CRC32C`, `CRC32_C`, `CRC64NVME` , `CRC64_NVME`, `SHA256`, `SHA1` The option `NONE` is new to Hadoop 3.4.3; previously an empty string was required for the same behavior. @@ -1818,6 +1818,7 @@ Add dependency on 'software.amazon.awssdk.crt:aws-crt' module to enable CRC64NVM Checksum/algorithm incompatibilities may surface as a failure in "Completing multipart upload"`. +First as a failure reported as a "missing part". ``` org.apache.hadoop.fs.s3a.AWSBadRequestException: Completing multipart upload id l8itQB. 5u7TcWqznqbGfTjHv06mxb4IlBNcZiDWrBAS0t1EMJGkr9J1QD2UAwDM5rLUZqypJfWCoPJtySxA3QK9QqKTBdKr3LXYjYJ_r9lRcGdzBRbnIJeI8tBr8yqtS on @@ -1832,6 +1833,22 @@ The part may not have been uploaded, or the specified entity tag may not match t g1zo25aQCZfqFh3vOzrzOBp9RjJEWmKImRcfWhvaeFHQ2hZo1xTm3GVMD03zN+d+cFB6oNeelNc=) (SDK Attempt Count: 1) ``` +Alternatively, as the failure of multipart uploads when a checksum algorithm is set and the part ordering is not in sequence. + +``` +org.apache.hadoop.fs.s3a.AWSStatus500Exception: + Completing multipart upload id A8rf256dBVbDtIVLr40KaMGKw9DY.rhgNP5zmn1ap97YjPaIO2Ac3XXL_T.2HCtIrGUpx5bdOTgvVeZzVHuoWI4pKv_MeMMVqBHJGP7u_q4PR8AxWvSq0Lsv724HT1fQ + on test/testMultipartUploadReverseOrderNonContiguousPartNumbers: +software.amazon.awssdk.services.s3.model.S3Exception: We encountered an internal error. +Please try again. +(Service: S3, Status Code: 500, Request ID: WTBY2FX76Q5F5YWB, +Extended Request ID: eWQWk8V8rmVmKImWVCI2rHyFS3XQSPgIkjfAyzzZCgVgyeRqox8mO8qO4ODMB6IUY0+rYqqsnOX2zXiQcRzFlb9p3nSkEEc+T0CYurLaH28=) +(SDK Attempt Count: 3) +``` + +This is only possible through the FileSystem multipart API; normal data writes including +those through the magic committer will not encounter it, + ## Other Topics ### Copying Data with distcp diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java index 6014494b31958..bb4cdf3a024d8 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractMultipartUploader.java @@ -24,6 +24,7 @@ import org.apache.hadoop.fs.contract.AbstractContractMultipartUploaderTest; import org.apache.hadoop.fs.contract.AbstractFSContract; import org.apache.hadoop.fs.s3a.S3AFileSystem; +import org.apache.hadoop.fs.s3a.impl.ChecksumSupport; import org.apache.hadoop.test.tags.IntegrationTest; import org.apache.hadoop.test.tags.ScaleTest; @@ -31,6 +32,8 @@ import org.junit.jupiter.api.Test; import static org.apache.hadoop.fs.contract.ContractTestUtils.skip; +import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_ALGORITHM; +import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_GENERATION; import static org.apache.hadoop.fs.s3a.S3ATestConstants.DEFAULT_MULTIPART_COMMIT_CONSUMES_UPLOAD_ID; import static org.apache.hadoop.fs.s3a.S3ATestConstants.DEFAULT_SCALE_TESTS_ENABLED; import static org.apache.hadoop.fs.s3a.S3ATestConstants.KEY_HUGE_PARTITION_SIZE; @@ -40,9 +43,12 @@ import static org.apache.hadoop.fs.s3a.S3ATestUtils.assume; import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeMultipartUploads; import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeNotS3ExpressFileSystem; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.disableFilesystemCaching; import static org.apache.hadoop.fs.s3a.S3ATestUtils.getTestPropertyBool; import static org.apache.hadoop.fs.s3a.S3ATestUtils.getTestPropertyBytes; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides; import static org.apache.hadoop.fs.s3a.S3ATestUtils.skipIfAnalyticsAcceleratorEnabled; +import static org.apache.hadoop.fs.s3a.impl.ChecksumSupport.getChecksumAlgorithm; import static org.apache.hadoop.fs.s3a.scale.AbstractSTestS3AHugeFiles.DEFAULT_HUGE_PARTITION_SIZE; /** @@ -105,6 +111,17 @@ protected boolean finalizeConsumesUploadIdImmediately() { return mpuCommitConsumesUploadId; } + @Override + protected Configuration createConfiguration() { + final Configuration conf = super.createConfiguration(); + // use whatever the default checksum generation option is. + removeBaseAndBucketOverrides(conf, CHECKSUM_GENERATION, CHECKSUM_ALGORITHM); + conf.setBoolean(CHECKSUM_GENERATION, false); + conf.set(CHECKSUM_ALGORITHM, ChecksumSupport.NONE); + disableFilesystemCaching(conf); + return conf; + } + @BeforeEach @Override public void setup() throws Exception { @@ -117,14 +134,16 @@ public void setup() throws Exception { assume("Scale test disabled: to enable set property " + KEY_SCALE_TESTS_ENABLED, enabled); - assumeMultipartUploads(getFileSystem().getConf()); + final Configuration fsConf = getFileSystem().getConf(); + assumeMultipartUploads(fsConf); partitionSize = (int) getTestPropertyBytes(conf, KEY_HUGE_PARTITION_SIZE, DEFAULT_HUGE_PARTITION_SIZE); - mpuCommitConsumesUploadId = getFileSystem().getConf().getBoolean( + mpuCommitConsumesUploadId = fsConf.getBoolean( MULTIPART_COMMIT_CONSUMES_UPLOAD_ID, DEFAULT_MULTIPART_COMMIT_CONSUMES_UPLOAD_ID); - LOG.debug("{} = {}", MULTIPART_COMMIT_CONSUMES_UPLOAD_ID, mpuCommitConsumesUploadId); + LOG.info("{} = {}", MULTIPART_COMMIT_CONSUMES_UPLOAD_ID, mpuCommitConsumesUploadId); + LOG.info("{} = {}", CHECKSUM_ALGORITHM, getChecksumAlgorithm(fsConf)); } /** @@ -146,6 +165,7 @@ public void testMultipartUploadReverseOrder() throws Exception { @Override public void testMultipartUploadReverseOrderNonContiguousPartNumbers() throws Exception { assumeNotS3ExpressFileSystem(getFileSystem()); + final Configuration fsConf = getFileSystem().getConf(); super.testMultipartUploadReverseOrderNonContiguousPartNumbers(); } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java index 4b4c4efc0f78f..665703cc26f5d 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AChecksum.java @@ -38,7 +38,7 @@ import static org.apache.hadoop.fs.contract.ContractTestUtils.skip; import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_ALGORITHM; import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_GENERATION; -import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_GENERATION_DEFAULT; +import static org.apache.hadoop.fs.s3a.Constants.DEFAULT_CHECKSUM_GENERATION; import static org.apache.hadoop.fs.s3a.Constants.CHECKSUM_VALIDATION; import static org.apache.hadoop.fs.s3a.S3ATestUtils.assume; import static org.apache.hadoop.fs.s3a.S3ATestUtils.getTestBucketName; @@ -101,7 +101,7 @@ protected Configuration createConfiguration() { LOG.info("Using checksum algorithm {}/{}", algorithmName, checksumAlgorithm); assume("Skipping checksum tests as " + CHECKSUM_GENERATION + " is set", propagateBucketOptions(conf, getTestBucketName(conf)) - .getBoolean(CHECKSUM_GENERATION, CHECKSUM_GENERATION_DEFAULT)); + .getBoolean(CHECKSUM_GENERATION, DEFAULT_CHECKSUM_GENERATION)); return conf; } From a5fd3db60f9f26e6d36781eb52c1cb482206b5cb Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Fri, 24 Oct 2025 20:44:47 +0100 Subject: [PATCH 23/24] HADOOP-19654. SDK checksum doc/javadoc tuning --- .../hadoop/fs/s3a/impl/ChecksumSupport.java | 10 +++++- .../site/markdown/tools/hadoop-aws/index.md | 33 +++++++++++++++---- .../tools/hadoop-aws/troubleshooting_s3a.md | 10 +++--- 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChecksumSupport.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChecksumSupport.java index 3f3dffaa92685..567eb571012a8 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChecksumSupport.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChecksumSupport.java @@ -40,8 +40,14 @@ public final class ChecksumSupport { */ public static final String NONE = "NONE"; + /** + * CRC32C, mapped to CRC32_C algorithm class. + */ public static final String CRC32C = "CRC32C"; + /** + * CRC64NVME, mapped to CRC64_NVME algorithm class. + */ public static final String CRC64NVME = "CRC64NVME"; private ChecksumSupport() { @@ -70,7 +76,9 @@ public static ChecksumAlgorithm getChecksumAlgorithm(Configuration conf) { ChecksumAlgorithm.class, configValue -> { // default values and handling algorithms names without underscores. - String val = configValue == null ? NONE : configValue.toUpperCase(Locale.ROOT); + String val = configValue == null + ? NONE + : configValue.toUpperCase(Locale.ROOT); switch (val) { case "": case NONE: diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md index a8af1bc5a4664..a4a08f093ef1f 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md @@ -1753,7 +1753,7 @@ in these metrics. ## Checksums -The S3 Client can use checksums in its requests to an S3 store in a number of ways +The S3 Client can use checksums in its requests to an S3 store in a number of ways: 1. To provide a checksum of the request headers. 2. To provide a `Content-MD5` hash of the request headers @@ -1761,7 +1761,7 @@ The S3 Client can use checksums in its requests to an S3 store in a number of wa 4. To validate data downloaded from the store. The various options available can impact performance and compatibility. -To understand the risks and issues here know that +To understand the risks and issues here know that: * Request checksum generation (item 1) and validation (4) can be done "when required" or "always". The "always" option is stricter, but can result in third-party compatibility issues * Some third-party stores require the `Content-MD5` header and will fail without it (item 2) @@ -1776,23 +1776,44 @@ To understand the risks and issues here know that | `fs.s3a.checksum.validation` | Validate checksums on download | boolean | `false` | | `fs.s3a.create.checksum.algorithm` | Checksum Algorithm when creating/copying files | `NONE`, `CRC32`, `CRC32C`, `CRC32_C`, `CRC64NVME` , `CRC64_NVME`, `SHA256`, `SHA1` | `""` | + +Turning on checksum generation and validation may seem like obvious actions, but consider +this: you are communicating with an S3 store over an HTTPS channels, which includes +cryptographically strong HMAC checksums of every block transmitted. +These are far more robust than the CRC* algorithms, and the computational cost is already +being paid for: so why add more? + +With TLS ensuring the network traffic isn't altered from the moment it is encrypted to when +it is decrypted, all extra checksum generation/validation does is ensure that there's no +accidental corruption between the data being generated and uploaded, or between being downloaded and read. + +This could potentially deal with memory/buffering/bus issues on the servers. +However this is what ECC RAM is for. +If you do suspect requests being corrupted during writing or reading, the options may +be worth considering. +As it is, they are off by default to avoid compatibility problems. + +Note: if you have a real example of where these checksum options have identified memory corruption, +please let us know. + ### Content-MD5 Header on requests: `fs.s3a.request.md5.header` Send a `Content-MD5 header` with every request? This header is required when interacting with some third-party stores. -It is supported by AWS S3, though has shown some unexpected behavior with AWS S3 Express storage +It is supported by AWS S3, though has has some unexpected behavior with AWS S3 Express storage [issue 6459](https://github.com/aws/aws-sdk-java-v2/issues/6459). -As that appears to have been fixed in the SDK, this option is enabled by default. +As that appears to have been fixed in the 2.35.4 SDK release, this option is enabled by default. ### Request checksum generation: `fs.s3a.checksum.generation` Should checksums be generated for all requests made to the store? * Incompatible with some third-party stores -* If `true` then multipart upload (i.e. large file upload) may fail if `fs.s3a.create.checksum.algorithm` is not set to an algorithm other than `NONE`. +* If `true` then multipart upload (i.e. large file upload) may fail if `fs.s3a.create.checksum.algorithm` + is not set to a valid algorithm (i.e. something other than `NONE`) -Set to `false` by default to avoid problems. +Set `fs.s3a.checksum.generation` to `false` by default to avoid these problems. ### Checksum validation `fs.s3a.checksum.validation` diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md index f9ddb68cd31e0..9b70f12dc8a6b 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md @@ -354,7 +354,7 @@ org.apache.hadoop.fs.s3a.AWSBadRequestException: upload part #1 upload ID 112233 This is an obscure failure which was encountered as part of [HADOOP-19221](https://issues.apache.org/jira/browse/HADOOP-19221) : an upload of part of a file could not -be succesfully retried after a failure was reported on the first attempt. +be successfully retried after a failure was reported on the first attempt. 1. It was only encountered during uploading files via the Staging Committers 2. And is a regression in the V2 SDK. @@ -364,7 +364,7 @@ be succesfully retried after a failure was reported on the first attempt. * If it is encountered on a release without the fix, please upgrade. It may be that the problem arises in the AWS SDK's "TransferManager", which is used for a -higher performance upload of data from the local fileystem. If this is the case. disable this feature: +higher performance upload of data from the local filesystem. If this is the case. disable this feature: ``` fs.s3a.optimized.copy.from.local.enabled @@ -614,13 +614,13 @@ This can surface when trying to interact, especially write data, to a third-part ``` The store does not recognize checksum calculation on every operation. -Fix: disable it by setting `fs.s3a.checksum.calculation.enabled` to `false`. +Fix: disable it by setting `fs.s3a.checksum.generation` to `false`. ``` - fs.s3a.checksum.calculation.enabled + fs.s3a.checksum.generation false - Calculate and attach a message checksum on every operation. (default: true) + Calculate and attach a message checksum on every operation. (default: false) ``` From 149e98291ada965d1dc2b85b4214f235bb4b5c5d Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Fri, 24 Oct 2025 21:34:34 +0100 Subject: [PATCH 24/24] HADOOP-19654. Fix AbstractContractUnbufferTest to read reliably --- .../fs/contract/AbstractContractUnbufferTest.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractUnbufferTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractUnbufferTest.java index 59eb57af8135e..0392c22fbee41 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractUnbufferTest.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractUnbufferTest.java @@ -18,6 +18,7 @@ package org.apache.hadoop.fs.contract; +import org.assertj.core.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -30,6 +31,7 @@ import static org.apache.hadoop.fs.contract.ContractTestUtils.createFile; import static org.apache.hadoop.fs.contract.ContractTestUtils.dataset; +import static org.apache.hadoop.fs.contract.ContractTestUtils.readNBytes; /** * Contract tests for {@link org.apache.hadoop.fs.CanUnbuffer#unbuffer}. @@ -145,10 +147,12 @@ protected void validateFileContents(FSDataInputStream stream, int length, int startIndex) throws IOException { byte[] streamData = new byte[length]; - assertEquals(length, stream.read(streamData), - "failed to read expected number of bytes from " - + "stream. This may be transient"); + final int read = readNBytes(stream, streamData, 0, length); + Assertions.assertThat(read) + .describedAs("failed to read expected number of bytes from stream. %s", stream) + .isEqualTo(length); byte[] validateFileBytes; + if (startIndex == 0 && length == fileBytes.length) { validateFileBytes = fileBytes; } else {