diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md index 2641b870d2e..e1761538f21 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md @@ -544,6 +544,18 @@ which address issues. In particular, we encourage testing of Hadoop release candidates, as these third-party endpoints get even less testing than the S3 endpoint itself. +### Public datasets used in tests + +Some tests rely on the presence of existing public datasets available on Amazon S3. +You may find a number of these in `org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils`. + +When testing against an endpoint which is not part of Amazon S3's standard commercial partition +(`aws`) such as third-party implementations or AWS's China regions, you should replace these +configurations with an empty space (` `) to disable the tests or an existing path in your object +store that supports these tests. + +An example of this might be the MarkerTools tests which require a bucket with a large number of +objects or the requester pays tests that require requester pays to be enabled for the bucket. ### Disabling the encryption tests diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ARequesterPays.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ARequesterPays.java index b8cb321f61a..9a818d037e4 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ARequesterPays.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ARequesterPays.java @@ -26,11 +26,11 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils; import org.apache.hadoop.fs.statistics.IOStatisticAssertions; import org.apache.hadoop.fs.statistics.StreamStatisticNames; import static org.apache.hadoop.fs.s3a.Constants.ALLOW_REQUESTER_PAYS; -import static org.apache.hadoop.fs.s3a.Constants.ENDPOINT; import static org.apache.hadoop.fs.s3a.Constants.S3A_BUCKET_PROBE; import static org.apache.hadoop.test.LambdaTestUtils.intercept; @@ -42,10 +42,15 @@ public class ITestS3ARequesterPays extends AbstractS3ATestBase { @Override protected Configuration createConfiguration() { Configuration conf = super.createConfiguration(); - S3ATestUtils.removeBaseAndBucketOverrides(conf, + + Path requesterPaysPath = getRequesterPaysPath(conf); + String requesterPaysBucketName = requesterPaysPath.toUri().getHost(); + S3ATestUtils.removeBaseAndBucketOverrides( + requesterPaysBucketName, + conf, ALLOW_REQUESTER_PAYS, - ENDPOINT, S3A_BUCKET_PROBE); + return conf; } @@ -102,14 +107,8 @@ public class ITestS3ARequesterPays extends AbstractS3ATestBase { } } - private Path getRequesterPaysPath(Configuration conf) { - String requesterPaysFile = - conf.getTrimmed(KEY_REQUESTER_PAYS_FILE, DEFAULT_REQUESTER_PAYS_FILE); - S3ATestUtils.assume( - "Empty test property: " + KEY_REQUESTER_PAYS_FILE, - !requesterPaysFile.isEmpty() - ); - return new Path(requesterPaysFile); + private static Path getRequesterPaysPath(Configuration conf) { + return new Path(PublicDatasetTestUtils.getRequesterPaysObject(conf)); } } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestConstants.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestConstants.java index 742c22ac5a5..6a74338cdeb 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestConstants.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestConstants.java @@ -20,6 +20,9 @@ package org.apache.hadoop.fs.s3a; import java.time.Duration; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils; + /** * Constants for S3A Testing. */ @@ -99,14 +102,19 @@ public interface S3ATestConstants { /** * Configuration key for an existing object in a requester pays bucket: {@value}. - * If not set, defaults to {@value DEFAULT_REQUESTER_PAYS_FILE}. + * + * Accessible via + * {@link PublicDatasetTestUtils#getRequesterPaysObject(Configuration)}. */ String KEY_REQUESTER_PAYS_FILE = TEST_FS_S3A + "requester.pays.file"; /** - * Default path for an S3 object inside a requester pays enabled bucket: {@value}. + * Configuration key for an existing bucket with many objects: {@value}. + * + * This is used for tests depending on buckets with a large number of keys. */ - String DEFAULT_REQUESTER_PAYS_FILE = "s3a://usgs-landsat/collection02/catalog.json"; + String KEY_BUCKET_WITH_MANY_OBJECTS + = TEST_FS_S3A + "bucket-with-many-objects"; /** * Name of the property to define the timeout for scale tests: {@value}. diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/test/PublicDatasetTestUtils.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/test/PublicDatasetTestUtils.java new file mode 100644 index 00000000000..669acd8b8bd --- /dev/null +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/test/PublicDatasetTestUtils.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3a.test; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.s3a.S3ATestUtils; + +import static org.apache.hadoop.fs.s3a.S3ATestConstants.KEY_BUCKET_WITH_MANY_OBJECTS; +import static org.apache.hadoop.fs.s3a.S3ATestConstants.KEY_REQUESTER_PAYS_FILE; + +/** + * Provides S3A filesystem URIs for public data sets for specific use cases. + * + * This allows for the contract between S3A tests and the existence of data sets + * to be explicit and also standardizes access and configuration of + * replacements. + * + * Bucket specific configuration such as endpoint or requester pays should be + * configured within "hadoop-tools/hadoop-aws/src/test/resources/core-site.xml". + * + * Warning: methods may mutate the configuration instance passed in. + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +public final class PublicDatasetTestUtils { + + /** + * Private constructor for utility class. + */ + private PublicDatasetTestUtils() {} + + /** + * Default path for an object inside a requester pays bucket: {@value}. + */ + private static final String DEFAULT_REQUESTER_PAYS_FILE + = "s3a://usgs-landsat/collection02/catalog.json"; + + /** + * Default bucket for an S3A file system with many objects: {@value}. + * + * We use a subdirectory to ensure we have permissions on all objects + * contained within as well as permission to inspect the directory itself. + */ + private static final String DEFAULT_BUCKET_WITH_MANY_OBJECTS + = "s3a://usgs-landsat/collection02/level-1/"; + + /** + * Provide a URI for a directory containing many objects. + * + * Unless otherwise configured, + * this will be {@value DEFAULT_BUCKET_WITH_MANY_OBJECTS}. + * + * @param conf Hadoop configuration + * @return S3A FS URI + */ + public static String getBucketPrefixWithManyObjects(Configuration conf) { + return fetchFromConfig(conf, + KEY_BUCKET_WITH_MANY_OBJECTS, DEFAULT_BUCKET_WITH_MANY_OBJECTS); + } + + /** + * Provide a URI to an object within a requester pays enabled bucket. + * + * Unless otherwise configured, + * this will be {@value DEFAULT_REQUESTER_PAYS_FILE}. + * + * @param conf Hadoop configuration + * @return S3A FS URI + */ + public static String getRequesterPaysObject(Configuration conf) { + return fetchFromConfig(conf, + KEY_REQUESTER_PAYS_FILE, DEFAULT_REQUESTER_PAYS_FILE); + } + + private static String fetchFromConfig(Configuration conf, String key, String defaultValue) { + String value = conf.getTrimmed(key, defaultValue); + + S3ATestUtils.assume("Empty test property: " + key, !value.isEmpty()); + + return value; + } + +} diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/tools/ITestMarkerTool.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/tools/ITestMarkerTool.java index fc1abc19dd8..127fcbab750 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/tools/ITestMarkerTool.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/tools/ITestMarkerTool.java @@ -28,9 +28,11 @@ import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.contract.ContractTestUtils; +import org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils; import org.apache.hadoop.fs.s3a.S3AFileSystem; import static org.apache.hadoop.fs.s3a.Constants.DIRECTORY_MARKER_POLICY_AUTHORITATIVE; @@ -307,22 +309,25 @@ public class ITestMarkerTool extends AbstractMarkerToolTest { } /** - * Run an audit against the landsat bucket. + * Run an audit against a bucket with a large number of objects. *

* This tests paging/scale against a larger bucket without * worrying about setup costs. */ @Test - public void testRunLimitedLandsatAudit() throws Throwable { - describe("Audit a few thousand landsat objects"); + public void testRunAuditManyObjectsInBucket() throws Throwable { + describe("Audit a few thousand objects"); final File audit = tempAuditFile(); + Configuration conf = super.createConfiguration(); + String bucketUri = PublicDatasetTestUtils.getBucketPrefixWithManyObjects(conf); + runToFailure(EXIT_INTERRUPTED, MARKERS, AUDIT, m(OPT_LIMIT), 3000, m(OPT_OUT), audit, - LANDSAT_BUCKET); + bucketUri); readOutput(audit); } diff --git a/hadoop-tools/hadoop-aws/src/test/resources/core-site.xml b/hadoop-tools/hadoop-aws/src/test/resources/core-site.xml index 300bd305fa7..1525f51d9d6 100644 --- a/hadoop-tools/hadoop-aws/src/test/resources/core-site.xml +++ b/hadoop-tools/hadoop-aws/src/test/resources/core-site.xml @@ -30,6 +30,8 @@ false + + fs.s3a.bucket.landsat-pds.endpoint ${central.endpoint} @@ -55,6 +57,31 @@ Do not add the referrer header to landsat operations + + + + fs.s3a.bucket.usgs-landsat.endpoint + ${central.endpoint} + + + + fs.s3a.bucket.usgs-landsat.requester.pays.enabled + true + usgs-landsat requires requester pays enabled + + + + fs.s3a.bucket.usgs-landsat.multipart.purge + false + Don't try to purge uploads in the read-only bucket, as + it will only create log noise. + + + + fs.s3a.bucket.usgs-landsat.audit.add.referrer.header + false + +