From 555a57adf2fa5f2d50c1141da907ae4ca032db2c Mon Sep 17 00:00:00 2001
From: Chris Nauroth <cnauroth@apache.org>
Date: Tue, 16 Aug 2016 17:05:52 -0700
Subject: [PATCH] HADOOP-13324. s3a tests don't authenticate with S3 frankfurt
 (or other V4 auth only endpoints). Contributed by Steve Loughran.

(cherry picked from commit 3808876c7397ea68906bc5cc18fdf690c9c42131)
---
 .../site/markdown/tools/hadoop-aws/index.md   | 247 +++++++++++++++---
 .../apache/hadoop/fs/s3a/S3ATestUtils.java    |  15 ++
 .../fs/s3a/TestS3AAWSCredentialsProvider.java |   1 +
 .../hadoop/fs/s3a/scale/S3AScaleTestBase.java |  15 +-
 .../scale/TestS3AInputStreamPerformance.java  |   2 +
 5 files changed, 247 insertions(+), 33 deletions(-)
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
index fc606d20998..3318c794a32 100644
--- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
@@ -631,6 +631,60 @@ this capability.
       any call to setReadahead() is made to an open stream.</description>
     </property>
 
+### Working with buckets in different regions
+
+S3 Buckets are hosted in different regions, the default being US-East.
+The client talks to it by default, under the URL `s3.amazonaws.com`
+
+S3A can work with buckets from any region. Each region has its own
+S3 endpoint, documented [by Amazon](http://docs.aws.amazon.com/general/latest/gr/rande.html#s3_region).
+
+1. Applications running in EC2 infrastructure do not pay for IO to/from
+*local S3 buckets*. They will be billed for access to remote buckets. Always
+use local buckets and local copies of data, wherever possible.
+1. The default S3 endpoint can support data IO with any bucket when the V1 request
+signing protocol is used.
+1. When the V4 signing protocol is used, AWS requires the explicit region endpoint
+to be used —hence S3A must be configured to use the specific endpoint. This
+is done in the configuration option `fs.s3a.endpoint`.
+1. All endpoints other than the default endpoint only support interaction
+with buckets local to that S3 instance.
+
+While it is generally simpler to use the default endpoint, working with
+V4-signing-only regions (Frankfurt, Seoul) requires the endpoint to be identified.
+Expect better performance from direct connections —traceroute will give you some insight.
+
+Examples:
+
+The default endpoint:
+
+```xml
+<property>
+  <name>fs.s3a.endpoint</name>
+  <value>s3.amazonaws.com</value>
+</property>
+```
+
+Frankfurt
+
+```xml
+<property>
+  <name>fs.s3a.endpoint</name>
+  <value>s3.eu-central-1.amazonaws.com</value>
+</property>
+```
+
+Seoul
+```xml
+<property>
+  <name>fs.s3a.endpoint</name>
+  <value>s3.ap-northeast-2.amazonaws.com</value>
+</property>
+```
+
+If the wrong endpoint is used, the request may fail. This may be reported as a 301/redirect error,
+or as a 400 Bad Request.
+
 ### S3AFastOutputStream
  **Warning: NEW in hadoop 2.7. UNSTABLE, EXPERIMENTAL: use at own risk**
 
@@ -816,8 +870,6 @@ of environment-variable authentication by attempting to use the `hdfs fs` comman
 to read or write data on S3. That is: comment out the `fs.s3a` secrets and rely on
 the environment variables.
 
-S3 Frankfurt is a special case. It uses the V4 authentication API.
-
 ### Authentication failures running on Java 8u60+
 
 A change in the Java 8 JVM broke some of the `toString()` string generation
@@ -826,6 +878,106 @@ generate authentication headers suitable for validation by S3.
 
 Fix: make sure that the version of Joda Time is 2.8.1 or later.
 
+### "Bad Request" exception when working with AWS S3 Frankfurt, Seoul, or elsewhere
+
+
+S3 Frankfurt and Seoul *only* support
+[the V4 authentication API](http://docs.aws.amazon.com/AmazonS3/latest/API/sig-v4-authenticating-requests.html).
+
+Requests using the V2 API will be rejected with 400 `Bad Request`
+
+```
+$ bin/hadoop fs -ls s3a://frankfurt/
+WARN s3a.S3AFileSystem: Client: Amazon S3 error 400: 400 Bad Request; Bad Request (retryable)
+
+com.amazonaws.services.s3.model.AmazonS3Exception: Bad Request (Service: Amazon S3; Status Code: 400; Error Code: 400 Bad Request; Request ID: 923C5D9E75E44C06), S3 Extended Request ID: HDwje6k+ANEeDsM6aJ8+D5gUmNAMguOk2BvZ8PH3g9z0gpH+IuwT7N19oQOnIr5CIx7Vqb/uThE=
+	at com.amazonaws.http.AmazonHttpClient.handleErrorResponse(AmazonHttpClient.java:1182)
+	at com.amazonaws.http.AmazonHttpClient.executeOneRequest(AmazonHttpClient.java:770)
+	at com.amazonaws.http.AmazonHttpClient.executeHelper(AmazonHttpClient.java:489)
+	at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:310)
+	at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:3785)
+	at com.amazonaws.services.s3.AmazonS3Client.headBucket(AmazonS3Client.java:1107)
+	at com.amazonaws.services.s3.AmazonS3Client.doesBucketExist(AmazonS3Client.java:1070)
+	at org.apache.hadoop.fs.s3a.S3AFileSystem.verifyBucketExists(S3AFileSystem.java:307)
+	at org.apache.hadoop.fs.s3a.S3AFileSystem.initialize(S3AFileSystem.java:284)
+	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2793)
+	at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:101)
+	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2830)
+	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2812)
+	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:389)
+	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:356)
+	at org.apache.hadoop.fs.shell.PathData.expandAsGlob(PathData.java:325)
+	at org.apache.hadoop.fs.shell.Command.expandArgument(Command.java:235)
+	at org.apache.hadoop.fs.shell.Command.expandArguments(Command.java:218)
+	at org.apache.hadoop.fs.shell.FsCommand.processRawArguments(FsCommand.java:103)
+	at org.apache.hadoop.fs.shell.Command.run(Command.java:165)
+	at org.apache.hadoop.fs.FsShell.run(FsShell.java:315)
+	at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:76)
+	at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:90)
+	at org.apache.hadoop.fs.FsShell.main(FsShell.java:373)
+ls: doesBucketExist on frankfurt-new: com.amazonaws.services.s3.model.AmazonS3Exception:
+  Bad Request (Service: Amazon S3; Status Code: 400; Error Code: 400 Bad Request;
+```
+
+This happens when trying to work with any S3 service which only supports the
+"V4" signing API —and he client is configured to use the default S3A service
+endpoint.
+
+The S3A client needs to be given the endpoint to use via the `fs.s3a.endpoint`
+property.
+
+```xml
+<property>
+  <name>fs.s3a.endpoint</name>
+  <value>s3.eu-central-1.amazonaws.com</value>
+</property>
+```
+
+### Error message "The bucket you are attempting to access must be addressed using the specified endpoint"
+
+This surfaces when `fs.s3a.endpoint` is configured to use S3 service endpoint
+which is neither the original AWS one, `s3.amazonaws.com` , nor the one where
+the bucket is hosted.
+
+```
+org.apache.hadoop.fs.s3a.AWSS3IOException: purging multipart uploads on landsat-pds:
+ com.amazonaws.services.s3.model.AmazonS3Exception:
+  The bucket you are attempting to access must be addressed using the specified endpoint.
+  Please send all future requests to this endpoint.
+   (Service: Amazon S3; Status Code: 301; Error Code: PermanentRedirect; Request ID: 5B7A5D18BE596E4B),
+    S3 Extended Request ID: uE4pbbmpxi8Nh7rycS6GfIEi9UH/SWmJfGtM9IeKvRyBPZp/hN7DbPyz272eynz3PEMM2azlhjE=:
+
+	at com.amazonaws.http.AmazonHttpClient.handleErrorResponse(AmazonHttpClient.java:1182)
+	at com.amazonaws.http.AmazonHttpClient.executeOneRequest(AmazonHttpClient.java:770)
+	at com.amazonaws.http.AmazonHttpClient.executeHelper(AmazonHttpClient.java:489)
+	at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:310)
+	at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:3785)
+	at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:3738)
+	at com.amazonaws.services.s3.AmazonS3Client.listMultipartUploads(AmazonS3Client.java:2796)
+	at com.amazonaws.services.s3.transfer.TransferManager.abortMultipartUploads(TransferManager.java:1217)
+	at org.apache.hadoop.fs.s3a.S3AFileSystem.initMultipartUploads(S3AFileSystem.java:454)
+	at org.apache.hadoop.fs.s3a.S3AFileSystem.initialize(S3AFileSystem.java:289)
+	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2715)
+	at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:96)
+	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2749)
+	at org.apache.hadoop.fs.FileSystem$Cache.getUnique(FileSystem.java:2737)
+	at org.apache.hadoop.fs.FileSystem.newInstance(FileSystem.java:430)
+```
+
+1. Use the [Specific endpoint of the bucket's S3 service](http://docs.aws.amazon.com/general/latest/gr/rande.html#s3_region)
+1. If not using "V4" authentication (see above), the original S3 endpoint
+can be used:
+
+```
+    <property>
+      <name>fs.s3a.endpoint</name>
+      <value>s3.amazonaws.com</value>
+    </property>
+```
+
+Using the explicit endpoint for the region is recommended for speed and the
+ability to use the V4 signing API.
+
 ## Visible S3 Inconsistency
 
 Amazon S3 is *an eventually consistent object store*. That is: not a filesystem.
@@ -1155,7 +1307,43 @@ that the file `contract-test-options.xml` does not contain any
 secret credentials itself. As the auth keys XML file is kept out of the
 source code tree, it is not going to get accidentally committed.
 
-### Running Tests against non-AWS storage infrastructures
+
+### Running the Tests
+
+After completing the configuration, execute the test run through Maven.
+
+    mvn clean test
+
+It's also possible to execute multiple test suites in parallel by enabling the
+`parallel-tests` Maven profile.  The tests spend most of their time blocked on
+network I/O with the S3 service, so running in parallel tends to complete full
+test runs faster.
+
+    mvn -Pparallel-tests clean test
+
+Some tests must run with exclusive access to the S3 bucket, so even with the
+`parallel-tests` profile enabled, several test suites will run in serial in a
+separate Maven execution step after the parallel tests.
+
+By default, the `parallel-tests` profile runs 4 test suites concurrently.  This
+can be tuned by passing the `testsThreadCount` argument.
+
+    mvn -Pparallel-tests -DtestsThreadCount=8 clean test
+
+### Testing against different regions
+
+S3A can connect to different regions —the tests support this. Simply
+define the target region in `contract-tests.xml` or any `auth-keys.xml`
+file referenced.
+
+```xml
+<property>
+  <name>fs.s3a.endpoint</name>
+  <value>s3.eu-central-1.amazonaws.com</value>
+</property>
+```
+This is used for all tests expect for scale tests using a Public CSV.gz file
+(see below)
 
 ### S3A session tests
 
@@ -1180,14 +1368,14 @@ An alternate endpoint may be defined in `test.fs.s3a.sts.endpoint`.
 
 The default is ""; meaning "use the amazon default value".
 
-#### CSV Data source
+#### CSV Data source Tests
 
 The `TestS3AInputStreamPerformance` tests require read access to a multi-MB
 text file. The default file for these tests is one published by amazon,
 [s3a://landsat-pds.s3.amazonaws.com/scene_list.gz](http://landsat-pds.s3.amazonaws.com/scene_list.gz).
 This is a gzipped CSV index of other files which amazon serves for open use.
 
-The path to this object is set in the option `fs.s3a.scale.test.csvfile`:
+The path to this object is set in the option `fs.s3a.scale.test.csvfile`,
 
     <property>
       <name>fs.s3a.scale.test.csvfile</name>
@@ -1196,21 +1384,37 @@ The path to this object is set in the option `fs.s3a.scale.test.csvfile`:
 
 1. If the option is not overridden, the default value is used. This
 is hosted in Amazon's US-east datacenter.
-1. If the property is empty, tests which require it will be skipped.
+1. If `fs.s3a.scale.test.csvfile` is empty, tests which require it will be skipped.
 1. If the data cannot be read for any reason then the test will fail.
 1. If the property is set to a different path, then that data must be readable
 and "sufficiently" large.
 
 To test on different S3 endpoints, or alternate infrastructures supporting
-the same APIs, the option `fs.s3a.scale.test.csvfile` must therefore be
+the same APIs, the option `fs.s3a.scale.test.csvfile` must either be
 set to " ", or an object of at least 10MB is uploaded to the object store, and
 the `fs.s3a.scale.test.csvfile` option set to its path.
 
-      <property>
-        <name>fs.s3a.scale.test.csvfile</name>
-        <value> </value>
-      </property>
+```xml
+<property>
+  <name>fs.s3a.scale.test.csvfile</name>
+  <value> </value>
+</property>
+```
 
+(the reason the space or newline is needed is to add "an empty entry"; an empty
+`<value/>` would be considered undefined and pick up the default)
+
+*Note:* if using a test file in an S3 region requiring a different endpoint value
+set in `fs.s3a.endpoint`, define it in `fs.s3a.scale.test.csvfile.endpoint`.
+If the default CSV file is used, the tests will automatically use the us-east
+endpoint:
+
+```xml
+<property>
+  <name>fs.s3a.scale.test.csvfile.endpoint</name>
+  <value>s3.amazonaws.com</value>
+</property>
+```
 
 #### Scale test operation count
 
@@ -1249,27 +1453,6 @@ smaller to achieve faster test runs.
         <value>10240</value>
       </property>
 
-### Running the Tests
-
-After completing the configuration, execute the test run through Maven.
-
-    mvn clean test
-
-It's also possible to execute multiple test suites in parallel by enabling the
-`parallel-tests` Maven profile.  The tests spend most of their time blocked on
-network I/O with the S3 service, so running in parallel tends to complete full
-test runs faster.
-
-    mvn -Pparallel-tests clean test
-
-Some tests must run with exclusive access to the S3 bucket, so even with the
-`parallel-tests` profile enabled, several test suites will run in serial in a
-separate Maven execution step after the parallel tests.
-
-By default, the `parallel-tests` profile runs 4 test suites concurrently.  This
-can be tuned by passing the `testsThreadCount` argument.
-
-    mvn -Pparallel-tests -DtestsThreadCount=8 clean test
 
 ### Testing against non AWS S3 endpoints.
 
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java
index 04010d635bf..39c60287d06 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java
@@ -21,6 +21,7 @@
 import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileContext;
+import org.apache.hadoop.fs.s3a.scale.S3AScaleTestBase;
 import org.junit.Assert;
 import org.junit.internal.AssumptionViolatedException;
 import org.slf4j.Logger;
@@ -133,6 +134,20 @@ public static void eventually(int timeout, Callable<Void> callback)
     throw lastException;
   }
 
+  /**
+   * patch the endpoint option so that irrespective of where other tests
+   * are working, the IO performance tests can work with the landsat
+   * images.
+   * @param conf configuration to patch
+   */
+  public static void useCSVDataEndpoint(Configuration conf) {
+    String endpoint = conf.getTrimmed(S3AScaleTestBase.KEY_CSVTEST_ENDPOINT,
+        S3AScaleTestBase.DEFAULT_CSVTEST_ENDPOINT);
+    if (!endpoint.isEmpty()) {
+      conf.set(ENDPOINT, endpoint);
+    }
+  }
+
   /**
    * The exception to raise so as to exit fast from
    * {@link #eventually(int, Callable)}.
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AAWSCredentialsProvider.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AAWSCredentialsProvider.java
index a25ca9cd36a..4ff84079fd4 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AAWSCredentialsProvider.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AAWSCredentialsProvider.java
@@ -133,6 +133,7 @@ public void testAnonymousProvider() throws Exception {
         AnonymousAWSCredentialsProvider.class.getName());
     Path testFile = new Path(
         conf.getTrimmed(KEY_CSVTEST_FILE, DEFAULT_CSVTEST_FILE));
+    S3ATestUtils.useCSVDataEndpoint(conf);
     FileSystem fs = FileSystem.newInstance(testFile.toUri(), conf);
     assertNotNull(fs);
     assertTrue(fs instanceof S3AFileSystem);
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/S3AScaleTestBase.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/S3AScaleTestBase.java
index 21639b151e9..d861a16230e 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/S3AScaleTestBase.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/S3AScaleTestBase.java
@@ -82,13 +82,26 @@ public static void nameThread() {
    */
   public static final String KEY_CSVTEST_FILE =
       S3A_SCALE_TEST + "csvfile";
-
   /**
    * Default path for the multi MB test file: {@value}.
    */
   public static final String DEFAULT_CSVTEST_FILE
       = "s3a://landsat-pds/scene_list.gz";
 
+  /**
+   * Endpoint for the S3 CSV/scale tests. This defaults to
+   * being us-east.
+   */
+  public static final String KEY_CSVTEST_ENDPOINT =
+      S3A_SCALE_TEST + "csvfile.endpoint";
+
+  /**
+   * Endpoint for the S3 CSV/scale tests. This defaults to
+   * being us-east.
+   */
+  public static final String DEFAULT_CSVTEST_ENDPOINT =
+      "s3.amazonaws.com";
+
   /**
    * The default number of operations to perform: {@value}.
    */
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/TestS3AInputStreamPerformance.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/TestS3AInputStreamPerformance.java
index bddd8e26c88..d6d9d66745b 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/TestS3AInputStreamPerformance.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/TestS3AInputStreamPerformance.java
@@ -28,6 +28,7 @@
 import org.apache.hadoop.fs.s3a.S3AInputPolicy;
 import org.apache.hadoop.fs.s3a.S3AInputStream;
 import org.apache.hadoop.fs.s3a.S3AInstrumentation;
+import org.apache.hadoop.fs.s3a.S3ATestUtils;
 import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.CompressionCodec;
@@ -79,6 +80,7 @@ public void openFS() throws IOException {
       assumptionMessage = "Empty test property: " + KEY_CSVTEST_FILE;
       testDataAvailable = false;
     } else {
+      S3ATestUtils.useCSVDataEndpoint(conf);
       testData = new Path(testFile);
       Path path = this.testData;
       bindS3aFS(path);