HADOOP-15426 Make S3guard client resilient to DDB throttle events and network failures (Contributed by Steve Loughran)

2018-09-12 21:04:49 -07:00 · 2018-09-12 21:04:49 -07:00 · d7c0a08a1c
parent d32a8d5d58
commit d7c0a08a1c
17 changed files with 1181 additions and 258 deletions
--- a/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml
+++ b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml
@ -1447,19 +1447,28 @@
  <name>fs.s3a.s3guard.ddb.max.retries</name>
  <value>9</value>
    <description>
-      Max retries on batched DynamoDB operations before giving up and
-      throwing an IOException.  Each retry is delayed with an exponential
+      Max retries on throttled/incompleted DynamoDB operations
+      before giving up and throwing an IOException.
+      Each retry is delayed with an exponential
      backoff timer which starts at 100 milliseconds and approximately
      doubles each time.  The minimum wait before throwing an exception is
      sum(100, 200, 400, 800, .. 100*2^N-1 ) == 100 * ((2^N)-1)
-      So N = 9 yields at least 51.1 seconds (51,100) milliseconds of blocking
-      before throwing an IOException.
+    </description>
+</property>
+
+<property>
+  <name>fs.s3a.s3guard.ddb.throttle.retry.interval</name>
+  <value>100ms</value>
+    <description>
+      Initial interval to retry after a request is throttled events;
+      the back-off policy is exponential until the number of retries of
+      fs.s3a.s3guard.ddb.max.retries is reached.
    </description>
 </property>

 <property>
  <name>fs.s3a.s3guard.ddb.background.sleep</name>
-  <value>25</value>
+  <value>25ms</value>
  <description>
    Length (in milliseconds) of pause between each batch of deletes when
    pruning metadata.  Prevents prune operations (which can typically be low
--- a/hadoop-tools/hadoop-aws/pom.xml
+++ b/hadoop-tools/hadoop-aws/pom.xml
@ -184,6 +184,8 @@
                    <exclude>**/ITestS3AFileContextStatistics.java</exclude>
                    <exclude>**/ITestS3AEncryptionSSEC*.java</exclude>
                    <exclude>**/ITestS3AHuge*.java</exclude>
+                    <!-- this sets out to overlaod DynamoDB, so must be run standalone -->
+                    <exclude>**/ITestDynamoDBMetadataStoreScale.java</exclude>
                  </excludes>
                </configuration>
              </execution>
@ -216,6 +218,8 @@
                    <include>**/ITestS3AFileContextStatistics.java</include>
                    <include>**/ITestS3AHuge*.java</include>
                    <include>**/ITestS3AEncryptionSSEC*.java</include>
+                    <!-- this sets out to overlaod DynamoDB, so must be run standalone -->
+                    <include>**/ITestDynamoDBMetadataStoreScale.java</include>
                  </includes>
                </configuration>
              </execution>
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java
@ -458,12 +458,20 @@ public final class Constants {
  @InterfaceStability.Unstable
  public static final String S3GUARD_DDB_MAX_RETRIES =
      "fs.s3a.s3guard.ddb.max.retries";
+
  /**
-   * Max retries on batched DynamoDB operations before giving up and
+   * Max retries on batched/throttled DynamoDB operations before giving up and
   * throwing an IOException.  Default is {@value}. See core-default.xml for
   * more detail.
   */
-  public static final int S3GUARD_DDB_MAX_RETRIES_DEFAULT = 9;
+  public static final int S3GUARD_DDB_MAX_RETRIES_DEFAULT =
+      DEFAULT_MAX_ERROR_RETRIES;
+
+  @InterfaceStability.Unstable
+  public static final String S3GUARD_DDB_THROTTLE_RETRY_INTERVAL =
+      "fs.s3a.s3guard.ddb.throttle.retry.interval";
+  public static final String S3GUARD_DDB_THROTTLE_RETRY_INTERVAL_DEFAULT =
+      "100ms";

  /**
   * Period of time (in milliseconds) to sleep between batches of writes.
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
@ -1131,6 +1131,7 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities {

  /**
   * Increment a statistic by 1.
+   * This increments both the instrumentation and storage statistics.
   * @param statistic The operation to increment
   */
  protected void incrementStatistic(Statistic statistic) {
@ -1139,6 +1140,7 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities {

  /**
   * Increment a statistic by a specific value.
+   * This increments both the instrumentation and storage statistics.
   * @param statistic The operation to increment
   * @param count the count to increment
   */
@ -1175,8 +1177,7 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities {
    Statistic stat = isThrottleException(ex)
        ? STORE_IO_THROTTLED
        : IGNORED_ERRORS;
-    instrumentation.incrementCounter(stat, 1);
-    storageStatistics.incrementCounter(stat, 1);
+    incrementStatistic(stat);
  }

  /**
@ -1197,6 +1198,11 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities {
  /**
   * Callback from {@link Invoker} when an operation against a metastore
   * is retried.
+   * Always increments the {@link Statistic#S3GUARD_METADATASTORE_RETRY}
+   * statistic/counter;
+   * if it is a throttling exception will update the associated
+   * throttled metrics/statistics.
+   *
   * @param ex exception
   * @param retries number of retries
   * @param idempotent is the method idempotent
@ -1205,6 +1211,11 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities {
      int retries,
      boolean idempotent) {
    operationRetried(ex);
+    incrementStatistic(S3GUARD_METADATASTORE_RETRY);
+    if (isThrottleException(ex)) {
+      incrementStatistic(S3GUARD_METADATASTORE_THROTTLED);
+      instrumentation.addValueToQuantiles(S3GUARD_METADATASTORE_THROTTLE_RATE, 1);
+    }
  }

  /**
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInstrumentation.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInstrumentation.java
@ -1032,15 +1032,14 @@ public class S3AInstrumentation implements Closeable, MetricsSource {
     * Throttled request.
     */
    public void throttled() {
-      incrementCounter(S3GUARD_METADATASTORE_THROTTLED, 1);
-      addValueToQuantiles(S3GUARD_METADATASTORE_THROTTLE_RATE, 1);
+      // counters are incremented by owner.
    }

    /**
     * S3Guard is retrying after a (retryable) failure.
     */
    public void retrying() {
-      incrementCounter(S3GUARD_METADATASTORE_RETRY, 1);
+      // counters are incremented by owner.
    }
  }

--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ARetryPolicy.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ARetryPolicy.java
@ -124,12 +124,7 @@ public class S3ARetryPolicy implements RetryPolicy {
    // and a separate policy for throttle requests, which are considered
    // repeatable, even for non-idempotent calls, as the service
    // rejected the call entirely
-    throttlePolicy = exponentialBackoffRetry(
-        conf.getInt(RETRY_THROTTLE_LIMIT, RETRY_THROTTLE_LIMIT_DEFAULT),
-        conf.getTimeDuration(RETRY_THROTTLE_INTERVAL,
-            RETRY_THROTTLE_INTERVAL_DEFAULT,
-            TimeUnit.MILLISECONDS),
-        TimeUnit.MILLISECONDS);
+    throttlePolicy = createThrottleRetryPolicy(conf);

    // client connectivity: fixed retries without care for idempotency
    connectivityFailure = fixedRetries;
@ -139,6 +134,22 @@ public class S3ARetryPolicy implements RetryPolicy {
    retryPolicy = retryByException(retryIdempotentCalls, policyMap);
  }

+  /**
+   * Create the throttling policy.
+   * This will be called from the S3ARetryPolicy constructor, so
+   * subclasses must assume they are not initialized.
+   * @param conf configuration to use.
+   * @return the retry policy for throttling events.
+   */
+  protected RetryPolicy createThrottleRetryPolicy(final Configuration conf) {
+    return exponentialBackoffRetry(
+        conf.getInt(RETRY_THROTTLE_LIMIT, RETRY_THROTTLE_LIMIT_DEFAULT),
+        conf.getTimeDuration(RETRY_THROTTLE_INTERVAL,
+            RETRY_THROTTLE_INTERVAL_DEFAULT,
+            TimeUnit.MILLISECONDS),
+        TimeUnit.MILLISECONDS);
+  }
+
  /**
   * Subclasses can override this like a constructor to change behavior: call
   * superclass method, then modify it as needed, and return it.
@ -206,6 +217,7 @@ public class S3ARetryPolicy implements RetryPolicy {
      int retries,
      int failovers,
      boolean idempotent) throws Exception {
+    Preconditions.checkArgument(exception != null, "Null exception");
    Exception ex = exception;
    if (exception instanceof AmazonClientException) {
      // uprate the amazon client exception for the purpose of exception
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java
@ -27,6 +27,7 @@ import com.amazonaws.SdkBaseException;
 import com.amazonaws.auth.AWSCredentialsProvider;
 import com.amazonaws.auth.EnvironmentVariableCredentialsProvider;
 import com.amazonaws.auth.InstanceProfileCredentialsProvider;
+import com.amazonaws.retry.RetryUtils;
 import com.amazonaws.services.dynamodbv2.model.AmazonDynamoDBException;
 import com.amazonaws.services.dynamodbv2.model.LimitExceededException;
 import com.amazonaws.services.dynamodbv2.model.ProvisionedThroughputExceededException;
@ -358,8 +359,10 @@ public final class S3AUtils {
  /**
   * Is the exception an instance of a throttling exception. That
   * is an AmazonServiceException with a 503 response, any
-   * exception from DynamoDB for limits exceeded, or an
-   * {@link AWSServiceThrottledException}.
+   * exception from DynamoDB for limits exceeded, an
+   * {@link AWSServiceThrottledException},
+   * or anything which the AWS SDK's RetryUtils considers to be
+   * a throttling exception.
   * @param ex exception to examine
   * @return true if it is considered a throttling exception
   */
@ -368,7 +371,9 @@ public final class S3AUtils {
        || ex instanceof ProvisionedThroughputExceededException
        || ex instanceof LimitExceededException
        || (ex instanceof AmazonServiceException
-            && 503  == ((AmazonServiceException)ex).getStatusCode());
+            && 503  == ((AmazonServiceException)ex).getStatusCode())
+        || (ex instanceof SdkBaseException
+            && RetryUtils.isThrottlingException((SdkBaseException) ex));
  }

  /**
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DynamoDBMetadataStore.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DynamoDBMetadataStore.java
@ -36,10 +36,12 @@ import java.util.Set;
 import java.util.TreeMap;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.stream.Collectors;

 import com.amazonaws.AmazonClientException;
+import com.amazonaws.AmazonServiceException;
 import com.amazonaws.auth.AWSCredentialsProvider;
 import com.amazonaws.services.dynamodbv2.AmazonDynamoDB;
 import com.amazonaws.services.dynamodbv2.document.BatchWriteItemOutcome;
@ -77,12 +79,12 @@ import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.s3a.AWSCredentialProviderList;
+import org.apache.hadoop.fs.s3a.AWSServiceThrottledException;
 import org.apache.hadoop.fs.s3a.Constants;
 import org.apache.hadoop.fs.s3a.Invoker;
 import org.apache.hadoop.fs.s3a.Retries;
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
 import org.apache.hadoop.fs.s3a.S3AInstrumentation;
-import org.apache.hadoop.fs.s3a.S3ARetryPolicy;
 import org.apache.hadoop.fs.s3a.S3AUtils;
 import org.apache.hadoop.fs.s3a.Tristate;
 import org.apache.hadoop.fs.s3a.auth.RolePolicies;
@ -198,10 +200,6 @@ public class DynamoDBMetadataStore implements MetadataStore {
  public static final String E_INCOMPATIBLE_VERSION
      = "Database table is from an incompatible S3Guard version.";

-  /** Initial delay for retries when batched operations get throttled by
-   * DynamoDB. Value is {@value} msec. */
-  public static final long MIN_RETRY_SLEEP_MSEC = 100;
-
  @VisibleForTesting
  static final String DESCRIPTION
      = "S3Guard metadata store in DynamoDB";
@ -214,6 +212,13 @@ public class DynamoDBMetadataStore implements MetadataStore {
  @VisibleForTesting
  static final String TABLE = "table";

+  @VisibleForTesting
+  static final String HINT_DDB_IOPS_TOO_LOW
+      = " This may be because the write threshold of DynamoDB is set too low.";
+
+  @VisibleForTesting
+  static final String THROTTLING = "Throttling";
+
  private static ValueMap deleteTrackingValueMap =
      new ValueMap().withBoolean(":false", false);

@ -226,7 +231,14 @@ public class DynamoDBMetadataStore implements MetadataStore {
  private Configuration conf;
  private String username;

-  private RetryPolicy dataAccessRetryPolicy;
+  /**
+   * This policy is mostly for batched writes, not for processing
+   * exceptions in invoke() calls.
+   * It also has a role purpose in {@link #getVersionMarkerItem()};
+   * look at that method for the details.
+   */
+  private RetryPolicy batchWriteRetryPolicy;
+
  private S3AInstrumentation.S3GuardInstrumentation instrumentation;

  /** Owner FS: only valid if configured with an owner FS. */
@ -237,8 +249,15 @@ public class DynamoDBMetadataStore implements MetadataStore {
      Invoker.NO_OP
  );

-  /** Data access can have its own policies. */
-  private Invoker dataAccess;
+  /** Invoker for read operations. */
+  private Invoker readOp;
+
+  /** Invoker for write operations. */
+  private Invoker writeOp;
+
+  private final AtomicLong readThrottleEvents = new AtomicLong(0);
+  private final AtomicLong writeThrottleEvents = new AtomicLong(0);
+  private final AtomicLong batchWriteCapacityExceededEvents = new AtomicLong(0);

  /**
   * Total limit on the number of throttle events after which
@ -292,10 +311,8 @@ public class DynamoDBMetadataStore implements MetadataStore {
    Preconditions.checkNotNull(fs, "Null filesystem");
    Preconditions.checkArgument(fs instanceof S3AFileSystem,
        "DynamoDBMetadataStore only supports S3A filesystem.");
-    owner = (S3AFileSystem) fs;
-    instrumentation = owner.getInstrumentation().getS3GuardInstrumentation();
+    bindToOwnerFilesystem((S3AFileSystem) fs);
    final String bucket = owner.getBucket();
-    conf = owner.getConf();
    String confRegion = conf.getTrimmed(S3GUARD_DDB_REGION_KEY);
    if (!StringUtils.isEmpty(confRegion)) {
      region = confRegion;
@ -316,7 +333,6 @@ public class DynamoDBMetadataStore implements MetadataStore {
      }
      LOG.debug("Inferring DynamoDB region from S3 bucket: {}", region);
    }
-    username = owner.getUsername();
    credentials = owner.shareCredentials("s3guard");
    dynamoDB = createDynamoDB(conf, region, bucket, credentials);

@ -325,7 +341,7 @@ public class DynamoDBMetadataStore implements MetadataStore {
    initDataAccessRetries(conf);

    // set up a full retry policy
-    invoker = new Invoker(new S3ARetryPolicy(conf),
+    invoker = new Invoker(new S3GuardDataAccessRetryPolicy(conf),
        this::retryEvent
    );

@ -334,6 +350,20 @@ public class DynamoDBMetadataStore implements MetadataStore {
    instrumentation.initialized();
  }

+  /**
+   * Declare that this table is owned by the specific S3A FS instance.
+   * This will bind some fields to the values provided by the owner,
+   * including wiring up the instrumentation.
+   * @param fs owner filesystem
+   */
+  @VisibleForTesting
+  void bindToOwnerFilesystem(final S3AFileSystem fs) {
+    owner = fs;
+    conf = owner.getConf();
+    instrumentation = owner.getInstrumentation().getS3GuardInstrumentation();
+    username = owner.getUsername();
+  }
+
  /**
   * Performs one-time initialization of the metadata store via configuration.
   *
@ -382,16 +412,23 @@ public class DynamoDBMetadataStore implements MetadataStore {
  /**
   * Set retry policy. This is driven by the value of
   * {@link Constants#S3GUARD_DDB_MAX_RETRIES} with an exponential backoff
-   * between each attempt of {@link #MIN_RETRY_SLEEP_MSEC} milliseconds.
+   * between each attempt of {@link Constants#S3GUARD_DDB_THROTTLE_RETRY_INTERVAL}
+   * milliseconds.
   * @param config configuration for data access
   */
  private void initDataAccessRetries(Configuration config) {
-    int maxRetries = config.getInt(S3GUARD_DDB_MAX_RETRIES,
-        S3GUARD_DDB_MAX_RETRIES_DEFAULT);
-    dataAccessRetryPolicy = RetryPolicies
-        .exponentialBackoffRetry(maxRetries, MIN_RETRY_SLEEP_MSEC,
+    batchWriteRetryPolicy = RetryPolicies
+        .exponentialBackoffRetry(
+            config.getInt(S3GUARD_DDB_MAX_RETRIES,
+                S3GUARD_DDB_MAX_RETRIES_DEFAULT),
+            conf.getTimeDuration(S3GUARD_DDB_THROTTLE_RETRY_INTERVAL,
+                S3GUARD_DDB_THROTTLE_RETRY_INTERVAL_DEFAULT,
+                TimeUnit.MILLISECONDS),
            TimeUnit.MILLISECONDS);
-    dataAccess = new Invoker(dataAccessRetryPolicy, this::retryEvent);
+    final RetryPolicy throttledRetryRetryPolicy
+        = new S3GuardDataAccessRetryPolicy(config);
+    readOp = new Invoker(throttledRetryRetryPolicy, this::readRetryEvent);
+    writeOp = new Invoker(throttledRetryRetryPolicy, this::writeRetryEvent);
  }

  @Override
@ -432,11 +469,17 @@ public class DynamoDBMetadataStore implements MetadataStore {
    if (tombstone) {
      Item item = PathMetadataDynamoDBTranslation.pathMetadataToItem(
          new DDBPathMetadata(PathMetadata.tombstone(path)));
-      invoker.retry("Put tombstone", path.toString(), idempotent,
+      writeOp.retry(
+          "Put tombstone",
+          path.toString(),
+          idempotent,
          () -> table.putItem(item));
    } else {
      PrimaryKey key = pathToKey(path);
-      invoker.retry("Delete key", path.toString(), idempotent,
+      writeOp.retry(
+          "Delete key",
+          path.toString(),
+          idempotent,
          () -> table.deleteItem(key));
    }
  }
@ -460,28 +503,38 @@ public class DynamoDBMetadataStore implements MetadataStore {
    }
  }

-  @Retries.OnceRaw
-  private Item getConsistentItem(PrimaryKey key) {
+  /**
+   * Get a consistent view of an item.
+   * @param path path to look up in the database
+   * @param path entry
+   * @return the result
+   * @throws IOException failure
+   */
+  @Retries.RetryTranslated
+  private Item getConsistentItem(final Path path) throws IOException {
+    PrimaryKey key = pathToKey(path);
    final GetItemSpec spec = new GetItemSpec()
        .withPrimaryKey(key)
        .withConsistentRead(true); // strictly consistent read
-    return table.getItem(spec);
+    return readOp.retry("get",
+        path.toString(),
+        true,
+        () -> table.getItem(spec));
  }

  @Override
-  @Retries.OnceTranslated
+  @Retries.RetryTranslated
  public DDBPathMetadata get(Path path) throws IOException {
    return get(path, false);
  }

  @Override
-  @Retries.OnceTranslated
+  @Retries.RetryTranslated
  public DDBPathMetadata get(Path path, boolean wantEmptyDirectoryFlag)
      throws IOException {
    checkPath(path);
    LOG.debug("Get from table {} in region {}: {}", tableName, region, path);
-    return Invoker.once("get", path.toString(),
-        () -> innerGet(path, wantEmptyDirectoryFlag));
+    return innerGet(path, wantEmptyDirectoryFlag);
  }

  /**
@ -491,9 +544,8 @@ public class DynamoDBMetadataStore implements MetadataStore {
   *   MetadataStore that it should try to compute the empty directory flag.
   * @return metadata for {@code path}, {@code null} if not found
   * @throws IOException IO problem
-   * @throws AmazonClientException dynamo DB level problem
   */
-  @Retries.OnceRaw
+  @Retries.RetryTranslated
  private DDBPathMetadata innerGet(Path path, boolean wantEmptyDirectoryFlag)
      throws IOException {
    final DDBPathMetadata meta;
@ -502,7 +554,7 @@ public class DynamoDBMetadataStore implements MetadataStore {
      meta =
          new DDBPathMetadata(makeDirStatus(username, path));
    } else {
-      final Item item = getConsistentItem(pathToKey(path));
+      final Item item = getConsistentItem(path);
      meta = itemToPathMetadata(item, username);
      LOG.debug("Get from table {} in region {} returning for {}: {}",
          tableName, region, path, meta);
@ -517,8 +569,10 @@ public class DynamoDBMetadataStore implements MetadataStore {
            .withConsistentRead(true)
            .withFilterExpression(IS_DELETED + " = :false")
            .withValueMap(deleteTrackingValueMap);
-        final ItemCollection<QueryOutcome> items = table.query(spec);
-        boolean hasChildren = items.iterator().hasNext();
+        boolean hasChildren = readOp.retry("get/hasChildren",
+            path.toString(),
+            true,
+            () -> table.query(spec).iterator().hasNext());
        // When this class has support for authoritative
        // (fully-cached) directory listings, we may also be able to answer
        // TRUE here.  Until then, we don't know if we have full listing or
@ -545,13 +599,16 @@ public class DynamoDBMetadataStore implements MetadataStore {
  }

  @Override
-  @Retries.OnceTranslated
+  @Retries.RetryTranslated
  public DirListingMetadata listChildren(final Path path) throws IOException {
    checkPath(path);
    LOG.debug("Listing table {} in region {}: {}", tableName, region, path);

    // find the children in the table
-    return Invoker.once("listChildren", path.toString(),
+    return readOp.retry(
+        "listChildren",
+        path.toString(),
+        true,
        () -> {
          final QuerySpec spec = new QuerySpec()
              .withHashKey(pathToParentKeyAttribute(path))
@ -610,7 +667,7 @@ public class DynamoDBMetadataStore implements MetadataStore {
  }

  @Override
-  @Retries.OnceTranslated
+  @Retries.RetryTranslated
  public void move(Collection<Path> pathsToDelete,
      Collection<PathMetadata> pathsToCreate) throws IOException {
    if (pathsToDelete == null && pathsToCreate == null) {
@ -639,25 +696,25 @@ public class DynamoDBMetadataStore implements MetadataStore {
      }
    }

-    Invoker.once("move", tableName,
-        () -> processBatchWriteRequest(null, pathMetadataToItem(newItems)));
+    processBatchWriteRequest(null, pathMetadataToItem(newItems));
  }

  /**
   * Helper method to issue a batch write request to DynamoDB.
   *
-   * The retry logic here is limited to repeating the write operations
-   * until all items have been written; there is no other attempt
-   * at recovery/retry. Throttling is handled internally.
+   * As well as retrying on the operation invocation, incomplete
+   * batches are retried until all have been deleted.
   * @param keysToDelete primary keys to be deleted; can be null
   * @param itemsToPut new items to be put; can be null
+   * @return the number of iterations needed to complete the call.
   */
-  @Retries.OnceRaw("Outstanding batch items are updated with backoff")
-  private void processBatchWriteRequest(PrimaryKey[] keysToDelete,
+  @Retries.RetryTranslated("Outstanding batch items are updated with backoff")
+  private int processBatchWriteRequest(PrimaryKey[] keysToDelete,
      Item[] itemsToPut) throws IOException {
    final int totalToDelete = (keysToDelete == null ? 0 : keysToDelete.length);
    final int totalToPut = (itemsToPut == null ? 0 : itemsToPut.length);
    int count = 0;
+    int batches = 0;
    while (count < totalToDelete + totalToPut) {
      final TableWriteItems writeItems = new TableWriteItems(tableName);
      int numToDelete = 0;
@ -682,35 +739,66 @@ public class DynamoDBMetadataStore implements MetadataStore {
        count += numToPut;
      }

-      BatchWriteItemOutcome res = dynamoDB.batchWriteItem(writeItems);
+      // if there's a retry and another process updates things then it's not
+      // quite idempotent, but this was the case anyway
+      batches++;
+      BatchWriteItemOutcome res = writeOp.retry(
+          "batch write",
+          "",
+          true,
+          () -> dynamoDB.batchWriteItem(writeItems));
      // Check for unprocessed keys in case of exceeding provisioned throughput
      Map<String, List<WriteRequest>> unprocessed = res.getUnprocessedItems();
      int retryCount = 0;
      while (!unprocessed.isEmpty()) {
-        retryBackoff(retryCount++);
-        res = dynamoDB.batchWriteItemUnprocessed(unprocessed);
+        batchWriteCapacityExceededEvents.incrementAndGet();
+        batches++;
+        retryBackoffOnBatchWrite(retryCount++);
+        // use a different reference to keep the compiler quiet
+        final Map<String, List<WriteRequest>> upx = unprocessed;
+        res = writeOp.retry(
+            "batch write",
+            "",
+            true,
+            () -> dynamoDB.batchWriteItemUnprocessed(upx));
        unprocessed = res.getUnprocessedItems();
      }
    }
+    return batches;
  }

  /**
   * Put the current thread to sleep to implement exponential backoff
   * depending on retryCount.  If max retries are exceeded, throws an
   * exception instead.
+   *
   * @param retryCount number of retries so far
   * @throws IOException when max retryCount is exceeded.
   */
-  private void retryBackoff(int retryCount) throws IOException {
+  private void retryBackoffOnBatchWrite(int retryCount) throws IOException {
    try {
      // Our RetryPolicy ignores everything but retryCount here.
-      RetryPolicy.RetryAction action = dataAccessRetryPolicy.shouldRetry(null,
+      RetryPolicy.RetryAction action = batchWriteRetryPolicy.shouldRetry(
+          null,
          retryCount, 0, true);
      if (action.action == RetryPolicy.RetryAction.RetryDecision.FAIL) {
-        throw new IOException(
-            String.format("Max retries exceeded (%d) for DynamoDB. This may be"
-                    + " because write threshold of DynamoDB is set too low.",
-                retryCount));
+        // Create an AWSServiceThrottledException, with a fake inner cause
+        // which we fill in to look like a real exception so
+        // error messages look sensible
+        AmazonServiceException cause = new AmazonServiceException(
+            "Throttling");
+        cause.setServiceName("S3Guard");
+        cause.setStatusCode(AWSServiceThrottledException.STATUS_CODE);
+        cause.setErrorCode(THROTTLING);  // used in real AWS errors
+        cause.setErrorType(AmazonServiceException.ErrorType.Service);
+        cause.setErrorMessage(THROTTLING);
+        cause.setRequestId("n/a");
+        throw new AWSServiceThrottledException(
+            String.format("Max retries during batch write exceeded"
+                    + " (%d) for DynamoDB."
+                    + HINT_DDB_IOPS_TOO_LOW,
+                retryCount),
+            cause);
      } else {
        LOG.debug("Sleeping {} msec before next retry", action.delayMillis);
        Thread.sleep(action.delayMillis);
@ -720,12 +808,12 @@ public class DynamoDBMetadataStore implements MetadataStore {
    } catch (IOException e) {
      throw e;
    } catch (Exception e) {
-      throw new IOException("Unexpected exception", e);
+      throw new IOException("Unexpected exception " + e, e);
    }
  }

  @Override
-  @Retries.OnceRaw
+  @Retries.RetryTranslated
  public void put(PathMetadata meta) throws IOException {
    // For a deeply nested path, this method will automatically create the full
    // ancestry and save respective item in DynamoDB table.
@ -741,7 +829,7 @@ public class DynamoDBMetadataStore implements MetadataStore {
  }

  @Override
-  @Retries.OnceRaw
+  @Retries.RetryTranslated
  public void put(Collection<PathMetadata> metas) throws IOException {
    innerPut(pathMetaToDDBPathMeta(metas));
  }
@ -757,8 +845,9 @@ public class DynamoDBMetadataStore implements MetadataStore {
  /**
   * Helper method to get full path of ancestors that are nonexistent in table.
   */
-  @Retries.OnceRaw
-  private Collection<DDBPathMetadata> fullPathsToPut(DDBPathMetadata meta)
+  @VisibleForTesting
+  @Retries.RetryTranslated
+  Collection<DDBPathMetadata> fullPathsToPut(DDBPathMetadata meta)
      throws IOException {
    checkPathMetadata(meta);
    final Collection<DDBPathMetadata> metasToPut = new ArrayList<>();
@ -771,7 +860,7 @@ public class DynamoDBMetadataStore implements MetadataStore {
    // first existent ancestor
    Path path = meta.getFileStatus().getPath().getParent();
    while (path != null && !path.isRoot()) {
-      final Item item = getConsistentItem(pathToKey(path));
+      final Item item = getConsistentItem(path);
      if (!itemExists(item)) {
        final FileStatus status = makeDirStatus(path, username);
        metasToPut.add(new DDBPathMetadata(status, Tristate.FALSE, false,
@ -810,7 +899,7 @@ public class DynamoDBMetadataStore implements MetadataStore {
   * @throws IOException IO problem
   */
  @Override
-  @Retries.OnceTranslated("retry(listFullPaths); once(batchWrite)")
+  @Retries.RetryTranslated
  public void put(DirListingMetadata meta) throws IOException {
    LOG.debug("Saving to table {} in region {}: {}", tableName, region, meta);

@ -821,15 +910,12 @@ public class DynamoDBMetadataStore implements MetadataStore {
            false, meta.isAuthoritative());

    // First add any missing ancestors...
-    final Collection<DDBPathMetadata> metasToPut = invoker.retry(
-        "paths to put", path.toString(), true,
-        () -> fullPathsToPut(ddbPathMeta));
+    final Collection<DDBPathMetadata> metasToPut = fullPathsToPut(ddbPathMeta);

    // next add all children of the directory
    metasToPut.addAll(pathMetaToDDBPathMeta(meta.getListing()));

-    Invoker.once("put", path.toString(),
-        () -> processBatchWriteRequest(null, pathMetadataToItem(metasToPut)));
+    processBatchWriteRequest(null, pathMetadataToItem(metasToPut));
  }

  @Override
@ -847,10 +933,10 @@ public class DynamoDBMetadataStore implements MetadataStore {
      closeAutocloseables(LOG, credentials);
      credentials = null;
    }
-}
+  }

  @Override
-  @Retries.OnceTranslated
+  @Retries.RetryTranslated
  public void destroy() throws IOException {
    if (table == null) {
      LOG.info("In destroy(): no table to delete");
@ -859,10 +945,11 @@ public class DynamoDBMetadataStore implements MetadataStore {
    LOG.info("Deleting DynamoDB table {} in region {}", tableName, region);
    Preconditions.checkNotNull(dynamoDB, "Not connected to DynamoDB");
    try {
-      table.delete();
+      invoker.retry("delete", null, true,
+          () -> table.delete());
      table.waitForDelete();
-    } catch (ResourceNotFoundException rnfe) {
-      LOG.info("ResourceNotFoundException while deleting DynamoDB table {} in "
+    } catch (FileNotFoundException rnfe) {
+      LOG.info("FileNotFoundException while deleting DynamoDB table {} in "
              + "region {}.  This may indicate that the table does not exist, "
              + "or has been deleted by another concurrent thread or process.",
          tableName, region);
@ -872,39 +959,50 @@ public class DynamoDBMetadataStore implements MetadataStore {
          tableName, ie);
      throw new InterruptedIOException("Table " + tableName
          + " in region " + region + " has not been deleted");
-    } catch (AmazonClientException e) {
-      throw translateException("destroy", tableName, e);
    }
  }

-  @Retries.OnceRaw
+  @Retries.RetryTranslated
  private ItemCollection<ScanOutcome> expiredFiles(long modTime,
-      String keyPrefix) {
+      String keyPrefix) throws IOException {
    String filterExpression =
        "mod_time < :mod_time and begins_with(parent, :parent)";
    String projectionExpression = "parent,child";
    ValueMap map = new ValueMap()
        .withLong(":mod_time", modTime)
        .withString(":parent", keyPrefix);
-    return table.scan(filterExpression, projectionExpression, null, map);
+    return readOp.retry(
+        "scan",
+        keyPrefix,
+        true,
+        () -> table.scan(filterExpression, projectionExpression, null, map));
  }

  @Override
-  @Retries.OnceRaw("once(batchWrite)")
+  @Retries.RetryTranslated
  public void prune(long modTime) throws IOException {
    prune(modTime, "/");
  }

+  /**
+   * Prune files, in batches. There's a sleep between each batch.
+   * @param modTime Oldest modification time to allow
+   * @param keyPrefix The prefix for the keys that should be removed
+   * @throws IOException Any IO/DDB failure.
+   * @throws InterruptedIOException if the prune was interrupted
+   */
  @Override
-  @Retries.OnceRaw("once(batchWrite)")
+  @Retries.RetryTranslated
  public void prune(long modTime, String keyPrefix) throws IOException {
    int itemCount = 0;
    try {
      Collection<Path> deletionBatch =
          new ArrayList<>(S3GUARD_DDB_BATCH_WRITE_REQUEST_LIMIT);
-      int delay = conf.getInt(S3GUARD_DDB_BACKGROUND_SLEEP_MSEC_KEY,
-          S3GUARD_DDB_BACKGROUND_SLEEP_MSEC_DEFAULT);
-      Set<Path> parentPathSet =  new HashSet<>();
+      long delay = conf.getTimeDuration(
+          S3GUARD_DDB_BACKGROUND_SLEEP_MSEC_KEY,
+          S3GUARD_DDB_BACKGROUND_SLEEP_MSEC_DEFAULT,
+          TimeUnit.MILLISECONDS);
+      Set<Path> parentPathSet = new HashSet<>();
      for (Item item : expiredFiles(modTime, keyPrefix)) {
        DDBPathMetadata md = PathMetadataDynamoDBTranslation
            .itemToPathMetadata(item, username);
@ -929,7 +1027,8 @@ public class DynamoDBMetadataStore implements MetadataStore {
          deletionBatch.clear();
        }
      }
-      if (deletionBatch.size() > 0) {
+      // final batch of deletes
+      if (!deletionBatch.isEmpty()) {
        Thread.sleep(delay);
        processBatchWriteRequest(pathToKey(deletionBatch), null);

@ -1093,19 +1192,34 @@ public class DynamoDBMetadataStore implements MetadataStore {
   * Get the version mark item in the existing DynamoDB table.
   *
   * As the version marker item may be created by another concurrent thread or
-   * process, we sleep and retry a limited times before we fail to get it.
-   * This does not include handling any failure other than "item not found",
-   * so this method is tagged as "OnceRaw"
+   * process, we sleep and retry a limited number times if the lookup returns
+   * with a null value.
+   * DDB throttling is always retried.
   */
-  @Retries.OnceRaw
-  private Item getVersionMarkerItem() throws IOException {
+  @VisibleForTesting
+  @Retries.RetryTranslated
+  Item getVersionMarkerItem() throws IOException {
    final PrimaryKey versionMarkerKey =
        createVersionMarkerPrimaryKey(VERSION_MARKER);
    int retryCount = 0;
-    Item versionMarker = table.getItem(versionMarkerKey);
+    // look for a version marker, with usual throttling/failure retries.
+    Item versionMarker = queryVersionMarker(versionMarkerKey);
    while (versionMarker == null) {
+      // The marker was null.
+      // Two possibilities
+      // 1. This isn't a S3Guard table.
+      // 2. This is a S3Guard table in construction; another thread/process
+      //    is about to write/actively writing the version marker.
+      // So that state #2 is handled, batchWriteRetryPolicy is used to manage
+      // retries.
+      // This will mean that if the cause is actually #1, failure will not
+      // be immediate. As this will ultimately result in a failure to
+      // init S3Guard and the S3A FS, this isn't going to be a performance
+      // bottleneck -simply a slightly slower failure report than would otherwise
+      // be seen.
+      // "if your settings are broken, performance is not your main issue"
      try {
-        RetryPolicy.RetryAction action = dataAccessRetryPolicy.shouldRetry(null,
+        RetryPolicy.RetryAction action = batchWriteRetryPolicy.shouldRetry(null,
            retryCount, 0, true);
        if (action.action == RetryPolicy.RetryAction.RetryDecision.FAIL) {
          break;
@ -1114,14 +1228,29 @@ public class DynamoDBMetadataStore implements MetadataStore {
          Thread.sleep(action.delayMillis);
        }
      } catch (Exception e) {
-        throw new IOException("initTable: Unexpected exception", e);
+        throw new IOException("initTable: Unexpected exception " + e, e);
      }
      retryCount++;
-      versionMarker = table.getItem(versionMarkerKey);
+      versionMarker = queryVersionMarker(versionMarkerKey);
    }
    return versionMarker;
  }

+  /**
+   * Issue the query to get the version marker, with throttling for overloaded
+   * DDB tables.
+   * @param versionMarkerKey key to look up
+   * @return the marker
+   * @throws IOException failure
+   */
+  @Retries.RetryTranslated
+  private Item queryVersionMarker(final PrimaryKey versionMarkerKey)
+      throws IOException {
+    return readOp.retry("getVersionMarkerItem",
+        VERSION_MARKER, true,
+        () -> table.getItem(versionMarkerKey));
+  }
+
  /**
   * Verify that a table version is compatible with this S3Guard client.
   * @param tableName name of the table (for error messages)
@ -1207,7 +1336,7 @@ public class DynamoDBMetadataStore implements MetadataStore {
   * @return the outcome.
   */
  @Retries.OnceRaw
-  PutItemOutcome putItem(Item item) {
+  private PutItemOutcome putItem(Item item) {
    LOG.debug("Putting item {}", item);
    return table.putItem(item);
  }
@ -1254,6 +1383,11 @@ public class DynamoDBMetadataStore implements MetadataStore {
    return region;
  }

+  @VisibleForTesting
+  public String getTableName() {
+    return tableName;
+  }
+
  @VisibleForTesting
  DynamoDB getDynamoDB() {
    return dynamoDB;
@ -1312,8 +1446,8 @@ public class DynamoDBMetadataStore implements MetadataStore {
    }
    map.put("description", DESCRIPTION);
    map.put("region", region);
-    if (dataAccessRetryPolicy != null) {
-      map.put("retryPolicy", dataAccessRetryPolicy.toString());
+    if (batchWriteRetryPolicy != null) {
+      map.put("retryPolicy", batchWriteRetryPolicy.toString());
    }
    return map;
  }
@ -1368,6 +1502,38 @@ public class DynamoDBMetadataStore implements MetadataStore {
    }
  }

+  /**
+   * Callback on a read operation retried.
+   * @param text text of the operation
+   * @param ex exception
+   * @param attempts number of attempts
+   * @param idempotent is the method idempotent (this is assumed to be true)
+   */
+  void readRetryEvent(
+      String text,
+      IOException ex,
+      int attempts,
+      boolean idempotent) {
+    readThrottleEvents.incrementAndGet();
+    retryEvent(text, ex, attempts, true);
+  }
+
+  /**
+   * Callback  on a write operation retried.
+   * @param text text of the operation
+   * @param ex exception
+   * @param attempts number of attempts
+   * @param idempotent is the method idempotent (this is assumed to be true)
+   */
+  void writeRetryEvent(
+      String text,
+      IOException ex,
+      int attempts,
+      boolean idempotent) {
+    writeThrottleEvents.incrementAndGet();
+    retryEvent(text, ex, attempts, idempotent);
+  }
+
  /**
   * Callback from {@link Invoker} when an operation is retried.
   * @param text text of the operation
@ -1410,4 +1576,31 @@ public class DynamoDBMetadataStore implements MetadataStore {
    }
  }

+  /**
+   * Get the count of read throttle events.
+   * @return the current count of read throttle events.
+   */
+  @VisibleForTesting
+  public long getReadThrottleEventCount() {
+    return readThrottleEvents.get();
+  }
+
+  /**
+   * Get the count of write throttle events.
+   * @return the current count of write throttle events.
+   */
+  @VisibleForTesting
+  public long getWriteThrottleEventCount() {
+    return writeThrottleEvents.get();
+  }
+
+  @VisibleForTesting
+  public long getBatchWriteCapacityExceededCount() {
+    return batchWriteCapacityExceededEvents.get();
+  }
+
+  @VisibleForTesting
+  public Invoker getInvoker() {
+    return invoker;
+  }
 }
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardDataAccessRetryPolicy.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardDataAccessRetryPolicy.java
@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import java.util.concurrent.TimeUnit;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.s3a.S3ARetryPolicy;
+import org.apache.hadoop.io.retry.RetryPolicy;
+
+import static org.apache.hadoop.fs.s3a.Constants.*;
+import static org.apache.hadoop.io.retry.RetryPolicies.exponentialBackoffRetry;
+
+/**
+ * A Retry policy whose throttling comes from the S3Guard config options.
+ */
+public class S3GuardDataAccessRetryPolicy extends S3ARetryPolicy {
+
+  public S3GuardDataAccessRetryPolicy(final Configuration conf) {
+    super(conf);
+  }
+
+  protected RetryPolicy createThrottleRetryPolicy(final Configuration conf) {
+    return exponentialBackoffRetry(
+        conf.getInt(S3GUARD_DDB_MAX_RETRIES, S3GUARD_DDB_MAX_RETRIES_DEFAULT),
+        conf.getTimeDuration(S3GUARD_DDB_THROTTLE_RETRY_INTERVAL,
+            S3GUARD_DDB_THROTTLE_RETRY_INTERVAL_DEFAULT,
+            TimeUnit.MILLISECONDS),
+        TimeUnit.MILLISECONDS);
+  }
+}
--- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/s3guard.md
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/s3guard.md
@ -298,8 +298,9 @@ rates.
 </property>
 ```

-Attempting to perform more IO than the capacity requested simply throttles the
-IO; small capacity numbers are recommended when initially experimenting
+Attempting to perform more IO than the capacity requested throttles the
+IO, and may result in operations failing. Larger IO capacities cost more.
+We recommending using small read and write capacities when initially experimenting
 with S3Guard.

 ## Authenticating with S3Guard
@ -327,7 +328,7 @@ to the options `fs.s3a.KEY` *for that bucket only*.
 As an example, here is a configuration to use different metadata stores
 and tables for different buckets

-First, we define shortcuts for the metadata store classnames
+First, we define shortcuts for the metadata store classnames:


 ```xml
@ -343,7 +344,7 @@ First, we define shortcuts for the metadata store classnames
 ```

 Next, Amazon's public landsat database is configured with no
-metadata store
+metadata store:

 ```xml
 <property>
@ -355,7 +356,7 @@ metadata store
 ```

 Next the `ireland-2` and `ireland-offline` buckets are configured with
-DynamoDB as the store, and a shared table `production-table`
+DynamoDB as the store, and a shared table `production-table`:


 ```xml
@ -716,10 +717,10 @@ Metadata Store Diagnostics:
 ```

 After the update, the table status changes to `UPDATING`; this is a sign that
-the capacity has been changed
+the capacity has been changed.

 Repeating the same command will not change the capacity, as both read and
-write values match that already in use
+write values match that already in use.

 ```
 2017-08-30 16:24:35,337 [main] INFO  s3guard.DynamoDBMetadataStore (DynamoDBMetadataStore.java:updateParameters(1090)) - Table capacity unchanged at read: 20, write: 20
@ -736,6 +737,9 @@ Metadata Store Diagnostics:
  write-capacity=20
 ```

+*Note*: There is a limit to how many times in a 24 hour period the capacity
+of a bucket can be changed, either through this command or the AWS console.
+
 ## Debugging and Error Handling

 If you run into network connectivity issues, or have a machine failure in the
@ -817,6 +821,97 @@ are only made after successful file creation, deletion and rename, the
 store is *unlikely* to get out of sync, it is still something which
 merits more testing before it could be considered reliable.

+## Managing DynamoDB IO Capacity
+
+DynamoDB is not only billed on use (data and IO requests), it is billed
+on allocated IO Capacity.
+
+When an application makes more requests than
+the allocated capacity permits, the request is rejected; it is up to
+the calling application to detect when it is being so throttled and
+react. S3Guard does this, but as a result: when the client is being
+throttled, operations are slower. This capacity throttling is averaged
+over a few minutes: a briefly overloaded table will not be throttled,
+but the rate cannot be sustained.
+
+The load on a table isvisible in the AWS console: go to the
+DynamoDB page for the table and select the "metrics" tab.
+If the graphs of throttled read or write
+requests show that a lot of throttling has taken place, then there is not
+enough allocated capacity for the applications making use of the table.
+
+Similarly, if the capacity graphs show that the read or write loads are
+low compared to the allocated capacities, then the table *may* be overprovisioned
+for the current workload.
+
+The S3Guard connector to DynamoDB can be configured to make
+multiple attempts to repeat a throttled request, with an exponential
+backoff between them.
+
+The relevant settings for managing retries in the connector are:
+
+```xml
+
+<property>
+  <name>fs.s3a.s3guard.ddb.max.retries</name>
+  <value>9</value>
+    <description>
+      Max retries on throttled/incompleted DynamoDB operations
+      before giving up and throwing an IOException.
+      Each retry is delayed with an exponential
+      backoff timer which starts at 100 milliseconds and approximately
+      doubles each time.  The minimum wait before throwing an exception is
+      sum(100, 200, 400, 800, .. 100*2^N-1 ) == 100 * ((2^N)-1)
+    </description>
+</property>
+
+<property>
+  <name>fs.s3a.s3guard.ddb.throttle.retry.interval</name>
+  <value>100ms</value>
+    <description>
+      Initial interval to retry after a request is throttled events;
+      the back-off policy is exponential until the number of retries of
+      fs.s3a.s3guard.ddb.max.retries is reached.
+    </description>
+</property>
+
+<property>
+  <name>fs.s3a.s3guard.ddb.background.sleep</name>
+  <value>25ms</value>
+  <description>
+    Length (in milliseconds) of pause between each batch of deletes when
+    pruning metadata.  Prevents prune operations (which can typically be low
+    priority background operations) from overly interfering with other I/O
+    operations.
+  </description>
+</property>
+```
+
+Having a large value for `fs.s3a.s3guard.ddb.max.retries` will ensure
+that clients of an overloaded table will not fail immediately. However
+queries may be unexpectedly slow.
+
+If operations, especially directory operations, are slow, check the AWS
+console. It is also possible to set up AWS alerts for capacity limits
+being exceeded.
+
+[DynamoDB Auto Scaling](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/AutoScaling.html)
+can automatically increase and decrease the allocated capacity.
+This is good for keeping capacity high when needed, but avoiding large
+bills when it is not.
+
+Experiments with S3Guard and DynamoDB Auto Scaling have shown that any Auto Scaling
+operation will only take place after callers have been throttled for a period of
+time. The clients will still need to be configured to retry when overloaded
+until any extra capacity is allocated. Furthermore, as this retrying will
+block the threads from performing other operations -including more IO, the
+the autoscale may not scale fast enough.
+
+We recommend experimenting with this, based on usage information collected
+from previous days, and and choosing a combination of
+retry counts and an interval which allow for the clients to cope with
+some throttling, but not to time out other applications.
+
 ## Troubleshooting

 ### Error: `S3Guard table lacks version marker.`
@ -857,12 +952,49 @@ or the configuration is preventing S3Guard from finding the table.
 region as the bucket being used.
 1. Create the table if necessary.

+
 ### Error `"The level of configured provisioned throughput for the table was exceeded"`

+```
+org.apache.hadoop.fs.s3a.AWSServiceThrottledException: listFiles on s3a://bucket/10/d1/d2/d3:
+com.amazonaws.services.dynamodbv2.model.ProvisionedThroughputExceededException:
+The level of configured provisioned throughput for the table was exceeded.
+Consider increasing your provisioning level with the UpdateTable API.
+(Service: AmazonDynamoDBv2; Status Code: 400;
+Error Code: ProvisionedThroughputExceededException;
+```
 The IO load of clients of the (shared) DynamoDB table was exceeded.

-Currently S3Guard doesn't do any throttling and retries here; the way to address
-this is to increase capacity via the AWS console or the `set-capacity` command.
+1. Increase the capacity of the DynamoDB table.
+1. Increase the retry count and/or sleep time of S3Guard on throttle events.
+1. Enable capacity autoscaling for the table in the AWS console.
+
+### Error `Max retries exceeded`
+
+The I/O load of clients of the (shared) DynamoDB table was exceeded, and
+the number of attempts to retry the operation exceeded the configured amount.
+
+1. Increase the capacity of the DynamoDB table.
+1. Increase the retry count and/or sleep time of S3Guard on throttle events.
+1. Enable capacity autoscaling for the table in the AWS console.
+
+
+### Error when running `set-capacity`: `org.apache.hadoop.fs.s3a.AWSServiceThrottledException: ProvisionTable`
+
+```
+org.apache.hadoop.fs.s3a.AWSServiceThrottledException: ProvisionTable on s3guard-example:
+com.amazonaws.services.dynamodbv2.model.LimitExceededException:
+Subscriber limit exceeded: Provisioned throughput decreases are limited within a given UTC day.
+After the first 4 decreases, each subsequent decrease in the same UTC day can be performed at most once every 3600 seconds.
+Number of decreases today: 6.
+Last decrease at Wednesday, July 25, 2018 8:48:14 PM UTC.
+Next decrease can be made at Wednesday, July 25, 2018 9:48:14 PM UTC
+```
+
+There's are limit on how often you can change the capacity of an DynamoDB table;
+if you call set-capacity too often, it fails. Wait until the after the time indicated
+and try again.
+

 ## Other Topics

--- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md
@ -742,6 +742,54 @@ sequential one afterwards. The IO heavy ones must also be subclasses of

 This is invaluable for debugging test failures.

+### Keeping AWS Costs down
+
+Most of the base S3 tests are designed to use public AWS data
+(the landsat-pds bucket) for read IO, so you don't have to pay for bytes
+downloaded or long term storage costs. The scale tests do work with more data
+so will cost more as well as generally take more time to execute.
+
+You are however billed for
+
+1. Data left in S3 after test runs.
+2. DynamoDB capacity reserved by S3Guard tables.
+3. HTTP operations on files (HEAD, LIST, GET).
+4. In-progress multipart uploads from bulk IO or S3A committer tests.
+5. Encryption/decryption using AWS KMS keys.
+
+The GET/decrypt costs are incurred on each partial read of a file,
+so random IO can cost more than sequential IO; the speedup of queries with
+columnar data usually justifies this.
+
+The DynamoDB costs come from the number of entries stores and the allocated capacity.
+
+How to keep costs down
+
+* Don't run the scale tests with large datasets; keep `fs.s3a.scale.test.huge.filesize` unset, or a few MB (minimum: 5).
+* Remove all files in the filesystem. The root tests usually do this, but
+it can be manually done:
+
+      hadoop fs -rm -r -f -skipTrash s3a://test-bucket/\*
+* Abort all outstanding uploads:
+
+      hadoop s3guard uploads -abort -force s3a://test-bucket/
+* If you don't need it, destroy the S3Guard DDB table.
+
+      hadoop s3guard destroy s3a://hwdev-steve-ireland-new/
+
+The S3Guard tests will automatically create the Dynamo DB table in runs with
+`-Ds3guard -Ddynamodb` set; default capacity of these buckets
+tests is very small; it keeps costs down at the expense of IO performance
+and, for test runs in or near the S3/DDB stores, throttling events.
+
+If you want to manage capacity, use `s3guard set-capacity` to increase it
+(performance) or decrease it (costs).
+For remote `hadoop-aws` test runs, the read/write capacities of "10" each should suffice;
+increase it if parallel test run logs warn of throttling.
+
+Tip: for agility, use DynamoDB autoscaling, setting the minimum to something very low (e.g 5 units), the maximum to the largest amount you are willing to pay.
+This will automatically reduce capacity when you are not running tests against
+the bucket, slowly increase it over multiple test runs, if the load justifies it.

 ## <a name="tips"></a> Tips

@ -985,9 +1033,13 @@ are included in the scale tests executed when `-Dscale` is passed to
 the maven command line.

 The two S3Guard scale tests are `ITestDynamoDBMetadataStoreScale` and
-`ITestLocalMetadataStoreScale`.  To run the DynamoDB test, you will need to
-define your table name and region in your test configuration.  For example,
-the following settings allow us to run `ITestDynamoDBMetadataStoreScale` with
+`ITestLocalMetadataStoreScale`.
+
+To run these tests, your DynamoDB table needs to be of limited capacity;
+the values in `ITestDynamoDBMetadataStoreScale` currently require a read capacity
+of 10 or less. a write capacity of 15 or more.
+
+The following settings allow us to run `ITestDynamoDBMetadataStoreScale` with
 artificially low read and write capacity provisioned, so we can judge the
 effects of being throttled by the DynamoDB service:

@ -1008,24 +1060,49 @@ effects of being throttled by the DynamoDB service:
  <name>fs.s3a.s3guard.ddb.table</name>
  <value>my-scale-test</value>
 </property>
-<property>
-  <name>fs.s3a.s3guard.ddb.region</name>
-  <value>us-west-2</value>
-</property>
 <property>
  <name>fs.s3a.s3guard.ddb.table.create</name>
  <value>true</value>
 </property>
 <property>
  <name>fs.s3a.s3guard.ddb.table.capacity.read</name>
-  <value>10</value>
+  <value>5</value>
 </property>
 <property>
  <name>fs.s3a.s3guard.ddb.table.capacity.write</name>
-  <value>10</value>
+  <value>5</value>
 </property>
 ```

+These tests verify that the invoked operations can trigger retries in the
+S3Guard code, rather than just in the AWS SDK level, so showing that if
+SDK operations fail, they get retried. They also verify that the filesystem
+statistics are updated to record that throttling took place.
+
+*Do not panic if these tests fail to detect throttling!*
+
+These tests are unreliable as they need certain conditions to be met
+to repeatedly fail:
+
+1. You must have a low-enough latency connection to the DynamoDB store that,
+for the capacity allocated, you can overload it.
+1. The AWS Console can give you a view of what is happening here.
+1. Running a single test on its own is less likely to trigger an overload
+than trying to run the whole test suite.
+1. And running the test suite more than once, back-to-back, can also help
+overload the cluster.
+1. Stepping through with a debugger will reduce load, so may not trigger
+failures.
+
+If the tests fail, it *probably* just means you aren't putting enough load
+on the table.
+
+These tests do not verify that the entire set of DynamoDB calls made
+during the use of a S3Guarded S3A filesystem are wrapped by retry logic.
+
+*The best way to verify resilience is to run the entire `hadoop-aws` test suite,
+or even a real application, with throttling enabled.
+
 ### Testing only: Local Metadata Store

 There is an in-memory Metadata Store for testing.
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFileSystemContract.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFileSystemContract.java
@ -54,6 +54,11 @@ public class ITestS3AFileSystemContract extends FileSystemContractBaseTest {
    Thread.currentThread().setName("JUnit-" + methodName.getMethodName());
  }

+  @Override
+  protected int getGlobalTimeout() {
+    return S3ATestConstants.S3A_TEST_TIMEOUT;
+  }
+
  @Before
  public void setUp() throws Exception {
    nameThread();
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStore.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStore.java
@ -95,7 +95,7 @@ public class ITestDynamoDBMetadataStore extends MetadataStoreTestBase {

  private static DynamoDBMetadataStore ddbmsStatic;

-  private static String TEST_DYNAMODB_TABLE_NAME;
+  private static String testDynamoDBTableName;

  /**
   * Create a path under the test path provided by
@ -113,8 +113,8 @@ public class ITestDynamoDBMetadataStore extends MetadataStoreTestBase {
    Configuration conf = prepareTestConfiguration(new Configuration());
    assertThatDynamoMetadataStoreImpl(conf);
    Assume.assumeTrue("Test DynamoDB table name should be set to run "
-            + "integration tests.", TEST_DYNAMODB_TABLE_NAME != null);
-    conf.set(S3GUARD_DDB_TABLE_NAME_KEY, TEST_DYNAMODB_TABLE_NAME);
+            + "integration tests.", testDynamoDBTableName != null);
+    conf.set(S3GUARD_DDB_TABLE_NAME_KEY, testDynamoDBTableName);

    s3AContract = new S3AContract(conf);
    s3AContract.init();
@ -140,10 +140,10 @@ public class ITestDynamoDBMetadataStore extends MetadataStoreTestBase {
  public static void beforeClassSetup() throws IOException {
    Configuration conf = prepareTestConfiguration(new Configuration());
    assertThatDynamoMetadataStoreImpl(conf);
-    TEST_DYNAMODB_TABLE_NAME = conf.get(S3GUARD_DDB_TEST_TABLE_NAME_KEY);
+    testDynamoDBTableName = conf.get(S3GUARD_DDB_TEST_TABLE_NAME_KEY);
    Assume.assumeTrue("Test DynamoDB table name should be set to run "
-        + "integration tests.", TEST_DYNAMODB_TABLE_NAME != null);
-    conf.set(S3GUARD_DDB_TABLE_NAME_KEY, TEST_DYNAMODB_TABLE_NAME);
+        + "integration tests.", testDynamoDBTableName != null);
+    conf.set(S3GUARD_DDB_TABLE_NAME_KEY, testDynamoDBTableName);

    LOG.debug("Creating static ddbms which will be shared between tests.");
    ddbmsStatic = new DynamoDBMetadataStore();
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStoreScale.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStoreScale.java
@ -18,46 +18,176 @@

 package org.apache.hadoop.fs.s3a.s3guard;

+import javax.annotation.Nullable;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
-import javax.annotation.Nullable;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;

 import com.amazonaws.services.dynamodbv2.document.DynamoDB;
+import com.amazonaws.services.dynamodbv2.document.Table;
 import com.amazonaws.services.dynamodbv2.model.ProvisionedThroughputDescription;
+import org.junit.FixMethodOrder;
 import org.junit.Test;
+import org.junit.internal.AssumptionViolatedException;
+import org.junit.runners.MethodSorters;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;

+import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.StorageStatistics;
+import org.apache.hadoop.fs.contract.ContractTestUtils;
+import org.apache.hadoop.fs.s3a.AWSServiceThrottledException;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.fs.s3a.S3AStorageStatistics;
+import org.apache.hadoop.fs.s3a.Statistic;
 import org.apache.hadoop.fs.s3a.scale.AbstractITestS3AMetadataStoreScale;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.apache.hadoop.test.LambdaTestUtils;

-import static org.apache.hadoop.fs.s3a.s3guard.MetadataStoreTestBase.basicFileStatus;
 import static org.apache.hadoop.fs.s3a.Constants.*;
+import static org.apache.hadoop.fs.s3a.s3guard.MetadataStoreTestBase.basicFileStatus;
 import static org.junit.Assume.*;

 /**
 * Scale test for DynamoDBMetadataStore.
+ *
+ * The throttle tests aren't quite trying to verify that throttling can
+ * be recovered from, because that makes for very slow tests: you have
+ * to overload the system and them have them back of until they finally complete.
+ * Instead
 */
+@FixMethodOrder(MethodSorters.NAME_ASCENDING)
 public class ITestDynamoDBMetadataStoreScale
    extends AbstractITestS3AMetadataStoreScale {

-  private static final long BATCH_SIZE = 25;
-  private static final long SMALL_IO_UNITS = BATCH_SIZE / 4;
+  private static final Logger LOG = LoggerFactory.getLogger(
+      ITestDynamoDBMetadataStoreScale.class);

+  private static final long BATCH_SIZE = 25;
+
+  /**
+   * IO Units for batch size; this sets the size to use for IO capacity.
+   * Value: {@value}.
+   */
+  private static final long MAXIMUM_READ_CAPACITY = 10;
+  private static final long MAXIMUM_WRITE_CAPACITY = 15;
+
+  private DynamoDBMetadataStore ddbms;
+
+  private DynamoDB ddb;
+
+  private Table table;
+
+  private String tableName;
+
+  /** was the provisioning changed in test_001_limitCapacity()? */
+  private boolean isOverProvisionedForTest;
+
+  private ProvisionedThroughputDescription originalCapacity;
+
+  private static final int THREADS = 40;
+
+  private static final int OPERATIONS_PER_THREAD = 50;
+
+  /**
+   * Create the metadata store. The table and region are determined from
+   * the attributes of the FS used in the tests.
+   * @return a new metadata store instance
+   * @throws IOException failure to instantiate
+   * @throws AssumptionViolatedException if the FS isn't running S3Guard + DDB/
+   */
  @Override
  public MetadataStore createMetadataStore() throws IOException {
-    Configuration conf = getFileSystem().getConf();
-    String ddbTable = conf.get(S3GUARD_DDB_TABLE_NAME_KEY);
-    assumeNotNull("DynamoDB table is configured", ddbTable);
-    String ddbEndpoint = conf.get(S3GUARD_DDB_REGION_KEY);
-    assumeNotNull("DynamoDB endpoint is configured", ddbEndpoint);
+    S3AFileSystem fs = getFileSystem();
+    assumeTrue("S3Guard is disabled for " + fs.getUri(),
+        fs.hasMetadataStore());
+    MetadataStore store = fs.getMetadataStore();
+    assumeTrue("Metadata store for " + fs.getUri() + " is " + store
+            + " -not DynamoDBMetadataStore",
+        store instanceof DynamoDBMetadataStore);
+
+    DynamoDBMetadataStore fsStore = (DynamoDBMetadataStore) store;
+    Configuration conf = new Configuration(fs.getConf());
+
+    tableName = fsStore.getTableName();
+    assertTrue("Null/Empty tablename in " + fsStore,
+        StringUtils.isNotEmpty(tableName));
+    String region = fsStore.getRegion();
+    assertTrue("Null/Empty region in " + fsStore,
+        StringUtils.isNotEmpty(region));
+    // create a new metastore configured to fail fast if throttling
+    // happens.
+    conf.set(S3GUARD_DDB_TABLE_NAME_KEY, tableName);
+    conf.set(S3GUARD_DDB_REGION_KEY, region);
+    conf.set(S3GUARD_DDB_THROTTLE_RETRY_INTERVAL, "50ms");
+    conf.set(S3GUARD_DDB_MAX_RETRIES, "2");
+    conf.set(MAX_ERROR_RETRIES, "1");
+    conf.set(S3GUARD_DDB_BACKGROUND_SLEEP_MSEC_KEY, "5ms");

    DynamoDBMetadataStore ms = new DynamoDBMetadataStore();
-    ms.initialize(getFileSystem().getConf());
+    ms.initialize(conf);
+    // wire up the owner FS so that we can make assertions about throttle
+    // events
+    ms.bindToOwnerFilesystem(fs);
    return ms;
  }

+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    ddbms = (DynamoDBMetadataStore) createMetadataStore();
+    tableName = ddbms.getTableName();
+    assertNotNull("table has no name", tableName);
+    ddb = ddbms.getDynamoDB();
+    table = ddb.getTable(tableName);
+    originalCapacity = table.describe().getProvisionedThroughput();
+
+    // If you set the same provisioned I/O as already set it throws an
+    // exception, avoid that.
+    isOverProvisionedForTest = (
+        originalCapacity.getReadCapacityUnits() > MAXIMUM_READ_CAPACITY
+            || originalCapacity.getWriteCapacityUnits() > MAXIMUM_WRITE_CAPACITY);
+    assumeFalse("Table has too much capacity: " + originalCapacity.toString(),
+        isOverProvisionedForTest);
+  }
+
+  @Override
+  public void teardown() throws Exception {
+    IOUtils.cleanupWithLogger(LOG, ddbms);
+    super.teardown();
+  }
+
+  /**
+   * The subclass expects the superclass to be throttled; sometimes it is.
+   */
+  @Test
+  @Override
+  public void test_020_Moves() throws Throwable {
+    ThrottleTracker tracker = new ThrottleTracker();
+    try {
+      // if this doesn't throttle, all is well.
+      super.test_020_Moves();
+    } catch (AWSServiceThrottledException ex) {
+      // if the service was throttled, we ex;ect the exception text
+      GenericTestUtils.assertExceptionContains(
+          DynamoDBMetadataStore.HINT_DDB_IOPS_TOO_LOW,
+          ex,
+          "Expected throttling message");
+    } finally {
+      LOG.info("Statistics {}", tracker);
+    }
+  }

  /**
   * Though the AWS SDK claims in documentation to handle retries and
@ -70,92 +200,298 @@ public class ITestDynamoDBMetadataStoreScale
   * correctly, retrying w/ smaller batch instead of surfacing exceptions.
   */
  @Test
-  public void testBatchedWriteExceedsProvisioned() throws Exception {
+  public void test_030_BatchedWrite() throws Exception {

-    final long iterations = 5;
-    boolean isProvisionedChanged;
-    List<PathMetadata> toCleanup = new ArrayList<>();
+    final int iterations = 15;
+    final ArrayList<PathMetadata> toCleanup = new ArrayList<>();
+    toCleanup.ensureCapacity(THREADS * iterations);

    // Fail if someone changes a constant we depend on
    assertTrue("Maximum batch size must big enough to run this test",
        S3GUARD_DDB_BATCH_WRITE_REQUEST_LIMIT >= BATCH_SIZE);

-    try (DynamoDBMetadataStore ddbms =
-         (DynamoDBMetadataStore)createMetadataStore()) {

-      DynamoDB ddb = ddbms.getDynamoDB();
-      String tableName = ddbms.getTable().getTableName();
-      final ProvisionedThroughputDescription existing =
-          ddb.getTable(tableName).describe().getProvisionedThroughput();
+    // We know the dynamodb metadata store will expand a put of a path
+    // of depth N into a batch of N writes (all ancestors are written
+    // separately up to the root).  (Ab)use this for an easy way to write
+    // a batch of stuff that is bigger than the provisioned write units
+    try {
+      describe("Running %d iterations of batched put, size %d", iterations,
+          BATCH_SIZE);

-      // If you set the same provisioned I/O as already set it throws an
-      // exception, avoid that.
-      isProvisionedChanged = (existing.getReadCapacityUnits() != SMALL_IO_UNITS
-          || existing.getWriteCapacityUnits() != SMALL_IO_UNITS);
+      ThrottleTracker result = execute("prune",
+          1,
+          true,
+          () -> {
+            ThrottleTracker tracker = new ThrottleTracker();
+            long pruneItems = 0;
+            for (long i = 0; i < iterations; i++) {
+              Path longPath = pathOfDepth(BATCH_SIZE, String.valueOf(i));
+              FileStatus status = basicFileStatus(longPath, 0, false, 12345,
+                  12345);
+              PathMetadata pm = new PathMetadata(status);
+              synchronized (toCleanup) {
+                toCleanup.add(pm);
+              }

-      if (isProvisionedChanged) {
-        // Set low provisioned I/O for dynamodb
-        describe("Provisioning dynamo tbl %s read/write -> %d/%d", tableName,
-            SMALL_IO_UNITS, SMALL_IO_UNITS);
-        // Blocks to ensure table is back to ready state before we proceed
-        ddbms.provisionTableBlocking(SMALL_IO_UNITS, SMALL_IO_UNITS);
-      } else {
-        describe("Skipping provisioning table I/O, already %d/%d",
-            SMALL_IO_UNITS, SMALL_IO_UNITS);
-      }
+              ddbms.put(pm);

-      try {
-        // We know the dynamodb metadata store will expand a put of a path
-        // of depth N into a batch of N writes (all ancestors are written
-        // separately up to the root).  (Ab)use this for an easy way to write
-        // a batch of stuff that is bigger than the provisioned write units
-        try {
-          describe("Running %d iterations of batched put, size %d", iterations,
-              BATCH_SIZE);
-          long pruneItems = 0;
-          for (long i = 0; i < iterations; i++) {
-            Path longPath = pathOfDepth(BATCH_SIZE, String.valueOf(i));
-            FileStatus status = basicFileStatus(longPath, 0, false, 12345,
-                12345);
-            PathMetadata pm = new PathMetadata(status);
+              pruneItems++;

-            ddbms.put(pm);
-            toCleanup.add(pm);
-            pruneItems++;
-            // Having hard time reproducing Exceeded exception with put, also
-            // try occasional prune, which was the only stack trace I've seen
-            // (on JIRA)
-            if (pruneItems == BATCH_SIZE) {
-              describe("pruning files");
-              ddbms.prune(Long.MAX_VALUE /* all files */);
-              pruneItems = 0;
+              if (pruneItems == BATCH_SIZE) {
+                describe("pruning files");
+                ddbms.prune(Long.MAX_VALUE /* all files */);
+                pruneItems = 0;
+              }
+              if (tracker.probe()) {
+                // fail fast
+                break;
+              }
            }
-          }
-        } finally {
-          describe("Cleaning up table %s", tableName);
-          for (PathMetadata pm : toCleanup) {
-            cleanupMetadata(ddbms, pm);
-          }
-        }
-      } finally {
-        if (isProvisionedChanged) {
-          long write = existing.getWriteCapacityUnits();
-          long read = existing.getReadCapacityUnits();
-          describe("Restoring dynamo tbl %s read/write -> %d/%d", tableName,
-              read, write);
-          ddbms.provisionTableBlocking(existing.getReadCapacityUnits(),
-              existing.getWriteCapacityUnits());
-        }
+          });
+      assertNotEquals("No batch retries in " + result,
+          0, result.batchThrottles);
+    } finally {
+      describe("Cleaning up table %s", tableName);
+      for (PathMetadata pm : toCleanup) {
+        cleanupMetadata(ddbms, pm);
      }
    }
  }

-  // Attempt do delete metadata, suppressing any errors
-  private void cleanupMetadata(MetadataStore ms, PathMetadata pm) {
+  /**
+   * Test Get throttling including using
+   * {@link MetadataStore#get(Path, boolean)},
+   * as that stresses more of the code.
+   */
+  @Test
+  public void test_040_get() throws Throwable {
+    // attempt to create many many get requests in parallel.
+    Path path = new Path("s3a://example.org/get");
+    S3AFileStatus status = new S3AFileStatus(true, path, "alice");
+    PathMetadata metadata = new PathMetadata(status);
+    ddbms.put(metadata);
    try {
-      ms.forgetMetadata(pm.getFileStatus().getPath());
+      execute("get",
+          OPERATIONS_PER_THREAD,
+          true,
+          () -> ddbms.get(path, true)
+      );
+    } finally {
+      retryingDelete(path);
+    }
+  }
+
+  /**
+   * Ask for the version marker, which is where table init can be overloaded.
+   */
+  @Test
+  public void test_050_getVersionMarkerItem() throws Throwable {
+    execute("get",
+        OPERATIONS_PER_THREAD * 2,
+        true,
+        () -> ddbms.getVersionMarkerItem()
+    );
+  }
+
+  /**
+   * Cleanup with an extra bit of retry logic around it, in case things
+   * are still over the limit.
+   * @param path path
+   */
+  private void retryingDelete(final Path path) {
+    try {
+      ddbms.getInvoker().retry("Delete ", path.toString(), true,
+          () -> ddbms.delete(path));
+    } catch (IOException e) {
+      LOG.warn("Failed to delete {}: ", path, e);
+    }
+  }
+
+  @Test
+  public void test_060_list() throws Throwable {
+    // attempt to create many many get requests in parallel.
+    Path path = new Path("s3a://example.org/list");
+    S3AFileStatus status = new S3AFileStatus(true, path, "alice");
+    PathMetadata metadata = new PathMetadata(status);
+    ddbms.put(metadata);
+    try {
+      Path parent = path.getParent();
+      execute("list",
+          OPERATIONS_PER_THREAD,
+          true,
+          () -> ddbms.listChildren(parent)
+      );
+    } finally {
+      retryingDelete(path);
+    }
+  }
+
+  @Test
+  public void test_070_putDirMarker() throws Throwable {
+    // attempt to create many many get requests in parallel.
+    Path path = new Path("s3a://example.org/putDirMarker");
+    S3AFileStatus status = new S3AFileStatus(true, path, "alice");
+    PathMetadata metadata = new PathMetadata(status);
+    ddbms.put(metadata);
+    DirListingMetadata children = ddbms.listChildren(path.getParent());
+    try {
+      execute("list",
+          OPERATIONS_PER_THREAD,
+          true,
+          () -> ddbms.put(children)
+      );
+    } finally {
+      retryingDelete(path);
+    }
+  }
+
+  @Test
+  public void test_080_fullPathsToPut() throws Throwable {
+    // attempt to create many many get requests in parallel.
+    Path base = new Path("s3a://example.org/test_080_fullPathsToPut");
+    Path child = new Path(base, "child");
+    List<PathMetadata> pms = new ArrayList<>();
+    ddbms.put(new PathMetadata(makeDirStatus(base)));
+    ddbms.put(new PathMetadata(makeDirStatus(child)));
+    ddbms.getInvoker().retry("set up directory tree",
+        base.toString(),
+        true,
+        () -> ddbms.put(pms));
+    try {
+      DDBPathMetadata dirData = ddbms.get(child, true);
+      execute("list",
+          OPERATIONS_PER_THREAD,
+          true,
+          () -> ddbms.fullPathsToPut(dirData)
+      );
+    } finally {
+      retryingDelete(base);
+    }
+  }
+
+  @Test
+  public void test_900_instrumentation() throws Throwable {
+    describe("verify the owner FS gets updated after throttling events");
+    // we rely on the FS being shared
+    S3AFileSystem fs = getFileSystem();
+    String fsSummary = fs.toString();
+
+    S3AStorageStatistics statistics = fs.getStorageStatistics();
+    for (StorageStatistics.LongStatistic statistic : statistics) {
+      LOG.info("{}", statistic.toString());
+    }
+    String retryKey = Statistic.S3GUARD_METADATASTORE_RETRY.getSymbol();
+    assertTrue("No increment of " + retryKey + " in " + fsSummary,
+        statistics.getLong(retryKey) > 0);
+    String throttledKey = Statistic.S3GUARD_METADATASTORE_THROTTLED.getSymbol();
+    assertTrue("No increment of " + throttledKey + " in " + fsSummary,
+        statistics.getLong(throttledKey) > 0);
+  }
+
+  /**
+   * Execute a set of operations in parallel, collect throttling statistics
+   * and return them.
+   * This execution will complete as soon as throttling is detected.
+   * This ensures that the tests do not run for longer than they should.
+   * @param operation string for messages.
+   * @param operationsPerThread number of times per thread to invoke the action.
+   * @param expectThrottling is throttling expected (and to be asserted on?)
+   * @param action action to invoke.
+   * @return the throttle statistics
+   */
+  public ThrottleTracker execute(String operation,
+      int operationsPerThread,
+      final boolean expectThrottling,
+      LambdaTestUtils.VoidCallable action)
+      throws Exception {
+
+    final ContractTestUtils.NanoTimer timer = new ContractTestUtils.NanoTimer();
+    final ThrottleTracker tracker = new ThrottleTracker();
+    final ExecutorService executorService = Executors.newFixedThreadPool(
+        THREADS);
+    final List<Callable<ExecutionOutcome>> tasks = new ArrayList<>(THREADS);
+
+    final AtomicInteger throttleExceptions = new AtomicInteger(0);
+    for (int i = 0; i < THREADS; i++) {
+      tasks.add(
+          () -> {
+            final ExecutionOutcome outcome = new ExecutionOutcome();
+            final ContractTestUtils.NanoTimer t
+                = new ContractTestUtils.NanoTimer();
+            for (int j = 0; j < operationsPerThread; j++) {
+              if (tracker.isThrottlingDetected()) {
+                outcome.skipped = true;
+                return outcome;
+              }
+              try {
+                action.call();
+                outcome.completed++;
+              } catch (AWSServiceThrottledException e) {
+                // this is possibly OK
+                LOG.info("Operation [{}] raised a throttled exception " + e, j, e);
+                LOG.debug(e.toString(), e);
+                throttleExceptions.incrementAndGet();
+                // consider it completed
+                outcome.throttleExceptions.add(e);
+                outcome.throttled++;
+              } catch (Exception e) {
+                LOG.error("Failed to execute {}", operation, e);
+                outcome.exceptions.add(e);
+                break;
+              }
+              tracker.probe();
+            }
+            LOG.info("Thread completed {} with in {} ms with outcome {}: {}",
+                operation, t.elapsedTimeMs(), outcome, tracker);
+            return outcome;
+          }
+      );
+    }
+    final List<Future<ExecutionOutcome>> futures =
+        executorService.invokeAll(tasks,
+        getTestTimeoutMillis(), TimeUnit.MILLISECONDS);
+    long elapsedMs = timer.elapsedTimeMs();
+    LOG.info("Completed {} with {}", operation, tracker);
+    LOG.info("time to execute: {} millis", elapsedMs);
+
+    for (Future<ExecutionOutcome> future : futures) {
+      assertTrue("Future timed out", future.isDone());
+    }
+    tracker.probe();
+
+    if (expectThrottling) {
+      tracker.assertThrottlingDetected();
+    }
+    for (Future<ExecutionOutcome> future : futures) {
+
+      ExecutionOutcome outcome = future.get();
+      if (!outcome.exceptions.isEmpty()) {
+        throw outcome.exceptions.get(0);
+      }
+      if (!outcome.skipped) {
+        assertEquals("Future did not complete all operations",
+            operationsPerThread, outcome.completed + outcome.throttled);
+      }
+    }
+
+    return tracker;
+  }
+
+  /**
+   * Attempt to delete metadata, suppressing any errors, and retrying on
+   * throttle events just in case some are still surfacing.
+   * @param ms store
+   * @param pm path to clean up
+   */
+  private void cleanupMetadata(MetadataStore ms, PathMetadata pm) {
+    Path path = pm.getFileStatus().getPath();
+    try {
+      ddbms.getInvoker().retry("clean up", path.toString(), true,
+          () -> ms.forgetMetadata(path));
    } catch (IOException ioe) {
      // Ignore.
+      LOG.info("Ignoring error while cleaning up {} in database", path, ioe);
    }
  }

@ -164,11 +500,114 @@ public class ITestDynamoDBMetadataStoreScale
    for (long i = 0; i < n; i++) {
      sb.append(i == 0 ? "/" + this.getClass().getSimpleName() : "lvl");
      sb.append(i);
-      if (i == n-1 && fileSuffix != null) {
+      if (i == n - 1 && fileSuffix != null) {
        sb.append(fileSuffix);
      }
      sb.append("/");
    }
    return new Path(getFileSystem().getUri().toString(), sb.toString());
  }
+
+  /**
+   * Something to track throttles.
+   * The constructor sets the counters to the current count in the
+   * DDB table; a call to {@link #reset()} will set it to the latest values.
+   * The {@link #probe()} will pick up the latest values to compare them with
+   * the original counts.
+   */
+  private class ThrottleTracker {
+
+    private long writeThrottleEventOrig = ddbms.getWriteThrottleEventCount();
+
+    private long readThrottleEventOrig = ddbms.getReadThrottleEventCount();
+
+    private long batchWriteThrottleCountOrig =
+        ddbms.getBatchWriteCapacityExceededCount();
+
+    private long readThrottles;
+
+    private long writeThrottles;
+
+    private long batchThrottles;
+
+    ThrottleTracker() {
+      reset();
+    }
+
+    /**
+     * Reset the counters.
+     */
+    private synchronized void reset() {
+      writeThrottleEventOrig
+          = ddbms.getWriteThrottleEventCount();
+
+      readThrottleEventOrig
+          = ddbms.getReadThrottleEventCount();
+
+      batchWriteThrottleCountOrig
+          = ddbms.getBatchWriteCapacityExceededCount();
+    }
+
+    /**
+     * Update the latest throttle count; synchronized.
+     * @return true if throttling has been detected.
+     */
+    private synchronized boolean probe() {
+      readThrottles = ddbms.getReadThrottleEventCount() - readThrottleEventOrig;
+      writeThrottles = ddbms.getWriteThrottleEventCount()
+          - writeThrottleEventOrig;
+      batchThrottles = ddbms.getBatchWriteCapacityExceededCount()
+          - batchWriteThrottleCountOrig;
+      return isThrottlingDetected();
+    }
+
+    @Override
+    public String toString() {
+      return String.format(
+          "Tracker with read throttle events = %d;"
+              + " write events = %d;"
+              + " batch throttles = %d",
+          readThrottles, writeThrottles, batchThrottles);
+    }
+
+    /**
+     * Assert that throttling has been detected.
+     */
+    void assertThrottlingDetected() {
+      assertTrue("No throttling detected in " + this +
+              " against " + ddbms.toString(),
+          isThrottlingDetected());
+    }
+
+    /**
+     * Has there been any throttling on an operation?
+     * @return true iff read, write or batch operations were throttled.
+     */
+    private boolean isThrottlingDetected() {
+      return readThrottles > 0 || writeThrottles > 0 || batchThrottles > 0;
+    }
+  }
+
+  /**
+   * Outcome of a thread's execution operation.
+   */
+  private static class ExecutionOutcome {
+    private int completed;
+    private int throttled;
+    private boolean skipped;
+    private final List<Exception> exceptions = new ArrayList<>(1);
+    private final List<Exception> throttleExceptions = new ArrayList<>(1);
+
+    @Override
+    public String toString() {
+      final StringBuilder sb = new StringBuilder(
+          "ExecutionOutcome{");
+      sb.append("completed=").append(completed);
+      sb.append(", skipped=").append(skipped);
+      sb.append(", throttled=").append(throttled);
+      sb.append(", exception count=").append(exceptions.size());
+      sb.append('}');
+      return sb.toString();
+    }
+  }
 }
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolDynamoDB.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolDynamoDB.java
@ -26,7 +26,6 @@ import java.util.Objects;
 import java.util.Random;
 import java.util.UUID;
 import java.util.concurrent.Callable;
-import java.util.concurrent.atomic.AtomicInteger;

 import com.amazonaws.services.dynamodbv2.document.DynamoDB;
 import com.amazonaws.services.dynamodbv2.document.Table;
@ -275,38 +274,7 @@ public class ITestS3GuardToolDynamoDB extends AbstractS3GuardToolTestBase {
      // that call does not change the values
      original.checkEquals("unchanged", getCapacities());

-      // now update the value
-      long readCap = original.getRead();
-      long writeCap = original.getWrite();
-      long rc2 = readCap + 1;
-      long wc2 = writeCap + 1;
-      Capacities desired = new Capacities(rc2, wc2);
-      capacityOut = exec(newSetCapacity(),
-          S3GuardTool.SetCapacity.NAME,
-          "-" + READ_FLAG, Long.toString(rc2),
-          "-" + WRITE_FLAG, Long.toString(wc2),
-          fsURI);
-      LOG.info("Set Capacity output=\n{}", capacityOut);
-
-      // to avoid race conditions, spin for the state change
-      AtomicInteger c = new AtomicInteger(0);
-      LambdaTestUtils.eventually(60000,
-          new LambdaTestUtils.VoidCallable() {
-            @Override
-            public void call() throws Exception {
-                c.incrementAndGet();
-                Map<String, String> diags = getMetadataStore().getDiagnostics();
-                Capacities updated = getCapacities(diags);
-                String tableInfo = String.format("[%02d] table state: %s",
-                    c.intValue(), diags.get(STATUS));
-                LOG.info("{}; capacities {}",
-                    tableInfo, updated);
-                desired.checkEquals(tableInfo, updated);
-            }
-          },
-          new LambdaTestUtils.ProportionalRetryInterval(500, 5000));
-
-      // Destroy MetadataStore
+         // Destroy MetadataStore
      Destroy destroyCmd = new Destroy(fs.getConf());

      String destroyed = exec(destroyCmd,
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/AbstractITestS3AMetadataStoreScale.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/AbstractITestS3AMetadataStoreScale.java
@ -22,7 +22,10 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.s3a.S3AFileStatus;
 import org.apache.hadoop.fs.s3a.s3guard.MetadataStore;
 import org.apache.hadoop.fs.s3a.s3guard.PathMetadata;
+
+import org.junit.FixMethodOrder;
 import org.junit.Test;
+import org.junit.runners.MethodSorters;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

@ -38,6 +41,7 @@ import static org.apache.hadoop.fs.contract.ContractTestUtils.NanoTimer;
 * Could be separated from S3A code, but we're using the S3A scale test
 * framework for convenience.
 */
+@FixMethodOrder(MethodSorters.NAME_ASCENDING)
 public abstract class AbstractITestS3AMetadataStoreScale extends
    S3AScaleTestBase {
  private static final Logger LOG = LoggerFactory.getLogger(
@ -60,7 +64,7 @@ public abstract class AbstractITestS3AMetadataStoreScale extends
  public abstract MetadataStore createMetadataStore() throws IOException;

  @Test
-  public void testPut() throws Throwable {
+  public void test_010_Put() throws Throwable {
    describe("Test workload of put() operations");

    // As described in hadoop-aws site docs, count parameter is used for
@ -83,7 +87,7 @@ public abstract class AbstractITestS3AMetadataStoreScale extends
  }

  @Test
-  public void testMoves() throws Throwable {
+  public void test_020_Moves() throws Throwable {
    describe("Test workload of batched move() operations");

    // As described in hadoop-aws site docs, count parameter is used for
@ -140,7 +144,7 @@ public abstract class AbstractITestS3AMetadataStoreScale extends
   * Create a copy of given list of PathMetadatas with the paths moved from
   * src to dest.
   */
-  private List<PathMetadata> moveMetas(List<PathMetadata> metas, Path src,
+  protected List<PathMetadata> moveMetas(List<PathMetadata> metas, Path src,
      Path dest) throws IOException {
    List<PathMetadata> moved = new ArrayList<>(metas.size());
    for (PathMetadata srcMeta : metas) {
@ -151,7 +155,7 @@ public abstract class AbstractITestS3AMetadataStoreScale extends
    return moved;
  }

-  private Path movePath(Path p, Path src, Path dest) {
+  protected Path movePath(Path p, Path src, Path dest) {
    String srcStr = src.toUri().getPath();
    String pathStr = p.toUri().getPath();
    // Strip off src dir
@ -160,7 +164,7 @@ public abstract class AbstractITestS3AMetadataStoreScale extends
    return new Path(dest, pathStr);
  }

-  private S3AFileStatus copyStatus(S3AFileStatus status) {
+  protected S3AFileStatus copyStatus(S3AFileStatus status) {
    if (status.isDirectory()) {
      return new S3AFileStatus(status.isEmptyDirectory(), status.getPath(),
          status.getOwner());
@ -185,7 +189,7 @@ public abstract class AbstractITestS3AMetadataStoreScale extends
    return count;
  }

-  private void clearMetadataStore(MetadataStore ms, long count)
+  protected void clearMetadataStore(MetadataStore ms, long count)
      throws IOException {
    describe("Recursive deletion");
    NanoTimer deleteTimer = new NanoTimer();
@ -202,15 +206,15 @@ public abstract class AbstractITestS3AMetadataStoreScale extends
        msecPerOp, op, count));
  }

-  private static S3AFileStatus makeFileStatus(Path path) throws IOException {
+  protected static S3AFileStatus makeFileStatus(Path path) throws IOException {
    return new S3AFileStatus(SIZE, ACCESS_TIME, path, BLOCK_SIZE, OWNER);
  }

-  private static S3AFileStatus makeDirStatus(Path p) throws IOException {
+  protected static S3AFileStatus makeDirStatus(Path p) throws IOException {
    return new S3AFileStatus(false, p, OWNER);
  }

-  private List<Path> metasToPaths(List<PathMetadata> metas) {
+  protected List<Path> metasToPaths(List<PathMetadata> metas) {
    List<Path> paths = new ArrayList<>(metas.size());
    for (PathMetadata meta : metas) {
      paths.add(meta.getFileStatus().getPath());
@ -225,7 +229,7 @@ public abstract class AbstractITestS3AMetadataStoreScale extends
   * @param width Number of files (and directories, if depth > 0) per directory.
   * @param paths List to add generated paths to.
   */
-  private static void createDirTree(Path parent, int depth, int width,
+  protected static void createDirTree(Path parent, int depth, int width,
      Collection<PathMetadata> paths) throws IOException {

    // Create files
--- a/hadoop-tools/hadoop-aws/src/test/resources/core-site.xml
+++ b/hadoop-tools/hadoop-aws/src/test/resources/core-site.xml
@ -150,6 +150,16 @@
    <value>simple</value>
  </property>

+  <!-- Reduce DDB capacity on auto-created tables, to keep bills down. -->
+  <property>
+    <name>fs.s3a.s3guard.ddb.table.capacity.read</name>
+    <value>10</value>
+  </property>
+  <property>
+    <name>fs.s3a.s3guard.ddb.table.capacity.write</name>
+    <value>10</value>
+  </property>
+
  <!--
  To run these tests.