HADOOP-13560. S3ABlockOutputStream to support huge (many GB) file writes. Contributed by Steve Loughran

This commit is contained in:
Steve Loughran 2016-10-18 19:33:38 +01:00
parent b733a6f862
commit 6c348c5691
37 changed files with 4614 additions and 873 deletions

View File

@ -994,8 +994,8 @@
<property>
<name>fs.s3a.threads.max</name>
<value>10</value>
<description> Maximum number of concurrent active (part)uploads,
which each use a thread from the threadpool.</description>
<description>The total number of threads available in the filesystem for data
uploads *or any other queued filesystem operation*.</description>
</property>
<property>
@ -1008,8 +1008,7 @@
<property>
<name>fs.s3a.max.total.tasks</name>
<value>5</value>
<description>Number of (part)uploads allowed to the queue before
blocking additional uploads.</description>
<description>The number of operations which can be queued for execution</description>
</property>
<property>
@ -1047,13 +1046,21 @@
<name>fs.s3a.multipart.purge</name>
<value>false</value>
<description>True if you want to purge existing multipart uploads that may not have been
completed/aborted correctly</description>
completed/aborted correctly. The corresponding purge age is defined in
fs.s3a.multipart.purge.age.
If set, when the filesystem is instantiated then all outstanding uploads
older than the purge age will be terminated -across the entire bucket.
This will impact multipart uploads by other applications and users. so should
be used sparingly, with an age value chosen to stop failed uploads, without
breaking ongoing operations.
</description>
</property>
<property>
<name>fs.s3a.multipart.purge.age</name>
<value>86400</value>
<description>Minimum age in seconds of multipart uploads to purge</description>
<description>Minimum age in seconds of multipart uploads to purge.
</description>
</property>
<property>
@ -1086,10 +1093,50 @@
<property>
<name>fs.s3a.fast.upload</name>
<value>false</value>
<description>Upload directly from memory instead of buffering to
disk first. Memory usage and parallelism can be controlled as up to
fs.s3a.multipart.size memory is consumed for each (part)upload actively
uploading (fs.s3a.threads.max) or queueing (fs.s3a.max.total.tasks)</description>
<description>
Use the incremental block-based fast upload mechanism with
the buffering mechanism set in fs.s3a.fast.upload.buffer.
</description>
</property>
<property>
<name>fs.s3a.fast.upload.buffer</name>
<value>disk</value>
<description>
The buffering mechanism to use when using S3A fast upload
(fs.s3a.fast.upload=true). Values: disk, array, bytebuffer.
This configuration option has no effect if fs.s3a.fast.upload is false.
"disk" will use the directories listed in fs.s3a.buffer.dir as
the location(s) to save data prior to being uploaded.
"array" uses arrays in the JVM heap
"bytebuffer" uses off-heap memory within the JVM.
Both "array" and "bytebuffer" will consume memory in a single stream up to the number
of blocks set by:
fs.s3a.multipart.size * fs.s3a.fast.upload.active.blocks.
If using either of these mechanisms, keep this value low
The total number of threads performing work across all threads is set by
fs.s3a.threads.max, with fs.s3a.max.total.tasks values setting the number of queued
work items.
</description>
</property>
<property>
<name>fs.s3a.fast.upload.active.blocks</name>
<value>4</value>
<description>
Maximum Number of blocks a single output stream can have
active (uploading, or queued to the central FileSystem
instance's pool of queued operations.
This stops a single stream overloading the shared thread pool.
</description>
</property>
<property>
@ -1100,13 +1147,6 @@
any call to setReadahead() is made to an open stream.</description>
</property>
<property>
<name>fs.s3a.fast.buffer.size</name>
<value>1048576</value>
<description>Size of initial memory buffer in bytes allocated for an
upload. No effect if fs.s3a.fast.upload is false.</description>
</property>
<property>
<name>fs.s3a.user.agent.prefix</name>
<value></value>

View File

@ -965,7 +965,7 @@ public static void bandwidth(NanoTimer timer, long bytes) {
* @return the number of megabytes/second of the recorded operation
*/
public static double bandwidthMBs(long bytes, long durationNS) {
return (bytes * 1000.0) / durationNS;
return bytes / (1024.0 * 1024) * 1.0e9 / durationNS;
}
/**
@ -1415,6 +1415,14 @@ public long duration() {
return endTime - startTime;
}
/**
* Intermediate duration of the operation.
* @return how much time has passed since the start (in nanos).
*/
public long elapsedTime() {
return now() - startTime;
}
public double bandwidth(long bytes) {
return bandwidthMBs(bytes, duration());
}
@ -1422,10 +1430,12 @@ public double bandwidth(long bytes) {
/**
* Bandwidth as bytes per second.
* @param bytes bytes in
* @return the number of bytes per second this operation timed.
* @return the number of bytes per second this operation.
* 0 if duration == 0.
*/
public double bandwidthBytes(long bytes) {
return (bytes * 1.0) / duration();
double duration = duration();
return duration > 0 ? bytes / duration : 0;
}
/**

View File

@ -35,6 +35,15 @@
<file.encoding>UTF-8</file.encoding>
<downloadSources>true</downloadSources>
<hadoop.tmp.dir>${project.build.directory}/test</hadoop.tmp.dir>
<!-- are scale tests enabled ? -->
<fs.s3a.scale.test.enabled>unset</fs.s3a.scale.test.enabled>
<!-- Size in MB of huge files. -->
<fs.s3a.scale.test.huge.filesize>unset</fs.s3a.scale.test.huge.filesize>
<!-- Size in MB of the partion size in huge file uploads. -->
<fs.s3a.scale.test.huge.partitionsize>unset</fs.s3a.scale.test.huge.partitionsize>
<!-- Timeout in seconds for scale tests.-->
<fs.s3a.scale.test.timeout>3600</fs.s3a.scale.test.timeout>
</properties>
<profiles>
@ -115,6 +124,11 @@
<!-- substitution. Putting a prefix in front of it like -->
<!-- "fork-" makes it work. -->
<test.unique.fork.id>fork-${surefire.forkNumber}</test.unique.fork.id>
<!-- Propagate scale parameters -->
<fs.s3a.scale.test.enabled>${fs.s3a.scale.test.enabled}</fs.s3a.scale.test.enabled>
<fs.s3a.scale.test.huge.filesize>${fs.s3a.scale.test.huge.filesize}</fs.s3a.scale.test.huge.filesize>
<fs.s3a.scale.test.huge.huge.partitionsize>${fs.s3a.scale.test.huge.partitionsize}</fs.s3a.scale.test.huge.huge.partitionsize>
<fs.s3a.scale.test.timeout>${fs.s3a.scale.test.timeout}</fs.s3a.scale.test.timeout>
</systemPropertyVariables>
</configuration>
</plugin>
@ -132,7 +146,10 @@
<forkCount>${testsThreadCount}</forkCount>
<reuseForks>false</reuseForks>
<argLine>${maven-surefire-plugin.argLine} -DminiClusterDedicatedDirs=true</argLine>
<forkedProcessTimeoutInSeconds>${fs.s3a.scale.test.timeout}</forkedProcessTimeoutInSeconds>
<systemPropertyVariables>
<!-- Tell tests that they are being executed in parallel -->
<test.parallel.execution>true</test.parallel.execution>
<test.build.data>${test.build.data}/${surefire.forkNumber}</test.build.data>
<test.build.dir>${test.build.dir}/${surefire.forkNumber}</test.build.dir>
<hadoop.tmp.dir>${hadoop.tmp.dir}/${surefire.forkNumber}</hadoop.tmp.dir>
@ -142,6 +159,11 @@
<!-- substitution. Putting a prefix in front of it like -->
<!-- "fork-" makes it work. -->
<test.unique.fork.id>fork-${surefire.forkNumber}</test.unique.fork.id>
<!-- Propagate scale parameters -->
<fs.s3a.scale.test.enabled>${fs.s3a.scale.test.enabled}</fs.s3a.scale.test.enabled>
<fs.s3a.scale.test.huge.filesize>${fs.s3a.scale.test.huge.filesize}</fs.s3a.scale.test.huge.filesize>
<fs.s3a.scale.test.huge.huge.partitionsize>${fs.s3a.scale.test.huge.partitionsize}</fs.s3a.scale.test.huge.huge.partitionsize>
<fs.s3a.scale.test.timeout>${fs.s3a.scale.test.timeout}</fs.s3a.scale.test.timeout>
</systemPropertyVariables>
<!-- Some tests cannot run in parallel. Tests that cover -->
<!-- access to the root directory must run in isolation -->
@ -160,10 +182,11 @@
<excludes>
<exclude>**/ITestJets3tNativeS3FileSystemContract.java</exclude>
<exclude>**/ITestS3ABlockingThreadPool.java</exclude>
<exclude>**/ITestS3AFastOutputStream.java</exclude>
<exclude>**/ITestS3AFileSystemContract.java</exclude>
<exclude>**/ITestS3AMiniYarnCluster.java</exclude>
<exclude>**/ITest*Root*.java</exclude>
<exclude>**/ITestS3AFileContextStatistics.java</exclude>
<include>**/ITestS3AHuge*.java</include>
</excludes>
</configuration>
</execution>
@ -174,6 +197,16 @@
<goal>verify</goal>
</goals>
<configuration>
<forkedProcessTimeoutInSeconds>${fs.s3a.scale.test.timeout}</forkedProcessTimeoutInSeconds>
<systemPropertyVariables>
<!-- Tell tests that they are being executed sequentially -->
<test.parallel.execution>false</test.parallel.execution>
<!-- Propagate scale parameters -->
<fs.s3a.scale.test.enabled>${fs.s3a.scale.test.enabled}</fs.s3a.scale.test.enabled>
<fs.s3a.scale.test.huge.filesize>${fs.s3a.scale.test.huge.filesize}</fs.s3a.scale.test.huge.filesize>
<fs.s3a.scale.test.huge.huge.partitionsize>${fs.s3a.scale.test.huge.partitionsize}</fs.s3a.scale.test.huge.huge.partitionsize>
<fs.s3a.scale.test.timeout>${fs.s3a.scale.test.timeout}</fs.s3a.scale.test.timeout>
</systemPropertyVariables>
<!-- Do a sequential run for tests that cannot handle -->
<!-- parallel execution. -->
<includes>
@ -183,6 +216,8 @@
<include>**/ITestS3AFileSystemContract.java</include>
<include>**/ITestS3AMiniYarnCluster.java</include>
<include>**/ITest*Root*.java</include>
<include>**/ITestS3AFileContextStatistics.java</include>
<include>**/ITestS3AHuge*.java</include>
</includes>
</configuration>
</execution>
@ -210,7 +245,13 @@
<goal>verify</goal>
</goals>
<configuration>
<forkedProcessTimeoutInSeconds>3600</forkedProcessTimeoutInSeconds>
<systemPropertyVariables>
<!-- Propagate scale parameters -->
<fs.s3a.scale.test.enabled>${fs.s3a.scale.test.enabled}</fs.s3a.scale.test.enabled>
<fs.s3a.scale.test.huge.filesize>${fs.s3a.scale.test.huge.filesize}</fs.s3a.scale.test.huge.filesize>
<fs.s3a.scale.test.timeout>${fs.s3a.scale.test.timeout}</fs.s3a.scale.test.timeout>
</systemPropertyVariables>
<forkedProcessTimeoutInSeconds>${fs.s3a.scale.test.timeout}</forkedProcessTimeoutInSeconds>
</configuration>
</execution>
</executions>
@ -218,6 +259,19 @@
</plugins>
</build>
</profile>
<!-- Turn on scale tests-->
<profile>
<id>scale</id>
<activation>
<property>
<name>scale</name>
</property>
</activation>
<properties >
<fs.s3a.scale.test.enabled>true</fs.s3a.scale.test.enabled>
</properties>
</profile>
</profiles>
<build>

View File

@ -18,30 +18,21 @@
package org.apache.hadoop.fs.s3a;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.RejectedExecutionHandler;
import java.util.concurrent.Semaphore;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicInteger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.util.concurrent.ForwardingListeningExecutorService;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.ListeningExecutorService;
import com.google.common.util.concurrent.MoreExecutors;
import org.apache.hadoop.classification.InterfaceAudience;
/**
* This ExecutorService blocks the submission of new tasks when its queue is
* already full by using a semaphore. Task submissions require permits, task
@ -50,17 +41,17 @@
* This is inspired by <a href="https://github.com/apache/incubator-s4/blob/master/subprojects/s4-comm/src/main/java/org/apache/s4/comm/staging/BlockingThreadPoolExecutorService.java">
* this s4 threadpool</a>
*/
public class BlockingThreadPoolExecutorService
extends ForwardingListeningExecutorService {
@InterfaceAudience.Private
final class BlockingThreadPoolExecutorService
extends SemaphoredDelegatingExecutor {
private static Logger LOG = LoggerFactory
.getLogger(BlockingThreadPoolExecutorService.class);
private Semaphore queueingPermits;
private ListeningExecutorService executorDelegatee;
private static final AtomicInteger POOLNUMBER = new AtomicInteger(1);
private final ThreadPoolExecutor eventProcessingExecutor;
/**
* Returns a {@link java.util.concurrent.ThreadFactory} that names each
* created thread uniquely,
@ -69,7 +60,7 @@ public class BlockingThreadPoolExecutorService
* @param prefix The prefix of every created Thread's name
* @return a {@link java.util.concurrent.ThreadFactory} that names threads
*/
public static ThreadFactory getNamedThreadFactory(final String prefix) {
static ThreadFactory getNamedThreadFactory(final String prefix) {
SecurityManager s = System.getSecurityManager();
final ThreadGroup threadGroup = (s != null) ? s.getThreadGroup() :
Thread.currentThread().getThreadGroup();
@ -113,6 +104,12 @@ public Thread newThread(Runnable r) {
};
}
private BlockingThreadPoolExecutorService(int permitCount,
ThreadPoolExecutor eventProcessingExecutor) {
super(MoreExecutors.listeningDecorator(eventProcessingExecutor),
permitCount, false);
this.eventProcessingExecutor = eventProcessingExecutor;
}
/**
* A thread pool that that blocks clients submitting additional tasks if
@ -125,10 +122,12 @@ public Thread newThread(Runnable r) {
* @param unit time unit
* @param prefixName prefix of name for threads
*/
public BlockingThreadPoolExecutorService(int activeTasks, int waitingTasks,
long keepAliveTime, TimeUnit unit, String prefixName) {
super();
queueingPermits = new Semaphore(waitingTasks + activeTasks, false);
public static BlockingThreadPoolExecutorService newInstance(
int activeTasks,
int waitingTasks,
long keepAliveTime, TimeUnit unit,
String prefixName) {
/* Although we generally only expect up to waitingTasks tasks in the
queue, we need to be able to buffer all tasks in case dequeueing is
slower than enqueueing. */
@ -147,126 +146,25 @@ public void rejectedExecution(Runnable r,
}
});
eventProcessingExecutor.allowCoreThreadTimeOut(true);
executorDelegatee =
MoreExecutors.listeningDecorator(eventProcessingExecutor);
}
@Override
protected ListeningExecutorService delegate() {
return executorDelegatee;
}
@Override
public <T> ListenableFuture<T> submit(Callable<T> task) {
try {
queueingPermits.acquire();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
return Futures.immediateFailedCheckedFuture(e);
}
return super.submit(new CallableWithPermitRelease<T>(task));
}
@Override
public <T> ListenableFuture<T> submit(Runnable task, T result) {
try {
queueingPermits.acquire();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
return Futures.immediateFailedCheckedFuture(e);
}
return super.submit(new RunnableWithPermitRelease(task), result);
}
@Override
public ListenableFuture<?> submit(Runnable task) {
try {
queueingPermits.acquire();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
return Futures.immediateFailedCheckedFuture(e);
}
return super.submit(new RunnableWithPermitRelease(task));
}
@Override
public void execute(Runnable command) {
try {
queueingPermits.acquire();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
super.execute(new RunnableWithPermitRelease(command));
return new BlockingThreadPoolExecutorService(waitingTasks + activeTasks,
eventProcessingExecutor);
}
/**
* Releases a permit after the task is executed.
* Get the actual number of active threads.
* @return the active thread count
*/
class RunnableWithPermitRelease implements Runnable {
private Runnable delegatee;
public RunnableWithPermitRelease(Runnable delegatee) {
this.delegatee = delegatee;
}
@Override
public void run() {
try {
delegatee.run();
} finally {
queueingPermits.release();
}
}
}
/**
* Releases a permit after the task is completed.
*/
class CallableWithPermitRelease<T> implements Callable<T> {
private Callable<T> delegatee;
public CallableWithPermitRelease(Callable<T> delegatee) {
this.delegatee = delegatee;
}
@Override
public T call() throws Exception {
try {
return delegatee.call();
} finally {
queueingPermits.release();
}
}
int getActiveCount() {
return eventProcessingExecutor.getActiveCount();
}
@Override
public <T> List<Future<T>> invokeAll(Collection<? extends Callable<T>> tasks)
throws InterruptedException {
throw new RuntimeException("Not implemented");
public String toString() {
final StringBuilder sb = new StringBuilder(
"BlockingThreadPoolExecutorService{");
sb.append(super.toString());
sb.append(", activeCount=").append(getActiveCount());
sb.append('}');
return sb.toString();
}
@Override
public <T> List<Future<T>> invokeAll(Collection<? extends Callable<T>> tasks,
long timeout, TimeUnit unit) throws InterruptedException {
throw new RuntimeException("Not implemented");
}
@Override
public <T> T invokeAny(Collection<? extends Callable<T>> tasks)
throws InterruptedException, ExecutionException {
throw new RuntimeException("Not implemented");
}
@Override
public <T> T invokeAny(Collection<? extends Callable<T>> tasks, long timeout,
TimeUnit unit)
throws InterruptedException, ExecutionException, TimeoutException {
throw new RuntimeException("Not implemented");
}
}

View File

@ -35,6 +35,9 @@ public final class Constants {
private Constants() {
}
/** The minimum multipart size which S3 supports. */
public static final int MULTIPART_MIN_SIZE = 5 * 1024 * 1024;
// s3 access key
public static final String ACCESS_KEY = "fs.s3a.access.key";
@ -124,14 +127,72 @@ private Constants() {
// comma separated list of directories
public static final String BUFFER_DIR = "fs.s3a.buffer.dir";
// should we upload directly from memory rather than using a file buffer
// switch to the fast block-by-block upload mechanism
public static final String FAST_UPLOAD = "fs.s3a.fast.upload";
public static final boolean DEFAULT_FAST_UPLOAD = false;
//initial size of memory buffer for a fast upload
@Deprecated
public static final String FAST_BUFFER_SIZE = "fs.s3a.fast.buffer.size";
public static final int DEFAULT_FAST_BUFFER_SIZE = 1048576; //1MB
/**
* What buffer to use.
* Default is {@link #FAST_UPLOAD_BUFFER_DISK}
* Value: {@value}
*/
@InterfaceStability.Unstable
public static final String FAST_UPLOAD_BUFFER =
"fs.s3a.fast.upload.buffer";
/**
* Buffer blocks to disk: {@value}.
* Capacity is limited to available disk space.
*/
@InterfaceStability.Unstable
public static final String FAST_UPLOAD_BUFFER_DISK = "disk";
/**
* Use an in-memory array. Fast but will run of heap rapidly: {@value}.
*/
@InterfaceStability.Unstable
public static final String FAST_UPLOAD_BUFFER_ARRAY = "array";
/**
* Use a byte buffer. May be more memory efficient than the
* {@link #FAST_UPLOAD_BUFFER_ARRAY}: {@value}.
*/
@InterfaceStability.Unstable
public static final String FAST_UPLOAD_BYTEBUFFER = "bytebuffer";
/**
* Default buffer option: {@value}.
*/
@InterfaceStability.Unstable
public static final String DEFAULT_FAST_UPLOAD_BUFFER =
FAST_UPLOAD_BUFFER_DISK;
/**
* Maximum Number of blocks a single output stream can have
* active (uploading, or queued to the central FileSystem
* instance's pool of queued operations.
* This stops a single stream overloading the shared thread pool.
* {@value}
* <p>
* Default is {@link #DEFAULT_FAST_UPLOAD_ACTIVE_BLOCKS}
*/
@InterfaceStability.Unstable
public static final String FAST_UPLOAD_ACTIVE_BLOCKS =
"fs.s3a.fast.upload.active.blocks";
/**
* Limit of queued block upload operations before writes
* block. Value: {@value}
*/
@InterfaceStability.Unstable
public static final int DEFAULT_FAST_UPLOAD_ACTIVE_BLOCKS = 4;
// Private | PublicRead | PublicReadWrite | AuthenticatedRead |
// LogDeliveryWrite | BucketOwnerRead | BucketOwnerFullControl
public static final String CANNED_ACL = "fs.s3a.acl.default";
@ -145,7 +206,7 @@ private Constants() {
// purge any multipart uploads older than this number of seconds
public static final String PURGE_EXISTING_MULTIPART_AGE =
"fs.s3a.multipart.purge.age";
public static final long DEFAULT_PURGE_EXISTING_MULTIPART_AGE = 14400;
public static final long DEFAULT_PURGE_EXISTING_MULTIPART_AGE = 86400;
// s3 server-side encryption
public static final String SERVER_SIDE_ENCRYPTION_ALGORITHM =
@ -215,4 +276,10 @@ private Constants() {
public static final Class<? extends S3ClientFactory>
DEFAULT_S3_CLIENT_FACTORY_IMPL =
S3ClientFactory.DefaultS3ClientFactory.class;
/**
* Maximum number of partitions in a multipart upload: {@value}.
*/
@InterfaceAudience.Private
public static final int MAX_MULTIPART_COUNT = 10000;
}

View File

@ -0,0 +1,703 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs.s3a;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import com.amazonaws.AmazonClientException;
import com.amazonaws.event.ProgressEvent;
import com.amazonaws.event.ProgressEventType;
import com.amazonaws.event.ProgressListener;
import com.amazonaws.services.s3.model.CompleteMultipartUploadResult;
import com.amazonaws.services.s3.model.PartETag;
import com.amazonaws.services.s3.model.PutObjectRequest;
import com.amazonaws.services.s3.model.PutObjectResult;
import com.amazonaws.services.s3.model.UploadPartRequest;
import com.google.common.base.Preconditions;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.ListeningExecutorService;
import com.google.common.util.concurrent.MoreExecutors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.retry.RetryPolicies;
import org.apache.hadoop.io.retry.RetryPolicy;
import org.apache.hadoop.util.Progressable;
import static org.apache.hadoop.fs.s3a.S3AUtils.*;
import static org.apache.hadoop.fs.s3a.Statistic.*;
/**
* Upload files/parts directly via different buffering mechanisms:
* including memory and disk.
*
* If the stream is closed and no update has started, then the upload
* is instead done as a single PUT operation.
*
* Unstable: statistics and error handling might evolve.
*/
@InterfaceAudience.Private
@InterfaceStability.Unstable
class S3ABlockOutputStream extends OutputStream {
private static final Logger LOG =
LoggerFactory.getLogger(S3ABlockOutputStream.class);
/** Owner FileSystem. */
private final S3AFileSystem fs;
/** Object being uploaded. */
private final String key;
/** Size of all blocks. */
private final int blockSize;
/** Callback for progress. */
private final ProgressListener progressListener;
private final ListeningExecutorService executorService;
/**
* Retry policy for multipart commits; not all AWS SDK versions retry that.
*/
private final RetryPolicy retryPolicy =
RetryPolicies.retryUpToMaximumCountWithProportionalSleep(
5,
2000,
TimeUnit.MILLISECONDS);
/**
* Factory for blocks.
*/
private final S3ADataBlocks.BlockFactory blockFactory;
/** Preallocated byte buffer for writing single characters. */
private final byte[] singleCharWrite = new byte[1];
/** Multipart upload details; null means none started. */
private MultiPartUpload multiPartUpload;
/** Closed flag. */
private final AtomicBoolean closed = new AtomicBoolean(false);
/** Current data block. Null means none currently active */
private S3ADataBlocks.DataBlock activeBlock;
/** Count of blocks uploaded. */
private long blockCount = 0;
/** Statistics to build up. */
private final S3AInstrumentation.OutputStreamStatistics statistics;
/**
* Write operation helper; encapsulation of the filesystem operations.
*/
private final S3AFileSystem.WriteOperationHelper writeOperationHelper;
/**
* An S3A output stream which uploads partitions in a separate pool of
* threads; different {@link S3ADataBlocks.BlockFactory}
* instances can control where data is buffered.
*
* @param fs S3AFilesystem
* @param key S3 object to work on.
* @param executorService the executor service to use to schedule work
* @param progress report progress in order to prevent timeouts. If
* this object implements {@code ProgressListener} then it will be
* directly wired up to the AWS client, so receive detailed progress
* information.
* @param blockSize size of a single block.
* @param blockFactory factory for creating stream destinations
* @param statistics stats for this stream
* @param writeOperationHelper state of the write operation.
* @throws IOException on any problem
*/
S3ABlockOutputStream(S3AFileSystem fs,
String key,
ExecutorService executorService,
Progressable progress,
long blockSize,
S3ADataBlocks.BlockFactory blockFactory,
S3AInstrumentation.OutputStreamStatistics statistics,
S3AFileSystem.WriteOperationHelper writeOperationHelper)
throws IOException {
this.fs = fs;
this.key = key;
this.blockFactory = blockFactory;
this.blockSize = (int) blockSize;
this.statistics = statistics;
this.writeOperationHelper = writeOperationHelper;
Preconditions.checkArgument(blockSize >= Constants.MULTIPART_MIN_SIZE,
"Block size is too small: %d", blockSize);
this.executorService = MoreExecutors.listeningDecorator(executorService);
this.multiPartUpload = null;
this.progressListener = (progress instanceof ProgressListener) ?
(ProgressListener) progress
: new ProgressableListener(progress);
// create that first block. This guarantees that an open + close sequence
// writes a 0-byte entry.
createBlockIfNeeded();
LOG.debug("Initialized S3ABlockOutputStream for {}" +
" output to {}", writeOperationHelper, activeBlock);
}
/**
* Demand create a destination block.
* @return the active block; null if there isn't one.
* @throws IOException on any failure to create
*/
private synchronized S3ADataBlocks.DataBlock createBlockIfNeeded()
throws IOException {
if (activeBlock == null) {
blockCount++;
if (blockCount>= Constants.MAX_MULTIPART_COUNT) {
LOG.error("Number of partitions in stream exceeds limit for S3: " +
+ Constants.MAX_MULTIPART_COUNT + " write may fail.");
}
activeBlock = blockFactory.create(this.blockSize);
}
return activeBlock;
}
/**
* Synchronized accessor to the active block.
* @return the active block; null if there isn't one.
*/
private synchronized S3ADataBlocks.DataBlock getActiveBlock() {
return activeBlock;
}
/**
* Predicate to query whether or not there is an active block.
* @return true if there is an active block.
*/
private synchronized boolean hasActiveBlock() {
return activeBlock != null;
}
/**
* Clear the active block.
*/
private void clearActiveBlock() {
LOG.debug("Clearing active block");
synchronized (this) {
activeBlock = null;
}
}
/**
* Check for the filesystem being open.
* @throws IOException if the filesystem is closed.
*/
void checkOpen() throws IOException {
if (closed.get()) {
throw new IOException("Filesystem " + writeOperationHelper + " closed");
}
}
/**
* The flush operation does not trigger an upload; that awaits
* the next block being full. What it does do is call {@code flush() }
* on the current block, leaving it to choose how to react.
* @throws IOException Any IO problem.
*/
@Override
public synchronized void flush() throws IOException {
checkOpen();
S3ADataBlocks.DataBlock dataBlock = getActiveBlock();
if (dataBlock != null) {
dataBlock.flush();
}
}
/**
* Writes a byte to the destination. If this causes the buffer to reach
* its limit, the actual upload is submitted to the threadpool.
* @param b the int of which the lowest byte is written
* @throws IOException on any problem
*/
@Override
public synchronized void write(int b) throws IOException {
singleCharWrite[0] = (byte)b;
write(singleCharWrite, 0, 1);
}
/**
* Writes a range of bytes from to the memory buffer. If this causes the
* buffer to reach its limit, the actual upload is submitted to the
* threadpool and the remainder of the array is written to memory
* (recursively).
* @param source byte array containing
* @param offset offset in array where to start
* @param len number of bytes to be written
* @throws IOException on any problem
*/
@Override
public synchronized void write(byte[] source, int offset, int len)
throws IOException {
S3ADataBlocks.validateWriteArgs(source, offset, len);
checkOpen();
if (len == 0) {
return;
}
S3ADataBlocks.DataBlock block = createBlockIfNeeded();
int written = block.write(source, offset, len);
int remainingCapacity = block.remainingCapacity();
if (written < len) {
// not everything was written the block has run out
// of capacity
// Trigger an upload then process the remainder.
LOG.debug("writing more data than block has capacity -triggering upload");
uploadCurrentBlock();
// tail recursion is mildly expensive, but given buffer sizes must be MB.
// it's unlikely to recurse very deeply.
this.write(source, offset + written, len - written);
} else {
if (remainingCapacity == 0) {
// the whole buffer is done, trigger an upload
uploadCurrentBlock();
}
}
}
/**
* Start an asynchronous upload of the current block.
* @throws IOException Problems opening the destination for upload
* or initializing the upload.
*/
private synchronized void uploadCurrentBlock() throws IOException {
Preconditions.checkState(hasActiveBlock(), "No active block");
LOG.debug("Writing block # {}", blockCount);
if (multiPartUpload == null) {
LOG.debug("Initiating Multipart upload");
multiPartUpload = new MultiPartUpload();
}
try {
multiPartUpload.uploadBlockAsync(getActiveBlock());
} finally {
// set the block to null, so the next write will create a new block.
clearActiveBlock();
}
}
/**
* Close the stream.
*
* This will not return until the upload is complete
* or the attempt to perform the upload has failed.
* Exceptions raised in this method are indicative that the write has
* failed and data is at risk of being lost.
* @throws IOException on any failure.
*/
@Override
public void close() throws IOException {
if (closed.getAndSet(true)) {
// already closed
LOG.debug("Ignoring close() as stream is already closed");
return;
}
S3ADataBlocks.DataBlock block = getActiveBlock();
boolean hasBlock = hasActiveBlock();
LOG.debug("{}: Closing block #{}: current block= {}",
this,
blockCount,
hasBlock ? block : "(none)");
try {
if (multiPartUpload == null) {
if (hasBlock) {
// no uploads of data have taken place, put the single block up.
// This must happen even if there is no data, so that 0 byte files
// are created.
putObject();
}
} else {
// there has already been at least one block scheduled for upload;
// put up the current then wait
if (hasBlock && block.hasData()) {
//send last part
uploadCurrentBlock();
}
// wait for the partial uploads to finish
final List<PartETag> partETags =
multiPartUpload.waitForAllPartUploads();
// then complete the operation
multiPartUpload.complete(partETags);
}
LOG.debug("Upload complete for {}", writeOperationHelper);
} catch (IOException ioe) {
writeOperationHelper.writeFailed(ioe);
throw ioe;
} finally {
LOG.debug("Closing block and factory");
IOUtils.closeStream(block);
IOUtils.closeStream(blockFactory);
LOG.debug("Statistics: {}", statistics);
IOUtils.closeStream(statistics);
clearActiveBlock();
}
// All end of write operations, including deleting fake parent directories
writeOperationHelper.writeSuccessful();
}
/**
* Upload the current block as a single PUT request; if the buffer
* is empty a 0-byte PUT will be invoked, as it is needed to create an
* entry at the far end.
* @throws IOException any problem.
*/
private void putObject() throws IOException {
LOG.debug("Executing regular upload for {}", writeOperationHelper);
final S3ADataBlocks.DataBlock block = getActiveBlock();
int size = block.dataSize();
final PutObjectRequest putObjectRequest =
writeOperationHelper.newPutRequest(
block.startUpload(),
size);
long transferQueueTime = now();
BlockUploadProgress callback =
new BlockUploadProgress(
block, progressListener, transferQueueTime);
putObjectRequest.setGeneralProgressListener(callback);
statistics.blockUploadQueued(size);
ListenableFuture<PutObjectResult> putObjectResult =
executorService.submit(new Callable<PutObjectResult>() {
@Override
public PutObjectResult call() throws Exception {
PutObjectResult result = fs.putObjectDirect(putObjectRequest);
block.close();
return result;
}
});
clearActiveBlock();
//wait for completion
try {
putObjectResult.get();
} catch (InterruptedException ie) {
LOG.warn("Interrupted object upload", ie);
Thread.currentThread().interrupt();
} catch (ExecutionException ee) {
throw extractException("regular upload", key, ee);
}
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder(
"S3ABlockOutputStream{");
sb.append(writeOperationHelper.toString());
sb.append(", blockSize=").append(blockSize);
// unsynced access; risks consistency in exchange for no risk of deadlock.
S3ADataBlocks.DataBlock block = activeBlock;
if (block != null) {
sb.append(", activeBlock=").append(block);
}
sb.append('}');
return sb.toString();
}
private void incrementWriteOperations() {
fs.incrementWriteOperations();
}
/**
* Current time in milliseconds.
* @return time
*/
private long now() {
return System.currentTimeMillis();
}
/**
* Multiple partition upload.
*/
private class MultiPartUpload {
private final String uploadId;
private final List<ListenableFuture<PartETag>> partETagsFutures;
public MultiPartUpload() throws IOException {
this.uploadId = writeOperationHelper.initiateMultiPartUpload();
this.partETagsFutures = new ArrayList<>(2);
LOG.debug("Initiated multi-part upload for {} with " +
"id '{}'", writeOperationHelper, uploadId);
}
/**
* Upload a block of data.
* This will take the block
* @param block block to upload
* @throws IOException upload failure
*/
private void uploadBlockAsync(final S3ADataBlocks.DataBlock block)
throws IOException {
LOG.debug("Queueing upload of {}", block);
final int size = block.dataSize();
final InputStream uploadStream = block.startUpload();
final int currentPartNumber = partETagsFutures.size() + 1;
final UploadPartRequest request =
writeOperationHelper.newUploadPartRequest(
uploadId,
uploadStream,
currentPartNumber,
size);
long transferQueueTime = now();
BlockUploadProgress callback =
new BlockUploadProgress(
block, progressListener, transferQueueTime);
request.setGeneralProgressListener(callback);
statistics.blockUploadQueued(block.dataSize());
ListenableFuture<PartETag> partETagFuture =
executorService.submit(new Callable<PartETag>() {
@Override
public PartETag call() throws Exception {
// this is the queued upload operation
LOG.debug("Uploading part {} for id '{}'", currentPartNumber,
uploadId);
// do the upload
PartETag partETag = fs.uploadPart(request).getPartETag();
LOG.debug("Completed upload of {}", block);
LOG.debug("Stream statistics of {}", statistics);
// close the block
block.close();
return partETag;
}
});
partETagsFutures.add(partETagFuture);
}
/**
* Block awaiting all outstanding uploads to complete.
* @return list of results
* @throws IOException IO Problems
*/
private List<PartETag> waitForAllPartUploads() throws IOException {
LOG.debug("Waiting for {} uploads to complete", partETagsFutures.size());
try {
return Futures.allAsList(partETagsFutures).get();
} catch (InterruptedException ie) {
LOG.warn("Interrupted partUpload", ie);
Thread.currentThread().interrupt();
return null;
} catch (ExecutionException ee) {
//there is no way of recovering so abort
//cancel all partUploads
LOG.debug("While waiting for upload completion", ee);
LOG.debug("Cancelling futures");
for (ListenableFuture<PartETag> future : partETagsFutures) {
future.cancel(true);
}
//abort multipartupload
this.abort();
throw extractException("Multi-part upload with id '" + uploadId
+ "' to " + key, key, ee);
}
}
/**
* This completes a multipart upload.
* Sometimes it fails; here retries are handled to avoid losing all data
* on a transient failure.
* @param partETags list of partial uploads
* @throws IOException on any problem
*/
private CompleteMultipartUploadResult complete(List<PartETag> partETags)
throws IOException {
int retryCount = 0;
AmazonClientException lastException;
String operation =
String.format("Completing multi-part upload for key '%s'," +
" id '%s' with %s partitions ",
key, uploadId, partETags.size());
do {
try {
LOG.debug(operation);
return writeOperationHelper.completeMultipartUpload(
uploadId,
partETags);
} catch (AmazonClientException e) {
lastException = e;
statistics.exceptionInMultipartComplete();
}
} while (shouldRetry(operation, lastException, retryCount++));
// this point is only reached if the operation failed more than
// the allowed retry count
throw translateException(operation, key, lastException);
}
/**
* Abort a multi-part upload. Retries are attempted on failures.
* IOExceptions are caught; this is expected to be run as a cleanup process.
*/
public void abort() {
int retryCount = 0;
AmazonClientException lastException;
fs.incrementStatistic(OBJECT_MULTIPART_UPLOAD_ABORTED);
String operation =
String.format("Aborting multi-part upload for '%s', id '%s",
writeOperationHelper, uploadId);
do {
try {
LOG.debug(operation);
writeOperationHelper.abortMultipartUpload(uploadId);
return;
} catch (AmazonClientException e) {
lastException = e;
statistics.exceptionInMultipartAbort();
}
} while (shouldRetry(operation, lastException, retryCount++));
// this point is only reached if the operation failed more than
// the allowed retry count
LOG.warn("Unable to abort multipart upload, you may need to purge " +
"uploaded parts", lastException);
}
/**
* Predicate to determine whether a failed operation should
* be attempted again.
* If a retry is advised, the exception is automatically logged and
* the filesystem statistic {@link Statistic#IGNORED_ERRORS} incremented.
* The method then sleeps for the sleep time suggested by the sleep policy;
* if the sleep is interrupted then {@code Thread.interrupted()} is set
* to indicate the thread was interrupted; then false is returned.
*
* @param operation operation for log message
* @param e exception raised.
* @param retryCount number of retries already attempted
* @return true if another attempt should be made
*/
private boolean shouldRetry(String operation,
AmazonClientException e,
int retryCount) {
try {
RetryPolicy.RetryAction retryAction =
retryPolicy.shouldRetry(e, retryCount, 0, true);
boolean retry = retryAction == RetryPolicy.RetryAction.RETRY;
if (retry) {
fs.incrementStatistic(IGNORED_ERRORS);
LOG.info("Retrying {} after exception ", operation, e);
Thread.sleep(retryAction.delayMillis);
}
return retry;
} catch (InterruptedException ex) {
Thread.currentThread().interrupt();
return false;
} catch (Exception ignored) {
return false;
}
}
}
/**
* The upload progress listener registered for events returned
* during the upload of a single block.
* It updates statistics and handles the end of the upload.
* Transfer failures are logged at WARN.
*/
private final class BlockUploadProgress implements ProgressListener {
private final S3ADataBlocks.DataBlock block;
private final ProgressListener nextListener;
private final long transferQueueTime;
private long transferStartTime;
/**
* Track the progress of a single block upload.
* @param block block to monitor
* @param nextListener optional next progress listener
* @param transferQueueTime time the block was transferred
* into the queue
*/
private BlockUploadProgress(S3ADataBlocks.DataBlock block,
ProgressListener nextListener,
long transferQueueTime) {
this.block = block;
this.transferQueueTime = transferQueueTime;
this.nextListener = nextListener;
}
@Override
public void progressChanged(ProgressEvent progressEvent) {
ProgressEventType eventType = progressEvent.getEventType();
long bytesTransferred = progressEvent.getBytesTransferred();
int size = block.dataSize();
switch (eventType) {
case REQUEST_BYTE_TRANSFER_EVENT:
// bytes uploaded
statistics.bytesTransferred(bytesTransferred);
break;
case TRANSFER_PART_STARTED_EVENT:
transferStartTime = now();
statistics.blockUploadStarted(transferStartTime - transferQueueTime,
size);
incrementWriteOperations();
break;
case TRANSFER_PART_COMPLETED_EVENT:
statistics.blockUploadCompleted(now() - transferStartTime, size);
break;
case TRANSFER_PART_FAILED_EVENT:
statistics.blockUploadFailed(now() - transferStartTime, size);
LOG.warn("Transfer failure of block {}", block);
break;
default:
// nothing
}
if (nextListener != null) {
nextListener.progressChanged(progressEvent);
}
}
}
/**
* Bridge from AWS {@code ProgressListener} to Hadoop {@link Progressable}.
*/
private static class ProgressableListener implements ProgressListener {
private final Progressable progress;
public ProgressableListener(Progressable progress) {
this.progress = progress;
}
public void progressChanged(ProgressEvent progressEvent) {
if (progress != null) {
progress.progress();
}
}
}
}

View File

@ -0,0 +1,821 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs.s3a;
import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import com.google.common.base.Preconditions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.fs.FSExceptionMessages;
import org.apache.hadoop.util.DirectBufferPool;
import static org.apache.hadoop.fs.s3a.S3ADataBlocks.DataBlock.DestState.*;
/**
* Set of classes to support output streaming into blocks which are then
* uploaded as partitions.
*/
final class S3ADataBlocks {
private static final Logger LOG =
LoggerFactory.getLogger(S3ADataBlocks.class);
private S3ADataBlocks() {
}
/**
* Validate args to a write command. These are the same validation checks
* expected for any implementation of {@code OutputStream.write()}.
* @param b byte array containing data
* @param off offset in array where to start
* @param len number of bytes to be written
* @throws NullPointerException for a null buffer
* @throws IndexOutOfBoundsException if indices are out of range
*/
static void validateWriteArgs(byte[] b, int off, int len)
throws IOException {
Preconditions.checkNotNull(b);
if ((off < 0) || (off > b.length) || (len < 0) ||
((off + len) > b.length) || ((off + len) < 0)) {
throw new IndexOutOfBoundsException(
"write (b[" + b.length + "], " + off + ", " + len + ')');
}
}
/**
* Create a factory.
* @param owner factory owner
* @param name factory name -the option from {@link Constants}.
* @return the factory, ready to be initialized.
* @throws IllegalArgumentException if the name is unknown.
*/
static BlockFactory createFactory(S3AFileSystem owner,
String name) {
switch (name) {
case Constants.FAST_UPLOAD_BUFFER_ARRAY:
return new ArrayBlockFactory(owner);
case Constants.FAST_UPLOAD_BUFFER_DISK:
return new DiskBlockFactory(owner);
case Constants.FAST_UPLOAD_BYTEBUFFER:
return new ByteBufferBlockFactory(owner);
default:
throw new IllegalArgumentException("Unsupported block buffer" +
" \"" + name + '"');
}
}
/**
* Base class for block factories.
*/
static abstract class BlockFactory implements Closeable {
private final S3AFileSystem owner;
protected BlockFactory(S3AFileSystem owner) {
this.owner = owner;
}
/**
* Create a block.
* @param limit limit of the block.
* @return a new block.
*/
abstract DataBlock create(int limit) throws IOException;
/**
* Implement any close/cleanup operation.
* Base class is a no-op
* @throws IOException -ideally, it shouldn't.
*/
@Override
public void close() throws IOException {
}
/**
* Owner.
*/
protected S3AFileSystem getOwner() {
return owner;
}
}
/**
* This represents a block being uploaded.
*/
static abstract class DataBlock implements Closeable {
enum DestState {Writing, Upload, Closed}
private volatile DestState state = Writing;
/**
* Atomically enter a state, verifying current state.
* @param current current state. null means "no check"
* @param next next state
* @throws IllegalStateException if the current state is not as expected
*/
protected synchronized final void enterState(DestState current,
DestState next)
throws IllegalStateException {
verifyState(current);
LOG.debug("{}: entering state {}", this, next);
state = next;
}
/**
* Verify that the block is in the declared state.
* @param expected expected state.
* @throws IllegalStateException if the DataBlock is in the wrong state
*/
protected final void verifyState(DestState expected)
throws IllegalStateException {
if (expected != null && state != expected) {
throw new IllegalStateException("Expected stream state " + expected
+ " -but actual state is " + state + " in " + this);
}
}
/**
* Current state.
* @return the current state.
*/
final DestState getState() {
return state;
}
/**
* Return the current data size.
* @return the size of the data
*/
abstract int dataSize();
/**
* Predicate to verify that the block has the capacity to write
* the given set of bytes.
* @param bytes number of bytes desired to be written.
* @return true if there is enough space.
*/
abstract boolean hasCapacity(long bytes);
/**
* Predicate to check if there is data in the block.
* @return true if there is
*/
boolean hasData() {
return dataSize() > 0;
}
/**
* The remaining capacity in the block before it is full.
* @return the number of bytes remaining.
*/
abstract int remainingCapacity();
/**
* Write a series of bytes from the buffer, from the offset.
* Returns the number of bytes written.
* Only valid in the state {@code Writing}.
* Base class verifies the state but does no writing.
* @param buffer buffer
* @param offset offset
* @param length length of write
* @return number of bytes written
* @throws IOException trouble
*/
int write(byte[] buffer, int offset, int length) throws IOException {
verifyState(Writing);
Preconditions.checkArgument(buffer != null, "Null buffer");
Preconditions.checkArgument(length >= 0, "length is negative");
Preconditions.checkArgument(offset >= 0, "offset is negative");
Preconditions.checkArgument(
!(buffer.length - offset < length),
"buffer shorter than amount of data to write");
return 0;
}
/**
* Flush the output.
* Only valid in the state {@code Writing}.
* In the base class, this is a no-op
* @throws IOException any IO problem.
*/
void flush() throws IOException {
verifyState(Writing);
}
/**
* Switch to the upload state and return a stream for uploading.
* Base class calls {@link #enterState(DestState, DestState)} to
* manage the state machine.
* @return the stream
* @throws IOException trouble
*/
InputStream startUpload() throws IOException {
LOG.debug("Start datablock upload");
enterState(Writing, Upload);
return null;
}
/**
* Enter the closed state.
* @return true if the class was in any other state, implying that
* the subclass should do its close operations
*/
protected synchronized boolean enterClosedState() {
if (!state.equals(Closed)) {
enterState(null, Closed);
return true;
} else {
return false;
}
}
@Override
public void close() throws IOException {
if (enterClosedState()) {
LOG.debug("Closed {}", this);
innerClose();
}
}
/**
* Inner close logic for subclasses to implement.
*/
protected void innerClose() throws IOException {
}
}
// ====================================================================
/**
* Use byte arrays on the heap for storage.
*/
static class ArrayBlockFactory extends BlockFactory {
ArrayBlockFactory(S3AFileSystem owner) {
super(owner);
}
@Override
DataBlock create(int limit) throws IOException {
return new ByteArrayBlock(limit);
}
}
/**
* Stream to memory via a {@code ByteArrayOutputStream}.
*
* This was taken from {@code S3AFastOutputStream} and has the
* same problem which surfaced there: it can consume a lot of heap space
* proportional to the mismatch between writes to the stream and
* the JVM-wide upload bandwidth to the S3 endpoint.
* The memory consumption can be limited by tuning the filesystem settings
* to restrict the number of queued/active uploads.
*/
static class ByteArrayBlock extends DataBlock {
private ByteArrayOutputStream buffer;
private final int limit;
// cache data size so that it is consistent after the buffer is reset.
private Integer dataSize;
ByteArrayBlock(int limit) {
this.limit = limit;
buffer = new ByteArrayOutputStream();
}
/**
* Get the amount of data; if there is no buffer then the size is 0.
* @return the amount of data available to upload.
*/
@Override
int dataSize() {
return dataSize != null ? dataSize : buffer.size();
}
@Override
InputStream startUpload() throws IOException {
super.startUpload();
dataSize = buffer.size();
ByteArrayInputStream bufferData = new ByteArrayInputStream(
buffer.toByteArray());
buffer = null;
return bufferData;
}
@Override
boolean hasCapacity(long bytes) {
return dataSize() + bytes <= limit;
}
@Override
int remainingCapacity() {
return limit - dataSize();
}
@Override
int write(byte[] b, int offset, int len) throws IOException {
super.write(b, offset, len);
int written = Math.min(remainingCapacity(), len);
buffer.write(b, offset, written);
return written;
}
@Override
protected void innerClose() {
buffer = null;
}
@Override
public String toString() {
return "ByteArrayBlock{" +
"state=" + getState() +
", limit=" + limit +
", dataSize=" + dataSize +
'}';
}
}
// ====================================================================
/**
* Stream via Direct ByteBuffers; these are allocated off heap
* via {@link DirectBufferPool}.
* This is actually the most complex of all the block factories,
* due to the need to explicitly recycle buffers; in comparison, the
* {@link DiskBlock} buffer delegates the work of deleting files to
* the {@link DiskBlock.FileDeletingInputStream}. Here the
* input stream {@link ByteBufferInputStream} has a similar task, along
* with the foundational work of streaming data from a byte array.
*/
static class ByteBufferBlockFactory extends BlockFactory {
private final DirectBufferPool bufferPool = new DirectBufferPool();
private final AtomicInteger buffersOutstanding = new AtomicInteger(0);
ByteBufferBlockFactory(S3AFileSystem owner) {
super(owner);
}
@Override
ByteBufferBlock create(int limit) throws IOException {
return new ByteBufferBlock(limit);
}
private ByteBuffer requestBuffer(int limit) {
LOG.debug("Requesting buffer of size {}", limit);
buffersOutstanding.incrementAndGet();
return bufferPool.getBuffer(limit);
}
private void releaseBuffer(ByteBuffer buffer) {
LOG.debug("Releasing buffer");
bufferPool.returnBuffer(buffer);
buffersOutstanding.decrementAndGet();
}
/**
* Get count of outstanding buffers.
* @return the current buffer count
*/
public int getOutstandingBufferCount() {
return buffersOutstanding.get();
}
@Override
public String toString() {
return "ByteBufferBlockFactory{"
+ "buffersOutstanding=" + buffersOutstanding +
'}';
}
/**
* A DataBlock which requests a buffer from pool on creation; returns
* it when the output stream is closed.
*/
class ByteBufferBlock extends DataBlock {
private ByteBuffer buffer;
private final int bufferSize;
// cache data size so that it is consistent after the buffer is reset.
private Integer dataSize;
/**
* Instantiate. This will request a ByteBuffer of the desired size.
* @param bufferSize buffer size
*/
ByteBufferBlock(int bufferSize) {
this.bufferSize = bufferSize;
buffer = requestBuffer(bufferSize);
}
/**
* Get the amount of data; if there is no buffer then the size is 0.
* @return the amount of data available to upload.
*/
@Override
int dataSize() {
return dataSize != null ? dataSize : bufferCapacityUsed();
}
@Override
ByteBufferInputStream startUpload() throws IOException {
super.startUpload();
dataSize = bufferCapacityUsed();
// set the buffer up from reading from the beginning
buffer.limit(buffer.position());
buffer.position(0);
return new ByteBufferInputStream(dataSize, buffer);
}
@Override
public boolean hasCapacity(long bytes) {
return bytes <= remainingCapacity();
}
@Override
public int remainingCapacity() {
return buffer != null ? buffer.remaining() : 0;
}
private int bufferCapacityUsed() {
return buffer.capacity() - buffer.remaining();
}
@Override
int write(byte[] b, int offset, int len) throws IOException {
super.write(b, offset, len);
int written = Math.min(remainingCapacity(), len);
buffer.put(b, offset, written);
return written;
}
@Override
protected void innerClose() {
buffer = null;
}
@Override
public String toString() {
return "ByteBufferBlock{"
+ "state=" + getState() +
", dataSize=" + dataSize() +
", limit=" + bufferSize +
", remainingCapacity=" + remainingCapacity() +
'}';
}
}
/**
* Provide an input stream from a byte buffer; supporting
* {@link #mark(int)}, which is required to enable replay of failed
* PUT attempts.
* This input stream returns the buffer to the pool afterwards.
*/
class ByteBufferInputStream extends InputStream {
private final int size;
private ByteBuffer byteBuffer;
ByteBufferInputStream(int size, ByteBuffer byteBuffer) {
LOG.debug("Creating ByteBufferInputStream of size {}", size);
this.size = size;
this.byteBuffer = byteBuffer;
}
/**
* Return the buffer to the pool after the stream is closed.
*/
@Override
public synchronized void close() {
if (byteBuffer != null) {
LOG.debug("releasing buffer");
releaseBuffer(byteBuffer);
byteBuffer = null;
}
}
/**
* Verify that the stream is open.
* @throws IOException if the stream is closed
*/
private void verifyOpen() throws IOException {
if (byteBuffer == null) {
throw new IOException(FSExceptionMessages.STREAM_IS_CLOSED);
}
}
public synchronized int read() throws IOException {
if (available() > 0) {
return byteBuffer.get() & 0xFF;
} else {
return -1;
}
}
@Override
public synchronized long skip(long offset) throws IOException {
verifyOpen();
long newPos = position() + offset;
if (newPos < 0) {
throw new EOFException(FSExceptionMessages.NEGATIVE_SEEK);
}
if (newPos > size) {
throw new EOFException(FSExceptionMessages.CANNOT_SEEK_PAST_EOF);
}
byteBuffer.position((int) newPos);
return newPos;
}
@Override
public synchronized int available() {
Preconditions.checkState(byteBuffer != null,
FSExceptionMessages.STREAM_IS_CLOSED);
return byteBuffer.remaining();
}
/**
* Get the current buffer position.
* @return the buffer position
*/
public synchronized int position() {
return byteBuffer.position();
}
/**
* Check if there is data left.
* @return true if there is data remaining in the buffer.
*/
public synchronized boolean hasRemaining() {
return byteBuffer.hasRemaining();
}
@Override
public synchronized void mark(int readlimit) {
LOG.debug("mark at {}", position());
byteBuffer.mark();
}
@Override
public synchronized void reset() throws IOException {
LOG.debug("reset");
byteBuffer.reset();
}
@Override
public boolean markSupported() {
return true;
}
/**
* Read in data.
* @param buffer destination buffer
* @param offset offset within the buffer
* @param length length of bytes to read
* @throws EOFException if the position is negative
* @throws IndexOutOfBoundsException if there isn't space for the
* amount of data requested.
* @throws IllegalArgumentException other arguments are invalid.
*/
@SuppressWarnings("NullableProblems")
public synchronized int read(byte[] buffer, int offset, int length)
throws IOException {
Preconditions.checkArgument(length >= 0, "length is negative");
Preconditions.checkArgument(buffer != null, "Null buffer");
if (buffer.length - offset < length) {
throw new IndexOutOfBoundsException(
FSExceptionMessages.TOO_MANY_BYTES_FOR_DEST_BUFFER
+ ": request length =" + length
+ ", with offset =" + offset
+ "; buffer capacity =" + (buffer.length - offset));
}
verifyOpen();
if (!hasRemaining()) {
return -1;
}
int toRead = Math.min(length, available());
byteBuffer.get(buffer, offset, toRead);
return toRead;
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder(
"ByteBufferInputStream{");
sb.append("size=").append(size);
ByteBuffer buffer = this.byteBuffer;
if (buffer != null) {
sb.append(", available=").append(buffer.remaining());
}
sb.append('}');
return sb.toString();
}
}
}
// ====================================================================
/**
* Buffer blocks to disk.
*/
static class DiskBlockFactory extends BlockFactory {
DiskBlockFactory(S3AFileSystem owner) {
super(owner);
}
/**
* Create a temp file and a block which writes to it.
* @param limit limit of the block.
* @return the new block
* @throws IOException IO problems
*/
@Override
DataBlock create(int limit) throws IOException {
File destFile = getOwner()
.createTmpFileForWrite("s3ablock", limit, getOwner().getConf());
return new DiskBlock(destFile, limit);
}
}
/**
* Stream to a file.
* This will stop at the limit; the caller is expected to create a new block
*/
static class DiskBlock extends DataBlock {
private int bytesWritten;
private final File bufferFile;
private final int limit;
private BufferedOutputStream out;
private InputStream uploadStream;
DiskBlock(File bufferFile, int limit)
throws FileNotFoundException {
this.limit = limit;
this.bufferFile = bufferFile;
out = new BufferedOutputStream(new FileOutputStream(bufferFile));
}
@Override
int dataSize() {
return bytesWritten;
}
@Override
boolean hasCapacity(long bytes) {
return dataSize() + bytes <= limit;
}
@Override
int remainingCapacity() {
return limit - bytesWritten;
}
@Override
int write(byte[] b, int offset, int len) throws IOException {
super.write(b, offset, len);
int written = Math.min(remainingCapacity(), len);
out.write(b, offset, written);
bytesWritten += written;
return written;
}
@Override
InputStream startUpload() throws IOException {
super.startUpload();
try {
out.flush();
} finally {
out.close();
out = null;
}
uploadStream = new FileInputStream(bufferFile);
return new FileDeletingInputStream(uploadStream);
}
/**
* The close operation will delete the destination file if it still
* exists.
* @throws IOException IO problems
*/
@Override
protected void innerClose() throws IOException {
final DestState state = getState();
LOG.debug("Closing {}", this);
switch (state) {
case Writing:
if (bufferFile.exists()) {
// file was not uploaded
LOG.debug("Deleting buffer file as upload did not start");
boolean deleted = bufferFile.delete();
if (!deleted && bufferFile.exists()) {
LOG.warn("Failed to delete buffer file {}", bufferFile);
}
}
break;
case Upload:
LOG.debug("Buffer file {} exists —close upload stream", bufferFile);
break;
case Closed:
// no-op
break;
default:
// this state can never be reached, but checkstyle complains, so
// it is here.
}
}
/**
* Flush operation will flush to disk.
* @throws IOException IOE raised on FileOutputStream
*/
@Override
void flush() throws IOException {
super.flush();
out.flush();
}
@Override
public String toString() {
String sb = "FileBlock{"
+ "destFile=" + bufferFile +
", state=" + getState() +
", dataSize=" + dataSize() +
", limit=" + limit +
'}';
return sb;
}
/**
* An input stream which deletes the buffer file when closed.
*/
private final class FileDeletingInputStream extends FilterInputStream {
private final AtomicBoolean closed = new AtomicBoolean(false);
FileDeletingInputStream(InputStream source) {
super(source);
}
/**
* Delete the input file when closed.
* @throws IOException IO problem
*/
@Override
public void close() throws IOException {
try {
super.close();
} finally {
if (!closed.getAndSet(true)) {
if (!bufferFile.delete()) {
LOG.warn("delete({}) returned false",
bufferFile.getAbsoluteFile());
}
}
}
}
}
}
}

View File

@ -1,410 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs.s3a;
import com.amazonaws.AmazonClientException;
import com.amazonaws.event.ProgressEvent;
import com.amazonaws.event.ProgressListener;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.model.AbortMultipartUploadRequest;
import com.amazonaws.services.s3.model.CannedAccessControlList;
import com.amazonaws.services.s3.model.CompleteMultipartUploadRequest;
import com.amazonaws.services.s3.model.InitiateMultipartUploadRequest;
import com.amazonaws.services.s3.model.ObjectMetadata;
import com.amazonaws.services.s3.model.PartETag;
import com.amazonaws.services.s3.model.PutObjectRequest;
import com.amazonaws.services.s3.model.PutObjectResult;
import com.amazonaws.services.s3.model.UploadPartRequest;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.ListeningExecutorService;
import com.google.common.util.concurrent.MoreExecutors;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.util.Progressable;
import org.slf4j.Logger;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import static org.apache.hadoop.fs.s3a.S3AUtils.*;
import static org.apache.hadoop.fs.s3a.Statistic.*;
/**
* Upload files/parts asap directly from a memory buffer (instead of buffering
* to a file).
* <p>
* Uploads are managed low-level rather than through the AWS TransferManager.
* This allows for uploading each part of a multi-part upload as soon as
* the bytes are in memory, rather than waiting until the file is closed.
* <p>
* Unstable: statistics and error handling might evolve
*/
@InterfaceAudience.Private
@InterfaceStability.Unstable
public class S3AFastOutputStream extends OutputStream {
private static final Logger LOG = S3AFileSystem.LOG;
private final String key;
private final String bucket;
private final AmazonS3 client;
private final int partSize;
private final int multiPartThreshold;
private final S3AFileSystem fs;
private final CannedAccessControlList cannedACL;
private final ProgressListener progressListener;
private final ListeningExecutorService executorService;
private MultiPartUpload multiPartUpload;
private boolean closed;
private ByteArrayOutputStream buffer;
private int bufferLimit;
/**
* Creates a fast OutputStream that uploads to S3 from memory.
* For MultiPartUploads, as soon as sufficient bytes have been written to
* the stream a part is uploaded immediately (by using the low-level
* multi-part upload API on the AmazonS3Client).
*
* @param client AmazonS3Client used for S3 calls
* @param fs S3AFilesystem
* @param bucket S3 bucket name
* @param key S3 key name
* @param progress report progress in order to prevent timeouts
* @param cannedACL used CannedAccessControlList
* @param partSize size of a single part in a multi-part upload (except
* last part)
* @param multiPartThreshold files at least this size use multi-part upload
* @param threadPoolExecutor thread factory
* @throws IOException on any problem
*/
public S3AFastOutputStream(AmazonS3 client,
S3AFileSystem fs,
String bucket,
String key,
Progressable progress,
CannedAccessControlList cannedACL,
long partSize,
long multiPartThreshold,
ExecutorService threadPoolExecutor)
throws IOException {
this.bucket = bucket;
this.key = key;
this.client = client;
this.fs = fs;
this.cannedACL = cannedACL;
//Ensure limit as ByteArrayOutputStream size cannot exceed Integer.MAX_VALUE
if (partSize > Integer.MAX_VALUE) {
this.partSize = Integer.MAX_VALUE;
LOG.warn("s3a: MULTIPART_SIZE capped to ~2.14GB (maximum allowed size " +
"when using 'FAST_UPLOAD = true')");
} else {
this.partSize = (int) partSize;
}
if (multiPartThreshold > Integer.MAX_VALUE) {
this.multiPartThreshold = Integer.MAX_VALUE;
LOG.warn("s3a: MIN_MULTIPART_THRESHOLD capped to ~2.14GB (maximum " +
"allowed size when using 'FAST_UPLOAD = true')");
} else {
this.multiPartThreshold = (int) multiPartThreshold;
}
this.bufferLimit = this.multiPartThreshold;
this.closed = false;
int initialBufferSize = this.fs.getConf()
.getInt(Constants.FAST_BUFFER_SIZE, Constants.DEFAULT_FAST_BUFFER_SIZE);
if (initialBufferSize < 0) {
LOG.warn("s3a: FAST_BUFFER_SIZE should be a positive number. Using " +
"default value");
initialBufferSize = Constants.DEFAULT_FAST_BUFFER_SIZE;
} else if (initialBufferSize > this.bufferLimit) {
LOG.warn("s3a: automatically adjusting FAST_BUFFER_SIZE to not " +
"exceed MIN_MULTIPART_THRESHOLD");
initialBufferSize = this.bufferLimit;
}
this.buffer = new ByteArrayOutputStream(initialBufferSize);
this.executorService = MoreExecutors.listeningDecorator(threadPoolExecutor);
this.multiPartUpload = null;
this.progressListener = new ProgressableListener(progress);
LOG.debug("Initialized S3AFastOutputStream for bucket '{}' key '{}'",
bucket, key);
}
/**
* Writes a byte to the memory buffer. If this causes the buffer to reach
* its limit, the actual upload is submitted to the threadpool.
* @param b the int of which the lowest byte is written
* @throws IOException on any problem
*/
@Override
public synchronized void write(int b) throws IOException {
buffer.write(b);
if (buffer.size() == bufferLimit) {
uploadBuffer();
}
}
/**
* Writes a range of bytes from to the memory buffer. If this causes the
* buffer to reach its limit, the actual upload is submitted to the
* threadpool and the remainder of the array is written to memory
* (recursively).
* @param b byte array containing
* @param off offset in array where to start
* @param len number of bytes to be written
* @throws IOException on any problem
*/
@Override
public synchronized void write(byte[] b, int off, int len)
throws IOException {
if (b == null) {
throw new NullPointerException();
} else if ((off < 0) || (off > b.length) || (len < 0) ||
((off + len) > b.length) || ((off + len) < 0)) {
throw new IndexOutOfBoundsException();
} else if (len == 0) {
return;
}
if (buffer.size() + len < bufferLimit) {
buffer.write(b, off, len);
} else {
int firstPart = bufferLimit - buffer.size();
buffer.write(b, off, firstPart);
uploadBuffer();
this.write(b, off + firstPart, len - firstPart);
}
}
private synchronized void uploadBuffer() throws IOException {
if (multiPartUpload == null) {
multiPartUpload = initiateMultiPartUpload();
/* Upload the existing buffer if it exceeds partSize. This possibly
requires multiple parts! */
final byte[] allBytes = buffer.toByteArray();
buffer = null; //earlier gc?
LOG.debug("Total length of initial buffer: {}", allBytes.length);
int processedPos = 0;
while ((multiPartThreshold - processedPos) >= partSize) {
LOG.debug("Initial buffer: processing from byte {} to byte {}",
processedPos, (processedPos + partSize - 1));
multiPartUpload.uploadPartAsync(new ByteArrayInputStream(allBytes,
processedPos, partSize), partSize);
processedPos += partSize;
}
//resize and reset stream
bufferLimit = partSize;
buffer = new ByteArrayOutputStream(bufferLimit);
buffer.write(allBytes, processedPos, multiPartThreshold - processedPos);
} else {
//upload next part
multiPartUpload.uploadPartAsync(new ByteArrayInputStream(buffer
.toByteArray()), partSize);
buffer.reset();
}
}
/**
* Close the stream. This will not return until the upload is complete
* or the attempt to perform the upload has failed.
* Exceptions raised in this method are indicative that the write has
* failed and data is at risk of being lost.
* @throws IOException on any failure.
*/
@Override
public synchronized void close() throws IOException {
if (closed) {
return;
}
closed = true;
try {
if (multiPartUpload == null) {
putObject();
} else {
int size = buffer.size();
if (size > 0) {
fs.incrementPutStartStatistics(size);
//send last part
multiPartUpload.uploadPartAsync(new ByteArrayInputStream(buffer
.toByteArray()), size);
}
final List<PartETag> partETags = multiPartUpload
.waitForAllPartUploads();
multiPartUpload.complete(partETags);
}
// This will delete unnecessary fake parent directories
fs.finishedWrite(key);
LOG.debug("Upload complete for bucket '{}' key '{}'", bucket, key);
} finally {
buffer = null;
super.close();
}
}
/**
* Create the default metadata for a multipart upload operation.
* @return the metadata to use/extend.
*/
private ObjectMetadata createDefaultMetadata() {
return fs.newObjectMetadata();
}
private MultiPartUpload initiateMultiPartUpload() throws IOException {
final InitiateMultipartUploadRequest initiateMPURequest =
new InitiateMultipartUploadRequest(bucket,
key,
createDefaultMetadata());
initiateMPURequest.setCannedACL(cannedACL);
try {
return new MultiPartUpload(
client.initiateMultipartUpload(initiateMPURequest).getUploadId());
} catch (AmazonClientException ace) {
throw translateException("initiate MultiPartUpload", key, ace);
}
}
private void putObject() throws IOException {
LOG.debug("Executing regular upload for bucket '{}' key '{}'",
bucket, key);
final ObjectMetadata om = createDefaultMetadata();
final int size = buffer.size();
om.setContentLength(size);
final PutObjectRequest putObjectRequest =
fs.newPutObjectRequest(key,
om,
new ByteArrayInputStream(buffer.toByteArray()));
putObjectRequest.setGeneralProgressListener(progressListener);
ListenableFuture<PutObjectResult> putObjectResult =
executorService.submit(new Callable<PutObjectResult>() {
@Override
public PutObjectResult call() throws Exception {
fs.incrementPutStartStatistics(size);
return client.putObject(putObjectRequest);
}
});
//wait for completion
try {
putObjectResult.get();
} catch (InterruptedException ie) {
LOG.warn("Interrupted object upload: {}", ie, ie);
Thread.currentThread().interrupt();
} catch (ExecutionException ee) {
throw extractException("regular upload", key, ee);
}
}
private class MultiPartUpload {
private final String uploadId;
private final List<ListenableFuture<PartETag>> partETagsFutures;
public MultiPartUpload(String uploadId) {
this.uploadId = uploadId;
this.partETagsFutures = new ArrayList<ListenableFuture<PartETag>>();
LOG.debug("Initiated multi-part upload for bucket '{}' key '{}' with " +
"id '{}'", bucket, key, uploadId);
}
private void uploadPartAsync(ByteArrayInputStream inputStream,
int partSize) {
final int currentPartNumber = partETagsFutures.size() + 1;
final UploadPartRequest request =
new UploadPartRequest().withBucketName(bucket).withKey(key)
.withUploadId(uploadId).withInputStream(inputStream)
.withPartNumber(currentPartNumber).withPartSize(partSize);
request.setGeneralProgressListener(progressListener);
ListenableFuture<PartETag> partETagFuture =
executorService.submit(new Callable<PartETag>() {
@Override
public PartETag call() throws Exception {
LOG.debug("Uploading part {} for id '{}'", currentPartNumber,
uploadId);
return fs.uploadPart(request).getPartETag();
}
});
partETagsFutures.add(partETagFuture);
}
private List<PartETag> waitForAllPartUploads() throws IOException {
try {
return Futures.allAsList(partETagsFutures).get();
} catch (InterruptedException ie) {
LOG.warn("Interrupted partUpload: {}", ie, ie);
Thread.currentThread().interrupt();
return null;
} catch (ExecutionException ee) {
//there is no way of recovering so abort
//cancel all partUploads
for (ListenableFuture<PartETag> future : partETagsFutures) {
future.cancel(true);
}
//abort multipartupload
this.abort();
throw extractException("Multi-part upload with id '" + uploadId + "'",
key, ee);
}
}
private void complete(List<PartETag> partETags) throws IOException {
try {
LOG.debug("Completing multi-part upload for key '{}', id '{}'",
key, uploadId);
client.completeMultipartUpload(
new CompleteMultipartUploadRequest(bucket,
key,
uploadId,
partETags));
} catch (AmazonClientException e) {
throw translateException("Completing multi-part upload", key, e);
}
}
public void abort() {
LOG.warn("Aborting multi-part upload with id '{}'", uploadId);
try {
fs.incrementStatistic(OBJECT_MULTIPART_UPLOAD_ABORTED);
client.abortMultipartUpload(new AbortMultipartUploadRequest(bucket,
key, uploadId));
} catch (Exception e2) {
LOG.warn("Unable to abort multipart upload, you may need to purge " +
"uploaded parts: {}", e2, e2);
}
}
}
private static class ProgressableListener implements ProgressListener {
private final Progressable progress;
public ProgressableListener(Progressable progress) {
this.progress = progress;
}
public void progressChanged(ProgressEvent progressEvent) {
if (progress != null) {
progress.progress();
}
}
}
}

View File

@ -1,4 +1,4 @@
/**
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@ -37,14 +37,20 @@
import com.amazonaws.AmazonClientException;
import com.amazonaws.AmazonServiceException;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.model.AbortMultipartUploadRequest;
import com.amazonaws.services.s3.model.AmazonS3Exception;
import com.amazonaws.services.s3.model.CannedAccessControlList;
import com.amazonaws.services.s3.model.CompleteMultipartUploadRequest;
import com.amazonaws.services.s3.model.CompleteMultipartUploadResult;
import com.amazonaws.services.s3.model.CopyObjectRequest;
import com.amazonaws.services.s3.model.DeleteObjectsRequest;
import com.amazonaws.services.s3.model.InitiateMultipartUploadRequest;
import com.amazonaws.services.s3.model.ListObjectsRequest;
import com.amazonaws.services.s3.model.ObjectListing;
import com.amazonaws.services.s3.model.ObjectMetadata;
import com.amazonaws.services.s3.model.PartETag;
import com.amazonaws.services.s3.model.PutObjectRequest;
import com.amazonaws.services.s3.model.CopyObjectRequest;
import com.amazonaws.services.s3.model.PutObjectResult;
import com.amazonaws.services.s3.model.S3ObjectSummary;
import com.amazonaws.services.s3.model.UploadPartRequest;
import com.amazonaws.services.s3.model.UploadPartResult;
@ -55,6 +61,8 @@
import com.amazonaws.event.ProgressListener;
import com.amazonaws.event.ProgressEvent;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.util.concurrent.ListeningExecutorService;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.classification.InterfaceAudience;
@ -68,6 +76,7 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.GlobalStorageStatistics;
import org.apache.hadoop.fs.InvalidRequestException;
import org.apache.hadoop.fs.LocalDirAllocator;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
@ -118,9 +127,12 @@ public class S3AFileSystem extends FileSystem {
private long partSize;
private boolean enableMultiObjectsDelete;
private TransferManager transfers;
private ExecutorService threadPoolExecutor;
private ListeningExecutorService threadPoolExecutor;
private long multiPartThreshold;
public static final Logger LOG = LoggerFactory.getLogger(S3AFileSystem.class);
private static final Logger PROGRESS =
LoggerFactory.getLogger("org.apache.hadoop.fs.s3a.S3AFileSystem.Progress");
private LocalDirAllocator directoryAllocator;
private CannedAccessControlList cannedACL;
private String serverSideEncryptionAlgorithm;
private S3AInstrumentation instrumentation;
@ -131,6 +143,10 @@ public class S3AFileSystem extends FileSystem {
// The maximum number of entries that can be deleted in any call to s3
private static final int MAX_ENTRIES_TO_DELETE = 1000;
private boolean blockUploadEnabled;
private String blockOutputBuffer;
private S3ADataBlocks.BlockFactory blockFactory;
private int blockOutputActiveBlocks;
/** Called after a new FileSystem instance is constructed.
* @param name a uri whose authority section names the host, port, etc.
@ -157,18 +173,11 @@ public void initialize(URI name, Configuration conf) throws IOException {
maxKeys = intOption(conf, MAX_PAGING_KEYS, DEFAULT_MAX_PAGING_KEYS, 1);
listing = new Listing(this);
partSize = conf.getLong(MULTIPART_SIZE, DEFAULT_MULTIPART_SIZE);
if (partSize < 5 * 1024 * 1024) {
LOG.error(MULTIPART_SIZE + " must be at least 5 MB");
partSize = 5 * 1024 * 1024;
}
partSize = getMultipartSizeProperty(conf,
MULTIPART_SIZE, DEFAULT_MULTIPART_SIZE);
multiPartThreshold = getMultipartSizeProperty(conf,
MIN_MULTIPART_THRESHOLD, DEFAULT_MIN_MULTIPART_THRESHOLD);
multiPartThreshold = conf.getLong(MIN_MULTIPART_THRESHOLD,
DEFAULT_MIN_MULTIPART_THRESHOLD);
if (multiPartThreshold < 5 * 1024 * 1024) {
LOG.error(MIN_MULTIPART_THRESHOLD + " must be at least 5 MB");
multiPartThreshold = 5 * 1024 * 1024;
}
//check but do not store the block size
longOption(conf, FS_S3A_BLOCK_SIZE, DEFAULT_BLOCKSIZE, 1);
enableMultiObjectsDelete = conf.getBoolean(ENABLE_MULTI_DELETE, true);
@ -189,14 +198,14 @@ public StorageStatistics provide() {
LOG.warn(MAX_THREADS + " must be at least 2: forcing to 2.");
maxThreads = 2;
}
int totalTasks = conf.getInt(MAX_TOTAL_TASKS, DEFAULT_MAX_TOTAL_TASKS);
if (totalTasks < 1) {
LOG.warn(MAX_TOTAL_TASKS + "must be at least 1: forcing to 1.");
totalTasks = 1;
}
long keepAliveTime = conf.getLong(KEEPALIVE_TIME, DEFAULT_KEEPALIVE_TIME);
threadPoolExecutor = new BlockingThreadPoolExecutorService(maxThreads,
maxThreads + totalTasks, keepAliveTime, TimeUnit.SECONDS,
int totalTasks = intOption(conf,
MAX_TOTAL_TASKS, DEFAULT_MAX_TOTAL_TASKS, 1);
long keepAliveTime = longOption(conf, KEEPALIVE_TIME,
DEFAULT_KEEPALIVE_TIME, 0);
threadPoolExecutor = BlockingThreadPoolExecutorService.newInstance(
maxThreads,
maxThreads + totalTasks,
keepAliveTime, TimeUnit.SECONDS,
"s3a-transfer-shared");
initTransferManager();
@ -209,8 +218,25 @@ public StorageStatistics provide() {
serverSideEncryptionAlgorithm =
conf.getTrimmed(SERVER_SIDE_ENCRYPTION_ALGORITHM);
LOG.debug("Using encryption {}", serverSideEncryptionAlgorithm);
inputPolicy = S3AInputPolicy.getPolicy(
conf.getTrimmed(INPUT_FADVISE, INPUT_FADV_NORMAL));
blockUploadEnabled = conf.getBoolean(FAST_UPLOAD, DEFAULT_FAST_UPLOAD);
if (blockUploadEnabled) {
blockOutputBuffer = conf.getTrimmed(FAST_UPLOAD_BUFFER,
DEFAULT_FAST_UPLOAD_BUFFER);
partSize = ensureOutputParameterInRange(MULTIPART_SIZE, partSize);
blockFactory = S3ADataBlocks.createFactory(this, blockOutputBuffer);
blockOutputActiveBlocks = intOption(conf,
FAST_UPLOAD_ACTIVE_BLOCKS, DEFAULT_FAST_UPLOAD_ACTIVE_BLOCKS, 1);
LOG.debug("Using S3ABlockOutputStream with buffer = {}; block={};" +
" queue limit={}",
blockOutputBuffer, partSize, blockOutputActiveBlocks);
} else {
LOG.debug("Using S3AOutputStream");
}
} catch (AmazonClientException e) {
throw translateException("initializing ", new Path(name), e);
}
@ -336,6 +362,33 @@ public S3AInputPolicy getInputPolicy() {
return inputPolicy;
}
/**
* Demand create the directory allocator, then create a temporary file.
* {@link LocalDirAllocator#createTmpFileForWrite(String, long, Configuration)}.
* @param pathStr prefix for the temporary file
* @param size the size of the file that is going to be written
* @param conf the Configuration object
* @return a unique temporary file
* @throws IOException IO problems
*/
synchronized File createTmpFileForWrite(String pathStr, long size,
Configuration conf) throws IOException {
if (directoryAllocator == null) {
String bufferDir = conf.get(BUFFER_DIR) != null
? BUFFER_DIR : "hadoop.tmp.dir";
directoryAllocator = new LocalDirAllocator(bufferDir);
}
return directoryAllocator.createTmpFileForWrite(pathStr, size, conf);
}
/**
* Get the bucket of this filesystem.
* @return the bucket
*/
public String getBucket() {
return bucket;
}
/**
* Change the input policy for this FS.
* @param inputPolicy new policy
@ -460,6 +513,7 @@ public FSDataInputStream open(Path f, int bufferSize)
* @see #setPermission(Path, FsPermission)
*/
@Override
@SuppressWarnings("IOResourceOpenedButNotSafelyClosed")
public FSDataOutputStream create(Path f, FsPermission permission,
boolean overwrite, int bufferSize, short replication, long blockSize,
Progressable progress) throws IOException {
@ -484,28 +538,33 @@ public FSDataOutputStream create(Path f, FsPermission permission,
}
instrumentation.fileCreated();
if (getConf().getBoolean(FAST_UPLOAD, DEFAULT_FAST_UPLOAD)) {
return new FSDataOutputStream(
new S3AFastOutputStream(s3,
this,
bucket,
FSDataOutputStream output;
if (blockUploadEnabled) {
output = new FSDataOutputStream(
new S3ABlockOutputStream(this,
key,
new SemaphoredDelegatingExecutor(threadPoolExecutor,
blockOutputActiveBlocks, true),
progress,
cannedACL,
partSize,
multiPartThreshold,
threadPoolExecutor),
statistics);
blockFactory,
instrumentation.newOutputStreamStatistics(),
new WriteOperationHelper(key)
),
null);
} else {
// We pass null to FSDataOutputStream so it won't count writes that
// are being buffered to a file
output = new FSDataOutputStream(
new S3AOutputStream(getConf(),
this,
key,
progress
),
null);
}
// We pass null to FSDataOutputStream so it won't count writes that
// are being buffered to a file
return new FSDataOutputStream(
new S3AOutputStream(getConf(),
this,
key,
progress
),
null);
return output;
}
/**
@ -749,6 +808,33 @@ protected void incrementStatistic(Statistic statistic, long count) {
storageStatistics.incrementCounter(statistic, count);
}
/**
* Decrement a gauge by a specific value.
* @param statistic The operation to decrement
* @param count the count to decrement
*/
protected void decrementGauge(Statistic statistic, long count) {
instrumentation.decrementGauge(statistic, count);
}
/**
* Increment a gauge by a specific value.
* @param statistic The operation to increment
* @param count the count to increment
*/
protected void incrementGauge(Statistic statistic, long count) {
instrumentation.incrementGauge(statistic, count);
}
/**
* Get the storage statistics of this filesystem.
* @return the storage statistics
*/
@Override
public S3AStorageStatistics getStorageStatistics() {
return storageStatistics;
}
/**
* Request object metadata; increments counters in the process.
* @param key key
@ -896,7 +982,9 @@ public ObjectMetadata newObjectMetadata() {
*/
public ObjectMetadata newObjectMetadata(long length) {
final ObjectMetadata om = newObjectMetadata();
om.setContentLength(length);
if (length >= 0) {
om.setContentLength(length);
}
return om;
}
@ -918,7 +1006,41 @@ public Upload putObject(PutObjectRequest putObjectRequest) {
len = putObjectRequest.getMetadata().getContentLength();
}
incrementPutStartStatistics(len);
return transfers.upload(putObjectRequest);
try {
Upload upload = transfers.upload(putObjectRequest);
incrementPutCompletedStatistics(true, len);
return upload;
} catch (AmazonClientException e) {
incrementPutCompletedStatistics(false, len);
throw e;
}
}
/**
* PUT an object directly (i.e. not via the transfer manager).
* Byte length is calculated from the file length, or, if there is no
* file, from the content length of the header.
* @param putObjectRequest the request
* @return the upload initiated
* @throws AmazonClientException on problems
*/
public PutObjectResult putObjectDirect(PutObjectRequest putObjectRequest)
throws AmazonClientException {
long len;
if (putObjectRequest.getFile() != null) {
len = putObjectRequest.getFile().length();
} else {
len = putObjectRequest.getMetadata().getContentLength();
}
incrementPutStartStatistics(len);
try {
PutObjectResult result = s3.putObject(putObjectRequest);
incrementPutCompletedStatistics(true, len);
return result;
} catch (AmazonClientException e) {
incrementPutCompletedStatistics(false, len);
throw e;
}
}
/**
@ -926,10 +1048,20 @@ public Upload putObject(PutObjectRequest putObjectRequest) {
* Increments the write and put counters
* @param request request
* @return the result of the operation.
* @throws AmazonClientException on problems
*/
public UploadPartResult uploadPart(UploadPartRequest request) {
incrementPutStartStatistics(request.getPartSize());
return s3.uploadPart(request);
public UploadPartResult uploadPart(UploadPartRequest request)
throws AmazonClientException {
long len = request.getPartSize();
incrementPutStartStatistics(len);
try {
UploadPartResult uploadPartResult = s3.uploadPart(request);
incrementPutCompletedStatistics(true, len);
return uploadPartResult;
} catch (AmazonClientException e) {
incrementPutCompletedStatistics(false, len);
throw e;
}
}
/**
@ -942,9 +1074,28 @@ public void incrementPutStartStatistics(long bytes) {
LOG.debug("PUT start {} bytes", bytes);
incrementWriteOperations();
incrementStatistic(OBJECT_PUT_REQUESTS);
incrementGauge(OBJECT_PUT_REQUESTS_ACTIVE, 1);
if (bytes > 0) {
incrementGauge(OBJECT_PUT_BYTES_PENDING, bytes);
}
}
/**
* At the end of a put/multipart upload operation, update the
* relevant counters and gauges.
*
* @param success did the operation succeed?
* @param bytes bytes in the request.
*/
public void incrementPutCompletedStatistics(boolean success, long bytes) {
LOG.debug("PUT completed success={}; {} bytes", success, bytes);
incrementWriteOperations();
if (bytes > 0) {
incrementStatistic(OBJECT_PUT_BYTES, bytes);
decrementGauge(OBJECT_PUT_BYTES_PENDING, bytes);
}
incrementStatistic(OBJECT_PUT_REQUESTS_COMPLETED);
decrementGauge(OBJECT_PUT_REQUESTS_ACTIVE, 1);
}
/**
@ -955,7 +1106,7 @@ public void incrementPutStartStatistics(long bytes) {
* @param bytes bytes successfully uploaded.
*/
public void incrementPutProgressStatistics(String key, long bytes) {
LOG.debug("PUT {}: {} bytes", key, bytes);
PROGRESS.debug("PUT {}: {} bytes", key, bytes);
incrementWriteOperations();
if (bytes > 0) {
statistics.incrementBytesWritten(bytes);
@ -1475,7 +1626,7 @@ private void innerCopyFromLocalFile(boolean delSrc, boolean overwrite,
LocalFileSystem local = getLocal(getConf());
File srcfile = local.pathToFile(src);
final ObjectMetadata om = newObjectMetadata();
final ObjectMetadata om = newObjectMetadata(srcfile.length());
PutObjectRequest putObjectRequest = newPutObjectRequest(key, om, srcfile);
Upload up = putObject(putObjectRequest);
ProgressableProgressListener listener = new ProgressableProgressListener(
@ -1743,6 +1894,10 @@ public String toString() {
.append(serverSideEncryptionAlgorithm)
.append('\'');
}
if (blockFactory != null) {
sb.append(", blockFactory=").append(blockFactory);
}
sb.append(", executor=").append(threadPoolExecutor);
sb.append(", statistics {")
.append(statistics)
.append("}");
@ -1950,4 +2105,163 @@ LocatedFileStatus toLocatedFileStatus(FileStatus status)
getFileBlockLocations(status, 0, status.getLen())
: null);
}
/**
* Helper for an ongoing write operation.
* <p>
* It hides direct access to the S3 API from the output stream,
* and is a location where the object upload process can be evolved/enhanced.
* <p>
* Features
* <ul>
* <li>Methods to create and submit requests to S3, so avoiding
* all direct interaction with the AWS APIs.</li>
* <li>Some extra preflight checks of arguments, so failing fast on
* errors.</li>
* <li>Callbacks to let the FS know of events in the output stream
* upload process.</li>
* </ul>
*
* Each instance of this state is unique to a single output stream.
*/
final class WriteOperationHelper {
private final String key;
private WriteOperationHelper(String key) {
this.key = key;
}
/**
* Create a {@link PutObjectRequest} request.
* The metadata is assumed to have been configured with the size of the
* operation.
* @param inputStream source data.
* @param length size, if known. Use -1 for not known
* @return the request
*/
PutObjectRequest newPutRequest(InputStream inputStream, long length) {
return newPutObjectRequest(key, newObjectMetadata(length), inputStream);
}
/**
* Callback on a successful write.
*/
void writeSuccessful() {
finishedWrite(key);
}
/**
* Callback on a write failure.
* @param e Any exception raised which triggered the failure.
*/
void writeFailed(Exception e) {
LOG.debug("Write to {} failed", this, e);
}
/**
* Create a new object metadata instance.
* Any standard metadata headers are added here, for example:
* encryption.
* @param length size, if known. Use -1 for not known
* @return a new metadata instance
*/
public ObjectMetadata newObjectMetadata(long length) {
return S3AFileSystem.this.newObjectMetadata(length);
}
/**
* Start the multipart upload process.
* @return the upload result containing the ID
* @throws IOException IO problem
*/
String initiateMultiPartUpload() throws IOException {
LOG.debug("Initiating Multipart upload");
final InitiateMultipartUploadRequest initiateMPURequest =
new InitiateMultipartUploadRequest(bucket,
key,
newObjectMetadata(-1));
initiateMPURequest.setCannedACL(cannedACL);
try {
return s3.initiateMultipartUpload(initiateMPURequest)
.getUploadId();
} catch (AmazonClientException ace) {
throw translateException("initiate MultiPartUpload", key, ace);
}
}
/**
* Complete a multipart upload operation.
* @param uploadId multipart operation Id
* @param partETags list of partial uploads
* @return the result
* @throws AmazonClientException on problems.
*/
CompleteMultipartUploadResult completeMultipartUpload(String uploadId,
List<PartETag> partETags) throws AmazonClientException {
Preconditions.checkNotNull(uploadId);
Preconditions.checkNotNull(partETags);
Preconditions.checkArgument(!partETags.isEmpty(),
"No partitions have been uploaded");
return s3.completeMultipartUpload(
new CompleteMultipartUploadRequest(bucket,
key,
uploadId,
partETags));
}
/**
* Abort a multipart upload operation.
* @param uploadId multipart operation Id
* @return the result
* @throws AmazonClientException on problems.
*/
void abortMultipartUpload(String uploadId) throws AmazonClientException {
s3.abortMultipartUpload(
new AbortMultipartUploadRequest(bucket, key, uploadId));
}
/**
* Create and initialize a part request of a multipart upload.
* @param uploadId ID of ongoing upload
* @param uploadStream source of data to upload
* @param partNumber current part number of the upload
* @param size amount of data
* @return the request.
*/
UploadPartRequest newUploadPartRequest(String uploadId,
InputStream uploadStream,
int partNumber,
int size) {
Preconditions.checkNotNull(uploadId);
Preconditions.checkNotNull(uploadStream);
Preconditions.checkArgument(size > 0, "Invalid partition size %s", size);
Preconditions.checkArgument(partNumber> 0 && partNumber <=10000,
"partNumber must be between 1 and 10000 inclusive, but is %s",
partNumber);
LOG.debug("Creating part upload request for {} #{} size {}",
uploadId, partNumber, size);
return new UploadPartRequest()
.withBucketName(bucket)
.withKey(key)
.withUploadId(uploadId)
.withInputStream(uploadStream)
.withPartNumber(partNumber)
.withPartSize(size);
}
/**
* The toString method is intended to be used in logging/toString calls.
* @return a string description.
*/
@Override
public String toString() {
final StringBuilder sb = new StringBuilder(
"{bucket=").append(bucket);
sb.append(", key='").append(key).append('\'');
sb.append('}');
return sb.toString();
}
}
}

View File

@ -18,7 +18,9 @@
package org.apache.hadoop.fs.s3a;
import com.google.common.base.Preconditions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.metrics2.MetricStringBuilder;
@ -29,10 +31,12 @@
import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
import org.apache.hadoop.metrics2.lib.MutableMetric;
import java.io.Closeable;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicLong;
import static org.apache.hadoop.fs.s3a.Statistic.*;
@ -50,6 +54,9 @@
@InterfaceAudience.Private
@InterfaceStability.Evolving
public class S3AInstrumentation {
private static final Logger LOG = LoggerFactory.getLogger(
S3AInstrumentation.class);
public static final String CONTEXT = "S3AFileSystem";
private final MetricsRegistry registry =
new MetricsRegistry("S3AFileSystem").setContext(CONTEXT);
@ -100,7 +107,23 @@ public class S3AInstrumentation {
OBJECT_METADATA_REQUESTS,
OBJECT_MULTIPART_UPLOAD_ABORTED,
OBJECT_PUT_BYTES,
OBJECT_PUT_REQUESTS
OBJECT_PUT_REQUESTS,
OBJECT_PUT_REQUESTS_COMPLETED,
STREAM_WRITE_FAILURES,
STREAM_WRITE_BLOCK_UPLOADS,
STREAM_WRITE_BLOCK_UPLOADS_COMMITTED,
STREAM_WRITE_BLOCK_UPLOADS_ABORTED,
STREAM_WRITE_TOTAL_TIME,
STREAM_WRITE_TOTAL_DATA,
};
private static final Statistic[] GAUGES_TO_CREATE = {
OBJECT_PUT_REQUESTS_ACTIVE,
OBJECT_PUT_BYTES_PENDING,
STREAM_WRITE_BLOCK_UPLOADS_ACTIVE,
STREAM_WRITE_BLOCK_UPLOADS_PENDING,
STREAM_WRITE_BLOCK_UPLOADS_DATA_PENDING,
};
public S3AInstrumentation(URI name) {
@ -143,6 +166,9 @@ public S3AInstrumentation(URI name) {
for (Statistic statistic : COUNTERS_TO_CREATE) {
counter(statistic);
}
for (Statistic statistic : GAUGES_TO_CREATE) {
gauge(statistic.getSymbol(), statistic.getDescription());
}
}
/**
@ -254,13 +280,13 @@ public long getCounterValue(String name) {
* Lookup a counter by name. Return null if it is not known.
* @param name counter name
* @return the counter
* @throws IllegalStateException if the metric is not a counter
*/
private MutableCounterLong lookupCounter(String name) {
MutableMetric metric = lookupMetric(name);
if (metric == null) {
return null;
}
Preconditions.checkNotNull(metric, "not found: " + name);
if (!(metric instanceof MutableCounterLong)) {
throw new IllegalStateException("Metric " + name
+ " is not a MutableCounterLong: " + metric);
@ -268,6 +294,20 @@ private MutableCounterLong lookupCounter(String name) {
return (MutableCounterLong) metric;
}
/**
* Look up a gauge.
* @param name gauge name
* @return the gauge or null
* @throws ClassCastException if the metric is not a Gauge.
*/
public MutableGaugeLong lookupGauge(String name) {
MutableMetric metric = lookupMetric(name);
if (metric == null) {
LOG.debug("No gauge {}", name);
}
return (MutableGaugeLong) metric;
}
/**
* Look up a metric from both the registered set and the lighter weight
* stream entries.
@ -349,6 +389,47 @@ public void incrementCounter(Statistic op, long count) {
counter.incr(count);
}
}
/**
* Increment a specific counter.
* No-op if not defined.
* @param op operation
* @param count atomic long containing value
*/
public void incrementCounter(Statistic op, AtomicLong count) {
incrementCounter(op, count.get());
}
/**
* Increment a specific gauge.
* No-op if not defined.
* @param op operation
* @param count increment value
* @throws ClassCastException if the metric is of the wrong type
*/
public void incrementGauge(Statistic op, long count) {
MutableGaugeLong gauge = lookupGauge(op.getSymbol());
if (gauge != null) {
gauge.incr(count);
} else {
LOG.debug("No Gauge: "+ op);
}
}
/**
* Decrement a specific gauge.
* No-op if not defined.
* @param op operation
* @param count increment value
* @throws ClassCastException if the metric is of the wrong type
*/
public void decrementGauge(Statistic op, long count) {
MutableGaugeLong gauge = lookupGauge(op.getSymbol());
if (gauge != null) {
gauge.decr(count);
} else {
LOG.debug("No Gauge: " + op);
}
}
/**
* Create a stream input statistics instance.
@ -553,4 +634,165 @@ public String toString() {
return sb.toString();
}
}
/**
* Create a stream output statistics instance.
* @return the new instance
*/
OutputStreamStatistics newOutputStreamStatistics() {
return new OutputStreamStatistics();
}
/**
* Merge in the statistics of a single output stream into
* the filesystem-wide statistics.
* @param statistics stream statistics
*/
private void mergeOutputStreamStatistics(OutputStreamStatistics statistics) {
incrementCounter(STREAM_WRITE_TOTAL_TIME, statistics.totalUploadDuration());
incrementCounter(STREAM_WRITE_QUEUE_DURATION, statistics.queueDuration);
incrementCounter(STREAM_WRITE_TOTAL_DATA, statistics.bytesUploaded);
incrementCounter(STREAM_WRITE_BLOCK_UPLOADS,
statistics.blockUploadsCompleted);
}
/**
* Statistics updated by an output stream during its actual operation.
* Some of these stats may be relayed. However, as block upload is
* spans multiple
*/
@InterfaceAudience.Private
@InterfaceStability.Unstable
public final class OutputStreamStatistics implements Closeable {
private final AtomicLong blocksSubmitted = new AtomicLong(0);
private final AtomicLong blocksInQueue = new AtomicLong(0);
private final AtomicLong blocksActive = new AtomicLong(0);
private final AtomicLong blockUploadsCompleted = new AtomicLong(0);
private final AtomicLong blockUploadsFailed = new AtomicLong(0);
private final AtomicLong bytesPendingUpload = new AtomicLong(0);
private final AtomicLong bytesUploaded = new AtomicLong(0);
private final AtomicLong transferDuration = new AtomicLong(0);
private final AtomicLong queueDuration = new AtomicLong(0);
private final AtomicLong exceptionsInMultipartFinalize = new AtomicLong(0);
/**
* Block is queued for upload.
*/
void blockUploadQueued(int blockSize) {
blocksSubmitted.incrementAndGet();
blocksInQueue.incrementAndGet();
bytesPendingUpload.addAndGet(blockSize);
incrementGauge(STREAM_WRITE_BLOCK_UPLOADS_PENDING, 1);
incrementGauge(STREAM_WRITE_BLOCK_UPLOADS_DATA_PENDING, blockSize);
}
/** Queued block has been scheduled for upload. */
void blockUploadStarted(long duration, int blockSize) {
queueDuration.addAndGet(duration);
blocksInQueue.decrementAndGet();
blocksActive.incrementAndGet();
incrementGauge(STREAM_WRITE_BLOCK_UPLOADS_PENDING, -1);
incrementGauge(STREAM_WRITE_BLOCK_UPLOADS_ACTIVE, 1);
}
/** A block upload has completed. */
void blockUploadCompleted(long duration, int blockSize) {
this.transferDuration.addAndGet(duration);
incrementGauge(STREAM_WRITE_BLOCK_UPLOADS_ACTIVE, -1);
blocksActive.decrementAndGet();
blockUploadsCompleted.incrementAndGet();
}
/**
* A block upload has failed.
* A final transfer completed event is still expected, so this
* does not decrement the active block counter.
*/
void blockUploadFailed(long duration, int blockSize) {
blockUploadsFailed.incrementAndGet();
}
/** Intermediate report of bytes uploaded. */
void bytesTransferred(long byteCount) {
bytesUploaded.addAndGet(byteCount);
bytesPendingUpload.addAndGet(-byteCount);
incrementGauge(STREAM_WRITE_BLOCK_UPLOADS_DATA_PENDING, -byteCount);
}
/**
* Note an exception in a multipart complete.
*/
void exceptionInMultipartComplete() {
exceptionsInMultipartFinalize.incrementAndGet();
}
/**
* Note an exception in a multipart abort.
*/
void exceptionInMultipartAbort() {
exceptionsInMultipartFinalize.incrementAndGet();
}
/**
* Get the number of bytes pending upload.
* @return the number of bytes in the pending upload state.
*/
public long getBytesPendingUpload() {
return bytesPendingUpload.get();
}
/**
* Output stream has closed.
* Trigger merge in of all statistics not updated during operation.
*/
@Override
public void close() {
if (bytesPendingUpload.get() > 0) {
LOG.warn("Closing output stream statistics while data is still marked" +
" as pending upload in {}", this);
}
mergeOutputStreamStatistics(this);
}
long averageQueueTime() {
return blocksSubmitted.get() > 0 ?
(queueDuration.get() / blocksSubmitted.get()) : 0;
}
double effectiveBandwidth() {
double duration = totalUploadDuration() / 1000.0;
return duration > 0 ?
(bytesUploaded.get() / duration) : 0;
}
long totalUploadDuration() {
return queueDuration.get() + transferDuration.get();
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder(
"OutputStreamStatistics{");
sb.append("blocksSubmitted=").append(blocksSubmitted);
sb.append(", blocksInQueue=").append(blocksInQueue);
sb.append(", blocksActive=").append(blocksActive);
sb.append(", blockUploadsCompleted=").append(blockUploadsCompleted);
sb.append(", blockUploadsFailed=").append(blockUploadsFailed);
sb.append(", bytesPendingUpload=").append(bytesPendingUpload);
sb.append(", bytesUploaded=").append(bytesUploaded);
sb.append(", exceptionsInMultipartFinalize=").append(
exceptionsInMultipartFinalize);
sb.append(", transferDuration=").append(transferDuration).append(" ms");
sb.append(", queueDuration=").append(queueDuration).append(" ms");
sb.append(", averageQueueTime=").append(averageQueueTime()).append(" ms");
sb.append(", totalUploadDuration=").append(totalUploadDuration())
.append(" ms");
sb.append(", effectiveBandwidth=").append(effectiveBandwidth())
.append(" bytes/s");
sb.append('}');
return sb.toString();
}
}
}

View File

@ -1,4 +1,4 @@
/**
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@ -35,8 +35,8 @@
import java.io.IOException;
import java.io.InterruptedIOException;
import java.io.OutputStream;
import java.util.concurrent.atomic.AtomicBoolean;
import static org.apache.hadoop.fs.s3a.Constants.*;
import static org.apache.hadoop.fs.s3a.S3AUtils.*;
/**
@ -45,37 +45,27 @@
@InterfaceAudience.Private
@InterfaceStability.Evolving
public class S3AOutputStream extends OutputStream {
private OutputStream backupStream;
private File backupFile;
private boolean closed;
private String key;
private Progressable progress;
private long partSize;
private long partSizeThreshold;
private S3AFileSystem fs;
private LocalDirAllocator lDirAlloc;
private final OutputStream backupStream;
private final File backupFile;
private final AtomicBoolean closed = new AtomicBoolean(false);
private final String key;
private final Progressable progress;
private final S3AFileSystem fs;
public static final Logger LOG = S3AFileSystem.LOG;
public S3AOutputStream(Configuration conf,
S3AFileSystem fs, String key, Progressable progress)
S3AFileSystem fs,
String key,
Progressable progress)
throws IOException {
this.key = key;
this.progress = progress;
this.fs = fs;
partSize = fs.getPartitionSize();
partSizeThreshold = fs.getMultiPartThreshold();
if (conf.get(BUFFER_DIR, null) != null) {
lDirAlloc = new LocalDirAllocator(BUFFER_DIR);
} else {
lDirAlloc = new LocalDirAllocator("${hadoop.tmp.dir}/s3a");
}
backupFile = lDirAlloc.createTmpFileForWrite("output-",
backupFile = fs.createTmpFileForWrite("output-",
LocalDirAllocator.SIZE_UNKNOWN, conf);
closed = false;
LOG.debug("OutputStream for key '{}' writing to tempfile: {}",
key, backupFile);
@ -84,25 +74,33 @@ public S3AOutputStream(Configuration conf,
new FileOutputStream(backupFile));
}
/**
* Check for the filesystem being open.
* @throws IOException if the filesystem is closed.
*/
void checkOpen() throws IOException {
if (closed.get()) {
throw new IOException("Output Stream closed");
}
}
@Override
public void flush() throws IOException {
checkOpen();
backupStream.flush();
}
@Override
public synchronized void close() throws IOException {
if (closed) {
public void close() throws IOException {
if (closed.getAndSet(true)) {
return;
}
backupStream.close();
LOG.debug("OutputStream for key '{}' closed. Now beginning upload", key);
LOG.debug("Minimum upload part size: {} threshold {}" , partSize,
partSizeThreshold);
try {
final ObjectMetadata om = fs.newObjectMetadata();
final ObjectMetadata om = fs.newObjectMetadata(backupFile.length());
Upload upload = fs.putObject(
fs.newPutObjectRequest(
key,
@ -126,18 +124,19 @@ public synchronized void close() throws IOException {
LOG.warn("Could not delete temporary s3a file: {}", backupFile);
}
super.close();
closed = true;
}
LOG.debug("OutputStream for key '{}' upload complete", key);
}
@Override
public void write(int b) throws IOException {
checkOpen();
backupStream.write(b);
}
@Override
public void write(byte[] b, int off, int len) throws IOException {
checkOpen();
backupStream.write(b, off, len);
}

View File

@ -49,6 +49,7 @@
import static org.apache.hadoop.fs.s3a.Constants.ACCESS_KEY;
import static org.apache.hadoop.fs.s3a.Constants.AWS_CREDENTIALS_PROVIDER;
import static org.apache.hadoop.fs.s3a.Constants.ENDPOINT;
import static org.apache.hadoop.fs.s3a.Constants.MULTIPART_MIN_SIZE;
import static org.apache.hadoop.fs.s3a.Constants.SECRET_KEY;
/**
@ -460,4 +461,42 @@ static long longOption(Configuration conf,
key, v, min));
return v;
}
/**
* Get a size property from the configuration: this property must
* be at least equal to {@link Constants#MULTIPART_MIN_SIZE}.
* If it is too small, it is rounded up to that minimum, and a warning
* printed.
* @param conf configuration
* @param property property name
* @param defVal default value
* @return the value, guaranteed to be above the minimum size
*/
public static long getMultipartSizeProperty(Configuration conf,
String property, long defVal) {
long partSize = conf.getLong(property, defVal);
if (partSize < MULTIPART_MIN_SIZE) {
LOG.warn("{} must be at least 5 MB; configured value is {}",
property, partSize);
partSize = MULTIPART_MIN_SIZE;
}
return partSize;
}
/**
* Ensure that the long value is in the range of an integer.
* @param name property name for error messages
* @param size original size
* @return the size, guaranteed to be less than or equal to the max
* value of an integer.
*/
public static int ensureOutputParameterInRange(String name, long size) {
if (size > Integer.MAX_VALUE) {
LOG.warn("s3a: {} capped to ~2.14GB" +
" (maximum allowed size with current output mechanism)", name);
return Integer.MAX_VALUE;
} else {
return (int)size;
}
}
}

View File

@ -0,0 +1,230 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs.s3a;
import com.google.common.util.concurrent.ForwardingListeningExecutorService;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.ListeningExecutorService;
import org.apache.hadoop.classification.InterfaceAudience;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
/**
* This ExecutorService blocks the submission of new tasks when its queue is
* already full by using a semaphore. Task submissions require permits, task
* completions release permits.
* <p>
* This is a refactoring of {@link BlockingThreadPoolExecutorService}; that code
* contains the thread pool logic, whereas this isolates the semaphore
* and submit logic for use with other thread pools and delegation models.
* In particular, it <i>permits multiple per stream executors to share a
* single per-FS-instance executor; the latter to throttle overall
* load from the the FS, the others to limit the amount of load which
* a single output stream can generate.</i>
* <p>
* This is inspired by <a href="https://github.com/apache/incubator-s4/blob/master/subprojects/s4-comm/src/main/java/org/apache/s4/comm/staging/BlockingThreadPoolExecutorService.java">
* this s4 threadpool</a>
*/
@SuppressWarnings("NullableProblems")
@InterfaceAudience.Private
class SemaphoredDelegatingExecutor extends
ForwardingListeningExecutorService {
private final Semaphore queueingPermits;
private final ListeningExecutorService executorDelegatee;
private final int permitCount;
/**
* Instantiate.
* @param executorDelegatee Executor to delegate to
* @param permitCount number of permits into the queue permitted
* @param fair should the semaphore be "fair"
*/
SemaphoredDelegatingExecutor(ListeningExecutorService executorDelegatee,
int permitCount,
boolean fair) {
this.permitCount = permitCount;
queueingPermits = new Semaphore(permitCount, fair);
this.executorDelegatee = executorDelegatee;
}
@Override
protected ListeningExecutorService delegate() {
return executorDelegatee;
}
@Override
public <T> List<Future<T>> invokeAll(Collection<? extends Callable<T>> tasks)
throws InterruptedException {
throw new RuntimeException("Not implemented");
}
@Override
public <T> List<Future<T>> invokeAll(Collection<? extends Callable<T>> tasks,
long timeout, TimeUnit unit) throws InterruptedException {
throw new RuntimeException("Not implemented");
}
@Override
public <T> T invokeAny(Collection<? extends Callable<T>> tasks)
throws InterruptedException, ExecutionException {
throw new RuntimeException("Not implemented");
}
@Override
public <T> T invokeAny(Collection<? extends Callable<T>> tasks, long timeout,
TimeUnit unit)
throws InterruptedException, ExecutionException, TimeoutException {
throw new RuntimeException("Not implemented");
}
@Override
public <T> ListenableFuture<T> submit(Callable<T> task) {
try {
queueingPermits.acquire();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
return Futures.immediateFailedCheckedFuture(e);
}
return super.submit(new CallableWithPermitRelease<>(task));
}
@Override
public <T> ListenableFuture<T> submit(Runnable task, T result) {
try {
queueingPermits.acquire();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
return Futures.immediateFailedCheckedFuture(e);
}
return super.submit(new RunnableWithPermitRelease(task), result);
}
@Override
public ListenableFuture<?> submit(Runnable task) {
try {
queueingPermits.acquire();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
return Futures.immediateFailedCheckedFuture(e);
}
return super.submit(new RunnableWithPermitRelease(task));
}
@Override
public void execute(Runnable command) {
try {
queueingPermits.acquire();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
super.execute(new RunnableWithPermitRelease(command));
}
/**
* Get the number of permits available; guaranteed to be
* {@code 0 <= availablePermits <= size}.
* @return the number of permits available at the time of invocation.
*/
public int getAvailablePermits() {
return queueingPermits.availablePermits();
}
/**
* Get the number of threads waiting to acquire a permit.
* @return snapshot of the length of the queue of blocked threads.
*/
public int getWaitingCount() {
return queueingPermits.getQueueLength();
}
/**
* Total number of permits.
* @return the number of permits as set in the constructor
*/
public int getPermitCount() {
return permitCount;
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder(
"SemaphoredDelegatingExecutor{");
sb.append("permitCount=").append(getPermitCount());
sb.append(", available=").append(getAvailablePermits());
sb.append(", waiting=").append(getWaitingCount());
sb.append('}');
return sb.toString();
}
/**
* Releases a permit after the task is executed.
*/
class RunnableWithPermitRelease implements Runnable {
private Runnable delegatee;
public RunnableWithPermitRelease(Runnable delegatee) {
this.delegatee = delegatee;
}
@Override
public void run() {
try {
delegatee.run();
} finally {
queueingPermits.release();
}
}
}
/**
* Releases a permit after the task is completed.
*/
class CallableWithPermitRelease<T> implements Callable<T> {
private Callable<T> delegatee;
public CallableWithPermitRelease(Callable<T> delegatee) {
this.delegatee = delegatee;
}
@Override
public T call() throws Exception {
try {
return delegatee.call();
} finally {
queueingPermits.release();
}
}
}
}

View File

@ -81,10 +81,16 @@ public enum Statistic {
"Object multipart upload aborted"),
OBJECT_PUT_REQUESTS("object_put_requests",
"Object put/multipart upload count"),
OBJECT_PUT_REQUESTS_COMPLETED("object_put_requests_completed",
"Object put/multipart upload completed count"),
OBJECT_PUT_REQUESTS_ACTIVE("object_put_requests_active",
"Current number of active put requests"),
OBJECT_PUT_BYTES("object_put_bytes", "number of bytes uploaded"),
OBJECT_PUT_BYTES_PENDING("object_put_bytes_pending",
"number of bytes queued for upload/being actively uploaded"),
STREAM_ABORTED("stream_aborted",
"Count of times the TCP stream was aborted"),
STREAM_BACKWARD_SEEK_OPERATIONS("stream_backward_seek_pperations",
STREAM_BACKWARD_SEEK_OPERATIONS("stream_backward_seek_operations",
"Number of executed seek operations which went backwards in a stream"),
STREAM_CLOSED("streamClosed", "Count of times the TCP stream was closed"),
STREAM_CLOSE_OPERATIONS("stream_close_operations",
@ -112,7 +118,29 @@ public enum Statistic {
STREAM_CLOSE_BYTES_READ("stream_bytes_read_in_close",
"Count of bytes read when closing streams during seek operations."),
STREAM_ABORT_BYTES_DISCARDED("stream_bytes_discarded_in_abort",
"Count of bytes discarded by aborting the stream");
"Count of bytes discarded by aborting the stream"),
STREAM_WRITE_FAILURES("stream_write_failures",
"Count of stream write failures reported"),
STREAM_WRITE_BLOCK_UPLOADS("stream_write_block_uploads",
"Count of block/partition uploads completed"),
STREAM_WRITE_BLOCK_UPLOADS_ACTIVE("stream_write_block_uploads_active",
"Count of block/partition uploads completed"),
STREAM_WRITE_BLOCK_UPLOADS_COMMITTED("stream_write_block_uploads_committed",
"Count of number of block uploads committed"),
STREAM_WRITE_BLOCK_UPLOADS_ABORTED("stream_write_block_uploads_aborted",
"Count of number of block uploads aborted"),
STREAM_WRITE_BLOCK_UPLOADS_PENDING("stream_write_block_uploads_pending",
"Gauge of block/partitions uploads queued to be written"),
STREAM_WRITE_BLOCK_UPLOADS_DATA_PENDING(
"stream_write_block_uploads_data_pending",
"Gauge of block/partitions data uploads queued to be written"),
STREAM_WRITE_TOTAL_TIME("stream_write_total_time",
"Count of total time taken for uploads to complete"),
STREAM_WRITE_TOTAL_DATA("stream_write_total_data",
"Count of total data uploaded in block output"),
STREAM_WRITE_QUEUE_DURATION("stream_write_queue_duration",
"Total queue duration of all block uploads");
private static final Map<String, Statistic> SYMBOL_MAP =
new HashMap<>(Statistic.values().length);

View File

@ -1,3 +1,4 @@
<!---
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -852,40 +853,361 @@ Seoul
If the wrong endpoint is used, the request may fail. This may be reported as a 301/redirect error,
or as a 400 Bad Request.
### S3AFastOutputStream
**Warning: NEW in hadoop 2.7. UNSTABLE, EXPERIMENTAL: use at own risk**
<property>
<name>fs.s3a.fast.upload</name>
<value>false</value>
<description>Upload directly from memory instead of buffering to
disk first. Memory usage and parallelism can be controlled as up to
fs.s3a.multipart.size memory is consumed for each (part)upload actively
uploading (fs.s3a.threads.max) or queueing (fs.s3a.max.total.tasks)</description>
</property>
<property>
<name>fs.s3a.fast.buffer.size</name>
<value>1048576</value>
<description>Size (in bytes) of initial memory buffer allocated for an
upload. No effect if fs.s3a.fast.upload is false.</description>
</property>
### <a name="s3a_fast_upload"></a>Stabilizing: S3A Fast Upload
Writes are buffered in memory instead of to a file on local disk. This
removes the throughput bottleneck of the local disk write and read cycle
before starting the actual upload. Furthermore, it allows handling files that
are larger than the remaining local disk space.
However, non-trivial memory tuning is needed for optimal results and careless
settings could cause memory overflow. Up to `fs.s3a.threads.max` parallel
(part)uploads are active. Furthermore, up to `fs.s3a.max.total.tasks`
additional part(uploads) can be waiting (and thus memory buffers are created).
The memory buffer is uploaded as a single upload if it is not larger than
`fs.s3a.multipart.threshold`. Else, a multi-part upload is initiated and
parts of size `fs.s3a.multipart.size` are used to protect against overflowing
the available memory. These settings should be tuned to the envisioned
workflow (some large files, many small ones, ...) and the physical
limitations of the machine and cluster (memory, network bandwidth).
**New in Hadoop 2.7; significantly enhanced in Hadoop 2.9**
Because of the nature of the S3 object store, data written to an S3A `OutputStream`
is not written incrementally —instead, by default, it is buffered to disk
until the stream is closed in its `close()` method.
This can make output slow:
* The execution time for `OutputStream.close()` is proportional to the amount of data
buffered and inversely proportional to the bandwidth. That is `O(data/bandwidth)`.
* The bandwidth is that available from the host to S3: other work in the same
process, server or network at the time of upload may increase the upload time,
hence the duration of the `close()` call.
* If a process uploading data fails before `OutputStream.close()` is called,
all data is lost.
* The disks hosting temporary directories defined in `fs.s3a.buffer.dir` must
have the capacity to store the entire buffered file.
Put succinctly: the further the process is from the S3 endpoint, or the smaller
the EC-hosted VM is, the longer it will take work to complete.
This can create problems in application code:
* Code often assumes that the `close()` call is fast;
the delays can create bottlenecks in operations.
* Very slow uploads sometimes cause applications to time out. (generally,
threads blocking during the upload stop reporting progress, so trigger timeouts)
* Streaming very large amounts of data may consume all disk space before the upload begins.
Work to addess this began in Hadoop 2.7 with the `S3AFastOutputStream`
[HADOOP-11183](https://issues.apache.org/jira/browse/HADOOP-11183), and
has continued with ` S3ABlockOutputStream`
[HADOOP-13560](https://issues.apache.org/jira/browse/HADOOP-13560).
This adds an alternative output stream, "S3a Fast Upload" which:
1. Always uploads large files as blocks with the size set by
`fs.s3a.multipart.size`. That is: the threshold at which multipart uploads
begin and the size of each upload are identical.
1. Buffers blocks to disk (default) or in on-heap or off-heap memory.
1. Uploads blocks in parallel in background threads.
1. Begins uploading blocks as soon as the buffered data exceeds this partition
size.
1. When buffering data to disk, uses the directory/directories listed in
`fs.s3a.buffer.dir`. The size of data which can be buffered is limited
to the available disk space.
1. Generates output statistics as metrics on the filesystem, including
statistics of active and pending block uploads.
1. Has the time to `close()` set by the amount of remaning data to upload, rather
than the total size of the file.
With incremental writes of blocks, "S3A fast upload" offers an upload
time at least as fast as the "classic" mechanism, with significant benefits
on long-lived output streams, and when very large amounts of data are generated.
The in memory buffering mechanims may also offer speedup when running adjacent to
S3 endpoints, as disks are not used for intermediate data storage.
```xml
<property>
<name>fs.s3a.fast.upload</name>
<value>true</value>
<description>
Use the incremental block upload mechanism with
the buffering mechanism set in fs.s3a.fast.upload.buffer.
The number of threads performing uploads in the filesystem is defined
by fs.s3a.threads.max; the queue of waiting uploads limited by
fs.s3a.max.total.tasks.
The size of each buffer is set by fs.s3a.multipart.size.
</description>
</property>
<property>
<name>fs.s3a.fast.upload.buffer</name>
<value>disk</value>
<description>
The buffering mechanism to use when using S3A fast upload
(fs.s3a.fast.upload=true). Values: disk, array, bytebuffer.
This configuration option has no effect if fs.s3a.fast.upload is false.
"disk" will use the directories listed in fs.s3a.buffer.dir as
the location(s) to save data prior to being uploaded.
"array" uses arrays in the JVM heap
"bytebuffer" uses off-heap memory within the JVM.
Both "array" and "bytebuffer" will consume memory in a single stream up to the number
of blocks set by:
fs.s3a.multipart.size * fs.s3a.fast.upload.active.blocks.
If using either of these mechanisms, keep this value low
The total number of threads performing work across all threads is set by
fs.s3a.threads.max, with fs.s3a.max.total.tasks values setting the number of queued
work items.
</description>
</property>
<property>
<name>fs.s3a.multipart.size</name>
<value>104857600</value>
<description>
How big (in bytes) to split upload or copy operations up into.
</description>
</property>
<property>
<name>fs.s3a.fast.upload.active.blocks</name>
<value>8</value>
<description>
Maximum Number of blocks a single output stream can have
active (uploading, or queued to the central FileSystem
instance's pool of queued operations.
This stops a single stream overloading the shared thread pool.
</description>
</property>
```
**Notes**
* If the amount of data written to a stream is below that set in `fs.s3a.multipart.size`,
the upload is performed in the `OutputStream.close()` operation —as with
the original output stream.
* The published Hadoop metrics monitor include live queue length and
upload operation counts, so identifying when there is a backlog of work/
a mismatch between data generation rates and network bandwidth. Per-stream
statistics can also be logged by calling `toString()` on the current stream.
* Incremental writes are not visible; the object can only be listed
or read when the multipart operation completes in the `close()` call, which
will block until the upload is completed.
#### <a name="s3a_fast_upload_disk"></a>Fast Upload with Disk Buffers `fs.s3a.fast.upload.buffer=disk`
When `fs.s3a.fast.upload.buffer` is set to `disk`, all data is buffered
to local hard disks prior to upload. This minimizes the amount of memory
consumed, and so eliminates heap size as the limiting factor in queued uploads
—exactly as the original "direct to disk" buffering used when
`fs.s3a.fast.upload=false`.
```xml
<property>
<name>fs.s3a.fast.upload</name>
<value>true</value>
</property>
<property>
<name>fs.s3a.fast.upload.buffer</name>
<value>disk</value>
</property>
```
#### <a name="s3a_fast_upload_bytebuffer"></a>Fast Upload with ByteBuffers: `fs.s3a.fast.upload.buffer=bytebuffer`
When `fs.s3a.fast.upload.buffer` is set to `bytebuffer`, all data is buffered
in "Direct" ByteBuffers prior to upload. This *may* be faster than buffering to disk,
and, if disk space is small (for example, tiny EC2 VMs), there may not
be much disk space to buffer with.
The ByteBuffers are created in the memory of the JVM, but not in the Java Heap itself.
The amount of data which can be buffered is
limited by the Java runtime, the operating system, and, for YARN applications,
the amount of memory requested for each container.
The slower the write bandwidth to S3, the greater the risk of running out
of memory —and so the more care is needed in
[tuning the upload settings](#s3a_fast_upload_thread_tuning).
```xml
<property>
<name>fs.s3a.fast.upload</name>
<value>true</value>
</property>
<property>
<name>fs.s3a.fast.upload.buffer</name>
<value>bytebuffer</value>
</property>
```
#### <a name="s3a_fast_upload_array"></a>Fast Upload with Arrays: `fs.s3a.fast.upload.buffer=array`
When `fs.s3a.fast.upload.buffer` is set to `array`, all data is buffered
in byte arrays in the JVM's heap prior to upload.
This *may* be faster than buffering to disk.
This `array` option is similar to the in-memory-only stream offered in
Hadoop 2.7 with `fs.s3a.fast.upload=true`
The amount of data which can be buffered is limited by the available
size of the JVM heap heap. The slower the write bandwidth to S3, the greater
the risk of heap overflows. This risk can be mitigated by
[tuning the upload settings](#s3a_fast_upload_thread_tuning).
```xml
<property>
<name>fs.s3a.fast.upload</name>
<value>true</value>
</property>
<property>
<name>fs.s3a.fast.upload.buffer</name>
<value>array</value>
</property>
```
#### <a name="s3a_fast_upload_thread_tuning"></a>S3A Fast Upload Thread Tuning
Both the [Array](#s3a_fast_upload_array) and [Byte buffer](#s3a_fast_upload_bytebuffer)
buffer mechanisms can consume very large amounts of memory, on-heap or
off-heap respectively. The [disk buffer](#s3a_fast_upload_disk) mechanism
does not use much memory up, but will consume hard disk capacity.
If there are many output streams being written to in a single process, the
amount of memory or disk used is the multiple of all stream's active memory/disk use.
Careful tuning may be needed to reduce the risk of running out memory, especially
if the data is buffered in memory.
There are a number parameters which can be tuned:
1. The total number of threads available in the filesystem for data
uploads *or any other queued filesystem operation*. This is set in
`fs.s3a.threads.max`
1. The number of operations which can be queued for execution:, *awaiting
a thread*: `fs.s3a.max.total.tasks`
1. The number of blocks which a single output stream can have active,
that is: being uploaded by a thread, or queued in the filesystem thread queue:
`fs.s3a.fast.upload.active.blocks`
1. How long an idle thread can stay in the thread pool before it is retired: `fs.s3a.threads.keepalivetime`
When the maximum allowed number of active blocks of a single stream is reached,
no more blocks can be uploaded from that stream until one or more of those active
blocks' uploads completes. That is: a `write()` call which would trigger an upload
of a now full datablock, will instead block until there is capacity in the queue.
How does that come together?
* As the pool of threads set in `fs.s3a.threads.max` is shared (and intended
to be used across all threads), a larger number here can allow for more
parallel operations. However, as uploads require network bandwidth, adding more
threads does not guarantee speedup.
* The extra queue of tasks for the thread pool (`fs.s3a.max.total.tasks`)
covers all ongoing background S3A operations (future plans include: parallelized
rename operations, asynchronous directory operations).
* When using memory buffering, a small value of `fs.s3a.fast.upload.active.blocks`
limits the amount of memory which can be consumed per stream.
* When using disk buffering a larger value of `fs.s3a.fast.upload.active.blocks`
does not consume much memory. But it may result in a large number of blocks to
compete with other filesystem operations.
We recommend a low value of `fs.s3a.fast.upload.active.blocks`; enough
to start background upload without overloading other parts of the system,
then experiment to see if higher values deliver more throughtput —especially
from VMs running on EC2.
```xml
<property>
<name>fs.s3a.fast.upload.active.blocks</name>
<value>4</value>
<description>
Maximum Number of blocks a single output stream can have
active (uploading, or queued to the central FileSystem
instance's pool of queued operations.
This stops a single stream overloading the shared thread pool.
</description>
</property>
<property>
<name>fs.s3a.threads.max</name>
<value>10</value>
<description>The total number of threads available in the filesystem for data
uploads *or any other queued filesystem operation*.</description>
</property>
<property>
<name>fs.s3a.max.total.tasks</name>
<value>5</value>
<description>The number of operations which can be queued for execution</description>
</property>
<property>
<name>fs.s3a.threads.keepalivetime</name>
<value>60</value>
<description>Number of seconds a thread can be idle before being
terminated.</description>
</property>
```
#### <a name="s3a_multipart_purge"></a>Cleaning up After Incremental Upload Failures: `fs.s3a.multipart.purge`
If an incremental streaming operation is interrupted, there may be
intermediate partitions uploaded to S3 —data which will be billed for.
These charges can be reduced by enabling `fs.s3a.multipart.purge`,
and setting a purge time in seconds, such as 86400 seconds —24 hours.
When an S3A FileSystem instance is instantiated with the purge time greater
than zero, it will, on startup, delete all outstanding partition requests
older than this time.
```xml
<property>
<name>fs.s3a.multipart.purge</name>
<value>true</value>
<description>True if you want to purge existing multipart uploads that may not have been
completed/aborted correctly</description>
</property>
<property>
<name>fs.s3a.multipart.purge.age</name>
<value>86400</value>
<description>Minimum age in seconds of multipart uploads to purge</description>
</property>
```
If an S3A client is instantited with `fs.s3a.multipart.purge=true`,
it will delete all out of date uploads *in the entire bucket*. That is: it will affect all
multipart uploads to that bucket, from all applications.
Leaving `fs.s3a.multipart.purge` to its default, `false`,
means that the client will not make any attempt to reset or change the partition
rate.
The best practise for using this option is to disable multipart purges in
normal use of S3A, enabling only in manual/scheduled housekeeping operations.
### S3A Experimental "fadvise" input policy support
@ -1221,7 +1543,143 @@ can be used:
Using the explicit endpoint for the region is recommended for speed and the
ability to use the V4 signing API.
## Visible S3 Inconsistency
### "Timeout waiting for connection from pool" when writing to S3A
This happens when using the Block output stream, `fs.s3a.fast.upload=true` and
the thread pool runs out of capacity.
```
[s3a-transfer-shared-pool1-t20] INFO http.AmazonHttpClient (AmazonHttpClient.java:executeHelper(496)) - Unable to execute HTTP request: Timeout waiting for connection from poolorg.apache.http.conn.ConnectionPoolTimeoutException: Timeout waiting for connection from pool
at org.apache.http.impl.conn.PoolingClientConnectionManager.leaseConnection(PoolingClientConnectionManager.java:230)
at org.apache.http.impl.conn.PoolingClientConnectionManager$1.getConnection(PoolingClientConnectionManager.java:199)
at sun.reflect.GeneratedMethodAccessor13.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at com.amazonaws.http.conn.ClientConnectionRequestFactory$Handler.invoke(ClientConnectionRequestFactory.java:70)
at com.amazonaws.http.conn.$Proxy10.getConnection(Unknown Source)
at org.apache.http.impl.client.DefaultRequestDirector.execute(DefaultRequestDirector.java:424)
at org.apache.http.impl.client.AbstractHttpClient.doExecute(AbstractHttpClient.java:884)
at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:82)
at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:55)
at com.amazonaws.http.AmazonHttpClient.executeOneRequest(AmazonHttpClient.java:728)
at com.amazonaws.http.AmazonHttpClient.executeHelper(AmazonHttpClient.java:489)
at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:310)
at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:3785)
at com.amazonaws.services.s3.AmazonS3Client.doUploadPart(AmazonS3Client.java:2921)
at com.amazonaws.services.s3.AmazonS3Client.uploadPart(AmazonS3Client.java:2906)
at org.apache.hadoop.fs.s3a.S3AFileSystem.uploadPart(S3AFileSystem.java:1025)
at org.apache.hadoop.fs.s3a.S3ABlockOutputStream$MultiPartUpload$1.call(S3ABlockOutputStream.java:360)
at org.apache.hadoop.fs.s3a.S3ABlockOutputStream$MultiPartUpload$1.call(S3ABlockOutputStream.java:355)
at org.apache.hadoop.fs.s3a.BlockingThreadPoolExecutorService$CallableWithPermitRelease.call(BlockingThreadPoolExecutorService.java:239)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
```
Make sure that `fs.s3a.connection.maximum` is at least larger
than `fs.s3a.threads.max`.
```xml
<property>
<name>fs.s3a.threads.max</name>
<value>20</value>
</property>
<property>
<name>fs.s3a.connection.maximum</name>
<value>30</value>
</property>
```
### "Timeout waiting for connection from pool" when reading from S3A
This happens when more threads are trying to read from an S3A system than
the maximum number of allocated HTTP connections.
Set `fs.s3a.connection.maximum` to a larger value (and at least as large as
`fs.s3a.threads.max`)
### Out of heap memory when writing to S3A via Fast Upload
This can happen when using the fast upload mechanism (`fs.s3a.fast.upload=true`)
and in-memory buffering (either `fs.s3a.fast.upload.buffer=array` or
`fs.s3a.fast.upload.buffer=bytebuffer`).
More data is being generated than in the JVM than it can upload to S3 —and
so much data has been buffered that the JVM has run out of memory.
Consult [S3A Fast Upload Thread Tuning](#s3a_fast_upload_thread_tuning) for
detail on this issue and options to address it. Consider also buffering to
disk, rather than memory.
### When writing to S3A: "java.io.FileNotFoundException: Completing multi-part upload"
```
java.io.FileNotFoundException: Completing multi-part upload on fork-5/test/multipart/1c397ca6-9dfb-4ac1-9cf7-db666673246b: com.amazonaws.services.s3.model.AmazonS3Exception: The specified upload does not exist. The upload ID may be invalid, or the upload may have been aborted or completed. (Service: Amazon S3; Status Code: 404; Error Code: NoSuchUpload; Request ID: 84FF8057174D9369), S3 Extended Request ID: Ij5Yn6Eq/qIERH4Z6Io3YL2t9/qNZ7z9gjPb1FrTtTovZ8k1MXqh+zCYYjqmfJ/fCY6E1+JR9jA=
at com.amazonaws.http.AmazonHttpClient.handleErrorResponse(AmazonHttpClient.java:1182)
at com.amazonaws.http.AmazonHttpClient.executeOneRequest(AmazonHttpClient.java:770)
at com.amazonaws.http.AmazonHttpClient.executeHelper(AmazonHttpClient.java:489)
at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:310)
at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:3785)
at com.amazonaws.services.s3.AmazonS3Client.completeMultipartUpload(AmazonS3Client.java:2705)
at org.apache.hadoop.fs.s3a.S3ABlockOutputStream$MultiPartUpload.complete(S3ABlockOutputStream.java:473)
at org.apache.hadoop.fs.s3a.S3ABlockOutputStream$MultiPartUpload.access$200(S3ABlockOutputStream.java:382)
at org.apache.hadoop.fs.s3a.S3ABlockOutputStream.close(S3ABlockOutputStream.java:272)
at org.apache.hadoop.fs.FSDataOutputStream$PositionCache.close(FSDataOutputStream.java:72)
at org.apache.hadoop.fs.FSDataOutputStream.close(FSDataOutputStream.java:106)
```
This surfaces if, while a multipart upload was taking place, all outstanding multipart
uploads were garbage collected. The upload operation cannot complete because
the data uploaded has been deleted.
Consult [Cleaning up After Incremental Upload Failures](#s3a_multipart_purge) for
details on how the multipart purge timeout can be set. If multipart uploads
are failing with the message above, it may be a sign that this value is too low.
### When writing to S3A, HTTP Exceptions logged at info from `AmazonHttpClient`
```
[s3a-transfer-shared-pool4-t6] INFO http.AmazonHttpClient (AmazonHttpClient.java:executeHelper(496)) - Unable to execute HTTP request: hwdev-steve-ireland-new.s3.amazonaws.com:443 failed to respond
org.apache.http.NoHttpResponseException: bucket.s3.amazonaws.com:443 failed to respond
at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:143)
at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:57)
at org.apache.http.impl.io.AbstractMessageParser.parse(AbstractMessageParser.java:261)
at org.apache.http.impl.AbstractHttpClientConnection.receiveResponseHeader(AbstractHttpClientConnection.java:283)
at org.apache.http.impl.conn.DefaultClientConnection.receiveResponseHeader(DefaultClientConnection.java:259)
at org.apache.http.impl.conn.ManagedClientConnectionImpl.receiveResponseHeader(ManagedClientConnectionImpl.java:209)
at org.apache.http.protocol.HttpRequestExecutor.doReceiveResponse(HttpRequestExecutor.java:272)
at com.amazonaws.http.protocol.SdkHttpRequestExecutor.doReceiveResponse(SdkHttpRequestExecutor.java:66)
at org.apache.http.protocol.HttpRequestExecutor.execute(HttpRequestExecutor.java:124)
at org.apache.http.impl.client.DefaultRequestDirector.tryExecute(DefaultRequestDirector.java:686)
at org.apache.http.impl.client.DefaultRequestDirector.execute(DefaultRequestDirector.java:488)
at org.apache.http.impl.client.AbstractHttpClient.doExecute(AbstractHttpClient.java:884)
at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:82)
at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:55)
at com.amazonaws.http.AmazonHttpClient.executeOneRequest(AmazonHttpClient.java:728)
at com.amazonaws.http.AmazonHttpClient.executeHelper(AmazonHttpClient.java:489)
at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:310)
at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:3785)
at com.amazonaws.services.s3.AmazonS3Client.copyPart(AmazonS3Client.java:1731)
at com.amazonaws.services.s3.transfer.internal.CopyPartCallable.call(CopyPartCallable.java:41)
at com.amazonaws.services.s3.transfer.internal.CopyPartCallable.call(CopyPartCallable.java:28)
at org.apache.hadoop.fs.s3a.BlockingThreadPoolExecutorService$CallableWithPermitRelease.call(BlockingThreadPoolExecutorService.java:239)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
```
These are HTTP I/O exceptions caught and logged inside the AWS SDK. The client
will attempt to retry the operation; it may just be a transient event. If there
are many such exceptions in logs, it may be a symptom of connectivity or network
problems.
### Visible S3 Inconsistency
Amazon S3 is *an eventually consistent object store*. That is: not a filesystem.
@ -1564,7 +2022,7 @@ tests or the `it.test` property for integration tests.
mvn clean test -Dtest=TestS3AInputPolicies
mvn clean verify -Dit.test=ITestS3AFileContextStatistics
mvn clean verify -Dit.test=ITestS3AFileContextStatistics -Dtest=none
mvn clean verify -Dtest=TestS3A* -Dit.test=ITestS3A*
@ -1614,7 +2072,7 @@ An alternate endpoint may be defined in `test.fs.s3a.sts.endpoint`.
The default is ""; meaning "use the amazon default value".
#### CSV Data source Tests
### CSV Data source Tests
The `TestS3AInputStreamPerformance` tests require read access to a multi-MB
text file. The default file for these tests is one published by amazon,
@ -1661,18 +2119,89 @@ endpoint:
<value>s3.amazonaws.com</value>
</property>
```
### Viewing Integration Test Reports
#### Scale test operation count
Integration test results and logs are stored in `target/failsafe-reports/`.
An HTML report can be generated during site generation, or with the `surefire-report`
plugin:
```
mvn surefire-report:failsafe-report-only
```
### Scale Tests
There are a set of tests designed to measure the scalability and performance
at scale of the S3A tests, *Scale Tests*. Tests include: creating
and traversing directory trees, uploading large files, renaming them,
deleting them, seeking through the files, performing random IO, and others.
This makes them a foundational part of the benchmarking.
By their very nature they are slow. And, as their execution time is often
limited by bandwidth between the computer running the tests and the S3 endpoint,
parallel execution does not speed these tests up.
#### Enabling the Scale Tests
The tests are enabled if the `scale` property is set in the maven build
this can be done regardless of whether or not the parallel test profile
is used
```bash
mvn verify -Dscale
mvn verify -Dparallel-tests -Dscale -DtestsThreadCount=8
```
The most bandwidth intensive tests (those which upload data) always run
sequentially; those which are slow due to HTTPS setup costs or server-side
actionsare included in the set of parallelized tests.
#### Maven build tuning options
Some of the tests can be tuned from the maven build or from the
configuration file used to run the tests.
```bash
mvn verify -Dscale -Dfs.s3a.scale.test.huge.filesize=128M
```
The algorithm is
1. The value is queried from the configuration file, using a default value if
it is not set.
1. The value is queried from the JVM System Properties, where it is passed
down by maven.
1. If the system property is null, empty, or it has the value `unset`, then
the configuration value is used. The `unset` option is used to
[work round a quirk in maven property propagation](http://stackoverflow.com/questions/7773134/null-versus-empty-arguments-in-maven).
Only a few properties can be set this way; more will be added.
| Property | Meaninging |
|-----------|-------------|
| `fs.s3a.scale.test.timeout`| Timeout in seconds for scale tests |
| `fs.s3a.scale.test.huge.filesize`| Size for huge file uploads |
| `fs.s3a.scale.test.huge.huge.partitionsize`| Size for partitions in huge file uploads |
The file and partition sizes are numeric values with a k/m/g/t/p suffix depending
on the desired size. For example: 128M, 128m, 2G, 2G, 4T or even 1P.
#### Scale test configuration options
Some scale tests perform multiple operations (such as creating many directories).
The exact number of operations to perform is configurable in the option
`scale.test.operation.count`
<property>
<name>scale.test.operation.count</name>
<value>10</value>
</property>
```xml
<property>
<name>scale.test.operation.count</name>
<value>10</value>
</property>
```
Larger values generate more load, and are recommended when testing locally,
or in batch runs.
@ -1685,19 +2214,64 @@ the width and depth of tests creating recursive directories. Larger
values create exponentially more directories, with consequent performance
impact.
<property>
<name>scale.test.directory.count</name>
<value>2</value>
</property>
```xml
<property>
<name>scale.test.directory.count</name>
<value>2</value>
</property>
```
DistCp tests targeting S3A support a configurable file size. The default is
10 MB, but the configuration value is expressed in KB so that it can be tuned
smaller to achieve faster test runs.
<property>
<name>scale.test.distcp.file.size.kb</name>
<value>10240</value>
</property>
```xml
<property>
<name>scale.test.distcp.file.size.kb</name>
<value>10240</value>
</property>
```
S3A specific scale test properties are
##### `fs.s3a.scale.test.huge.filesize`: size in MB for "Huge file tests".
The Huge File tests validate S3A's ability to handle large files —the property
`fs.s3a.scale.test.huge.filesize` declares the file size to use.
```xml
<property>
<name>fs.s3a.scale.test.huge.filesize</name>
<value>200M</value>
</property>
```
Amazon S3 handles files larger than 5GB differently than smaller ones.
Setting the huge filesize to a number greater than that) validates support
for huge files.
```xml
<property>
<name>fs.s3a.scale.test.huge.filesize</name>
<value>6G</value>
</property>
```
Tests at this scale are slow: they are best executed from hosts running in
the cloud infrastructure where the S3 endpoint is based.
Otherwise, set a large timeout in `fs.s3a.scale.test.timeout`
```xml
<property>
<name>fs.s3a.scale.test.timeout</name>
<value>432000</value>
</property>
```
The tests are executed in an order to only clean up created files after
the end of all the tests. If the tests are interrupted, the test data will remain.
### Testing against non AWS S3 endpoints.

View File

@ -18,24 +18,26 @@
package org.apache.hadoop.fs.contract.s3a;
import static org.apache.hadoop.fs.s3a.Constants.MIN_MULTIPART_THRESHOLD;
import static org.apache.hadoop.fs.s3a.Constants.MULTIPART_SIZE;
import static org.apache.hadoop.fs.s3a.Constants.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.tools.contract.AbstractContractDistCpTest;
/**
* Contract test suite covering S3A integration with DistCp.
* Uses the block output stream, buffered to disk. This is the
* recommended output mechanism for DistCP due to its scalability.
*/
public class ITestS3AContractDistCp extends AbstractContractDistCpTest {
private static final long MULTIPART_SETTING = 8 * 1024 * 1024; // 8 MB
private static final long MULTIPART_SETTING = MULTIPART_MIN_SIZE;
@Override
protected Configuration createConfiguration() {
Configuration newConf = super.createConfiguration();
newConf.setLong(MIN_MULTIPART_THRESHOLD, MULTIPART_SETTING);
newConf.setLong(MULTIPART_SIZE, MULTIPART_SETTING);
newConf.setBoolean(FAST_UPLOAD, true);
newConf.set(FAST_UPLOAD_BUFFER, FAST_UPLOAD_BUFFER_DISK);
return newConf;
}

View File

@ -48,6 +48,7 @@ protected AbstractFSContract createContract(Configuration conf) {
@Override
public void teardown() throws Exception {
super.teardown();
describe("closing file system");
IOUtils.closeStream(getFileSystem());
}

View File

@ -1,4 +1,4 @@
/**
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@ -20,18 +20,23 @@
import com.google.common.util.concurrent.ListenableFuture;
import org.apache.hadoop.util.StopWatch;
import org.junit.*;
import org.junit.AfterClass;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.Timeout;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
/**
* Basic unit test for S3A's blocking executor service.
* Basic test for S3A's blocking executor service.
*/
public class ITestBlockingThreadPoolExecutorService {
@ -47,7 +52,10 @@ public class ITestBlockingThreadPoolExecutorService {
private static final Integer SOME_VALUE = 1337;
private static BlockingThreadPoolExecutorService tpe = null;
private static BlockingThreadPoolExecutorService tpe;
@Rule
public Timeout testTimeout = new Timeout(60 * 1000);
@AfterClass
public static void afterClass() throws Exception {
@ -71,13 +79,23 @@ public void testSubmitCallable() throws Exception {
@Test
public void testSubmitRunnable() throws Exception {
ensureCreated();
int totalTasks = NUM_ACTIVE_TASKS + NUM_WAITING_TASKS;
verifyQueueSize(tpe, NUM_ACTIVE_TASKS + NUM_WAITING_TASKS);
}
/**
* Verify the size of the executor's queue, by verifying that the first
* submission to block is {@code expectedQueueSize + 1}.
* @param executorService executor service to test
* @param expectedQueueSize size of queue
*/
protected void verifyQueueSize(ExecutorService executorService,
int expectedQueueSize) {
StopWatch stopWatch = new StopWatch().start();
for (int i = 0; i < totalTasks; i++) {
tpe.submit(sleeper);
for (int i = 0; i < expectedQueueSize; i++) {
executorService.submit(sleeper);
assertDidntBlock(stopWatch);
}
tpe.submit(sleeper);
executorService.submit(sleeper);
assertDidBlock(stopWatch);
}
@ -93,6 +111,15 @@ public void testShutdown() throws Exception {
ensureDestroyed();
}
@Test
public void testChainedQueue() throws Throwable {
ensureCreated();
int size = 2;
ExecutorService wrapper = new SemaphoredDelegatingExecutor(tpe,
size, true);
verifyQueueSize(wrapper, size);
}
// Helper functions, etc.
private void assertDidntBlock(StopWatch sw) {
@ -141,8 +168,9 @@ public Integer call() throws Exception {
private static void ensureCreated() throws Exception {
if (tpe == null) {
LOG.debug("Creating thread pool");
tpe = new BlockingThreadPoolExecutorService(NUM_ACTIVE_TASKS,
NUM_WAITING_TASKS, 1, TimeUnit.SECONDS, "btpetest");
tpe = BlockingThreadPoolExecutorService.newInstance(
NUM_ACTIVE_TASKS, NUM_WAITING_TASKS,
1, TimeUnit.SECONDS, "btpetest");
}
}

View File

@ -0,0 +1,90 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs.s3a;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.contract.ContractTestUtils;
import org.apache.hadoop.io.IOUtils;
import org.junit.Test;
import java.io.IOException;
import static org.apache.hadoop.fs.s3a.Constants.*;
/**
* Tests small file upload functionality for
* {@link S3ABlockOutputStream} with the blocks buffered in byte arrays.
*
* File sizes are kept small to reduce test duration on slow connections;
* multipart tests are kept in scale tests.
*/
public class ITestS3ABlockOutputArray extends AbstractS3ATestBase {
@Override
protected Configuration createConfiguration() {
Configuration conf = super.createConfiguration();
S3ATestUtils.disableFilesystemCaching(conf);
conf.setLong(MIN_MULTIPART_THRESHOLD, MULTIPART_MIN_SIZE);
conf.setInt(MULTIPART_SIZE, MULTIPART_MIN_SIZE);
conf.setBoolean(Constants.FAST_UPLOAD, true);
conf.set(FAST_UPLOAD_BUFFER, getBlockOutputBufferName());
return conf;
}
protected String getBlockOutputBufferName() {
return FAST_UPLOAD_BUFFER_ARRAY;
}
@Test
public void testZeroByteUpload() throws IOException {
verifyUpload("0", 0);
}
@Test
public void testRegularUpload() throws IOException {
verifyUpload("regular", 1024);
}
@Test(expected = IOException.class)
public void testDoubleStreamClose() throws Throwable {
Path dest = path("testDoubleStreamClose");
describe(" testDoubleStreamClose");
FSDataOutputStream stream = getFileSystem().create(dest, true);
byte[] data = ContractTestUtils.dataset(16, 'a', 26);
try {
stream.write(data);
stream.close();
stream.write(data);
} finally {
IOUtils.closeStream(stream);
}
}
public void verifyUpload(String name, int fileSize) throws IOException {
Path dest = path(name);
describe(name + " upload to " + dest);
ContractTestUtils.createAndVerifyFile(
getFileSystem(),
dest,
fileSize);
}
}

View File

@ -0,0 +1,30 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs.s3a;
/**
* Use {@link Constants#FAST_UPLOAD_BYTEBUFFER} for buffering.
*/
public class ITestS3ABlockOutputByteBuffer extends ITestS3ABlockOutputArray {
protected String getBlockOutputBufferName() {
return Constants.FAST_UPLOAD_BYTEBUFFER;
}
}

View File

@ -0,0 +1,30 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs.s3a;
/**
* Use {@link Constants#FAST_UPLOAD_BUFFER_DISK} for buffering.
*/
public class ITestS3ABlockOutputDisk extends ITestS3ABlockOutputArray {
protected String getBlockOutputBufferName() {
return Constants.FAST_UPLOAD_BUFFER_DISK;
}
}

View File

@ -72,6 +72,8 @@ public void testRegularMultiPartUpload() throws Exception {
@Test
public void testFastMultiPartUpload() throws Exception {
conf.setBoolean(Constants.FAST_UPLOAD, true);
conf.set(Constants.FAST_UPLOAD_BUFFER,
Constants.FAST_UPLOAD_BYTEBUFFER);
fs = S3ATestUtils.createTestFileSystem(conf);
ContractTestUtils.createAndVerifyFile(fs, getTestPath(), 16 * 1024 *
1024);

View File

@ -28,6 +28,7 @@
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.contract.ContractTestUtils;
import org.apache.hadoop.fs.s3native.S3xLoginHelper;
import org.apache.hadoop.test.GenericTestUtils;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.Timeout;
@ -35,6 +36,7 @@
import org.slf4j.LoggerFactory;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.assertEquals;
@ -417,6 +419,33 @@ public void testCloseIdempotent() throws Throwable {
fs.close();
}
@Test
public void testDirectoryAllocatorDefval() throws Throwable {
conf = new Configuration();
conf.unset(Constants.BUFFER_DIR);
fs = S3ATestUtils.createTestFileSystem(conf);
File tmp = fs.createTmpFileForWrite("out-", 1024, conf);
assertTrue("not found: " + tmp, tmp.exists());
tmp.delete();
}
@Test
public void testDirectoryAllocatorRR() throws Throwable {
File dir1 = GenericTestUtils.getRandomizedTestDir();
File dir2 = GenericTestUtils.getRandomizedTestDir();
dir1.mkdirs();
dir2.mkdirs();
conf = new Configuration();
conf.set(Constants.BUFFER_DIR, dir1 +", " + dir2);
fs = S3ATestUtils.createTestFileSystem(conf);
File tmp1 = fs.createTmpFileForWrite("out-", 1024, conf);
tmp1.delete();
File tmp2 = fs.createTmpFileForWrite("out-", 1024, conf);
tmp2.delete();
assertNotEquals("round robin not working",
tmp1.getParent(), tmp2.getParent());
}
/**
* Reads and returns a field from an object using reflection. If the field
* cannot be found, is null, or is not the expected type, then this method

View File

@ -21,15 +21,16 @@
import org.apache.hadoop.conf.Configuration;
/**
* Run the encryption tests against the Fast output stream.
* This verifies that both file writing paths can encrypt their data.
* Run the encryption tests against the block output stream.
*/
public class ITestS3AEncryptionFastOutputStream extends ITestS3AEncryption {
public class ITestS3AEncryptionBlockOutputStream extends ITestS3AEncryption {
@Override
protected Configuration createConfiguration() {
Configuration conf = super.createConfiguration();
conf.setBoolean(Constants.FAST_UPLOAD, true);
conf.set(Constants.FAST_UPLOAD_BUFFER,
Constants.FAST_UPLOAD_BYTEBUFFER);
return conf;
}
}

View File

@ -1,74 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs.s3a;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.contract.ContractTestUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.Timeout;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
/**
* Tests regular and multi-part upload functionality for S3AFastOutputStream.
* File sizes are kept small to reduce test duration on slow connections
*/
public class ITestS3AFastOutputStream {
private FileSystem fs;
@Rule
public Timeout testTimeout = new Timeout(30 * 60 * 1000);
@Before
public void setUp() throws Exception {
Configuration conf = new Configuration();
conf.setLong(Constants.MIN_MULTIPART_THRESHOLD, 5 * 1024 * 1024);
conf.setInt(Constants.MULTIPART_SIZE, 5 * 1024 * 1024);
conf.setBoolean(Constants.FAST_UPLOAD, true);
fs = S3ATestUtils.createTestFileSystem(conf);
}
@After
public void tearDown() throws Exception {
if (fs != null) {
fs.delete(getTestPath(), true);
}
}
protected Path getTestPath() {
return new Path("/tests3a");
}
@Test
public void testRegularUpload() throws IOException {
ContractTestUtils.createAndVerifyFile(fs, getTestPath(), 1024 * 1024);
}
@Test
public void testMultiPartUpload() throws IOException {
ContractTestUtils.createAndVerifyFile(fs, getTestPath(), 6 * 1024 *
1024);
}
}

View File

@ -0,0 +1,98 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs.s3a;
import org.apache.hadoop.conf.Configuration;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.hadoop.fs.s3a.S3ATestUtils.*;
/**
* Test the test utils. Why an integration test? it's needed to
* verify property pushdown.
*/
public class ITestS3ATestUtils extends Assert {
private static final Logger LOG =
LoggerFactory.getLogger(ITestS3ATestUtils.class);
public static final String KEY = "undefined.property";
@Before
public void clear() {
System.clearProperty(KEY);
}
@Test
public void testGetTestProperty() throws Throwable {
Configuration conf = new Configuration(false);
assertEquals("a", getTestProperty(conf, KEY, "a"));
conf.set(KEY, "\t b \n");
assertEquals("b", getTestProperty(conf, KEY, "a"));
System.setProperty(KEY, "c");
assertEquals("c", getTestProperty(conf, KEY, "a"));
unsetSysprop();
assertEquals("b", getTestProperty(conf, KEY, "a"));
}
@Test
public void testGetTestPropertyLong() throws Throwable {
Configuration conf = new Configuration(false);
assertEquals(1, getTestPropertyLong(conf, KEY, 1));
conf.setInt(KEY, 2);
assertEquals(2, getTestPropertyLong(conf, KEY, 1));
System.setProperty(KEY, "3");
assertEquals(3, getTestPropertyLong(conf, KEY, 1));
}
@Test
public void testGetTestPropertyInt() throws Throwable {
Configuration conf = new Configuration(false);
assertEquals(1, getTestPropertyInt(conf, KEY, 1));
conf.setInt(KEY, 2);
assertEquals(2, getTestPropertyInt(conf, KEY, 1));
System.setProperty(KEY, "3");
assertEquals(3, getTestPropertyInt(conf, KEY, 1));
conf.unset(KEY);
assertEquals(3, getTestPropertyInt(conf, KEY, 1));
unsetSysprop();
assertEquals(5, getTestPropertyInt(conf, KEY, 5));
}
@Test
public void testGetTestPropertyBool() throws Throwable {
Configuration conf = new Configuration(false);
assertTrue(getTestPropertyBool(conf, KEY, true));
conf.set(KEY, "\tfalse \n");
assertFalse(getTestPropertyBool(conf, KEY, true));
System.setProperty(KEY, "true");
assertTrue(getTestPropertyBool(conf, KEY, true));
unsetSysprop();
assertEquals("false", getTestProperty(conf, KEY, "true"));
conf.unset(KEY);
assertTrue(getTestPropertyBool(conf, KEY, true));
}
protected void unsetSysprop() {
System.setProperty(KEY, UNSET_PROPERTY);
}
}

View File

@ -43,15 +43,36 @@ public interface S3ATestConstants {
*/
String TEST_FS_S3A_NAME = TEST_FS_S3A + "name";
/**
* Run the encryption tests?
*/
String KEY_ENCRYPTION_TESTS = TEST_FS_S3A + "encryption.enabled";
/**
* Tell tests that they are being executed in parallel: {@value}.
*/
String KEY_PARALLEL_TEST_EXECUTION = "test.parallel.execution";
/**
* A property set to true in maven if scale tests are enabled: {@value}.
*/
String KEY_SCALE_TESTS_ENABLED = S3A_SCALE_TEST + "enabled";
/**
* The number of operations to perform: {@value}.
*/
String KEY_OPERATION_COUNT = SCALE_TEST + "operation.count";
/**
* The number of directory operations to perform: {@value}.
*/
String KEY_DIRECTORY_COUNT = SCALE_TEST + "directory.count";
/**
* The readahead buffer: {@value}.
*/
String KEY_READ_BUFFER_SIZE = S3A_SCALE_TEST + "read.buffer.size";
int DEFAULT_READ_BUFFER_SIZE = 16384;
/**
@ -64,13 +85,63 @@ public interface S3ATestConstants {
*/
String DEFAULT_CSVTEST_FILE = "s3a://landsat-pds/scene_list.gz";
/**
* Endpoint for the S3 CSV/scale tests. This defaults to
* being us-east.
*/
String KEY_CSVTEST_ENDPOINT = S3A_SCALE_TEST + "csvfile.endpoint";
/**
* Endpoint for the S3 CSV/scale tests. This defaults to
* being us-east.
*/
String DEFAULT_CSVTEST_ENDPOINT = "s3.amazonaws.com";
/**
* Name of the property to define the timeout for scale tests: {@value}.
* Measured in seconds.
*/
String KEY_TEST_TIMEOUT = S3A_SCALE_TEST + "timeout";
/**
* Name of the property to define the file size for the huge file
* tests: {@value}.
* Measured in KB; a suffix like "M", or "G" will change the unit.
*/
String KEY_HUGE_FILESIZE = S3A_SCALE_TEST + "huge.filesize";
/**
* Name of the property to define the partition size for the huge file
* tests: {@value}.
* Measured in KB; a suffix like "M", or "G" will change the unit.
*/
String KEY_HUGE_PARTITION_SIZE = S3A_SCALE_TEST + "huge.partitionsize";
/**
* The default huge size is small full 5GB+ scale tests are something
* to run in long test runs on EC2 VMs. {@value}.
*/
String DEFAULT_HUGE_FILESIZE = "10M";
/**
* The default number of operations to perform: {@value}.
*/
long DEFAULT_OPERATION_COUNT = 2005;
/**
* Run the encryption tests?
* Default number of directories to create when performing
* directory performance/scale tests.
*/
String KEY_ENCRYPTION_TESTS = TEST_FS_S3A + "encryption.enabled";
int DEFAULT_DIRECTORY_COUNT = 2;
/**
* Default scale test timeout in seconds: {@value}.
*/
int DEFAULT_TEST_TIMEOUT = 30 * 60;
/**
* Default policy on scale tests: {@value}.
*/
boolean DEFAULT_SCALE_TESTS_ENABLED = false;
}

View File

@ -39,6 +39,12 @@
*/
public class S3ATestUtils {
/**
* Value to set a system property to (in maven) to declare that
* a property has been unset.
*/
public static final String UNSET_PROPERTY = "unset";
/**
* Create the test filesystem.
*
@ -53,8 +59,25 @@ public class S3ATestUtils {
*/
public static S3AFileSystem createTestFileSystem(Configuration conf)
throws IOException {
String fsname = conf.getTrimmed(TEST_FS_S3A_NAME, "");
return createTestFileSystem(conf, true);
}
/**
* Create the test filesystem with or without multipart purging
*
* If the test.fs.s3a.name property is not set, this will
* trigger a JUnit failure.
* @param conf configuration
* @param purge flag to enable Multipart purging
* @return the FS
* @throws IOException IO Problems
* @throws AssumptionViolatedException if the FS is not named
*/
public static S3AFileSystem createTestFileSystem(Configuration conf,
boolean purge)
throws IOException {
String fsname = conf.getTrimmed(TEST_FS_S3A_NAME, "");
boolean liveTest = !StringUtils.isEmpty(fsname);
URI testURI = null;
@ -70,8 +93,12 @@ public static S3AFileSystem createTestFileSystem(Configuration conf)
}
S3AFileSystem fs1 = new S3AFileSystem();
//enable purging in tests
conf.setBoolean(PURGE_EXISTING_MULTIPART, true);
conf.setInt(PURGE_EXISTING_MULTIPART_AGE, 0);
if (purge) {
conf.setBoolean(PURGE_EXISTING_MULTIPART, true);
// but a long delay so that parallel multipart tests don't
// suddenly start timing out
conf.setInt(PURGE_EXISTING_MULTIPART_AGE, 30 * 60);
}
fs1.initialize(testURI, conf);
return fs1;
}
@ -148,6 +175,121 @@ public static void useCSVDataEndpoint(Configuration conf) {
}
}
/**
* Get a long test property.
* <ol>
* <li>Look up configuration value (which can pick up core-default.xml),
* using {@code defVal} as the default value (if conf != null).
* </li>
* <li>Fetch the system property.</li>
* <li>If the system property is not empty or "(unset)":
* it overrides the conf value.
* </li>
* </ol>
* This puts the build properties in charge of everything. It's not a
* perfect design; having maven set properties based on a file, as ant let
* you do, is better for customization.
*
* As to why there's a special (unset) value, see
* {@link http://stackoverflow.com/questions/7773134/null-versus-empty-arguments-in-maven}
* @param conf config: may be null
* @param key key to look up
* @param defVal default value
* @return the evaluated test property.
*/
public static long getTestPropertyLong(Configuration conf,
String key, long defVal) {
return Long.valueOf(
getTestProperty(conf, key, Long.toString(defVal)));
}
/**
* Get a test property value in bytes, using k, m, g, t, p, e suffixes.
* {@link org.apache.hadoop.util.StringUtils.TraditionalBinaryPrefix#string2long(String)}
* <ol>
* <li>Look up configuration value (which can pick up core-default.xml),
* using {@code defVal} as the default value (if conf != null).
* </li>
* <li>Fetch the system property.</li>
* <li>If the system property is not empty or "(unset)":
* it overrides the conf value.
* </li>
* </ol>
* This puts the build properties in charge of everything. It's not a
* perfect design; having maven set properties based on a file, as ant let
* you do, is better for customization.
*
* As to why there's a special (unset) value, see
* {@link http://stackoverflow.com/questions/7773134/null-versus-empty-arguments-in-maven}
* @param conf config: may be null
* @param key key to look up
* @param defVal default value
* @return the evaluated test property.
*/
public static long getTestPropertyBytes(Configuration conf,
String key, String defVal) {
return org.apache.hadoop.util.StringUtils.TraditionalBinaryPrefix
.string2long(getTestProperty(conf, key, defVal));
}
/**
* Get an integer test property; algorithm described in
* {@link #getTestPropertyLong(Configuration, String, long)}.
* @param key key to look up
* @param defVal default value
* @return the evaluated test property.
*/
public static int getTestPropertyInt(Configuration conf,
String key, int defVal) {
return (int) getTestPropertyLong(conf, key, defVal);
}
/**
* Get a boolean test property; algorithm described in
* {@link #getTestPropertyLong(Configuration, String, long)}.
* @param key key to look up
* @param defVal default value
* @return the evaluated test property.
*/
public static boolean getTestPropertyBool(Configuration conf,
String key,
boolean defVal) {
return Boolean.valueOf(
getTestProperty(conf, key, Boolean.toString(defVal)));
}
/**
* Get a string test property.
* <ol>
* <li>Look up configuration value (which can pick up core-default.xml),
* using {@code defVal} as the default value (if conf != null).
* </li>
* <li>Fetch the system property.</li>
* <li>If the system property is not empty or "(unset)":
* it overrides the conf value.
* </li>
* </ol>
* This puts the build properties in charge of everything. It's not a
* perfect design; having maven set properties based on a file, as ant let
* you do, is better for customization.
*
* As to why there's a special (unset) value, see
* @see <a href="http://stackoverflow.com/questions/7773134/null-versus-empty-arguments-in-maven">
* Stack Overflow</a>
* @param conf config: may be null
* @param key key to look up
* @param defVal default value
* @return the evaluated test property.
*/
public static String getTestProperty(Configuration conf,
String key,
String defVal) {
String confVal = conf != null ? conf.getTrimmed(key, defVal) : defVal;
String propval = System.getProperty(key);
return StringUtils.isNotEmpty(propval) && !UNSET_PROPERTY.equals(propval)
? propval : confVal;
}
/**
* The exception to raise so as to exit fast from
* {@link #eventually(int, Callable)}.

View File

@ -0,0 +1,124 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs.s3a;
import org.apache.hadoop.fs.contract.ContractTestUtils;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.Timeout;
/**
* Unit tests for {@link S3ADataBlocks}.
*/
public class TestDataBlocks extends Assert {
@Rule
public Timeout testTimeout = new Timeout(30 * 1000);
@Before
public void nameThread() {
Thread.currentThread().setName("JUnit");
}
/**
* Test the {@link S3ADataBlocks.ByteBufferBlockFactory}.
* That code implements an input stream over a ByteBuffer, and has to
* return the buffer to the pool after the read complete.
*
* This test verifies the basic contract of the process.
*/
@Test
public void testByteBufferIO() throws Throwable {
try (S3ADataBlocks.ByteBufferBlockFactory factory =
new S3ADataBlocks.ByteBufferBlockFactory(null)) {
int limit = 128;
S3ADataBlocks.ByteBufferBlockFactory.ByteBufferBlock block
= factory.create(limit);
assertEquals("outstanding buffers in " + factory,
1, factory.getOutstandingBufferCount());
byte[] buffer = ContractTestUtils.toAsciiByteArray("test data");
int bufferLen = buffer.length;
block.write(buffer, 0, bufferLen);
assertEquals(bufferLen, block.dataSize());
assertEquals("capacity in " + block,
limit - bufferLen, block.remainingCapacity());
assertTrue("hasCapacity(64) in " + block, block.hasCapacity(64));
assertTrue("No capacity in " + block,
block.hasCapacity(limit - bufferLen));
// now start the write
S3ADataBlocks.ByteBufferBlockFactory.ByteBufferInputStream
stream = block.startUpload();
assertTrue("!hasRemaining() in " + stream, stream.hasRemaining());
int expected = bufferLen;
assertEquals("wrong available() in " + stream,
expected, stream.available());
assertEquals('t', stream.read());
expected--;
assertEquals("wrong available() in " + stream,
expected, stream.available());
// close the block. The buffer must remain outstanding here;
// the stream manages the lifecycle of it now
block.close();
assertEquals("outstanding buffers in " + factory,
1, factory.getOutstandingBufferCount());
block.close();
// read into a byte array with an offset
int offset = 5;
byte[] in = new byte[limit];
assertEquals(2, stream.read(in, offset, 2));
assertEquals('e', in[offset]);
assertEquals('s', in[offset + 1]);
expected -= 2;
assertEquals("wrong available() in " + stream,
expected, stream.available());
// read to end
byte[] remainder = new byte[limit];
int c;
int index = 0;
while ((c = stream.read()) >= 0) {
remainder[index++] = (byte) c;
}
assertEquals(expected, index);
assertEquals('a', remainder[--index]);
assertEquals("wrong available() in " + stream,
0, stream.available());
assertTrue("hasRemaining() in " + stream, !stream.hasRemaining());
// when the stream is closed, the data should be returned
stream.close();
assertEquals("outstanding buffers in " + factory,
0, factory.getOutstandingBufferCount());
stream.close();
assertEquals("outstanding buffers in " + factory,
0, factory.getOutstandingBufferCount());
}
}
}

View File

@ -34,6 +34,7 @@ public void setUp() throws Exception {
fc = S3ATestUtils.createTestFileContext(conf);
fc.mkdir(fileContextTestHelper.getTestRootPath(fc, "test"),
FileContext.DEFAULT_PERM, true);
FileContext.clearStatistics();
}
@After

View File

@ -0,0 +1,412 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs.s3a.scale;
import java.io.IOException;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import com.amazonaws.event.ProgressEvent;
import com.amazonaws.event.ProgressEventType;
import com.amazonaws.event.ProgressListener;
import org.junit.FixMethodOrder;
import org.junit.Test;
import org.junit.runners.MethodSorters;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.StorageStatistics;
import org.apache.hadoop.fs.contract.ContractTestUtils;
import org.apache.hadoop.fs.s3a.S3AFileStatus;
import org.apache.hadoop.fs.s3a.Statistic;
import org.apache.hadoop.util.Progressable;
import static org.apache.hadoop.fs.contract.ContractTestUtils.*;
import static org.apache.hadoop.fs.s3a.Constants.*;
import static org.apache.hadoop.fs.s3a.S3ATestUtils.*;
/**
* Scale test which creates a huge file.
*
* <b>Important:</b> the order in which these tests execute is fixed to
* alphabetical order. Test cases are numbered {@code test_123_} to impose
* an ordering based on the numbers.
*
* Having this ordering allows the tests to assume that the huge file
* exists. Even so: they should all have a {@link #assumeHugeFileExists()}
* check at the start, in case an individual test is executed.
*/
@FixMethodOrder(MethodSorters.NAME_ASCENDING)
public abstract class AbstractSTestS3AHugeFiles extends S3AScaleTestBase {
private static final Logger LOG = LoggerFactory.getLogger(
AbstractSTestS3AHugeFiles.class);
public static final int DEFAULT_UPLOAD_BLOCKSIZE = 64 * _1KB;
public static final String DEFAULT_PARTITION_SIZE = "8M";
private Path scaleTestDir;
private Path hugefile;
private Path hugefileRenamed;
private int uploadBlockSize = DEFAULT_UPLOAD_BLOCKSIZE;
private int partitionSize;
@Override
public void setUp() throws Exception {
super.setUp();
final Path testPath = getTestPath();
scaleTestDir = new Path(testPath, "scale");
hugefile = new Path(scaleTestDir, "hugefile");
hugefileRenamed = new Path(scaleTestDir, "hugefileRenamed");
}
@Override
public void tearDown() throws Exception {
// do nothing. Specifically: do not delete the test dir
}
/**
* Note that this can get called before test setup.
* @return the configuration to use.
*/
@Override
protected Configuration createConfiguration() {
Configuration conf = super.createConfiguration();
partitionSize = (int)getTestPropertyBytes(conf,
KEY_HUGE_PARTITION_SIZE,
DEFAULT_PARTITION_SIZE);
assertTrue("Partition size too small: " + partitionSize,
partitionSize > MULTIPART_MIN_SIZE);
conf.setLong(SOCKET_SEND_BUFFER, _1MB);
conf.setLong(SOCKET_RECV_BUFFER, _1MB);
conf.setLong(MIN_MULTIPART_THRESHOLD, partitionSize);
conf.setInt(MULTIPART_SIZE, partitionSize);
conf.set(USER_AGENT_PREFIX, "STestS3AHugeFileCreate");
conf.setBoolean(FAST_UPLOAD, true);
conf.set(FAST_UPLOAD_BUFFER, getBlockOutputBufferName());
return conf;
}
/**
* The name of the buffering mechanism to use.
* @return a buffering mechanism
*/
protected abstract String getBlockOutputBufferName();
@Test
public void test_010_CreateHugeFile() throws IOException {
assertFalse("Please run this test sequentially to avoid timeouts" +
" and bandwidth problems", isParallelExecution());
long filesize = getTestPropertyBytes(getConf(), KEY_HUGE_FILESIZE,
DEFAULT_HUGE_FILESIZE);
long filesizeMB = filesize / _1MB;
// clean up from any previous attempts
deleteHugeFile();
describe("Creating file %s of size %d MB" +
" with partition size %d buffered by %s",
hugefile, filesizeMB, partitionSize, getBlockOutputBufferName());
// now do a check of available upload time, with a pessimistic bandwidth
// (that of remote upload tests). If the test times out then not only is
// the test outcome lost, as the follow-on tests continue, they will
// overlap with the ongoing upload test, for much confusion.
int timeout = getTestTimeoutSeconds();
// assume 1 MB/s upload bandwidth
int bandwidth = _1MB;
long uploadTime = filesize / bandwidth;
assertTrue(String.format("Timeout set in %s seconds is too low;" +
" estimating upload time of %d seconds at 1 MB/s." +
" Rerun tests with -D%s=%d",
timeout, uploadTime, KEY_TEST_TIMEOUT, uploadTime * 2),
uploadTime < timeout);
assertEquals("File size set in " + KEY_HUGE_FILESIZE + " = " + filesize
+ " is not a multiple of " + uploadBlockSize,
0, filesize % uploadBlockSize);
byte[] data = new byte[uploadBlockSize];
for (int i = 0; i < uploadBlockSize; i++) {
data[i] = (byte) (i % 256);
}
long blocks = filesize / uploadBlockSize;
long blocksPerMB = _1MB / uploadBlockSize;
// perform the upload.
// there's lots of logging here, so that a tail -f on the output log
// can give a view of what is happening.
StorageStatistics storageStatistics = fs.getStorageStatistics();
String putRequests = Statistic.OBJECT_PUT_REQUESTS.getSymbol();
String putBytes = Statistic.OBJECT_PUT_BYTES.getSymbol();
Statistic putRequestsActive = Statistic.OBJECT_PUT_REQUESTS_ACTIVE;
Statistic putBytesPending = Statistic.OBJECT_PUT_BYTES_PENDING;
ContractTestUtils.NanoTimer timer = new ContractTestUtils.NanoTimer();
long blocksPer10MB = blocksPerMB * 10;
ProgressCallback progress = new ProgressCallback(timer);
try (FSDataOutputStream out = fs.create(hugefile,
true,
uploadBlockSize,
progress)) {
for (long block = 1; block <= blocks; block++) {
out.write(data);
long written = block * uploadBlockSize;
// every 10 MB and on file upload @ 100%, print some stats
if (block % blocksPer10MB == 0 || written == filesize) {
long percentage = written * 100 / filesize;
double elapsedTime = timer.elapsedTime() / 1.0e9;
double writtenMB = 1.0 * written / _1MB;
LOG.info(String.format("[%02d%%] Buffered %.2f MB out of %d MB;" +
" PUT %d bytes (%d pending) in %d operations (%d active);" +
" elapsedTime=%.2fs; write to buffer bandwidth=%.2f MB/s",
percentage,
writtenMB,
filesizeMB,
storageStatistics.getLong(putBytes),
gaugeValue(putBytesPending),
storageStatistics.getLong(putRequests),
gaugeValue(putRequestsActive),
elapsedTime,
writtenMB / elapsedTime));
}
}
// now close the file
LOG.info("Closing file and completing write operation");
ContractTestUtils.NanoTimer closeTimer
= new ContractTestUtils.NanoTimer();
out.close();
closeTimer.end("time to close() output stream");
}
timer.end("time to write %d MB in blocks of %d",
filesizeMB, uploadBlockSize);
logFSState();
bandwidth(timer, filesize);
long putRequestCount = storageStatistics.getLong(putRequests);
Long putByteCount = storageStatistics.getLong(putBytes);
LOG.info("PUT {} bytes in {} operations; {} MB/operation",
putByteCount, putRequestCount,
putByteCount / (putRequestCount * _1MB));
LOG.info("Time per PUT {} nS",
toHuman(timer.nanosPerOperation(putRequestCount)));
assertEquals("active put requests in \n" + fs,
0, gaugeValue(putRequestsActive));
ContractTestUtils.assertPathExists(fs, "Huge file", hugefile);
S3AFileStatus status = fs.getFileStatus(hugefile);
ContractTestUtils.assertIsFile(hugefile, status);
assertEquals("File size in " + status, filesize, status.getLen());
progress.verifyNoFailures("Put file " + hugefile + " of size " + filesize);
}
/**
* Progress callback from AWS. Likely to come in on a different thread.
*/
private final class ProgressCallback implements Progressable,
ProgressListener {
private AtomicLong bytesTransferred = new AtomicLong(0);
private AtomicInteger failures = new AtomicInteger(0);
private final ContractTestUtils.NanoTimer timer;
private ProgressCallback(NanoTimer timer) {
this.timer = timer;
}
@Override
public void progress() {
}
@Override
public void progressChanged(ProgressEvent progressEvent) {
ProgressEventType eventType = progressEvent.getEventType();
if (eventType.isByteCountEvent()) {
bytesTransferred.addAndGet(progressEvent.getBytesTransferred());
}
switch (eventType) {
case TRANSFER_PART_FAILED_EVENT:
// failure
failures.incrementAndGet();
LOG.warn("Transfer failure");
break;
case TRANSFER_PART_COMPLETED_EVENT:
// completion
long elapsedTime = timer.elapsedTime();
double elapsedTimeS = elapsedTime / 1.0e9;
long written = bytesTransferred.get();
long writtenMB = written / _1MB;
LOG.info(String.format(
"Event %s; total uploaded=%d MB in %.1fs;" +
" effective upload bandwidth = %.2f MB/s",
progressEvent,
writtenMB, elapsedTimeS, writtenMB / elapsedTimeS));
break;
default:
if (eventType.isByteCountEvent()) {
LOG.debug("Event {}", progressEvent);
} else {
LOG.info("Event {}", progressEvent);
}
break;
}
}
@Override
public String toString() {
String sb = "ProgressCallback{"
+ "bytesTransferred=" + bytesTransferred +
", failures=" + failures +
'}';
return sb;
}
private void verifyNoFailures(String operation) {
assertEquals("Failures in " + operation +": " + this, 0, failures.get());
}
}
void assumeHugeFileExists() throws IOException {
ContractTestUtils.assertPathExists(fs, "huge file not created", hugefile);
ContractTestUtils.assertIsFile(fs, hugefile);
}
private void logFSState() {
LOG.info("File System state after operation:\n{}", fs);
}
@Test
public void test_040_PositionedReadHugeFile() throws Throwable {
assumeHugeFileExists();
final String encryption = getConf().getTrimmed(
SERVER_SIDE_ENCRYPTION_ALGORITHM);
boolean encrypted = encryption != null;
if (encrypted) {
LOG.info("File is encrypted with algorithm {}", encryption);
}
String filetype = encrypted ? "encrypted file" : "file";
describe("Positioned reads of %s %s", filetype, hugefile);
S3AFileStatus status = fs.getFileStatus(hugefile);
long filesize = status.getLen();
int ops = 0;
final int bufferSize = 8192;
byte[] buffer = new byte[bufferSize];
long eof = filesize - 1;
ContractTestUtils.NanoTimer timer = new ContractTestUtils.NanoTimer();
ContractTestUtils.NanoTimer readAtByte0, readAtByte0Again, readAtEOF;
try (FSDataInputStream in = fs.open(hugefile, uploadBlockSize)) {
readAtByte0 = new ContractTestUtils.NanoTimer();
in.readFully(0, buffer);
readAtByte0.end("time to read data at start of file");
ops++;
readAtEOF = new ContractTestUtils.NanoTimer();
in.readFully(eof - bufferSize, buffer);
readAtEOF.end("time to read data at end of file");
ops++;
readAtByte0Again = new ContractTestUtils.NanoTimer();
in.readFully(0, buffer);
readAtByte0Again.end("time to read data at start of file again");
ops++;
LOG.info("Final stream state: {}", in);
}
long mb = Math.max(filesize / _1MB, 1);
logFSState();
timer.end("time to performed positioned reads of %s of %d MB ",
filetype, mb);
LOG.info("Time per positioned read = {} nS",
toHuman(timer.nanosPerOperation(ops)));
}
@Test
public void test_050_readHugeFile() throws Throwable {
assumeHugeFileExists();
describe("Reading %s", hugefile);
S3AFileStatus status = fs.getFileStatus(hugefile);
long filesize = status.getLen();
long blocks = filesize / uploadBlockSize;
byte[] data = new byte[uploadBlockSize];
ContractTestUtils.NanoTimer timer = new ContractTestUtils.NanoTimer();
try (FSDataInputStream in = fs.open(hugefile, uploadBlockSize)) {
for (long block = 0; block < blocks; block++) {
in.readFully(data);
}
LOG.info("Final stream state: {}", in);
}
long mb = Math.max(filesize / _1MB, 1);
timer.end("time to read file of %d MB ", mb);
LOG.info("Time per MB to read = {} nS",
toHuman(timer.nanosPerOperation(mb)));
bandwidth(timer, filesize);
logFSState();
}
@Test
public void test_100_renameHugeFile() throws Throwable {
assumeHugeFileExists();
describe("renaming %s to %s", hugefile, hugefileRenamed);
S3AFileStatus status = fs.getFileStatus(hugefile);
long filesize = status.getLen();
fs.delete(hugefileRenamed, false);
ContractTestUtils.NanoTimer timer = new ContractTestUtils.NanoTimer();
fs.rename(hugefile, hugefileRenamed);
long mb = Math.max(filesize / _1MB, 1);
timer.end("time to rename file of %d MB", mb);
LOG.info("Time per MB to rename = {} nS",
toHuman(timer.nanosPerOperation(mb)));
bandwidth(timer, filesize);
logFSState();
S3AFileStatus destFileStatus = fs.getFileStatus(hugefileRenamed);
assertEquals(filesize, destFileStatus.getLen());
// rename back
ContractTestUtils.NanoTimer timer2 = new ContractTestUtils.NanoTimer();
fs.rename(hugefileRenamed, hugefile);
timer2.end("Renaming back");
LOG.info("Time per MB to rename = {} nS",
toHuman(timer2.nanosPerOperation(mb)));
bandwidth(timer2, filesize);
}
@Test
public void test_999_DeleteHugeFiles() throws IOException {
deleteHugeFile();
ContractTestUtils.NanoTimer timer2 = new ContractTestUtils.NanoTimer();
fs.delete(hugefileRenamed, false);
timer2.end("time to delete %s", hugefileRenamed);
ContractTestUtils.rm(fs, getTestPath(), true, true);
}
protected void deleteHugeFile() throws IOException {
describe("Deleting %s", hugefile);
NanoTimer timer = new NanoTimer();
fs.delete(hugefile, false);
timer.end("time to delete %s", hugefile);
}
}

View File

@ -116,20 +116,9 @@ public Boolean call() throws IOException {
@Test
public void testOpenCreate() throws IOException {
Path dir = new Path("/tests3a");
ContractTestUtils.createAndVerifyFile(fs, dir, 1024);
ContractTestUtils.createAndVerifyFile(fs, dir, 5 * 1024 * 1024);
ContractTestUtils.createAndVerifyFile(fs, dir, 20 * 1024 * 1024);
/*
Enable to test the multipart upload
try {
ContractTestUtils.createAndVerifyFile(fs, dir,
(long)6 * 1024 * 1024 * 1024);
} catch (IOException e) {
fail(e.getMessage());
}
*/
final Path scaleTestDir = getTestPath();
final Path srcDir = new Path(scaleTestDir, "opencreate");
ContractTestUtils.createAndVerifyFile(fs, srcDir, 1024);
ContractTestUtils.createAndVerifyFile(fs, srcDir, 50 * 1024);
}
}

View File

@ -0,0 +1,31 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs.s3a.scale;
import org.apache.hadoop.fs.s3a.Constants;
/**
* Use {@link Constants#FAST_UPLOAD_BUFFER_ARRAY} for buffering.
*/
public class ITestS3AHugeFilesArrayBlocks extends AbstractSTestS3AHugeFiles {
protected String getBlockOutputBufferName() {
return Constants.FAST_UPLOAD_BUFFER_ARRAY;
}
}

View File

@ -0,0 +1,34 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs.s3a.scale;
import org.apache.hadoop.fs.s3a.Constants;
import static org.apache.hadoop.fs.s3a.Constants.FAST_UPLOAD_BYTEBUFFER;
/**
* Use {@link Constants#FAST_UPLOAD_BYTEBUFFER} for buffering.
*/
public class ITestS3AHugeFilesByteBufferBlocks
extends AbstractSTestS3AHugeFiles {
protected String getBlockOutputBufferName() {
return FAST_UPLOAD_BYTEBUFFER;
}
}

View File

@ -0,0 +1,41 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs.s3a.scale;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.s3a.Constants;
/**
* Use classic output for writing things; tweaks the configuration to do
* this after it has been set up in the superclass.
* The generator test has been copied and re
*/
public class ITestS3AHugeFilesClassicOutput extends AbstractSTestS3AHugeFiles {
@Override
protected Configuration createConfiguration() {
final Configuration conf = super.createConfiguration();
conf.setBoolean(Constants.FAST_UPLOAD, false);
return conf;
}
protected String getBlockOutputBufferName() {
return "classic";
}
}

View File

@ -0,0 +1,31 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs.s3a.scale;
import org.apache.hadoop.fs.s3a.Constants;
/**
* Use {@link Constants#FAST_UPLOAD_BUFFER_DISK} for buffering.
*/
public class ITestS3AHugeFilesDiskBlocks extends AbstractSTestS3AHugeFiles {
protected String getBlockOutputBufferName() {
return Constants.FAST_UPLOAD_BUFFER_DISK;
}
}

View File

@ -20,18 +20,18 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.contract.ContractTestUtils;
import org.apache.hadoop.fs.s3a.S3AFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.s3a.S3AInputStream;
import org.apache.hadoop.fs.s3a.S3AInstrumentation;
import org.apache.hadoop.fs.s3a.S3ATestConstants;
import org.apache.hadoop.fs.s3a.S3ATestUtils;
import org.apache.hadoop.fs.s3a.Statistic;
import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
import org.junit.After;
import org.junit.Assert;
import org.junit.Assume;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Rule;
import org.junit.rules.TestName;
import org.junit.rules.Timeout;
@ -40,6 +40,8 @@
import java.io.InputStream;
import static org.apache.hadoop.fs.s3a.S3ATestUtils.*;
/**
* Base class for scale tests; here is where the common scale configuration
* keys are defined.
@ -47,71 +49,18 @@
public class S3AScaleTestBase extends Assert implements S3ATestConstants {
@Rule
public TestName methodName = new TestName();
public final TestName methodName = new TestName();
@Rule
public Timeout testTimeout = new Timeout(30 * 60 * 1000);
public Timeout testTimeout = createTestTimeout();
@BeforeClass
public static void nameThread() {
@Before
public void nameThread() {
Thread.currentThread().setName("JUnit");
}
/**
* The number of operations to perform: {@value}.
*/
public static final String KEY_OPERATION_COUNT =
SCALE_TEST + "operation.count";
/**
* The number of directory operations to perform: {@value}.
*/
public static final String KEY_DIRECTORY_COUNT =
SCALE_TEST + "directory.count";
/**
* The readahead buffer: {@value}.
*/
public static final String KEY_READ_BUFFER_SIZE =
S3A_SCALE_TEST + "read.buffer.size";
public static final int DEFAULT_READ_BUFFER_SIZE = 16384;
/**
* Key for a multi MB test file: {@value}.
*/
public static final String KEY_CSVTEST_FILE =
S3A_SCALE_TEST + "csvfile";
/**
* Default path for the multi MB test file: {@value}.
*/
public static final String DEFAULT_CSVTEST_FILE
= "s3a://landsat-pds/scene_list.gz";
/**
* Endpoint for the S3 CSV/scale tests. This defaults to
* being us-east.
*/
public static final String KEY_CSVTEST_ENDPOINT =
S3A_SCALE_TEST + "csvfile.endpoint";
/**
* Endpoint for the S3 CSV/scale tests. This defaults to
* being us-east.
*/
public static final String DEFAULT_CSVTEST_ENDPOINT =
"s3.amazonaws.com";
/**
* The default number of operations to perform: {@value}.
*/
public static final long DEFAULT_OPERATION_COUNT = 2005;
/**
* Default number of directories to create when performing
* directory performance/scale tests.
*/
public static final int DEFAULT_DIRECTORY_COUNT = 2;
public static final int _1KB = 1024;
public static final int _1MB = _1KB * _1KB;
protected S3AFileSystem fs;
@ -120,6 +69,8 @@ public static void nameThread() {
private Configuration conf;
private boolean enabled;
/**
* Configuration generator. May be overridden to inject
* some custom options.
@ -137,11 +88,33 @@ public Configuration getConf() {
return conf;
}
/**
* Setup. This triggers creation of the configuration.
*/
@Before
public void setUp() throws Exception {
conf = createConfiguration();
demandCreateConfiguration();
LOG.debug("Scale test operation count = {}", getOperationCount());
fs = S3ATestUtils.createTestFileSystem(conf);
// multipart purges are disabled on the scale tests
fs = createTestFileSystem(conf, false);
// check for the test being enabled
enabled = getTestPropertyBool(
getConf(),
KEY_SCALE_TESTS_ENABLED,
DEFAULT_SCALE_TESTS_ENABLED);
Assume.assumeTrue("Scale test disabled: to enable set property " +
KEY_SCALE_TESTS_ENABLED, enabled);
}
/**
* Create the configuration if it is not already set up.
* @return the configuration.
*/
private synchronized Configuration demandCreateConfiguration() {
if (conf == null) {
conf = createConfiguration();
}
return conf;
}
@After
@ -160,7 +133,27 @@ protected long getOperationCount() {
}
/**
* Describe a test in the logs
* Create the timeout for tests. Some large tests may need a larger value.
* @return the test timeout to use
*/
protected Timeout createTestTimeout() {
demandCreateConfiguration();
return new Timeout(
getTestTimeoutSeconds() * 1000);
}
/**
* Get the test timeout in seconds.
* @return the test timeout as set in system properties or the default.
*/
protected static int getTestTimeoutSeconds() {
return getTestPropertyInt(null,
KEY_TEST_TIMEOUT,
DEFAULT_TEST_TIMEOUT);
}
/**
* Describe a test in the logs.
* @param text text to print
* @param args arguments to format in the printing
*/
@ -189,4 +182,30 @@ protected S3AInstrumentation.InputStreamStatistics getInputStreamStatistics(
}
}
/**
* Get the gauge value of a statistic. Raises an assertion if
* there is no such gauge.
* @param statistic statistic to look up
* @return the value.
*/
public long gaugeValue(Statistic statistic) {
S3AInstrumentation instrumentation = fs.getInstrumentation();
MutableGaugeLong gauge = instrumentation.lookupGauge(statistic.getSymbol());
assertNotNull("No gauge " + statistic
+ " in " + instrumentation.dump("", " = ", "\n", true), gauge);
return gauge.value();
}
protected boolean isEnabled() {
return enabled;
}
/**
* Flag to indicate that this test is being used sequentially. This
* is used by some of the scale tests to validate test time expectations.
* @return true if the build indicates this test is being run in parallel.
*/
protected boolean isParallelExecution() {
return Boolean.getBoolean(S3ATestConstants.KEY_PARALLEL_TEST_EXECUTION);
}
}