From c2790932bd01c0a8a7c32a38ea99ed47c0fe05f8 Mon Sep 17 00:00:00 2001 From: cnauroth Date: Tue, 28 Oct 2014 14:06:27 -0700 Subject: [PATCH] HDFS-7291. Persist in-memory replicas with appropriate unbuffered copy API on POSIX and Windows. Contributed by Xiaoyu Yao. (cherry picked from commit c6f04f391b54e57467b3ae5d0a95bd96e7feaa82) Conflicts: hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt --- .../apache/hadoop/io/nativeio/NativeIO.java | 66 +++++++++++++++---- .../org/apache/hadoop/io/nativeio/NativeIO.c | 42 +----------- .../hadoop/io/nativeio/TestNativeIO.java | 5 +- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 + .../hadoop/hdfs/server/common/Storage.java | 9 +-- 5 files changed, 66 insertions(+), 59 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/nativeio/NativeIO.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/nativeio/NativeIO.java index 24009586a31..4a1ae7a6b70 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/nativeio/NativeIO.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/nativeio/NativeIO.java @@ -22,22 +22,20 @@ import java.io.FileDescriptor; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; import java.io.RandomAccessFile; import java.lang.reflect.Field; import java.nio.ByteBuffer; import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; -import org.apache.commons.io.FileUtils; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CommonConfigurationKeys; import org.apache.hadoop.fs.HardLink; -import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.SecureIOUtils.AlreadyExistsException; import org.apache.hadoop.util.NativeCodeLoader; import org.apache.hadoop.util.Shell; @@ -662,7 +660,7 @@ public class NativeIO { * user account name, of the format DOMAIN\UserName. This method * will remove the domain part of the full logon name. * - * @param the full principal name containing the domain + * @param Fthe full principal name containing the domain * @return name with domain removed */ private static String stripDomain(String name) { @@ -855,24 +853,66 @@ public class NativeIO { /** * Unbuffered file copy from src to dst without tainting OS buffer cache - * In Linux, it uses sendfile() which uses O_DIRECT flag internally - * In Windows, it uses CopyFileEx with COPY_FILE_NO_BUFFERING flag * - * Note: This does not support FreeBSD/OSX which have a different sendfile() - * semantic. Also, this simple native wrapper does minimal parameter checking + * In POSIX platform: + * It uses FileChannel#transferTo() which internally attempts + * unbuffered IO on OS with native sendfile64() support and falls back to + * buffered IO otherwise. + * + * It minimizes the number of FileChannel#transferTo call by passing the the + * src file size directly instead of a smaller size as the 3rd parameter. + * This saves the number of sendfile64() system call when native sendfile64() + * is supported. In the two fall back cases where sendfile is not supported, + * FileChannle#transferTo already has its own batching of size 8 MB and 8 KB, + * respectively. + * + * In Windows Platform: + * It uses its own native wrapper of CopyFileEx with COPY_FILE_NO_BUFFERING + * flag, which is supported on Windows Server 2008 and above. + * + * Ideally, we should use FileChannel#transferTo() across both POSIX and Windows + * platform. Unfortunately, the wrapper(Java_sun_nio_ch_FileChannelImpl_transferTo0) + * used by FileChannel#transferTo for unbuffered IO is not implemented on Windows. + * Based on OpenJDK 6/7/8 source code, Java_sun_nio_ch_FileChannelImpl_transferTo0 + * on Windows simply returns IOS_UNSUPPORTED. + * + * Note: This simple native wrapper does minimal parameter checking before copy and + * consistency check (e.g., size) after copy. * It is recommended to use wrapper function like - * the Storage#nativeCopyFileUnbuffered() function in hadoop-hdfs. - * + * the Storage#nativeCopyFileUnbuffered() function in hadoop-hdfs with pre/post copy + * checks. * * @param src The source path * @param dst The destination path * @throws IOException */ public static void copyFileUnbuffered(File src, File dst) throws IOException { - if ((nativeLoaded) && (Shell.WINDOWS || Shell.LINUX)) { + if (nativeLoaded && Shell.WINDOWS) { copyFileUnbuffered0(src.getAbsolutePath(), dst.getAbsolutePath()); } else { - FileUtils.copyFile(src, dst); + FileInputStream fis = null; + FileOutputStream fos = null; + FileChannel input = null; + FileChannel output = null; + try { + fis = new FileInputStream(src); + fos = new FileOutputStream(dst); + input = fis.getChannel(); + output = fos.getChannel(); + long remaining = input.size(); + long position = 0; + long transferred = 0; + while (remaining > 0) { + transferred = input.transferTo(position, remaining, output); + remaining -= transferred; + position += transferred; + } + } finally { + IOUtils.cleanup(LOG, output); + IOUtils.cleanup(LOG, fos); + IOUtils.cleanup(LOG, input); + IOUtils.cleanup(LOG, fis); + } } } diff --git a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/io/nativeio/NativeIO.c b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/io/nativeio/NativeIO.c index 792205d5a9e..f0f9ebcb7f4 100644 --- a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/io/nativeio/NativeIO.c +++ b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/io/nativeio/NativeIO.c @@ -1155,46 +1155,8 @@ Java_org_apache_hadoop_io_nativeio_NativeIO_copyFileUnbuffered0( JNIEnv *env, jclass clazz, jstring jsrc, jstring jdst) { #ifdef UNIX -#if (defined(__FreeBSD__) || defined(__MACH__)) - THROW(env, "java/io/IOException", - "The function copyFileUnbuffered() is not supported on FreeBSD or Mac OS"); - return; -#else - const char *src = NULL, *dst = NULL; - int srcFd = -1; - int dstFd = -1; - struct stat s; - off_t offset = 0; - - src = (*env)->GetStringUTFChars(env, jsrc, NULL); - if (!src) goto cleanup; // exception was thrown - dst = (*env)->GetStringUTFChars(env, jdst, NULL); - if (!dst) goto cleanup; // exception was thrown - - srcFd = open(src, O_RDONLY); - if (srcFd == -1) { - throw_ioe(env, errno); - goto cleanup; - } - if (fstat(srcFd, &s) == -1){ - throw_ioe(env, errno); - goto cleanup; - } - dstFd = open(dst, O_WRONLY | O_CREAT, s.st_mode); - if (dstFd == -1) { - throw_ioe(env, errno); - goto cleanup; - } - if (sendfile(dstFd, srcFd, &offset, s.st_size) == -1) { - throw_ioe(env, errno); - } - -cleanup: - if (src) (*env)->ReleaseStringUTFChars(env, jsrc, src); - if (dst) (*env)->ReleaseStringUTFChars(env, jdst, dst); - if (srcFd != -1) close(srcFd); - if (dstFd != -1) close(dstFd); -#endif + THROW(env, "java/lang/UnsupportedOperationException", + "The function copyFileUnbuffered0 should not be used on Unix. Use FileChannel#transferTo instead."); #endif #ifdef WINDOWS diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/nativeio/TestNativeIO.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/nativeio/TestNativeIO.java index 5425c4994be..bf3ece7894c 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/nativeio/TestNativeIO.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/nativeio/TestNativeIO.java @@ -632,7 +632,7 @@ public class TestNativeIO { final String METHOD_NAME = GenericTestUtils.getMethodName(); File srcFile = new File(TEST_DIR, METHOD_NAME + ".src.dat"); File dstFile = new File(TEST_DIR, METHOD_NAME + ".dst.dat"); - final int fileSize = 0x8FFFFFF; // 128 MB + final int fileSize = 0x8000000; // 128 MB final int SEED = 0xBEEF; final int batchSize = 4096; final int numBatches = fileSize / batchSize; @@ -650,7 +650,8 @@ public class TestNativeIO { mapBuf.put(bytesToWrite); } NativeIO.copyFileUnbuffered(srcFile, dstFile); - }finally { + Assert.assertEquals(srcFile.length(), dstFile.length()); + } finally { IOUtils.cleanup(LOG, channel); IOUtils.cleanup(LOG, raSrcFile); FileUtils.deleteQuietly(TEST_DIR); diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 48915ab4d3d..290b3ef31c9 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -802,6 +802,9 @@ Release 2.6.0 - UNRELEASED HDFS-6934. Move checksum computation off the hot path when writing to RAM disk. (cnauroth) + HDFS-7291. Persist in-memory replicas with appropriate unbuffered copy API + on POSIX and Windows. (Xiaoyu Yao via cnauroth) + BREAKDOWN OF HDFS-6134 AND HADOOP-10150 SUBTASKS AND RELATED JIRAS HDFS-6387. HDFS CLI admin tool for creating & deleting an diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java index 4320e229529..735e0c1b1c2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java @@ -1003,10 +1003,11 @@ public abstract class Storage extends StorageInfo { * This method copies the contents of the specified source file * to the specified destination file using OS specific unbuffered IO. * The goal is to avoid churning the file system buffer cache when copying - * large files. TheFileUtils#copyLarge function from apache-commons-io library - * can be used to achieve this with an internal memory buffer but is less - * efficient than the native unbuffered APIs such as sendfile() in Linux and - * CopyFileEx() in Windows wrapped in {@link NativeIO#copyFileUnbuffered}. + * large files. + * + * We can't use FileUtils#copyFile from apache-commons-io because it + * is a buffered IO based on FileChannel#transferFrom, which uses MmapByteBuffer + * internally. * * The directory holding the destination file is created if it does not exist. * If the destination file exists, then this method will delete it first.