From c2790932bd01c0a8a7c32a38ea99ed47c0fe05f8 Mon Sep 17 00:00:00 2001
From: cnauroth <cnauroth@apache.org>
Date: Tue, 28 Oct 2014 14:06:27 -0700
Subject: [PATCH] HDFS-7291. Persist in-memory replicas with appropriate
 unbuffered copy API on POSIX and Windows. Contributed by Xiaoyu Yao.

(cherry picked from commit c6f04f391b54e57467b3ae5d0a95bd96e7feaa82)

Conflicts:
	hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
---
 .../apache/hadoop/io/nativeio/NativeIO.java   | 66 +++++++++++++++----
 .../org/apache/hadoop/io/nativeio/NativeIO.c  | 42 +-----------
 .../hadoop/io/nativeio/TestNativeIO.java      |  5 +-
 hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt   |  3 +
 .../hadoop/hdfs/server/common/Storage.java    |  9 +--
 5 files changed, 66 insertions(+), 59 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/nativeio/NativeIO.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/nativeio/NativeIO.java
index 24009586a31..4a1ae7a6b70 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/nativeio/NativeIO.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/nativeio/NativeIO.java
@@ -22,22 +22,20 @@ import java.io.FileDescriptor;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
 import java.io.RandomAccessFile;
 import java.lang.reflect.Field;
 import java.nio.ByteBuffer;
 import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
 import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
 
-import org.apache.commons.io.FileUtils;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.CommonConfigurationKeys;
 import org.apache.hadoop.fs.HardLink;
-import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.io.SecureIOUtils.AlreadyExistsException;
 import org.apache.hadoop.util.NativeCodeLoader;
 import org.apache.hadoop.util.Shell;
@@ -662,7 +660,7 @@ public class NativeIO {
    * user account name, of the format DOMAIN\UserName. This method
    * will remove the domain part of the full logon name.
    *
-   * @param the full principal name containing the domain
+   * @param Fthe full principal name containing the domain
    * @return name with domain removed
    */
   private static String stripDomain(String name) {
@@ -855,24 +853,66 @@ public class NativeIO {
 
   /**
    * Unbuffered file copy from src to dst without tainting OS buffer cache
-   * In Linux, it uses sendfile() which uses O_DIRECT flag internally
-   * In Windows, it uses CopyFileEx with COPY_FILE_NO_BUFFERING flag
    *
-   * Note: This does not support FreeBSD/OSX which have a different sendfile()
-   * semantic. Also, this simple native wrapper does minimal parameter checking
+   * In POSIX platform:
+   * It uses FileChannel#transferTo() which internally attempts
+   * unbuffered IO on OS with native sendfile64() support and falls back to
+   * buffered IO otherwise.
+   *
+   * It minimizes the number of FileChannel#transferTo call by passing the the
+   * src file size directly instead of a smaller size as the 3rd parameter.
+   * This saves the number of sendfile64() system call when native sendfile64()
+   * is supported. In the two fall back cases where sendfile is not supported,
+   * FileChannle#transferTo already has its own batching of size 8 MB and 8 KB,
+   * respectively.
+   *
+   * In Windows Platform:
+   * It uses its own native wrapper of CopyFileEx with COPY_FILE_NO_BUFFERING
+   * flag, which is supported on Windows Server 2008 and above.
+   *
+   * Ideally, we should use FileChannel#transferTo() across both POSIX and Windows
+   * platform. Unfortunately, the wrapper(Java_sun_nio_ch_FileChannelImpl_transferTo0)
+   * used by FileChannel#transferTo for unbuffered IO is not implemented on Windows.
+   * Based on OpenJDK 6/7/8 source code, Java_sun_nio_ch_FileChannelImpl_transferTo0
+   * on Windows simply returns IOS_UNSUPPORTED.
+   *
+   * Note: This simple native wrapper does minimal parameter checking before copy and
+   * consistency check (e.g., size) after copy.
    * It is recommended to use wrapper function like
-   * the Storage#nativeCopyFileUnbuffered() function in hadoop-hdfs.
-   *
+   * the Storage#nativeCopyFileUnbuffered() function in hadoop-hdfs with pre/post copy
+   * checks.
    *
    * @param src                  The source path
    * @param dst                  The destination path
    * @throws IOException
    */
   public static void copyFileUnbuffered(File src, File dst) throws IOException {
-    if ((nativeLoaded) && (Shell.WINDOWS || Shell.LINUX)) {
+    if (nativeLoaded && Shell.WINDOWS) {
       copyFileUnbuffered0(src.getAbsolutePath(), dst.getAbsolutePath());
     } else {
-      FileUtils.copyFile(src, dst);
+      FileInputStream fis = null;
+      FileOutputStream fos = null;
+      FileChannel input = null;
+      FileChannel output = null;
+      try {
+        fis = new FileInputStream(src);
+        fos = new FileOutputStream(dst);
+        input = fis.getChannel();
+        output = fos.getChannel();
+        long remaining = input.size();
+        long position = 0;
+        long transferred = 0;
+        while (remaining > 0) {
+          transferred = input.transferTo(position, remaining, output);
+          remaining -= transferred;
+          position += transferred;
+        }
+      } finally {
+        IOUtils.cleanup(LOG, output);
+        IOUtils.cleanup(LOG, fos);
+        IOUtils.cleanup(LOG, input);
+        IOUtils.cleanup(LOG, fis);
+      }
     }
   }
 
diff --git a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/io/nativeio/NativeIO.c b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/io/nativeio/NativeIO.c
index 792205d5a9e..f0f9ebcb7f4 100644
--- a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/io/nativeio/NativeIO.c
+++ b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/io/nativeio/NativeIO.c
@@ -1155,46 +1155,8 @@ Java_org_apache_hadoop_io_nativeio_NativeIO_copyFileUnbuffered0(
 JNIEnv *env, jclass clazz, jstring jsrc, jstring jdst)
 {
 #ifdef UNIX
-#if (defined(__FreeBSD__) || defined(__MACH__))
-  THROW(env, "java/io/IOException",
-      "The function copyFileUnbuffered() is not supported on FreeBSD or Mac OS");
-  return;
-#else
-  const char *src = NULL, *dst = NULL;
-  int srcFd = -1;
-  int dstFd = -1;
-  struct stat s;
-  off_t offset = 0;
-
-  src = (*env)->GetStringUTFChars(env, jsrc, NULL);
-  if (!src) goto cleanup; // exception was thrown
-  dst = (*env)->GetStringUTFChars(env, jdst, NULL);
-  if (!dst) goto cleanup; // exception was thrown
-
-  srcFd = open(src, O_RDONLY);
-  if (srcFd == -1) {
-    throw_ioe(env, errno);
-    goto cleanup;
-  }
-  if (fstat(srcFd, &s) == -1){
-    throw_ioe(env, errno);
-    goto cleanup;
-  }
-  dstFd = open(dst, O_WRONLY | O_CREAT, s.st_mode);
-  if (dstFd == -1) {
-    throw_ioe(env, errno);
-    goto cleanup;
-  }
-  if (sendfile(dstFd, srcFd, &offset, s.st_size) == -1) {
-    throw_ioe(env, errno);
-  }
-
-cleanup:
-  if (src) (*env)->ReleaseStringUTFChars(env, jsrc, src);
-  if (dst) (*env)->ReleaseStringUTFChars(env, jdst, dst);
-  if (srcFd != -1) close(srcFd);
-  if (dstFd != -1) close(dstFd);
-#endif
+  THROW(env, "java/lang/UnsupportedOperationException",
+    "The function copyFileUnbuffered0 should not be used on Unix. Use FileChannel#transferTo instead.");
 #endif
 
 #ifdef WINDOWS
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/nativeio/TestNativeIO.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/nativeio/TestNativeIO.java
index 5425c4994be..bf3ece7894c 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/nativeio/TestNativeIO.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/nativeio/TestNativeIO.java
@@ -632,7 +632,7 @@ public class TestNativeIO {
     final String METHOD_NAME = GenericTestUtils.getMethodName();
     File srcFile = new File(TEST_DIR, METHOD_NAME + ".src.dat");
     File dstFile = new File(TEST_DIR, METHOD_NAME + ".dst.dat");
-    final int fileSize = 0x8FFFFFF; // 128 MB
+    final int fileSize = 0x8000000; // 128 MB
     final int SEED = 0xBEEF;
     final int batchSize = 4096;
     final int numBatches = fileSize / batchSize;
@@ -650,7 +650,8 @@ public class TestNativeIO {
         mapBuf.put(bytesToWrite);
       }
       NativeIO.copyFileUnbuffered(srcFile, dstFile);
-    }finally {
+      Assert.assertEquals(srcFile.length(), dstFile.length());
+    } finally {
       IOUtils.cleanup(LOG, channel);
       IOUtils.cleanup(LOG, raSrcFile);
       FileUtils.deleteQuietly(TEST_DIR);
diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
index 48915ab4d3d..290b3ef31c9 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
@@ -802,6 +802,9 @@ Release 2.6.0 - UNRELEASED
       HDFS-6934. Move checksum computation off the hot path when writing to RAM
       disk. (cnauroth)
 
+      HDFS-7291. Persist in-memory replicas with appropriate unbuffered copy API
+      on POSIX and Windows. (Xiaoyu Yao via cnauroth)
+
     BREAKDOWN OF HDFS-6134 AND HADOOP-10150 SUBTASKS AND RELATED JIRAS
   
       HDFS-6387. HDFS CLI admin tool for creating & deleting an
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java
index 4320e229529..735e0c1b1c2 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java
@@ -1003,10 +1003,11 @@ public abstract class Storage extends StorageInfo {
    * This method copies the contents of the specified source file
    * to the specified destination file using OS specific unbuffered IO.
    * The goal is to avoid churning the file system buffer cache when copying
-   * large files. TheFileUtils#copyLarge function from apache-commons-io library
-   * can be used to achieve this with an internal memory buffer but is less
-   * efficient than the native unbuffered APIs such as sendfile() in Linux and
-   * CopyFileEx() in Windows wrapped in {@link NativeIO#copyFileUnbuffered}.
+   * large files.
+   *
+   * We can't use FileUtils#copyFile from apache-commons-io because it
+   * is a buffered IO based on FileChannel#transferFrom, which uses MmapByteBuffer
+   * internally.
    *
    * The directory holding the destination file is created if it does not exist.
    * If the destination file exists, then this method will delete it first.