HDFS-7090. Use unbuffered writes when persisting in-memory replicas. Contributed by Xiaoyu Yao.

2014-10-13 10:50:25 -07:00 · 2014-10-13 10:50:25 -07:00 · 1770bb942f
parent 4aed2d8e91
commit 1770bb942f
8 changed files with 223 additions and 2 deletions
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/nativeio/Errno.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/nativeio/Errno.java
@ -58,6 +58,7 @@ public enum Errno {
  ELOOP,
  ENAMETOOLONG,
  ENOTEMPTY,
+  EOVERFLOW,

  UNKNOWN;
 }
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/nativeio/NativeIO.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/nativeio/NativeIO.java
@ -29,6 +29,7 @@ import java.nio.MappedByteBuffer;
 import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;

+import org.apache.commons.io.FileUtils;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
@ -846,4 +847,30 @@ public class NativeIO {

  private static native void link0(String src, String dst)
      throws NativeIOException;
+
+  /**
+   * Unbuffered file copy from src to dst without tainting OS buffer cache
+   * In Linux, it uses sendfile() which uses O_DIRECT flag internally
+   * In Windows, it uses CopyFileEx with COPY_FILE_NO_BUFFERING flag
+   *
+   * Note: This does not support FreeBSD/OSX which have a different sendfile()
+   * semantic. Also, this simple native wrapper does minimal parameter checking
+   * It is recommended to use wrapper function like
+   * the Storage#nativeCopyFileUnbuffered() function in hadoop-hdfs.
+   *
+   *
+   * @param src                  The source path
+   * @param dst                  The destination path
+   * @throws IOException
+   */
+  public static void copyFileUnbuffered(File src, File dst) throws IOException {
+    if ((nativeLoaded) && (Shell.WINDOWS || Shell.LINUX)) {
+      copyFileUnbuffered0(src.getAbsolutePath(), dst.getAbsolutePath());
+    } else {
+      FileUtils.copyFile(src, dst);
+    }
+  }
+
+  private static native void copyFileUnbuffered0(String src, String dst)
+      throws NativeIOException;
 }
--- a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/io/nativeio/NativeIO.c
+++ b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/io/nativeio/NativeIO.c
@ -35,6 +35,9 @@
 #include <sys/resource.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
+#if !(defined(__FreeBSD__) || defined(__MACH__))
+#include <sys/sendfile.h>
+#endif
 #include <sys/time.h>
 #include <sys/types.h>
 #include <unistd.h>
@ -1142,6 +1145,70 @@ JNIEnv *env, jclass clazz)
 #endif
 }

+JNIEXPORT void JNICALL
+Java_org_apache_hadoop_io_nativeio_NativeIO_copyFileUnbuffered0(
+JNIEnv *env, jclass clazz, jstring jsrc, jstring jdst)
+{
+#ifdef UNIX
+#if (defined(__FreeBSD__) || defined(__MACH__))
+  THROW(env, "java/io/IOException",
+      "The function copyFileUnbuffered() is not supported on FreeBSD or Mac OS");
+  return;
+#else
+  const char *src = NULL, *dst = NULL;
+  int srcFd = -1;
+  int dstFd = -1;
+  struct stat s;
+  off_t offset = 0;
+
+  src = (*env)->GetStringUTFChars(env, jsrc, NULL);
+  if (!src) goto cleanup; // exception was thrown
+  dst = (*env)->GetStringUTFChars(env, jdst, NULL);
+  if (!dst) goto cleanup; // exception was thrown
+
+  srcFd = open(src, O_RDONLY);
+  if (srcFd == -1) {
+    throw_ioe(env, errno);
+    goto cleanup;
+  }
+  if (fstat(srcFd, &s) == -1){
+    throw_ioe(env, errno);
+    goto cleanup;
+  }
+  dstFd = open(dst, O_WRONLY | O_CREAT, s.st_mode);
+  if (dstFd == -1) {
+    throw_ioe(env, errno);
+    goto cleanup;
+  }
+  if (sendfile(dstFd, srcFd, &offset, s.st_size) == -1) {
+    throw_ioe(env, errno);
+  }
+
+cleanup:
+  if (src) (*env)->ReleaseStringUTFChars(env, jsrc, src);
+  if (dst) (*env)->ReleaseStringUTFChars(env, jdst, dst);
+  if (srcFd != -1) close(srcFd);
+  if (dstFd != -1) close(dstFd);
+#endif
+#endif
+
+#ifdef WINDOWS
+  LPCWSTR src = NULL, dst = NULL;
+
+  src = (LPCWSTR) (*env)->GetStringChars(env, jsrc, NULL);
+  if (!src) goto cleanup; // exception was thrown
+  dst = (LPCWSTR) (*env)->GetStringChars(env, jdst, NULL);
+  if (!dst) goto cleanup; // exception was thrown
+  if (!CopyFileEx(src, dst, NULL, NULL, NULL, COPY_FILE_NO_BUFFERING)) {
+    throw_ioe(env, GetLastError());
+  }
+
+cleanup:
+  if (src) (*env)->ReleaseStringChars(env, jsrc, src);
+  if (dst) (*env)->ReleaseStringChars(env, jdst, dst);
+#endif
+}
+
 /**
 * vim: sw=2: ts=2: et:
 */
--- a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/io/nativeio/errno_enum.c
+++ b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/io/nativeio/errno_enum.c
@ -66,6 +66,7 @@ static errno_mapping_t ERRNO_MAPPINGS[] = {
  MAPPING(ELOOP),
  MAPPING(ENAMETOOLONG),
  MAPPING(ENOTEMPTY),
+  MAPPING(EOVERFLOW),
  {-1, NULL}
 };

--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/nativeio/TestNativeIO.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/nativeio/TestNativeIO.java
@ -24,14 +24,18 @@ import java.io.FileOutputStream;
 import java.io.FileReader;
 import java.io.FileWriter;
 import java.io.IOException;
+import java.io.RandomAccessFile;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.channels.FileChannel.MapMode;
+import java.util.Random;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;

+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.test.GenericTestUtils;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
@ -622,4 +626,34 @@ public class TestNativeIO {
    assumeTrue(NativeIO.isAvailable());
    NativeIO.getMemlockLimit();
  }
+
+  @Test (timeout = 30000)
+  public void testCopyFileUnbuffered() throws Exception {
+    final String METHOD_NAME = GenericTestUtils.getMethodName();
+    File srcFile = new File(TEST_DIR, METHOD_NAME + ".src.dat");
+    File dstFile = new File(TEST_DIR, METHOD_NAME + ".dst.dat");
+    final int fileSize = 0x8FFFFFF; // 128 MB
+    final int SEED = 0xBEEF;
+    final int batchSize = 4096;
+    final int numBatches = fileSize / batchSize;
+    Random rb = new Random(SEED);
+    FileChannel channel = null;
+    RandomAccessFile raSrcFile = null;
+    try {
+      raSrcFile = new RandomAccessFile(srcFile, "rw");
+      channel = raSrcFile.getChannel();
+      byte bytesToWrite[] = new byte[batchSize];
+      MappedByteBuffer mapBuf;
+      mapBuf = channel.map(MapMode.READ_WRITE, 0, fileSize);
+      for (int i = 0; i < numBatches; i++) {
+        rb.nextBytes(bytesToWrite);
+        mapBuf.put(bytesToWrite);
+      }
+      NativeIO.copyFileUnbuffered(srcFile, dstFile);
+    }finally {
+      IOUtils.cleanup(LOG, channel);
+      IOUtils.cleanup(LOG, raSrcFile);
+      FileUtils.deleteQuietly(TEST_DIR);
+    }
+  }
 }
--- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
@ -355,6 +355,9 @@ Trunk (Unreleased)
    HDFS-7112. LazyWriter should use either async IO or one thread per physical
    disk. (Xiaoyu Yao via cnauroth)

+    HDFS-7090. Use unbuffered writes when persisting in-memory replicas.
+    (Xiaoyu Yao via cnauroth)
+
 Release 2.7.0 - UNRELEASED

  INCOMPATIBLE CHANGES
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java
@ -19,6 +19,7 @@ package org.apache.hadoop.hdfs.server.common;

 import java.io.File;
 import java.io.FileOutputStream;
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.RandomAccessFile;
 import java.lang.management.ManagementFactory;
@ -997,6 +998,93 @@ public abstract class Storage extends StorageInfo {
    }
  }

+  /**
+   * Copies a file (usually large) to a new location using native unbuffered IO.
+   * <p>
+   * This method copies the contents of the specified source file
+   * to the specified destination file using OS specific unbuffered IO.
+   * The goal is to avoid churning the file system buffer cache when copying
+   * large files. TheFileUtils#copyLarge function from apache-commons-io library
+   * can be used to achieve this with an internal memory buffer but is less
+   * efficient than the native unbuffered APIs such as sendfile() in Linux and
+   * CopyFileEx() in Windows wrapped in {@link NativeIO#copyFileUnbuffered}.
+   *
+   * The directory holding the destination file is created if it does not exist.
+   * If the destination file exists, then this method will delete it first.
+   * <p>
+   * <strong>Note:</strong> Setting <code>preserveFileDate</code> to
+   * {@code true} tries to preserve the file's last modified
+   * date/times using {@link File#setLastModified(long)}, however it is
+   * not guaranteed that the operation will succeed.
+   * If the modification operation fails, no indication is provided.
+   *
+   * @param srcFile  an existing file to copy, must not be {@code null}
+   * @param destFile  the new file, must not be {@code null}
+   * @param preserveFileDate  true if the file date of the copy
+   *  should be the same as the original
+   *
+   * @throws NullPointerException if source or destination is {@code null}
+   * @throws IOException if source or destination is invalid
+   * @throws IOException if an IO error occurs during copying
+   */
+  public static void nativeCopyFileUnbuffered(File srcFile, File destFile,
+      boolean preserveFileDate) throws IOException {
+    if (srcFile == null) {
+      throw new NullPointerException("Source must not be null");
+    }
+    if (destFile == null) {
+      throw new NullPointerException("Destination must not be null");
+    }
+    if (srcFile.exists() == false) {
+      throw new FileNotFoundException("Source '" + srcFile + "' does not exist");
+    }
+    if (srcFile.isDirectory()) {
+      throw new IOException("Source '" + srcFile + "' exists but is a directory");
+    }
+    if (srcFile.getCanonicalPath().equals(destFile.getCanonicalPath())) {
+      throw new IOException("Source '" + srcFile + "' and destination '" +
+          destFile + "' are the same");
+    }
+    File parentFile = destFile.getParentFile();
+    if (parentFile != null) {
+      if (!parentFile.mkdirs() && !parentFile.isDirectory()) {
+        throw new IOException("Destination '" + parentFile
+            + "' directory cannot be created");
+      }
+    }
+    if (destFile.exists()) {
+      if (FileUtil.canWrite(destFile) == false) {
+        throw new IOException("Destination '" + destFile
+            + "' exists but is read-only");
+      } else {
+        if (destFile.delete() == false) {
+          throw new IOException("Destination '" + destFile
+              + "' exists but cannot be deleted");
+        }
+      }
+    }
+    try {
+      NativeIO.copyFileUnbuffered(srcFile, destFile);
+    } catch (NativeIOException e) {
+      throw new IOException("Failed to copy " + srcFile.getCanonicalPath()
+          + " to " + destFile.getCanonicalPath()
+          + " due to failure in NativeIO#copyFileUnbuffered(). "
+          + e.toString());
+    }
+    if (srcFile.length() != destFile.length()) {
+      throw new IOException("Failed to copy full contents from '" + srcFile
+          + "' to '" + destFile + "'");
+    }
+    if (preserveFileDate) {
+      if (destFile.setLastModified(srcFile.lastModified()) == false) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Failed to preserve last modified date from'" + srcFile
+            + "' to '" + destFile + "'");
+        }
+      }
+    }
+  }
+
  /**
   * Recursively delete all the content of the directory first and then 
   * the directory itself from the local filesystem.
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java
@ -723,12 +723,12 @@ class FsDatasetImpl implements FsDatasetSpi<FsVolumeImpl> {
    final File dstFile = new File(destDir, srcFile.getName());
    final File dstMeta = FsDatasetUtil.getMetaFile(dstFile, genStamp);
    try {
-      FileUtils.copyFile(srcMeta, dstMeta);
+      Storage.nativeCopyFileUnbuffered(srcMeta, dstMeta, true);
    } catch (IOException e) {
      throw new IOException("Failed to copy " + srcMeta + " to " + dstMeta, e);
    }
    try {
-      FileUtils.copyFile(srcFile, dstFile);
+      Storage.nativeCopyFileUnbuffered(srcFile, dstFile, true);
    } catch (IOException e) {
      throw new IOException("Failed to copy " + srcFile + " to " + dstFile, e);
    }